PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/big_data/clara/utils/wfcmdd.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""
+@Author  : 李欣怡
+@File    : wfcmdd.py
+@Time    : 2024/12/28 13:38
+@Desc    :
+"""
+import numpy as np
+import pandas as pd
+import warnings
+def wfcmdd(diss, memb, weights=None, method="FCMdd", m=2, dnoise=None, eta=None, alpha=0.001,
+           iter_max=100, verbose=False, dlambda=None):
+    # Setting and checking argument values
+    METHODS = ["NCdd", "HNCdd", "FCMdd", "PCMdd"]
+    if method not in METHODS:
+        raise ValueError(f" [!] Method must be one of {METHODS}.")
+    # TODO:源码中没有 weights = null 时的处理
+    if weights is None:
+        weights = np.ones(len(diss), dtype=int)
+    # R 源码中只定义未使用
+    # pweights = weights / np.sum(weights)
+    d = np.array(diss)
+    n = d.shape[0]
+    if method == "NCdd":
+        if dnoise is None and dlambda is None:
+            raise ValueError(" [!] Must provide a value for dnoise or dlambda.")
+        if dlambda is not None:
+            dnoise = 1
+    elif method == "HNCdd":
+        if dnoise is None:
+            raise ValueError(" [!] Must provide a value for dnoise.")
+        m = 1
+    elif method == "PCMdd":
+        if eta is None:
+            raise ValueError(" [!] Must provide a vector of values for eta.")
+    # Checking the membership matrix (memb)
+    if isinstance(memb, (pd.DataFrame, np.ndarray)):  # Check if memb is matrix or dataframe-like
+        if memb.shape[0] != d.shape[1]:
+            raise ValueError(" [!] The number of rows in memb must be the same as the number of rows and columns of d.")
+        u = memb.to_numpy() if isinstance(memb, pd.DataFrame) else memb
+    elif isinstance(memb, list) and all(isinstance(x, (int, float)) for x in memb):
+    # else if (is.vector(memb) && is.numeric(memb))
+        u = np.zeros((n, len(memb)))
+        for k in range(len(memb)):
+            u[memb[k], k] = 1
+    else:
+        raise ValueError("[!] Provide a number, a vector of seeds, or membership matrix for mobile clusters.")
+    kMov = u.shape[1]
+    med = np.full(kMov, np.nan)
+    if method == "PCMdd" and len(eta) != kMov:
+        raise ValueError(" [!] Vector of reference distances (eta) must have a length equal to the number of clusters.")
+    if method in ["NCdd", "HNCdd"]:
+        # u <- cbind(u, vector("numeric", length = n))
+        u = np.hstack([u, np.zeros((n, 1))])
+    kMovNC = u.shape[1]
+    # print("kMovNC = ", kMovNC)
+    uPrev = np.zeros((n, kMovNC))
+    if dlambda is not None:
+        kdiv = kMov * np.sum(weights)
+    dist2med = np.zeros((n, kMovNC))
+    # print("dist2med = ", dist2med)
+    if method in ["NCdd", "HNCdd"]:
+        dist2med[:, kMovNC - 1] = dnoise
+    continue_flag = True
+    iter_count = 1
+    uPrev2 = 0
+    # print("u = ", u)
+    # print("d = ", d)
+    # print("med = ", med)
+    while continue_flag:
+        # Finding centers
+        for k in range(kMov):
+            # candidates < - which(apply(u[, -k, drop=FALSE], 1, max) < 1 & (!1:n % in %med[0:(k - 1)]))
+            # med[k] < - candidates[which.min((u[, k] ^ m * weights) % * % d[, candidates])]
+            # dist2med[, k] < - d[, med[k]]
+            u_removed_k = np.delete(u, k, axis=1)      # 去掉第 k 列
+            max_per_row = np.max(u_removed_k, axis=1)  # 每行的最大值
+            # 查找最大值小于 1 的行
+            candidates = np.where((max_per_row < 1) & (~np.isin(np.arange(1, len(u) + 1), med[:k])))[0]
+            # print("candidates = ", candidates)
+            u_k_m = u[:, k] ** m
+            # print("u_k_m = ", u_k_m)
+            # 按照权重与距离矩阵进行矩阵乘法
+            weighted_u_k_m = u_k_m * weights
+            # print("weighted_u_k_m =", weighted_u_k_m)
+            # 从 d 中选择 candidates 列
+            d_candidates = d[:, candidates]
+            # print("d_candidates =", d_candidates)
+            # 进行矩阵乘法
+            product = weighted_u_k_m @ d_candidates
+            # print("product = ", product)
+            # 选取最小值对应的索引
+            min_index = np.argmin(product)
+            # print("min_index = ", min_index)
+            med[k] = candidates[min_index]  # 更新 med[k]
+            # print("med[k] = ", med[k])
+            dist2med[:, k] = d[:, int(med[k])]
+            # print("dist2med[:, k] = ", dist2med[:, k])
+        # Updating dnoise for adaptive dnoise clustering
+        if dlambda is not None and method == "NCdd":
+            dnoise = dlambda * np.sum(dist2med[:, :-1] * weights[:, None]) / (kMov * np.sum(weights))
+            dist2med[:, kMovNC - 1] = dnoise
+        # Updating membership
+        if method == "HNCdd":
+            d2cm = np.hstack([dist2med, np.full((dist2med.shape[0], 1), dnoise)])
+            u = np.zeros_like(u)
+            minC = np.argmin(d2cm, axis=1)
+            for i in range(len(minC)):
+                u[i, minC[i]] = 1
+        elif method in ["FCMdd", "NCdd"]:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                # dist2med_safe = np.where(dist2med == 0, 1e-10, dist2med)
+                # TODO : 不显示中间报错
+                u = (1 / dist2med) ** (1 / (m - 1))
+                u /= np.sum(u, axis=1, keepdims=True)
+                u[dist2med == 0] = 1
+        elif method == "PCMdd":
+            for k in range(kMov):
+                u[:, k] = 1 / (1 + (dist2med[:, k] / eta[k]) ** (1 / (m - 1)))
+            u[dist2med == 0] = 1
+        # Checking convergence
+        if iter_count > 2:
+            continue_flag = np.max(np.abs(u - uPrev)) > alpha and iter_count <= iter_max \
+                            and np.max(np.abs(u - uPrev2)) > alpha
+        if continue_flag:
+            uPrev2 = uPrev
+            uPrev = u
+            iter_count += 1
+            if verbose:
+                print(".", end="")
+    # Calculate the functional value
+    if method in ["NCdd", "FCMdd"]:
+        functional = np.sum(dist2med * (u ** m) * weights[:, None])
+    elif method == "HNCdd":
+        functional = np.sum(dist2med * (u ** m) * weights[:, None])
+    elif method == "PCMdd":
+        functional = 0
+        for k in range(kMov):
+            functional += np.sum(dist2med[:, k] * (u[:, k] ** m) * weights) + np.sum(
+                eta[k] * (1 - u[:, k]) ** m * weights)
+    if verbose:
+        print(f"\nIterations: {iter_count}, Functional: {functional}")
+    mobile_centers = med[:kMov]
+    return {
+        "dnoise": dnoise,
+        "memb": u,
+        "mobileCenters": mobile_centers,
+        "functional": functional
+    }
+if __name__ == "__main__":
+    diss = np.array([[0.0, 1.0, 2.0],
+                     [1.0, 0.0, 1.0],
+                     [2.0, 1.0, 0.0]])
+    diss = pd.DataFrame(diss)
+    memb = np.array([[0.7, 0.3],
+                     [0.2, 0.8],
+                     [0.5, 0.5]])
+    result = wfcmdd(diss=diss, memb=memb, method="FCMdd")
+    print("result['dnoise'] = ", result['dnoise'])
+    print("result['memb'] =")
+    print(result['memb'])
+    print("result['mobileCenters'] = ", result['mobileCenters'])
+    print("result['functional'] = ", result['functional'])

sequenzo/big_data/clara/visualization.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : visualization.py
+@Time    : 04/04/2025 15:21
+@Desc    :
+"""
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import pandas as pd
+def plot_scores_from_dataframe(df,
+                               k_col="k",
+                               metrics=None,
+                               norm="zscore",
+                               title="CLARA Cluster Quality Metrics",
+                               palette="Set2",
+                               line_width=2,
+                               style="whitegrid",
+                               xlabel="Number of Clusters",
+                               ylabel="Normalized Score",
+                               grid=True,
+                               save_as=None,
+                               dpi=200,
+                               figsize=(12, 8)):
+    """
+    Plot clustering metrics directly from a summary DataFrame (e.g., loaded from CSV).
+    :param df: DataFrame with clustering metrics. Must include a 'k' column.
+    :param k_col: Column name indicating the number of clusters.
+    :param metrics: List of metric columns to plot. If None, auto-detect numeric columns.
+    :param norm: Normalization method for plotting ('zscore', 'range', or 'none')
+    :param title: Plot title
+    :param palette: Color palette for the plot
+    :param line_width: Width of plotted lines
+    :param style: Seaborn style for the plot
+    :param xlabel: X-axis label
+    :param ylabel: Y-axis label
+    :param grid: Whether to show grid lines
+    :param save_as: File path to save the plot (optional)
+    :param dpi: DPI for saved image
+    :param figsize: Figure size in inches
+    """
+    df = df.copy()
+    df = df.sort_values(by=k_col)
+    if metrics is None:
+        metrics = df.select_dtypes(include=[float, int]).columns.tolist()
+        blacklist = ["Best iter", k_col] # Removed best iter as it is not part of the indicators for cluster quality evaluation
+        metrics = [m for m in metrics if m not in blacklist]
+    normed = {}
+    for metric in metrics:
+        values = df[metric].values.astype(float)
+        if norm == "zscore":
+            mean = np.nanmean(values)
+            std = np.nanstd(values)
+            normed[metric] = (values - mean) / std if std > 0 else values
+        elif norm == "range":
+            min_val = np.nanmin(values)
+            max_val = np.nanmax(values)
+            normed[metric] = (values - min_val) / (max_val - min_val) if max_val > min_val else values
+        else:
+            normed[metric] = values
+    sns.set(style=style)
+    palette_colors = sns.color_palette(palette, len(metrics))
+    plt.figure(figsize=figsize)
+    for idx, metric in enumerate(metrics):
+        plt.plot(df[k_col], normed[metric],
+                 label=metric,
+                 linewidth=line_width,
+                 color=palette_colors[idx])
+    plt.title(title, fontsize=14, fontweight="bold")
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.xticks(df[k_col])
+    plt.grid(grid, linestyle="--", alpha=0.6)
+    plt.legend(title="Metric", fontsize=10)
+    plt.tight_layout()
+    if save_as:
+        plt.savefig(save_as, dpi=dpi)
+    plt.show()

sequenzo/clustering/KMedoids.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""
+@Author  : 李欣怡 Xinyi Li
+@File    : KMedoids.py
+@Time    : 2025/2/8 11:53
+@Desc    :
+"""
+import numpy as np
+from scipy.cluster.hierarchy import cut_tree
+import importlib
+import sequenzo.clustering.clustering_c_code
+clustering_c_code = importlib.import_module("sequenzo.clustering.clustering_c_code")
+from sequenzo.clustering.utils.disscenter import disscentertrim
+def KMedoids(diss, k, weights=None, npass=1, initialclust=None, method='PAMonce', cluster_only=False, verbose=True):
+    # Lazily import the c_code module to avoid circular dependencies during installation
+    # from .__init__ import _import_c_code
+    # c_code = _import_c_code()
+    # Convert method to integer if it's a string
+    method_original = method
+    if isinstance(method, str):
+        method = method.lower()
+        method_map = ["kmedoids", "pam", "pamonce"]
+        if method in method_map:
+            method = method_map.index(method) + 1  # 1-based index
+    if not (isinstance(method, int) and method in {1, 2, 3}):
+        raise ValueError(f"[!] Unknown clustering method: {method_original}.")
+    if verbose:
+        method_names = ["KMedoids", "PAM", "PAMonce"]
+        method_name = method_names[method - 1]
+        print(f"[>] Starting KMedoids clustering (method: {method_name}, k={k})...")
+    nelements = diss.shape[0]
+    if nelements != diss.shape[1]:
+        raise ValueError(f"[!] Dissipation matrix has {nelements} elements.")
+    def internal_random_sample(nelements, k):
+        return np.random.choice(nelements, k, replace=False)  # 0-based 直接适用
+    if weights is None:
+        weights = np.ones(diss.shape[1], dtype=float)
+    if len(weights) != nelements:
+        raise ValueError(f"[!] 'weights' should be a vector of length {nelements}.")
+    if initialclust is None:
+        initialclust = internal_random_sample(nelements, k)
+    else:
+        if _validate_linkage_matrix(initialclust):
+            # initialclust = fcluster(initialclust, k, criterion='maxclust')  # 1-based 索引
+            initialclust = cut_tree(initialclust, n_clusters=k).flatten() + 1  # 1-based 索引
+        # TODO : 现在已经得到一个组了，为什么不用这个组当作 PAMonce/PAM 算法的初始化？反而利用这个组去选中心点？
+        #  初始化中心点的必要性为什么大于组？初始化中心点无论好不好，最后经过不断迭代肯定能选出较好的
+        # TODO : 就算想要从子样本扩展到全数据，入口参数的这个组也是可以的呀？
+        if len(initialclust) == nelements:
+            initialclust = disscentertrim(diss=diss, group=initialclust, medoids_index="first", weights=weights)
+            if len(initialclust) != k:
+                raise ValueError(f"[!] 'initialclust' should be a vector of cluster membership with k={k}.")
+        npass = 0
+    if len(initialclust) != k:
+        raise ValueError(f"[!] 'initialclust' should be a vector of medoids index of length :{k}.")
+    if isinstance(initialclust, list):
+        initialclust = np.asarray(initialclust)
+    if np.any((initialclust >= nelements) | (initialclust < 0)):
+        raise ValueError(f"[!] Starting medoids should be in 1:{nelements}")
+    if npass < 0:
+        raise ValueError("[!] 'npass' should be greater than 0")
+    if k < 2 or k > nelements:
+        raise ValueError(f" [!] 'k' should be in [2, {nelements}]")
+    if method == 1:   # KMedoid
+        memb = clustering_c_code.KMedoid(nelements,
+                                         diss.astype(np.float64),
+                                         initialclust.astype(np.int32),
+                                         npass,
+                                         weights.astype(np.float64))
+    elif method == 2:  # PAM
+        memb = clustering_c_code.PAM(nelements,
+                                     diss.astype(np.float64),
+                                     initialclust.astype(np.int32),
+                                     npass,
+                                     weights.astype(np.float64))
+    else:   # PAMonce
+        memb = clustering_c_code.PAMonce(nelements,
+                                         diss.astype(np.float64),
+                                         initialclust.astype(np.int32),
+                                         npass,
+                                         weights.astype(np.float64))
+    memb_matrix = memb.runclusterloop()
+    if verbose:
+        print("[>] Computed Successfully.")
+    return memb_matrix
+def _validate_linkage_matrix(initialclust):
+    """
+    Check that the passed matrix matches the linkage matrix type requirements
+    """
+    if not isinstance(initialclust, np.ndarray):
+        return False    # Linkage matrix must be a NumPy array
+    if initialclust.ndim != 2 or initialclust.shape[1] != 4:
+        return False    # Linkage matrix must be a 2D array with 4 columns
+    if initialclust.dtype != np.float64:
+        return False    # Linkage matrix 'Z' must contain doubles (np.float64).
+    return True
+if __name__ == '__main__':
+    # TODO : KMeodis 在 python3.11 里导包有 numpy 的问题
+    # TODO : sequenzo 0.1.14 里找不到 KMeodis 模块（这是 init 的问题，现已修正）
+    from sequenzo import *
+    import pandas as pd
+    # =====
+    #  CO2
+    # =====
+    # df = load_dataset('country_co2_emissions')
+    # time = list(df.columns)[1:]
+    # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
+    # sequence_data = SequenceData(df, time=time, id_col="country", states=states)
+    # =========
+    # synthetic
+    # =========
+    df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/not_real_detailed_data/synthetic_detailed_U5_N10000.csv")
+    _time = list(df.columns)[2:]
+    states = ["Data", "Data science", "Hardware", "Research", "Software", "Support & test", "Systems & infrastructure"]
+    df = df[['id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]
+    sequence_data = SequenceData(df, time=_time, id_col="id", states=states)
+    om = get_distance_matrix(sequence_data, method="OM", sm="TRATE", indel="auto")
+    centroid_indices = [0, 50, 100, 150, 190]
+    n_pass = 1
+    weights = np.ones(len(om))
+    # Example 1: KMedoids algorithm without specifying the center point
+    # clustering = KMedoids(diss=om,
+    #                       k=5,
+    #                       method='KMedoids',
+    #                       npass=n_pass,
+    #                       weights=weights)
+    #
+    # # Example 2: PAM algorithm with a specified center point
+    # clustering = KMedoids(diss=om,
+    #                       k=5,
+    #                       method='PAM',
+    #                       initialclust=centroid_indices,
+    #                       npass=n_pass,
+    #                       weights=weights)
+    # Example 3: PAMonce algorithm with default parameters
+    clustering = KMedoids(diss=om,
+                          k=5,
+                          method='PAMonce',
+                          npass=n_pass,
+                          weights=weights)
+    print(clustering)
+    uniq = np.unique(clustering)
+    print(uniq.min(), uniq.max())

sequenzo/clustering/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : __init__.py
+@Time    : 27/02/2025 09:58
+@Desc    :
+"""
+from .hierarchical_clustering import Cluster, ClusterResults, ClusterQuality
+from .KMedoids import KMedoids
+def _import_c_code():
+    """Lazily import the c_code module to avoid circular dependencies during installation"""
+    try:
+        # Import built pybind11 extension placed under this package
+        from sequenzo.clustering import clustering_c_code
+        return clustering_c_code
+    except ImportError:
+        # If the C extension cannot be imported, return None
+        print(
+            "Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
+        return None
+__all__ = [
+    "Cluster",
+    "ClusterResults",
+    "ClusterQuality",
+    "KMedoids",
+    # Add other functions as needed
+]

sequenzo/clustering/clustering_c_code.cpython-310-darwin.so ADDED Viewed

Binary file