PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl - Mend

sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show

sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so CHANGED Viewed

Binary file

sequenzo/clustering/hierarchical_clustering.py CHANGED Viewed

@@ -75,6 +75,12 @@ from scipy.spatial.distance import squareform
 # sklearn metrics no longer needed - using C++ implementation
 from fastcluster import linkage
+import rpy2.robjects as ro
+from rpy2.robjects import numpy2ri
+from rpy2.robjects.packages import importr
+from rpy2.robjects.conversion import localconverter
+from rpy2.robjects import FloatVector
 # Import C++ cluster quality functions
 try:
     from . import clustering_c_code
@@ -84,7 +90,7 @@ except ImportError:
     print("[!] Warning: C++ cluster quality functions not available. Using Python fallback.")
 # Corrected imports: Use relative imports *within* the package.
-from ..visualization.utils import save_and_show_results
+from sequenzo.visualization.utils import save_and_show_results
 # Global flag to ensure Ward warning is only shown once per session
 _WARD_WARNING_SHOWN = False
@@ -259,6 +265,79 @@ def _clean_distance_matrix(matrix):
     return matrix
+def _hclust_to_linkage_matrix(linkage_matrix):
+    """
+    Convert an R `hclust` object to a SciPy-compatible linkage matrix.
+    This function takes an `hclust` object returned by R (e.g., from
+    `fastcluster::hclust`) and converts it into the standard linkage matrix
+    format used by SciPy (`scipy.cluster.hierarchy.linkage`), which can be
+    used for dendrogram plotting or further clustering analysis in Python.
+    Parameters
+    ----------
+    linkage_matrix : rpy2.robjects.ListVector
+        An R `hclust` object. Expected to contain at least the following fields:
+        - 'merge': ndarray of shape (n-1, 2), indicating which clusters are merged
+                   at each step (negative indices for original observations,
+                   positive indices for previously merged clusters).
+        - 'height': ndarray of shape (n-1,), distances at which merges occur.
+        - 'order': ordering of the leaves.
+    Returns
+    -------
+    Z : numpy.ndarray, shape (n-1, 4), dtype=float
+        A SciPy-compatible linkage matrix where each row represents a merge:
+        - Z[i, 0] : index of the first cluster (0-based)
+        - Z[i, 1] : index of the second cluster (0-based)
+        - Z[i, 2] : distance between the merged clusters
+        - Z[i, 3] : total number of original samples in the newly formed cluster
+    Notes
+    -----
+    - The conversion handles the difference in indexing:
+        - In R's `hclust`, negative numbers in 'merge' indicate original samples
+          and positive numbers indicate previously merged clusters (1-based).
+        - In the returned SciPy linkage matrix, all indices are converted to 0-based.
+    - The function iteratively tracks cluster sizes to populate the fourth column
+      (sample counts) required by SciPy.
+    """
+    n = len(linkage_matrix.rx2("order"))  # 样本数
+    merge = np.array(linkage_matrix.rx2("merge"), dtype=int)  # (n-1, 2)
+    height = np.array(linkage_matrix.rx2("height"), dtype=float)
+    cluster_sizes = np.ones(n, dtype=int)  # 单个样本初始大小 = 1
+    Z = np.zeros((n - 1, 4), dtype=float)
+    for i in range(n - 1):
+        a, b = merge[i]
+        # R hclust 编号负数表示原始样本
+        if a < 0:
+            idx1 = -a - 1  # 转成 0-based
+            size1 = 1
+        else:
+            idx1 = n + a - 1  # 已合并簇，0-based
+            size1 = cluster_sizes[idx1]
+        if b < 0:
+            idx2 = -b - 1
+            size2 = 1
+        else:
+            idx2 = n + b - 1
+            size2 = cluster_sizes[idx2]
+        Z[i, 0] = idx1
+        Z[i, 1] = idx2
+        Z[i, 2] = height[i]
+        Z[i, 3] = size1 + size2
+        # 更新 cluster_sizes，用于后续簇
+        cluster_sizes = np.append(cluster_sizes, size1 + size2)
+    return Z
 class Cluster:
     def __init__(self,
                  matrix,
@@ -358,12 +437,27 @@ class Cluster:
         try:
             # Map our method names to fastcluster's expected method names
             fastcluster_method = self._map_method_name(self.clustering_method)
-            linkage_matrix = linkage(self.condensed_matrix, method=fastcluster_method)
+            if self.clustering_method == "ward_d" or self.clustering_method == "ward":
+                fastcluster_r = importr("fastcluster")
+                # 将 full_matrix 转换为 R 矩阵（直接从 Python 数组创建），避免 rpy2 对大向量长度出错
+                # 用‘F’强制按列展开，符合 R 的内存布局（列优先）
+                full_matrix_r = ro.r.matrix(ro.FloatVector(self.full_matrix.flatten('F')),
+                                            nrow=self.full_matrix.shape[0], ncol=self.full_matrix.shape[1])
+                r_om = ro.r['as.dist'](full_matrix_r)
+                linkage_matrix = fastcluster_r.hclust(r_om, method="ward.D")
+                linkage_matrix = _hclust_to_linkage_matrix(linkage_matrix)
+            else:
+                linkage_matrix = linkage(self.condensed_matrix, method=fastcluster_method)
             # Apply Ward D correction if needed (divide distances by 2 for classic Ward)
-            if self.clustering_method == "ward_d":
-                linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
+            # if self.clustering_method == "ward_d":
+            #     linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
         except Exception as e:
             raise RuntimeError(
                 f"Failed to compute linkage with method '{self.clustering_method}'. "
@@ -1080,5 +1174,105 @@ class ClusterResults:
         save_and_show_results(save_as, dpi)
+# For xinyi's test, because she can't debug in Jupyter :
+    # Traceback (most recent call last):
+    #   File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 736, in make_thread_stack_str
+    #     append('file="%s" line="%s">' % (make_valid_xml_value(my_file), lineno))
+    #   File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_xml.py", line 36, in make_valid_xml_value
+    #     return s.replace("&", "&amp;").replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
+    # AttributeError: 'tuple' object has no attribute 'replace'
+if __name__ == '__main__':
+    # Import necessary libraries
+    # Your calling code (e.g., in a script or notebook)
+    from sequenzo import *  # Import the package, give it a short alias
+    import pandas as pd  # Data manipulation
+    import numpy as np
+    # List all the available datasets in Sequenzo
+    # Now access functions using the alias:
+    print('Available datasets in Sequenzo: ', list_datasets())
+    # Load the data that we would like to explore in this tutorial
+    # `df` is the short for `dataframe`, which is a common variable name for a dataset
+    # df = load_dataset('country_co2_emissions')
+    df = load_dataset('mvad')
+    # 时间列表
+    time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
+                 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
+                 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
+                 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
+                 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
+                 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
+                 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
+                 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
+                 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
+                 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
+                 'Apr.99', 'May.99', 'Jun.99']
+    # 方法1: 使用pandas获取所有唯一值
+    time_states_df = df[time_list]
+    all_unique_states = set()
+    for col in time_list:
+        unique_vals = df[col].dropna().unique()  # Remove NaN values
+        all_unique_states.update(unique_vals)
+    # 转换为排序的列表
+    states = sorted(list(all_unique_states))
+    print("All unique states:")
+    for i, state in enumerate(states, 1):
+        print(f"{i:2d}. {state}")
+    print(f"\nstates list:")
+    print(f"states = {states}")
+    # Create a SequenceData object
+    # Define the time-span variable
+    time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
+                 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
+                 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
+                 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
+                 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
+                 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
+                 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
+                 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
+                 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
+                 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
+                 'Apr.99', 'May.99', 'Jun.99']
+    states = ['FE', 'HE', 'employment', 'joblessness', 'school', 'training']
+    labels = ['further education', 'higher education', 'employment', 'joblessness', 'school', 'training']
+    # TODO: write a try and error: if no such a parameter, then ask to pass the right ones
+    # sequence_data = SequenceData(df, time=time, time_type="year", id_col="country", ids=df['country'].values, states=states)
+    sequence_data = SequenceData(df,
+                                 time=time_list,
+                                 id_col="id",
+                                 states=states,
+                                 labels=labels,
+                                 )
+    om = get_distance_matrix(sequence_data,
+                             method="OM",
+                             sm="CONSTANT",
+                             indel=1)
+    cluster = Cluster(om, sequence_data.ids, clustering_method='ward_d')
+    cluster.plot_dendrogram(xlabel="Individuals", ylabel="Distance")
+    # Create a ClusterQuality object to evaluate clustering quality
+    cluster_quality = ClusterQuality(cluster)
+    cluster_quality.compute_cluster_quality_scores()
+    cluster_quality.plot_cqi_scores(norm='zscore')
+    summary_table = cluster_quality.get_cqi_table()
+    print(summary_table)
+    table = cluster_quality.get_cluster_range_table()
+    # table.to_csv("cluster_quality_table.csv")
+    print(table)

sequenzo/define_sequence_data.py CHANGED Viewed

@@ -325,7 +325,23 @@ class SequenceData:
                 if non_missing_states <= 20:
                     non_missing_color_list = sns.color_palette("Spectral", non_missing_states)
                 else:
-                    non_missing_color_list = sns.color_palette("cubehelix", non_missing_states)
+                    # Use a more elegant color palette for many states - combination of viridis and pastel colors
+                    if non_missing_states <= 40:
+                        # Use viridis for up to 40 states (more colorful than cubehelix)
+                        non_missing_color_list = sns.color_palette("viridis", non_missing_states)
+                    else:
+                        # For very large state counts, use a custom palette combining multiple schemes
+                        viridis_colors = sns.color_palette("viridis", min(non_missing_states // 2, 20))
+                        pastel_colors = sns.color_palette("Set3", min(non_missing_states // 2, 12))
+                        tab20_colors = sns.color_palette("tab20", min(non_missing_states // 3, 20))
+                        # Combine and extend the palette
+                        combined_colors = viridis_colors + pastel_colors + tab20_colors
+                        # If we need more colors, cycle through the combined palette
+                        while len(combined_colors) < non_missing_states:
+                            combined_colors.extend(combined_colors[:min(len(combined_colors), non_missing_states - len(combined_colors))])
+                        non_missing_color_list = combined_colors[:non_missing_states]
                 if reverse_colors:
                     non_missing_color_list = list(reversed(non_missing_color_list))
@@ -342,7 +358,23 @@ class SequenceData:
                 if num_states <= 20:
                     color_list = sns.color_palette("Spectral", num_states)
                 else:
-                    color_list = sns.color_palette("cubehelix", num_states)
+                    # Use a more elegant color palette for many states - combination of viridis and pastel colors
+                    if num_states <= 40:
+                        # Use viridis for up to 40 states (more colorful than cubehelix)
+                        color_list = sns.color_palette("viridis", num_states)
+                    else:
+                        # For very large state counts, use a custom palette combining multiple schemes
+                        viridis_colors = sns.color_palette("viridis", min(num_states // 2, 20))
+                        pastel_colors = sns.color_palette("Set3", min(num_states // 2, 12))
+                        tab20_colors = sns.color_palette("tab20", min(num_states // 3, 20))
+                        # Combine and extend the palette
+                        combined_colors = viridis_colors + pastel_colors + tab20_colors
+                        # If we need more colors, cycle through the combined palette
+                        while len(combined_colors) < num_states:
+                            combined_colors.extend(combined_colors[:min(len(combined_colors), num_states - len(combined_colors))])
+                        color_list = combined_colors[:num_states]
                 if reverse_colors:
                     color_list = list(reversed(color_list))

sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so CHANGED Viewed

Binary file

sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py CHANGED Viewed

@@ -142,7 +142,7 @@ def get_substitution_cost_matrix(seqdata, method, cval=None, miss_cost=None, tim
     # ================================
     if method in ["INDELS", "INDELSLOG"]:
         if time_varying:
-            indels = get_cross_sectional_entropy(seqdata)['Frequencies']
+            indels = get_cross_sectional_entropy(seqdata, return_format="dict")['Frequencies']
         else:
             ww = seqdata.weights
             if ww is None:

sequenzo/dissimilarity_measures/src/DHDdistance.cpp CHANGED Viewed

@@ -4,6 +4,7 @@
 #include <cmath>
 #include <iostream>
 #include "utils.h"
+#include "dp_utils.h"
 #ifdef _OPENMP
     #include <omp.h>
 #endif
@@ -104,26 +105,11 @@ public:
     py::array_t<double> compute_all_distances() {
         try {
-            auto buffer = dist_matrix.mutable_unchecked<2>();
-            #pragma omp parallel
-            {
-                #pragma omp for schedule(guided)
-                for (int i = 0; i < nseq; i++) {
-                    for (int j = i; j < nseq; j++) {
-                        buffer(i, j) = compute_distance(i, j);
-                    }
-                }
-            }
-            #pragma omp for schedule(static)
-            for (int i = 0; i < nseq; ++i) {
-                for (int j = i + 1; j < nseq; ++j) {
-                    buffer(j, i) = buffer(i, j);
-                }
-            }
-            return dist_matrix;
+            return dp_utils::compute_all_distances_simple(
+                nseq,
+                dist_matrix,
+                [this](int i, int j){ return this->compute_distance(i, j); }
+            );
         } catch (const std::exception& e) {
             py::print("Error in compute_all_distances: ", e.what());
             throw;
@@ -132,23 +118,13 @@ public:
     py::array_t<double> compute_refseq_distances() {
         try {
-            auto buffer = refdist_matrix.mutable_unchecked<2>();
-            #pragma omp parallel
-            {
-                #pragma omp for schedule(guided)
-                for (int rseq = rseq1; rseq < rseq2; rseq ++) {
-                    for (int is = 0; is < nseq; is ++) {
-                        if(is == rseq){
-                            buffer(is, rseq-rseq1) = 0;
-                        }else{
-                            buffer(is, rseq-rseq1) = compute_distance(is, rseq);
-                        }
-                    }
-                }
-            }
-            return refdist_matrix;
+            return dp_utils::compute_refseq_distances_simple(
+                nseq,
+                rseq1,
+                rseq2,
+                refdist_matrix,
+                [this](int is, int rseq){ return this->compute_distance(is, rseq); }
+            );
         } catch (const std::exception& e) {
             py::print("Error in compute_all_distances: ", e.what());
             throw;

sequenzo/dissimilarity_measures/src/LCPdistance.cpp CHANGED Viewed

@@ -3,6 +3,7 @@
 #include <vector>
 #include <iostream>
 #include "utils.h"
+#include "dp_utils.h"
 namespace py = pybind11;
@@ -71,26 +72,11 @@ public:
     py::array_t<double> compute_all_distances() {
         try {
-            auto buffer = dist_matrix.mutable_unchecked<2>();
-            #pragma omp parallel
-            {
-                #pragma omp for schedule(static)
-                for (int i = 0; i < nseq; i++) {
-                    for (int j = i; j < nseq; j++) {
-                        buffer(i, j) = compute_distance(i, j);
-                    }
-                }
-            }
-            #pragma omp for schedule(static)
-            for (int i = 0; i < nseq; ++i) {
-                for (int j = i + 1; j < nseq; ++j) {
-                    buffer(j, i) = buffer(i, j);
-                }
-            }
-            return dist_matrix;
+            return dp_utils::compute_all_distances_simple(
+                nseq,
+                dist_matrix,
+                [this](int i, int j){ return this->compute_distance(i, j); }
+            );
         } catch (const std::exception& e) {
             py::print("Error in compute_all_distances: ", e.what());
             throw;
@@ -99,23 +85,13 @@ public:
     py::array_t<double> compute_refseq_distances() {
         try {
-            auto buffer = refdist_matrix.mutable_unchecked<2>();
-            #pragma omp parallel
-            {
-                #pragma omp for schedule(guided)
-                for (int rseq = rseq1; rseq < rseq2; rseq ++) {
-                    for (int is = 0; is < nseq; is ++) {
-                        if(is == rseq){
-                            buffer(is, rseq-rseq1) = 0;
-                        }else{
-                            buffer(is, rseq-rseq1) = compute_distance(is, rseq);
-                        }
-                    }
-                }
-            }
-            return refdist_matrix;
+            return dp_utils::compute_refseq_distances_simple(
+                nseq,
+                rseq1,
+                rseq2,
+                refdist_matrix,
+                [this](int is, int rseq){ return this->compute_distance(is, rseq); }
+            );
         } catch (const std::exception& e) {
             py::print("Error in compute_all_distances: ", e.what());
             throw;

sequenzo/dissimilarity_measures/src/OMdistance.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <cmath>
 #include <iostream>
 #include "utils.h"
+#include "dp_utils.h"
 #ifdef _OPENMP
     #include <omp.h>
 #endif
@@ -71,24 +72,6 @@ public:
         }
     }
-    // 对齐分配函数
-    #ifdef _WIN32
-    inline double* aligned_alloc_double(size_t size, size_t align=64) {
-        return reinterpret_cast<double*>(_aligned_malloc(size * sizeof(double), align));
-    }
-    inline void aligned_free_double(double* ptr) {
-        _aligned_free(ptr);
-    }
-    #else
-    inline double* aligned_alloc_double(size_t size, size_t align=64) {
-        void* ptr = nullptr;
-        if(posix_memalign(&ptr, align, size*sizeof(double)) != 0) throw std::bad_alloc();
-        return reinterpret_cast<double*>(ptr);
-    }
-    inline void aligned_free_double(double* ptr) { free(ptr); }
-    #endif
     double compute_distance(int is, int js, double* prev, double* curr) {
         try {
             auto ptr_len = seqlength.unchecked<1>();
@@ -198,34 +181,14 @@ public:
     py::array_t<double> compute_all_distances() {
         try {
-            auto buffer = dist_matrix.mutable_unchecked<2>();
-            #pragma omp parallel
-            {
-                // 每线程独立分配 prev/curr
-                double* prev = aligned_alloc_double(fmatsize);
-                double* curr = aligned_alloc_double(fmatsize);
-                #pragma omp for schedule(static)
-                for (int i = 0; i < nseq; i++) {
-                    for (int j = i; j < nseq; j++) {
-                        buffer(i, j) = compute_distance(i, j, prev, curr);
-                    }
-                }
-                aligned_free_double(prev);
-                aligned_free_double(curr);
-            }
-            // 对称填充
-            #pragma omp parallel for schedule(static)
-            for(int i = 0; i < nseq; i++) {
-                for(int j = i+1; j < nseq; j++) {
-                    buffer(j, i) = buffer(i, j);
+            return dp_utils::compute_all_distances(
+                nseq,
+                fmatsize,
+                dist_matrix,
+                [this](int i, int j, double* prev, double* curr) {
+                    return this->compute_distance(i, j, prev, curr);
                 }
-            }
-            return dist_matrix;
+            );
         } catch (const std::exception& e) {
             py::print("Error in compute_all_distances: ", e.what());
             throw;
@@ -238,8 +201,8 @@ public:
             #pragma omp parallel
             {
-                double* prev = aligned_alloc_double(2 * seqlen + 1);
-                double* curr = aligned_alloc_double(2 * seqlen + 1);
+                double* prev = dp_utils::aligned_alloc_double(static_cast<size_t>(fmatsize));
+                double* curr = dp_utils::aligned_alloc_double(static_cast<size_t>(fmatsize));
                 #pragma omp for schedule(static)
                 for (int rseq = rseq1; rseq < rseq2; rseq ++) {
@@ -252,6 +215,8 @@ public:
                         buffer(is, rseq - rseq1) = cmpres;
                     }
                 }
+                dp_utils::aligned_free_double(prev);
+                dp_utils::aligned_free_double(curr);
             }
             return refdist_matrix;