PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/multidomain/idcd.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : idcd.py
+@Time    : 15/04/2025 16:38
+@Desc    :
+    IDCD strategy for multidomain sequence analysis in Python, with custom time, states, and labels.
+"""
+from typing import List, Dict
+import pandas as pd
+from sequenzo.define_sequence_data import SequenceData
+def _generate_combined_sequence_from_csv(csv_paths: List[str],
+                                         time_cols: List[str],
+                                         id_col: str = "id") -> pd.DataFrame:
+    """
+    Load multiple CSVs, extract time sequences, and combine into a multidomain sequence.
+    Only observed combinations will be used.
+    Parameters:
+        csv_paths: List of file paths, each containing one domain's sequence data
+        time_cols: Time columns to extract and align
+        id_col: ID column to align on
+    Returns:
+        combined_df: DataFrame with combined state sequences
+    Raises:
+        ValueError: If any CSV is missing required columns
+    """
+    import os
+    domain_dfs = []
+    for idx, path in enumerate(csv_paths):
+        try:
+            df = pd.read_csv(path)
+        except Exception as e:
+            raise ValueError(f"Failed to read CSV at '{path}': {str(e)}")
+        # Check if ID column exists
+        if id_col not in df.columns:
+            raise ValueError(
+                f"Missing ID column '{id_col}' in file: {path}\n"
+                f"Available columns: {list(df.columns)}"
+            )
+        # Check if all time columns exist
+        missing_cols = [col for col in time_cols if col not in df.columns]
+        if missing_cols:
+            raise ValueError(
+                f"Missing time columns {missing_cols} in file: {path}\n"
+                f"Available columns: {list(df.columns)}"
+            )
+        df = df.copy()
+        df.sort_values(by=id_col, inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        domain_dfs.append(df)
+    # Combine states row-wise
+    combined_matrix = []
+    for i in range(domain_dfs[0].shape[0]):
+        row = []
+        for t in time_cols:
+            combo = '+'.join(str(df.at[i, t]) for df in domain_dfs)
+            row.append(combo)
+        combined_matrix.append(row)
+    combined_df = pd.DataFrame(combined_matrix, columns=time_cols)
+    combined_df.insert(0, id_col, domain_dfs[0][id_col].values)
+    return combined_df
+def create_idcd_sequence_from_csvs(
+    csv_paths: List[str],
+    time_cols: List[str],
+    id_col: str = "id",
+    domain_state_labels: List[Dict] = None
+) -> SequenceData:
+    """
+    Create IDCD-style SequenceData from multiple CSVs.
+    Combines real observed joint states and builds sequence data.
+    Parameters:
+    - csv_paths: List of paths to domain CSVs
+    - time_cols: List of time column names to use
+    - id_col: ID column name
+    - domain_state_labels: List of dictionaries mapping raw state values to labels for each domain
+    Returns:
+    - SequenceData object with expanded alphabet of observed joint states
+    """
+    combined_df = _generate_combined_sequence_from_csv(csv_paths, time_cols, id_col=id_col)
+    # Get observed states only
+    flat_vals = combined_df[time_cols].values.ravel()
+    observed_states = pd.Series(flat_vals).value_counts()
+    proportions = observed_states / len(flat_vals) * 100
+    # Construct label mapping if provided
+    if domain_state_labels:
+        pretty_labels = []
+        for state in observed_states.index:
+            parts = state.split("+")
+            label_parts = []
+            for i, token in enumerate(parts):
+                try:
+                    key = int(token) if token.isdigit() else token
+                    label = domain_state_labels[i].get(key, str(token))
+                except Exception:
+                    label = str(token)
+                label_parts.append(label)
+            pretty_labels.append(' + '.join(label_parts))  # 更清晰的拼接格式
+    else:
+        pretty_labels = observed_states.index.tolist()
+    # Print frequency + proportion table
+    freq_table = pd.DataFrame({
+        "State": observed_states.index,
+        "Label": pretty_labels,
+        "Frequency": observed_states.values,
+        "Proportion (%)": proportions.round(2)
+    })
+    print("\n[IDCD] Observed Combined States Frequency Table:")
+    print(freq_table.to_string(index=False))
+    return SequenceData(
+        data=combined_df,
+        time=time_cols,
+        states=observed_states.index.tolist(),
+        labels=pretty_labels,
+        id_col=id_col
+    )

sequenzo/multidomain/linked_polyad.py ADDED Viewed

@@ -0,0 +1,292 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : linked_polyad.py
+@Time    : 28/04/2025 21:19
+@Desc    :
+    This module implements the full Python version of Tim Liao and Gilbert Ritschard's
+    seqpolyads function (R version 1.0.2, 29.12.20) for linked polyadic sequence analysis.
+    Provided functionalities:
+    1. Customizable pairwise weighting (pair_weights)
+    2. Support for role-specific weights (role_weights)
+    3. Support for weighted sampling (weights)
+    4. Randomization method selection: a=1 (resample sequences), a=2 (resample states)
+    5. Multi-core parallel processing (n_jobs)
+    6. Full reproducibility via random_seed control
+    7. Outputs include observed distances, randomized distances, U, V, V>0.95 dummy, and mean observed/random distances
+    All calculations faithfully replicate the logic and outputs of the original R implementation.
+    Note:
+    You may encounter the following error during execution, especially when running the script inside PyCharm:
+    Traceback (most recent call last):
+      File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 293, in _on_run
+        r = self.sock.recv(1024)
+    OSError: [Errno 9] Bad file descriptor
+    This error is related to PyCharm's debugger trying to manage communication sockets while multiprocessing or background progress bars (like tqdm) are active.
+    It does not affect the actual computation or results of the linked_polyad function. You can safely ignore it.
+    To suppress it or avoid seeing it:
+    Run the script outside the PyCharm debugger (e.g., from terminal or using “Run” instead of “Debug”).
+    Alternatively, disable progress bars or multiprocessing (e.g., set n_jobs=1 and disable=True in tqdm, if available in the function).
+"""
+import numpy as np
+import random
+from typing import List, Dict, Union, Tuple, Any
+from tqdm import tqdm
+from joblib import Parallel, delayed
+import pandas as pd
+from sequenzo.dissimilarity_measures import get_distance_matrix
+from sequenzo.define_sequence_data import SequenceData
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+import multiprocessing
+import platform
+if platform.system() != "Windows":
+    multiprocessing.set_start_method("fork", force=True)
+def linked_polyadic_sequence_analysis(seqlist: List[SequenceData],
+                                      a: int = 1,
+                                      method: str = "OM",
+                                      distance_parameters: dict = None,
+                                      weights: np.ndarray = None,
+                                      rand_weight_type: int = 1,
+                                      role_weights: List[float] = None,
+                                      pair_weights: np.ndarray = None,
+                                      T: int = 1000,
+                                      random_seed: int = 36963,
+                                      replace: bool = True,
+                                      n_jobs: int = 1,
+                                      verbose: bool = True,
+                                      return_df: bool = False,
+                                      return_merged_seqdata: bool = False) -> Union[Dict,
+                                                                                    pd.DataFrame,
+                                                                                    Tuple[Dict, SequenceData],
+                                                                                    Tuple[pd.DataFrame, SequenceData]
+                                                                                ]:
+    """
+    Calculate U and V statistics for linked polyadic sequence data.
+    Provided functionalities:
+    1. Customizable pairwise weighting (pair_weights)
+    2. Support for role-specific weights (role_weights)
+    3. Support for weighted sampling (weights)
+    4. Randomization method selection: a=1 (resample sequences), a=2 (resample states)
+    5. Multi-core parallel processing (n_jobs)
+    6. Full reproducibility via random_seed control
+    7. Outputs include (1) merged data, and (2) observed distances, randomized distances, U, V, V>0.95 dummy, and mean observed/random distances
+    :param seqlist: List of SequenceData objects to analyze.
+    :param a: Randomization type. 1 = resample sequences; 2 = resample states within sequences.
+    :param method: Distance measure method ('HAM', 'OM', 'CHI2', etc.).
+    :param distance_parameters: Dictionary of additional keyword arguments for distance calculation.
+    :param weights: Sampling weights for sequences when generating random polyads.
+    :param rand_weight_type: Strategy for computing randomization weights (1 = uniform, 2 = sample-weight-based).
+    :param role_weights: Role-specific importance weights for different sequence sources.
+    :param pair_weights: Pairwise weights for distance averaging.
+    :param T: Number of randomizations performed.
+    :param random_seed: Seed for random number generator to ensure reproducibility.
+    :param replace: Whether to sample with replacement during randomization.
+    :param n_jobs: Number of parallel workers for randomization; set to -1 to use all CPUs.
+    :param verbose: Whether to display a progress bar during randomization.
+    :param return_df: If True, return results as a pandas DataFrame (ObservedDist, U, V, V>0.95).
+    :param return_merged_seqdata: If True, also return the merged SequenceData object used internally for distance computation.
+    Dictionary containing:
+    - 'mean.dist': Mean observed and random distances
+    - 'U': Array of U values (mean random distance - observed distance)
+    - 'V': Array of V values (proportion observed < random)
+    - 'V.95': Binary array where V > 0.95
+    - 'observed.dist': Array of observed polyadic distances
+    - 'random.dist': Array of randomized polyadic distances
+    - If `return_df=True` and `return_merged_seqdata=False`: returns a pandas DataFrame with columns:
+        [ObservedDist, U, V, V>0.95]
+    - If `return_df=False` and `return_merged_seqdata=True`: returns a tuple:
+        (result_dict, merged_seqdata)
+    - If `return_df=True` and `return_merged_seqdata=True`: returns a tuple:
+        (result_df, merged_seqdata)
+    """
+    if distance_parameters is None:
+        distance_parameters = {}
+    print("[Step 1] Validating sequence data inputs...")
+    P = len(seqlist)
+    n = seqlist[0].n_sequences
+    seq_length = seqlist[0].n_steps
+    for sd in seqlist:
+        assert isinstance(sd, SequenceData)
+        assert sd.n_sequences == n
+        assert sd.n_steps == seq_length
+    if role_weights is None:
+        role_weights = [1.0 / P] * P
+    role_weights = np.array(role_weights)
+    if pair_weights is None:
+        pair_weights = np.ones(int(P * (P - 1) / 2))
+    print(f"[Step 2] Constructing merged polyadic sequence data... (Total polyads: {n}, Roles per polyad: {P})")
+    tagged_dfs = []
+    for i, sd in enumerate(seqlist):
+        df = sd.to_dataframe().copy()
+        df["__id__"] = [f"R{i}_{j}" for j in range(sd.n_sequences)]
+        tagged_dfs.append(df)
+    data_concat = pd.concat(tagged_dfs, axis=0).reset_index(drop=True)
+    labels = seqlist[0].labels
+    merged_seqdata = SequenceData(
+        data=data_concat,
+        time=seqlist[0].time,
+        states=[i for i in range(1, len(seqlist[0].states) + 1)],
+        labels=labels,
+        id_col="__id__"
+    )
+    print("[Step 3] Computing all pairwise dissimilarities using method:", method)
+    alldist = np.asarray(get_distance_matrix(merged_seqdata, method=method, **distance_parameters))
+    print("         -> Dissimilarity matrix shape:", alldist.shape)
+    cj = np.array([n * p for p in range(P)])
+    if weights is None:
+        weights = np.ones(n) / n
+    def weighted_mean(mat):
+        return np.average(mat[np.triu_indices(P, 1)], weights=pair_weights)
+    l_m = np.zeros((T, P), dtype=int)
+    print(f"[Step 4] Starting {T} randomized polyad simulations (randomization type: a={a})...")
+    def random_sample_once(i):
+        local_rng = np.random.default_rng(random_seed + i)
+        sampled = local_rng.choice(n, size=P, replace=replace, p=weights)
+        l_m[i] = sampled
+        sample_indices = cj + sampled
+        if a == 1:
+            mat = alldist[np.ix_(sample_indices, sample_indices)]
+            return weighted_mean(mat)
+        elif a == 2:
+            df = merged_seqdata.to_dataframe().drop(columns="__id__")
+            sampled_df = df.iloc[sample_indices].reset_index(drop=True)
+            shuffled = sampled_df.apply(lambda row: local_rng.choice(row, size=seq_length, replace=replace),
+                                        axis=1, result_type="broadcast")
+            shuffled["__id__"] = [f"Rand_{i}_{j}" for j in range(len(shuffled))]
+            seq_shuffled = SequenceData(
+                data=shuffled,
+                time=merged_seqdata.time,
+                states=merged_seqdata.states,
+                labels=merged_seqdata.labels,
+                id_col="__id__"
+            )
+            dmat = np.asarray(get_distance_matrix(seq_shuffled, method=method, **distance_parameters))
+            return weighted_mean(dmat)
+        else:
+            raise ValueError("Invalid randomization type 'a'. Should be 1 or 2.")
+    iterator = tqdm(range(T), desc="-> Randomizing polyads") if verbose else range(T)
+    random_dists = Parallel(n_jobs=n_jobs)(delayed(random_sample_once)(i) for i in iterator)
+    random_dists = np.array(random_dists)
+    print("[Step 5] Computing observed distances for each polyad...")
+    observed_dists = []
+    for i in range(n):
+        indices = [i + n * p for p in range(P)]
+        mat = alldist[np.ix_(indices, indices)]
+        observed_dists.append(weighted_mean(mat))
+    observed_dists = np.array(observed_dists)
+    print("[Step 6] Calculating U, V, and significance threshold V > 0.95...")
+    if rand_weight_type == 2:
+        p_weights = np.array([np.sum(weights[sampled]) for sampled in l_m])
+    else:
+        p_weights = 1.0
+    l_weights = np.zeros(T)
+    for i in range(T):
+        sampled = l_m[i]
+        l_weights[i] = np.sum(weights[sampled] * role_weights / p_weights[i] if rand_weight_type == 2 else p_weights)
+    l_weights /= np.sum(l_weights)
+    mean_rand_dist = np.sum(random_dists * l_weights)
+    U = mean_rand_dist - observed_dists
+    V = np.array([np.sum((observed_dists[i] < random_dists) * l_weights) for i in range(n)])
+    V_95 = (V > 0.95).astype(int)
+    print(
+        f"[Step 7] Final summary: mean observed = {np.mean(observed_dists):.2f}, mean randomized = {mean_rand_dist:.2f}")
+    print(f"         -> Significant polyads (V > 0.95): {np.sum(V_95)} / {n}")
+    result = {
+        "mean.dist": {"Obs": np.mean(observed_dists), "Rand": mean_rand_dist},
+        "U": U,
+        "V": V,
+        "V.95": V_95,
+        "observed.dist": observed_dists,
+        "random.dist": random_dists
+    }
+    if return_df and return_merged_seqdata:
+        result_df = pd.DataFrame({
+            'ObservedDist': result['observed.dist'],
+            'U': result['U'],
+            'V': result['V'],
+            'V>0.95': result['V.95']
+        }, index=pd.RangeIndex(start=1, stop=len(result['U']) + 1, name="PolyadID"))
+        print("[Step 8] Returning result_df and merged_seqdata.")
+        print("[Program Completed] Please continue to use result_df and merged_seqdata for further analysis.")
+        return result_df, merged_seqdata
+    elif return_df:
+        print("[Step 8] Returning result_dict only.")
+        print("[Info] Program completed successfully.")
+        print("[Info] You can now explore the result_dict for detailed statistics (U, V, observed distances, etc.).")
+        print("[Info] For further analysis like clustering or sequence visualization,")
+        print("       please re-run this function with `return_merged_seqdata=True`.")
+        print("       Example:")
+        print("           result, merged_seq = linked_polyadic_sequence_analysis(..., return_merged_seqdata=True)")
+        return pd.DataFrame({
+            'ObservedDist': result['observed.dist'],
+            'U': result['U'],
+            'V': result['V'],
+            'V>0.95': result['V.95']
+        }, index=pd.RangeIndex(start=1, stop=len(result['U']) + 1, name="PolyadID"))
+    elif return_merged_seqdata:
+        print("[Step 8] Returning result_dict and merged_seqdata.")
+        print("[Program Completed] Please continue to use result_df and merged_seqdata for further analysis.")
+        return result, merged_seqdata
+    else:
+        raise ValueError(
+            "[Error] Both `return_df` and `return_merged_seqdata` are set to False.\n"
+            "        This configuration will omit both summary statistics (U, V, etc.) and the merged sequence data,\n"
+            "        which are essential for interpretation and further analysis.\n\n"
+            "        Recommendation:\n"
+            "        Set `return_df=True` to obtain the polyad-level summary statistics (ObservedDist, U, V, V>0.95),\n"
+            "        AND/OR set `return_merged_seqdata=True` to obtain the merged SequenceData for clustering, visualization, etc.\n\n"
+            "        Example usage:\n"
+            "            result_df, merged_seq = linked_polyadic_sequence_analysis(...,\n"
+            "                                                    return_df=True,\n"
+            "                                                    return_merged_seqdata=True)\n"
+        )
+if __name__ == '__main__':
+    pass

sequenzo/openmp_setup.py ADDED Viewed

@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : openmp_setup.py
+@Time    : 07/10/2025 10:42
+@Desc    :
+OpenMP Setup for Apple Silicon Macs
+This module provides automatic OpenMP dependency management for Apple Silicon Macs.
+It ensures that libomp is available for parallel computation without requiring
+manual user intervention.
+"""
+import sys
+import os
+import subprocess
+import platform
+import ctypes
+from pathlib import Path
+def check_libomp_availability():
+    """
+    Check if libomp is available on the system.
+    Returns:
+        bool: True if libomp is available, False otherwise
+    """
+    try:
+        # Try to load libomp directly
+        ctypes.CDLL('libomp.dylib')
+        return True
+    except OSError:
+        pass
+    # Try common Homebrew paths
+    homebrew_paths = [
+        '/opt/homebrew/lib/libomp.dylib',  # Apple Silicon
+        '/usr/local/lib/libomp.dylib',     # Intel Mac
+    ]
+    for path in homebrew_paths:
+        if os.path.exists(path):
+            try:
+                ctypes.CDLL(path)
+                return True
+            except OSError:
+                continue
+    return False
+def check_homebrew_available():
+    """
+    Check if Homebrew is available on the system.
+    Returns:
+        bool: True if Homebrew is available, False otherwise
+    """
+    try:
+        subprocess.run(['brew', '--version'],
+                      stdout=subprocess.DEVNULL,
+                      stderr=subprocess.DEVNULL,
+                      check=True)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+def install_libomp_via_homebrew():
+    """
+    Install libomp via Homebrew.
+    Returns:
+        bool: True if installation successful, False otherwise
+    """
+    try:
+        print("🔧 Installing libomp via Homebrew...")
+        result = subprocess.run(['brew', 'install', 'libomp'],
+                              check=True,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        print("[>] libomp installed successfully!")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"[>] libomp installation failed: {e}")
+        return False
+    except Exception as e:
+        print(f"[>] Error during installation: {e}")
+        return False
+def setup_openmp_environment():
+    """
+    Set up OpenMP environment variables for Apple Silicon.
+    Returns:
+        bool: True if setup successful, False otherwise
+    """
+    try:
+        # Get Homebrew prefix
+        result = subprocess.run(['brew', '--prefix'],
+                              capture_output=True, text=True, check=True)
+        homebrew_prefix = result.stdout.strip()
+        # Set environment variables
+        lib_path = f"{homebrew_prefix}/lib"
+        include_path = f"{homebrew_prefix}/include"
+        os.environ['DYLD_LIBRARY_PATH'] = f"{lib_path}:{os.environ.get('DYLD_LIBRARY_PATH', '')}"
+        os.environ['LDFLAGS'] = f"-L{lib_path} {os.environ.get('LDFLAGS', '')}"
+        os.environ['CPPFLAGS'] = f"-I{include_path} {os.environ.get('CPPFLAGS', '')}"
+        print(f"[>] OpenMP environment variables set")
+        print(f"   - Library path: {lib_path}")
+        print(f"   - Include path: {include_path}")
+        return True
+    except Exception as e:
+        print(f"[>] Failed to set environment variables: {e}")
+        return False
+def ensure_openmp_support():
+    """
+    Ensure OpenMP support is available on Apple Silicon Macs.
+    This function handles the complete setup process.
+    Returns:
+        bool: True if OpenMP is available, False otherwise
+    """
+    # Only run on macOS
+    if sys.platform != 'darwin':
+        return True
+    # Only run on Apple Silicon
+    if platform.machine() != 'arm64':
+        return True
+    # Check if we're in a conda environment (don't interfere)
+    if os.environ.get('CONDA_DEFAULT_ENV'):
+        print("[>] Detected Conda environment, skipping OpenMP auto-setup")
+        return True
+    print("[>] Detected Apple Silicon Mac, checking OpenMP support...")
+    # Check if libomp is already available
+    if check_libomp_availability():
+        print("[>] OpenMP support is available")
+        return True
+    # Check if Homebrew is available
+    if not check_homebrew_available():
+        print("""
+[>] OpenMP Dependency Detection
+On Apple Silicon Mac, Sequenzo requires OpenMP support for parallel computation.
+Please run the following command to install OpenMP support:
+    brew install libomp
+If you don't have Homebrew installed, please visit https://brew.sh to install Homebrew first.
+        """)
+        return False
+    # Check if libomp is already installed via Homebrew
+    try:
+        subprocess.run(['brew', 'list', 'libomp'],
+                      stdout=subprocess.DEVNULL,
+                      stderr=subprocess.DEVNULL,
+                      check=True)
+        print("[>] libomp is already installed via Homebrew")
+        # Set up environment variables
+        setup_openmp_environment()
+        return True
+    except subprocess.CalledProcessError:
+        pass  # libomp not installed, continue with installation
+    # Attempt to install libomp automatically
+    if install_libomp_via_homebrew():
+        # Set up environment variables after installation
+        setup_openmp_environment()
+        return True
+    else:
+        print("""
+[>] Automatic OpenMP installation failed
+Please manually run the following command:
+    brew install libomp
+After installation, please restart Python or re-import sequenzo.
+        """)
+        return False
+def get_openmp_status():
+    """
+    Get the current OpenMP status and provide helpful information.
+    Returns:
+        dict: Status information about OpenMP support
+    """
+    status = {
+        'platform': sys.platform,
+        'architecture': platform.machine(),
+        'is_apple_silicon': sys.platform == 'darwin' and platform.machine() == 'arm64',
+        'libomp_available': check_libomp_availability(),
+        'homebrew_available': check_homebrew_available(),
+        'conda_environment': bool(os.environ.get('CONDA_DEFAULT_ENV')),
+    }
+    return status
+if __name__ == "__main__":
+    # Run the setup when called directly
+    success = ensure_openmp_support()
+    if success:
+        print("[>] OpenMP support is ready!")
+    else:
+        print("[>] OpenMP support unavailable, will use serial computation")
+    # Print status information
+    status = get_openmp_status()
+    print(f"\n[>] System Status:")
+    print(f"   - Platform: {status['platform']}")
+    print(f"   - Architecture: {status['architecture']}")
+    print(f"   - Apple Silicon: {status['is_apple_silicon']}")
+    print(f"   - libomp available: {status['libomp_available']}")
+    print(f"   - Homebrew available: {status['homebrew_available']}")
+    print(f"   - Conda environment: {status['conda_environment']}")