PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/dissimilarity_measures/get_distance_matrix.py ADDED Viewed

@@ -0,0 +1,762 @@
+"""
+@Author  : Xinyi Li 李欣怡
+@File    : get_distance_matrix.py
+@Time    : 2024/11/10 19:55
+@Desc    : Computes pairwise dissimilarities between sequences or dissimilarity from a reference sequence.
+            Several dissimilarity measures can be chosen,
+            including optimal matching (OM) and many of its variants, distance based on the count of common attributes,
+            and distances between state distributions within sequences.
+        :params
+            seqdata        : State sequence object of class stslist
+            method         : String.The dissimilarity measure to use.
+                            It can be "OM", "OMloc", "OMslen",  "OMspell", "OMstran", "HAM", "DHD",
+                            "CHI2", "EUCLID", "LCS", "LCP", "RLCP", "LCPspell", "RLCPspell",
+                            "NMS", "NMSMST", "SVRspell", or "TWED".
+            refseq         : Default: NULL. The baseline sequence to compute the distances from.
+                            (1)When an integer, the index of a sequence in seqdata or 0 for the most frequent sequence.
+                            (2)When a state sequence object, it must contain a single sequence and have the same alphabet as seqdata.
+                            (3)When a list, it must be a list of two sets of indexes of seqdata rows.
+            norm           : Default: "none". The normalization to use when method is one of
+                            {"OM", "OMloc", "OMslen", "OMspell", "OMstran", "TWED", "HAM", "DHD",
+                            "LCS", "LCP", "RLCP", "LCPspell", "RLCPspell", "CHI2", "EUCLID"}.
+                            (1)It can be "none", "auto", or,
+                                except for "CHI2" and "EUCLID", "maxlength", "gmean", "maxdist", or "YujianBo".
+                            (2)"auto" is equivalent to
+                                1) "maxlength" when method is one of "OM", "HAM", or "DHD",
+                                2)"gmean"  when method is one of "LCS", "LCP", "RLCP", "LCPspell", or "RLCPspell",
+                                3) YujianBo when method is one of "OMloc", "OMslen", "OMspell", "OMstran", "TWED".
+            indel          : Insertion/deletion  cost(s).
+                            Applies when method is one of "OM", "OMslen", "OMspell", or "OMstran".
+                            (1)The single state-independent insertion/deletion cost when a double.
+                            (2)The state-dependent insertion/deletion costs when a vector of doubles.
+                                The vector should contain an indel cost by state in the order of the alphabet.
+                            (3)When "auto", the indel is set as max(sm)/2 when sm is a matrix
+                                and is computed by means of seqcost when sm is a string specifying a cost method.
+            sm             : Substitution costs. Default: NULL.
+                            (1)The substitution-cost matrix when a matrix
+                                and method is one of "OM", "OMloc", "OMslen", "OMspell", "OMstran", "HAM", or "TWED".
+                            (2)The series of the substitution-cost matrices when an array and method = "DHD".
+                                They are grouped in a 3-dimensional array with the third index referring to the position in the sequence.
+                            (3)One of the strings "CONSTANT", "INDELS", "INDELSLOG", or "TRATE".
+                                Designates a seqcost method to build sm. "CONSTANT" is not relevant for "DHD".
+                            sm is mandatory when method is one of "OM", "OMloc", "OMslen", "OMspell", "OMstran", or "TWED".
+                            sm is autogenerated when method is one of "HAM" or "DHD" and sm = NULL.
+            full.matrix    : Default: TRUE. When refseq = NULL, if TRUE, the full distance matrix is returned,
+                            if FALSE, an object of class dist is returned,
+                            that is, a vector containing only values from the lower triangle of the distance matrix.
+                            Objects of class dist are smaller and can be passed directly as arguments to most clustering functions.
+            tpow           : Default: 1.0.
+                            The exponential weight of spell length when method is one of "OMspell", "NMSMST", or "SVRspell".
+            expcost        : Default: 0.5. The cost of spell length transformation when method = "OMloc", "OMspell", "LCPspell", or "RLCPspell".
+                            It must be positive. The exact interpretation is distance-dependent.
+            weighted       : Default: TRUE. When method is "CHI2" or when sm is a string (method),
+                            should the distributions of the states account for the sequence weights in seqdata?
+            check.max.size : Logical. Should seqdist stop when maximum allowed number of unique sequences is exceeded?
+"""
+import gc
+import time
+import warnings
+from scipy.spatial.distance import pdist, squareform
+import numpy as np
+import pandas as pd
+from sequenzo.define_sequence_data import SequenceData
+with_missing_warned = False
+def get_distance_matrix(seqdata=None, method=None, refseq=None, norm="none", indel="auto", sm=None, full_matrix=True,
+                        tpow=1.0, expcost=0.5, weighted=True, check_max_size=True, opts=None, **kwargs):
+    from .utils.seqconc import seqconc
+    from .utils.seqdss import seqdss
+    from .utils.seqdur import seqdur
+    from .utils.seqlength import seqlength
+    from . import get_substitution_cost_matrix
+    # Lazily import the c_code module to avoid circular dependencies during installation
+    from .__init__ import _import_c_code
+    c_code = _import_c_code()
+    gc.collect()                           # garbage collection
+    if opts is not None:
+        seqdata = opts.get('seqdata')
+        method = opts.get('method')
+        refseq = opts.get('refseq')
+        norm = opts.get('norm') or "none"
+        indel = opts.get('indel') or "auto"
+        sm = opts.get('sm')
+        full_matrix = opts.get('full_matrix') or True
+        tpow = opts.get('tpow') or 1.0
+        expcost = opts.get('expcost') or 0.5
+        weighted = opts.get('weighted') or True
+        check_max_size = opts.get('check_max_size') or True
+    if 'with_missing' in kwargs:
+        print("[!] 'with_missing' has been removed and is ignored.")
+        print("    Missing values are always included by default, consistent with TraMineR.")
+        with_missing_warned = True
+    # ======================================
+    # Check Arguments With Deprecated Values
+    # ======================================
+    # the version in 2017
+    # check method
+    deprecated_methods = ["OMopt", "LCSopt"]
+    if method in deprecated_methods:
+        print(f"[!] Warning: {method} is deprecated.\n")
+        if method == "OMopt":
+            method = "OM"
+            print(f"[!] 'method' is set to \"OM\" which is equivalent.")
+        elif method == "LCSopt":
+            method = "LCS"
+            print(f"[!] 'method' is set to \"LCS\" which is equivalent.")
+    # check norm
+    if isinstance(norm, bool):
+        norm = "auto" if norm else "none"
+        print("[!] Warning: 'norm' has a deprecated value, TRUE changed into 'auto', FALSE into 'none'.\n")
+    # ===========================================
+    # Check For Arguments That Need To Be Defined
+    # ===========================================
+    # Check if the method parameter is missing
+    if seqdata is None:
+        raise ValueError("[!] The 'seqdata' parameter is missing.")
+    if method is None:
+        raise ValueError("[!] The 'method' parameter is missing.")
+    # ====================
+    # Check Argument Types
+    # ====================
+    if not isinstance(seqdata, SequenceData):
+        raise ValueError("[!] 'seqdata' must be a state sequence object created with SequenceData")
+    nseqs = seqdata.seqdata.shape[0]
+    nstates = len(seqdata.states)
+    seqs_dlens = np.unique(seqlength(seqdata))
+    # check method
+    om_methods = ["OM", "OMspell"]
+    methods = om_methods + ["HAM", "DHD", "LCP", "RLCP", "LCPspell", "RLCPspell"]
+    if method not in methods:
+        raise ValueError(f"[!] Invalid 'method': {method}. Expected one of {methods}")
+    # check refseq
+    if refseq is not None:
+        # if list of two sets of indexes, we will compute pairwise distances between the two sets
+        if isinstance(refseq, list) and len(refseq) > 1:
+            if len(refseq) > 2:
+                print("[!] Warning: Only first two elements of the 'refseq' list are used.\n")
+            for i, ref in enumerate(refseq[:2]):
+                if any(not isinstance(x, int) or x < 0 for x in ref):
+                    raise ValueError(
+                        "[x] When 'refseq' is a list, it must contain two sets of indexes with positive integer values.")
+                if max(ref, default=-1) > nseqs:
+                    raise ValueError("[x] Some indexes in 'refseq' are out of range.")
+            refseq_type = "sets"
+        else:
+            raise ValueError("[!] Invalid 'refseq' value.")
+    else:
+        refseq_type = "none"
+    # check for empty sequences
+    sdur = seqdur(seqdata)
+    emptyseq = np.where(np.isnan(sdur[:, 0]))[0]
+    if len(emptyseq) > 0:
+        if method == "OMloc":
+            raise ValueError(f"[!] Error: empty sequences in method 'OMloc': {emptyseq}.")
+        else:
+            print(f"[!] Warning: empty sequences {emptyseq}.\n")
+    print(f"[>] Processing {nseqs} sequences with {nstates} unique states.")
+    # check norm
+    norms = ["auto", "none", "maxlength", "gmean", "maxdist", "YujianBo"]
+    if norm not in norms:
+        raise ValueError(f"[!] 'norm' should be in {norms}.")
+    # check indel
+    # indel_type: "number", "vector", "auto"
+    # must be after including missing values as an additional state (nstates)
+    # all but NMS, NMSMST, SVRspell
+    if isinstance(indel, (int, float)):
+        indel_type = "number"
+    elif isinstance(indel, (np.ndarray, list)) and np.issubdtype(indel.dtype, np.number):
+        if len(indel) != nstates:
+            raise ValueError("[!] When a vector, 'indel' must contain a cost for each state.")
+        indel_type = "vector"
+    elif indel == "auto":
+        indel_type = "auto"
+    else:
+        raise ValueError("[!] indel")
+    # check sm
+    # Must be after sanity checks on 'indel'
+    # Add here new seqcost() method names
+    # sm.type:
+    #   "none" :
+    #   "matrix" : "OM", "OMloc", "OMslen", "OMspell", "OMstran", "HAM", "DHD" or "TWED".
+    #   "method" : "TRATE", "CONSTANT", "INDELS", "INDELSLOG"
+    sm_methods = ["TRATE", "CONSTANT", "INDELS", "INDELSLOG"]
+    if sm is not None:
+        if isinstance(sm, np.ndarray) and (sm.ndim == 2 or sm.ndim == 3):
+            sm_type = "matrix"
+        elif isinstance(sm, np.ndarray) and sm.ndim == 1:
+            sm_type = "array"
+        elif isinstance(sm, str):
+            sm = sm.upper()
+            if sm not in sm_methods:
+                raise ValueError(f"[!] Invalid 'sm' value, must be one of {sm_methods}.")
+            if method == "OM" and (sm == "INDELSLOG" or sm == "INDELS"):
+                raise ValueError(f"[!] 'sm = \"{sm}\"' is not relevant for OM now, consider TRATE or CONSTANT instead.")
+            sm_type = "method"
+        else:
+            raise ValueError("[!] 'sm' must be of a valid type (matrix, array, method).")
+    else:
+        sm_type = "none"
+    # ===================================
+    # Check Arguments Not Yet Implemented
+    # ===================================
+    # norm: all but  SVRspell, NMS, NMSMST
+    if norm != "none" and method not in ["OM", "OMspell", "HAM", "DHD", "LCP", "RLCP", "LCPspell", "RLCPspell"]:
+        raise ValueError(f"[x] norm is not matched with {method}.")
+    # ===============================
+    # Check Method Specific Arguments
+    # ===============================
+    # 1. OMspell, LCPspell, RLCPspell
+    if method in ["OMspell"] and expcost < 0:
+        raise ValueError("[x] 'expcost' must be positive.")
+    if method in ["LCPspell", "RLCPspell"] and expcost < 0:
+        raise ValueError("[x] 'expcost' must be non-negative for LCPspell/RLCPspell (use 0 to ignore duration).")
+    # 2. DHD
+    elif method == "DHD":
+        if sm_type == "method" and sm == "CONSTANT":
+            raise ValueError("[!] 'sm = \"CONSTANT\"' is not relevant for DHD, consider HAM instead.")
+    # 3. HAM, DHD
+    if method in ["HAM", "DHD"]:
+        if seqs_dlens.shape[0] > 1:
+            raise ValueError(f"[x] {method} is not defined for sequences of different length.")
+    # ==============
+    # Configure Norm
+    # ==============
+    if norm == "auto":
+        if method in ["OM", "HAM", "DHD"]:
+            norm = "maxlength"
+        elif method in ["LCP", "RLCP", "LCPspell", "RLCPspell"]:
+            norm = "gmean"
+        elif method in ["OMspell"]:
+            norm = "YujianBo"
+        else:
+            raise ValueError(f"[!] No known normalization method to select automatically for {method}.")
+    # ======================
+    # Configure sm and indel
+    # ======================
+    if indel_type == "auto" and sm_type == "matrix":
+        indel = np.max(sm) / 2
+        indel_type = "number"
+    # OM, OMspell, HAM, DHD
+    if method in om_methods + ["HAM", "DHD"]:
+        if sm_type == "matrix":
+            if method in om_methods + ["TWED"]:
+                # TODO : checkcost()
+                # Add a NaN column at the beginning and a NaN row at the top
+                # This ensures that indexing starts from 1
+                nan_col = np.full((sm.shape[0], 1), np.nan)
+                sm = np.hstack([nan_col, sm])
+                nan_row = np.full((1, sm.shape[1]), np.nan)
+                sm = np.vstack([nan_row, sm])
+                pass
+            elif method == "HAM":
+                # TODO : checkcost()
+                nan_col = np.full((sm.shape[0], 1), np.nan)
+                sm = np.hstack([nan_col, sm])
+                nan_row = np.full((1, sm.shape[1]), np.nan)
+                sm = np.vstack([nan_row, sm])
+                pass
+            else:
+                raise ValueError(f"[x] No known 'sm' check for {method}.")
+        elif sm_type == "array":
+            if method == "DHD":
+                # TODO : checkcost()
+                pass
+            else:
+                raise ValueError(f"[x] 'sm' as an array is not relevant for {method}.")
+        elif sm_type == "method":
+            tv = False
+            cost = None
+            if sm in ["INDELS", "INDELSLOG"]:
+                if method == "DHD":
+                    tv = True
+            elif sm == "TRATE":
+                if method == "OM":
+                    cost = 2
+                elif method == "HAM":
+                    cost = 2
+                elif method == "DHD":
+                    cost = 4
+                    tv = True
+            elif sm == "CONSTANT":
+                if method == "HAM":
+                    cost = 1
+                else:
+                    cost = 2
+            sm = get_substitution_cost_matrix(seqdata,
+                                              method=sm,
+                                              cval=cost,
+                                              miss_cost=cost,
+                                              time_varying=tv,
+                                              weighted=weighted)
+            if indel_type == "auto":
+                indel = sm['indel']
+                indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
+                print(f"[>] generated an indel of type {indel_type}\n")
+            sm = sm['sm']
+            del cost, tv
+        else:
+            if method == "HAM":
+                print("[>] Creating a 'sm' with a single substitution cost of 1.\n")
+                sm = get_substitution_cost_matrix(seqdata,
+                                                  method="CONSTANT",
+                                                  cval=1,
+                                                  miss_cost=1)
+                if indel_type == "auto":
+                    indel = sm['indel']
+                    indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
+                sm = sm['sm']
+            elif method == "DHD":
+                print("[>] Creating a 'sm' with the costs derived from the transition rates.\n")
+                sm = get_substitution_cost_matrix(seqdata,
+                                                  method="TRATE",
+                                                  cval=4, miss_cost=4, time_varying=True,
+                                                  weighted=weighted)
+                if indel_type == "auto":
+                    indel = sm['indel']
+                    indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
+                sm = sm['sm']
+            else:
+                raise ValueError("[x] 'sm' is missing.")
+    elif method not in ["CHI2", "EUCLID", "LCP", "RLCP", "LCPspell", "RLCPspell", "NMS", "NMSMST", "SVRspell"]:
+        raise ValueError(f"[x] No known 'sm' preparation for {method}.")
+    # ===========================
+    # Pre-Process Data (Part 1/2)
+    # ===========================
+    seqdata_num = seqdata.values   # it's numpy
+    if refseq_type == "sets":
+        dseqs_num1 = np.unique(seqdata_num[refseq[0], :], axis=0)
+        nunique1 = len(dseqs_num1)
+        dseqs_num2 = np.unique(seqdata_num[refseq[1], :], axis=0)
+        nunique2 = len(dseqs_num2)
+        dseqs_num = np.vstack((dseqs_num1, dseqs_num2))
+    else:
+        dseqs_num = np.unique(seqdata_num, axis=0)
+    # Check that dseqs_num does not exceed the max allowed number
+    # if check_max_size:
+    #     max_allowed_seq = np.floor(np.sqrt(np.iinfo(np.int32).max)) if refseq_type == "none" else np.iinfo(np.int32).max - 1
+    #
+    #     if refseq_type == "sets":
+    #         if (np.sqrt(nunique1) * np.sqrt(nunique2)) > max_allowed_seq:
+    #             raise ValueError(f"[!] Number of {nunique1} and {nunique2} unique sequences too large for max allowed distances {max_allowed_seq}.")
+    #     else:
+    #         if len(dseqs_num) > max_allowed_seq:
+    #             raise ValueError(f"[!] {len(dseqs_num)} unique sequences exceeds max allowed of {max_allowed_seq}.")
+    # =========================
+    # Handle Reference Sequence
+    # =========================
+    if refseq_type == "sets":
+        conc1 = seqconc(data=seqdata_num[refseq[0], :])
+        conc2 = seqconc(data=dseqs_num1)
+        # Find the position of each element in conc1 within conc2
+        index_map = {value: idx for idx, value in enumerate(conc2)}
+        seqdata_didxs1 = np.array([index_map[element] for element in conc1])
+        conc3 = seqconc(data=seqdata_num[refseq[1], :])
+        conc4 = seqconc(data=dseqs_num2)
+        # Find the position of each element in conc3 within conc4
+        index_map = {value: idx for idx, value in enumerate(conc4)}
+        seqdata_didxs2 = np.array([index_map[element] for element in conc3])
+    else:
+        seqdata_series = seqconc(data=seqdata_num)
+        dseqs_series = seqconc(data=dseqs_num)
+        index_map = {value: idx for idx, value in enumerate(dseqs_series)}
+        seqdata_didxs = np.array([index_map[element] for element in seqdata_series])
+    if refseq_type != "none":
+        if refseq_type == "sets":
+            if method in ["OMstran"]:
+                refseq_id = refseq
+            else:
+                refseq_id = [nunique1, nunique1 + nunique2]
+        else:
+            raise ValueError(f"[!] Unknown refseq type: {refseq_type}.")
+        if refseq_type == "sets":
+            print(f"[>] Pairwise measures between two subsets of sequences of sizes {len(refseq[0])} and {len(refseq[1])}")
+    # ==============================
+    # Compute Method-Specific Values
+    # ==============================
+    if method in ["OMspell"]:
+        if indel_type == "number":
+            indellist = np.repeat(indel, nstates + 1)
+            indel_type = "vector"
+        elif indel_type == "vector":
+            indellist = indel
+        indel = np.max(indellist)
+    # OM method: convert vector indel to scalar if needed
+    # OMdistance C++ code only accepts scalar indel, not state-dependent
+    # Following TraMineR's behavior: when indel.type == "vector", use max(indel)
+    # See TraMineR seqdist.R line 696: params[["indel"]] <- max(indel)
+    elif method == "OM" and indel_type == "vector":
+        if isinstance(indel, np.ndarray):
+            # Use max(indel) to match TraMineR's behavior
+            indel = float(np.max(indel))
+            indel_type = "number"
+        elif isinstance(indel, list):
+            indel_array = np.array(indel)
+            indel = float(np.max(indel_array))
+            indel_type = "number"
+    # OMspell
+    # Redefined dseqs.num
+    if method in ["OMspell", "LCPspell", "RLCPspell", "NMSMST", "SVRspell"]:
+        dseqs_dur = seqdur(seqdata) ** tpow  # Do not use dseqs.num
+        # The position of the first occurrence of the deduplicated data (conc1) in the original data (conc2)
+        conc1 = seqconc(data=dseqs_num)
+        conc2 = seqconc(data=seqdata_num)
+        index_map = {value: idx for idx, value in enumerate(conc2)}
+        dseqs_oidxs = np.array([index_map[element] for element in conc1])
+        # Can't sort! Otherwise, the actual sequence compared will not be the expected sequence
+        # Get duration
+        c = 1 if method == "OMspell" else 0
+        dseqs_dur = dseqs_dur[dseqs_oidxs, :] - c
+        # Get DSS
+        seqdata_dss = seqdss(seqdata)
+        dseqs_num = seqdata_dss[dseqs_oidxs, :]
+        if method in ["OMspell", "LCPspell", "RLCPspell"]:
+            _seqlength = seqlength(dseqs_num)
+        if method == "LCPspell":
+            sign = 1
+        elif method == "RLCPspell":
+            sign = -1
+        del dseqs_oidxs
+        del c
+        del seqdata_dss
+    # HAM, DHD
+    elif method in ["HAM", "DHD"]:
+        if method == "HAM":
+            # sm_type = "array"  # Not used. Should be here if it changes.
+            sm = adaptSmForHAM(sm, nstates, seqdata.seqdata.shape[1])
+        # Maximum possible cost of the Hamming distance
+        max_cost = 0
+        for i in range(np.max(seqs_dlens)):  # seqs_dlens has here only one value
+            max_cost += np.max(sm[i, :, :])
+    # LCP
+    elif method == "LCP":
+        sign = 1
+    # RLCP
+    elif method == "RLCP":
+        sign = -1
+    # LCPspell (spell-based LCP, forward)
+    elif method == "LCPspell":
+        sign = 1
+    # RLCPspell (spell-based LCP, reverse)
+    elif method == "RLCPspell":
+        sign = -1
+    del index_map
+    del seqdata_num
+    # ===========================
+    # Pre-Process Data (part 2/2)
+    # ===========================
+    # Modified dseqs.num for OMspell
+    ndn = dseqs_num.shape[0]
+    incl_refseq = " (including refseq)" if refseq_type == "sequence" else ""
+    seq_or_spell = "spell sequences" if method in ["OMspell", "LCPspell", "RLCPspell"] else "sequences"
+    print(f"[>] Identified {ndn} unique {seq_or_spell}{incl_refseq}.")
+    del ndn
+    del seq_or_spell
+    # =================
+    # Compute Distances
+    # =================
+    norm_num = norms[1:].index(norm)
+    if isinstance(sm, pd.DataFrame):
+        sm = sm.values
+    lengths = seqlength(dseqs_num)
+    # C++ already guarantees that invalid values will not be accessed
+    warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in cast")
+    if refseq_type != "none":
+        if len(refseq_id) == 1:
+            refseq_id = [refseq_id, refseq_id]
+        refseq_id = np.array(refseq_id, dtype=int)
+        if method == "OM":
+            om = c_code.OMdistance(dseqs_num,
+                                    sm,
+                                    indel,
+                                    norm_num,
+                                    lengths,
+                                    refseq_id)
+            dist_matrix = om.compute_refseq_distances()
+        elif method == "OMspell":
+            om = c_code.OMspellDistance(dseqs_num,
+                                         sm,
+                                         indel,
+                                         norm_num,
+                                         refseq_id,
+                                         expcost,
+                                         dseqs_dur,
+                                         indellist.astype(np.float64),
+                                         _seqlength)
+            dist_matrix = om.compute_refseq_distances()
+        elif method == "HAM" or method == "DHD":
+            DHD = c_code.DHDdistance(dseqs_num,
+                                      sm,
+                                      norm_num,
+                                      max_cost,
+                                      refseq_id)
+            dist_matrix = DHD.compute_refseq_distances()
+        elif method == "LCP" or method == "RLCP":
+            LCP = c_code.LCPdistance(dseqs_num,
+                                     norm_num,
+                                     sign,
+                                     refseq_id)
+            dist_matrix = LCP.compute_all_distances()
+        elif method == "LCPspell" or method == "RLCPspell":
+            LCPspell = c_code.LCPspellDistance(dseqs_num,
+                                                dseqs_dur,
+                                                _seqlength,
+                                                norm_num,
+                                                sign,
+                                                refseq_id,
+                                                expcost)
+            dist_matrix = LCPspell.compute_refseq_distances()
+        dist_matrix = dist_matrix[seqdata_didxs1[:, None], seqdata_didxs2[None, :]]
+        dist_matrix = pd.DataFrame(dist_matrix, index=seqdata.ids[refseq[0]], columns=seqdata.ids[refseq[1]])
+    else:
+        refseq_id = np.array([-1, -1])
+        if method == "OM":
+            om = c_code.OMdistance(dseqs_num,
+                                    sm,
+                                    indel,
+                                    norm_num,
+                                    lengths,
+                                    refseq_id)
+            dist_matrix = om.compute_all_distances()
+        elif method == "OMspell":
+            om = c_code.OMspellDistance(dseqs_num,
+                                         sm,
+                                         indel,
+                                         norm_num,
+                                         refseq_id,
+                                         expcost,
+                                         dseqs_dur,
+                                         indellist,
+                                         _seqlength)
+            dist_matrix = om.compute_all_distances()
+        elif method == "HAM" or method == "DHD":
+            DHD = c_code.DHDdistance(dseqs_num,
+                                      sm,
+                                      norm_num,
+                                      max_cost,
+                                      refseq_id)
+            dist_matrix = DHD.compute_all_distances()
+        elif method == "LCP" or method == "RLCP":
+            LCP = c_code.LCPdistance(dseqs_num,
+                                     norm_num,
+                                     sign,
+                                     refseq_id)
+            dist_matrix = LCP.compute_all_distances()
+        elif method == "LCPspell" or method == "RLCPspell":
+            LCPspell = c_code.LCPspellDistance(dseqs_num,
+                                               dseqs_dur,
+                                               _seqlength,
+                                               norm_num,
+                                               sign,
+                                               refseq_id,
+                                               expcost)
+            dist_matrix = LCPspell.compute_all_distances()
+        _matrix = c_code.dist2matrix(nseqs, seqdata_didxs, dist_matrix)
+        _dist2matrix = _matrix.padding_matrix()
+    if full_matrix == True and refseq == None:
+        dist_matrix = pd.DataFrame(_dist2matrix, index=seqdata.ids, columns=seqdata.ids)
+    elif full_matrix == False and refseq != None:
+        print("[!] Sequenzo returned a full distance matrix because 'refseq' is not None. This is same as TraMineR.")
+    elif full_matrix == False and refseq == None:
+        dist_matrix = squareform(_dist2matrix)
+    print("[>] Computed Successfully.")
+    return dist_matrix
+def adaptSmForHAM(sm, nstates, ncols):
+    costs = np.tile(sm, (ncols, 1, 1))
+    return costs
+def getElementsNumber(x):
+    if isinstance(x, pd.DataFrame):
+        return x.shape[1]
+    elif isinstance(x, (np.ndarray, list, tuple)):
+        return x.size if isinstance(x, np.ndarray) else len(x)
+    else:
+        return 1
+if __name__ == '__main__':
+    from sequenzo import *
+    start_time = time.time()
+    # tracemalloc.start()
+    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/files/sampled_data_sets/broad_data/sampled_30000_data.csv")
+    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/files/orignal data/detailed_sequence_10_work_years_df.csv")
+    # ===============================
+    #             Sohee
+    # ===============================
+    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
+    # time_list = list(df.columns)[1:133]
+    # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+    # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
+    # sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
+    # om = get_distance_matrix(sequence_data, method="OM", sm="TRATE", indel="auto")
+    # om.to_csv("D:/college/research/QiQi/sequenzo/files/sequenzo_Sohee_string_OM_TRATE.csv", index=True)
+    # ===============================
+    #             kass
+    # ===============================
+    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
+    # time_list = list(df.columns)[1:]
+    # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
+    #           'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
+    # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
+    # om = get_distance_matrix(sequence_data, method="RLCP", sm="TRATE", indel="auto")
+    # ===============================
+    #             CO2
+    # ===============================
+    df = pd.read_csv("D:/country_co2_emissions_missing.csv")
+    _time = list(df.columns)[1:]
+    states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
+    sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
+    om = get_distance_matrix(sequence_data, method="OMspell", sm="TRATE", indel="auto")
+    # ===============================
+    #            detailed
+    # ===============================
+    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
+    # _time = list(df.columns)[4:]
+    # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
+    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
+    #                              time_type="age", time=_time, id_col="worker_id", states=states)
+    # # refseq = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [142, 85]]
+    # om = get_distance_matrix(sequence_data, method="OM", sm="TRATE", indel="auto")
+    # ===============================
+    #             broad
+    # ===============================
+    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
+    # _time = list(df.columns)[4:]
+    # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
+    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
+    #                              time_type="age", time=_time, id_col="worker_id", states=states)
+    # om = get_distance_matrix(sequence_data, method="DHD", sm="TRATE", indel="auto")
+    # refseq = [[0, 1, 2], [99, 100]]
+    # print(om)
+    # snapshot = tracemalloc.take_snapshot()
+    # top_stats = snapshot.statistics('lineno')
+    # for stat in top_stats[:10]:
+    #     print(stat)
+    print("================")
+    end_time = time.time()
+    print(f"[>] Total time: {end_time - start_time:.2f} seconds")
+    print(om)