PyPI - cocoatree - Versions diffs - 0.1.0rc0.dev2__py3-none-any.whl - Mend

cocoatree 0.1.0rc0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cocoatree/__init__.py +8 -0
cocoatree/__params.py +80 -0
cocoatree/_pipeline.py +144 -0
cocoatree/_scraper.py +23 -0
cocoatree/_version.py +1 -0
cocoatree/datasets/__init__.py +3 -0
cocoatree/datasets/_base.py +188 -0
cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
cocoatree/datasets/tests/test_datasets.py +14 -0
cocoatree/decomposition.py +263 -0
cocoatree/io.py +185 -0
cocoatree/msa.py +579 -0
cocoatree/pysca.py +238 -0
cocoatree/randomize.py +30 -0
cocoatree/scripts/cocoatree-sca.py +6 -0
cocoatree/statistics/__init__.py +58 -0
cocoatree/statistics/pairwise.py +318 -0
cocoatree/statistics/position.py +258 -0
cocoatree/tests/test_init.py +24 -0
cocoatree/tests/test_msa.py +14 -0
cocoatree/visualization.py +440 -0
cocoatree-0.1.0rc0.dev2.dist-info/METADATA +66 -0
cocoatree-0.1.0rc0.dev2.dist-info/RECORD +39 -0
cocoatree-0.1.0rc0.dev2.dist-info/WHEEL +5 -0
cocoatree-0.1.0rc0.dev2.dist-info/licenses/LICENSE +28 -0
cocoatree-0.1.0rc0.dev2.dist-info/top_level.txt +1 -0

cocoatree/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from . import msa  # noqa: F401
+from . import datasets  # noqa: F401
+from . import statistics  # noqa: F401
+from . import io  # noqa: F401
+from . import decomposition  # noqa: F401
+from ._pipeline import perform_sca  # noqa: F401
+from ._version import __version__   # noqa: F401

cocoatree/__params.py ADDED Viewed

@@ -0,0 +1,80 @@
+# Parameters for COCOA-Tree
+import numpy as np
+# in pySCA, the defautl value is 0.03 (default here)
+# in mean-field DCA, the default value is 0.5
+__freq_regularization_ref = 0.03
+__freq0 = np.array(
+    [
+        0.073,
+        0.025,
+        0.050,
+        0.061,
+        0.042,
+        0.072,
+        0.023,
+        0.053,
+        0.064,
+        0.089,
+        0.023,
+        0.043,
+        0.052,
+        0.040,
+        0.052,
+        0.073,
+        0.056,
+        0.063,
+        0.013,
+        0.033
+    ]
+)
+lett2num = {
+    '-': 0,
+    'A': 1,
+    'C': 2,
+    'D': 3,
+    'E': 4,
+    'F': 5,
+    'G': 6,
+    'H': 7,
+    'I': 8,
+    'K': 9,
+    'L': 10,
+    'M': 11,
+    'N': 12,
+    'P': 13,
+    'Q': 14,
+    'R': 15,
+    'S': 16,
+    'T': 17,
+    'V': 18,
+    'W': 19,
+    'Y': 20}
+__aa_count = len(lett2num)
+aatable = {
+    "ALA": "A",
+    "ARG": "R",
+    "ASN": "N",
+    "ASP": "D",
+    "CYS": "C",
+    "GLN": "Q",
+    "GLU": "E",
+    "GLY": "G",
+    "HIS": "H",
+    "ILE": "I",
+    "LEU": "L",
+    "LYS": "K",
+    "MET": "M",
+    "PHE": "F",
+    "PRO": "P",
+    "SER": "S",
+    "THR": "T",
+    "TRP": "W",
+    "TYR": "Y",
+    "VAL": "V",
+}

cocoatree/_pipeline.py ADDED Viewed

@@ -0,0 +1,144 @@
+from . import msa
+from . import statistics
+from . import decomposition
+from . import __params
+import pandas as pd
+import numpy as np
+def perform_sca(sequences_id, sequences,
+                n_components=4,
+                freq_regul=__params.__freq_regularization_ref,
+                gap_threshold=0.4, seq_threshold=0.2,
+                coevolution_metric="SCA", correction=None):
+    """
+    Perform statistical coupling analysis (SCA)
+    Parameters
+    ----------
+    sequences : list of MSA sequences to filter
+    sequences_id : list of the MSA's sequence identifiers
+    n_components : int, default: 4
+    gap_threshold : float [0, 1], default: 0.4
+        max proportion of gaps tolerated
+    seq_threshold : maximum fraction of gaps per sequence (default 0.2)
+    coevolution_metric : str or callable, optional, default: 'SCA'
+        which coevolution metric to use:
+        - SCA: the coevolution matrix from Rivoire et al
+        - MI: the mutual information
+        - NMI: the normalized mutual information
+        - callable: a function that takes as arguments (1) sequences, (2)
+          `seq_weights`, and `freq_regul`
+    correction : {None, 'APC', 'entropy'}, default: None
+        which correction to use
+    Returns
+    -------
+    coevol_matrix : np.ndarray (n_filtered_pos, n_filtered_pos)
+        coevolution matrix
+    coevol_matrix_ngm : np.ndarray (n_filtered_pos, n_filtered_pos)
+        coevolution matrix without global mode (ngm = no global mode)
+    df : pd.DataFrame with the following columns
+        - original_msa_pos : the original MSA position
+        - filtered_msa_pos : the position in the filtered MSA
+        and  for each component:
+        - PCk: the projection of the residue onto the kth principal component
+        - ICk: the projeciton of the residue onto the kth independent
+          component
+        - xcor_k: wherether the residue is found to be part of xcor k
+    """
+    # Start by filtering sequences
+    seq_kept, seq_kept_id, pos_kept = msa.filter_sequences(
+        sequences, sequences_id, gap_threshold=gap_threshold,
+        seq_threshold=seq_threshold)
+    # Compute sequence weights. This is mostly to avoid recomputing it at
+    # several step in the pipeline and thus speed things up a bit
+    seq_weights, _ = msa.compute_seq_weights(seq_kept)
+    # Compute co-evolution matrix
+    if coevolution_metric == "SCA":
+        coevol_matrix = statistics.pairwise.compute_sca_matrix(
+            seq_kept,
+            seq_weights=seq_weights,
+            freq_regul=freq_regul)
+    elif coevolution_metric == "MI":
+        coevol_matrix = statistics.pairwise.compute_mutual_information_matrix(
+            seq_kept, seq_weights=seq_weights, freq_regul=freq_regul,
+            normalize=False)
+    elif coevolution_metric == "NMI":
+        coevol_matrix = statistics.pairwise.compute_mutual_information_matrix(
+            seq_kept, seq_weights=seq_weights, freq_regul=freq_regul)
+    elif callable(coevolution_metric):
+        coevol_matrix = coevolution_metric(
+            seq_kept, seq_weights=seq_weights,
+            freq_regul=freq_regul)
+    else:
+        raise ValueError(
+            "Unknown 'coevol_metric' value. User provided"
+            f"{coevolution_metric}. Options are 'SCA', 'MI', 'NMI'")
+    # Compute correction on coevolution matrix
+    if correction is not None:
+        if correction == "APC":
+            _, coevol_matrix = statistics.pairwise.compute_apc(coevol_matrix)
+        elif correction == "entropy":
+            entropy_aa = statistics.position.compute_conservation(
+                seq_kept,
+                seq_weights=seq_weights)
+            coevol_matrix = statistics.pairwise.compute_entropy_correction(
+                coevol_matrix, entropy_aa)
+        else:
+            raise ValueError(
+                "Unknown 'correction' value. User provided"
+                f"{correction}. Options are 'APC', 'entropy'")
+    # Now, compute deconvolution
+    principal_components = decomposition.extract_principal_components(
+        coevol_matrix)
+    independent_components = decomposition.extract_independent_components(
+        coevol_matrix, n_components=n_components)
+    xcors = decomposition.extract_xcors_from_ICs(
+        independent_components, coevol_matrix)
+    # Now, map everything into a nice pandas DataFrame
+    pos_mapping, _ = msa.map_msa_positions(len(sequences[0]), pos_kept)
+    df = pd.DataFrame(
+        {"original_msa_pos": np.arange(len(sequences[0]), dtype=int),
+         "filtered_msa_pos": pos_mapping.values()})
+    # make filtered_msa_pos stay integer with NaN support
+    df["filtered_msa_pos"] = df["filtered_msa_pos"].astype("Int64")
+    # Add PCA and ICA results
+    for k in range(n_components):
+        df.loc[~df["filtered_msa_pos"].isna(),
+               "PC%d" % (k+1)] = principal_components[k]
+        df.loc[~df["filtered_msa_pos"].isna(),
+               "IC%d" % (k+1)] = independent_components[k]
+        df["xcor_%d" % (k+1)] = np.isin(
+            df["filtered_msa_pos"], xcors[k])
+        df.loc[~df["filtered_msa_pos"].isna(),
+               "xcor_%d" % (k+1)] = np.isin(
+                   df.loc[~df["filtered_msa_pos"].isna(),
+                          "filtered_msa_pos"], xcors[k])
+    coevol_matrix_ngm = decomposition.remove_global_correlations(coevol_matrix)
+    return coevol_matrix, coevol_matrix_ngm, df

cocoatree/_scraper.py ADDED Viewed

@@ -0,0 +1,23 @@
+from glob import glob
+import shutil
+import os
+from sphinx_gallery.scrapers import figure_rst
+def png_scraper(block, block_vars, gallery_conf):
+    # Find all PNG files in the directory of this example.
+    path_current_example = os.path.dirname(block_vars['src_file'])
+    pngs = sorted(glob(os.path.join(path_current_example, '*.png')))
+    # Iterate through PNGs, copy them to the Sphinx-Gallery output directory
+    image_names = list()
+    image_path_iterator = block_vars['image_path_iterator']
+    seen = set()
+    for png in pngs:
+        if png not in seen:
+            seen |= set(png)
+            this_image_path = image_path_iterator.next()
+            image_names.append(this_image_path)
+            shutil.move(png, this_image_path)
+    # Use the `figure_rst` helper function to generate reST for image files
+    return figure_rst(image_names, gallery_conf['src_dir'])

cocoatree/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0.rc0.dev2"

cocoatree/datasets/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ._base import load_S1A_serine_proteases  # noqa: F401
+from ._base import load_rhomboid_proteases  # noqa: F401
+from ._base import load_DHFR  # noqa: F401

cocoatree/datasets/_base.py ADDED Viewed

@@ -0,0 +1,188 @@
+import os
+from ..io import load_MSA, load_pdb
+import numpy as np
+import pandas as pd
+import gzip
+def load_S1A_serine_proteases(paper='rivoire'):
+    """
+    Load the S1A serine protease dataset
+    Halabi dataset: 1470 sequences of length 832; 3 sectors identified
+    Rivoire dataset : 1390 sequences of length 832 (snake sequences were
+    removed for the paper's analysis); 6 sectors identified (including the
+    3 from Halabi et al, 2008)
+    Parameters
+    ----------
+    paper: str, either 'halabi' or 'rivoire'
+        whether to load the dataset from Halabi et al, Cell, 2008 or from
+        Rivoire et al, PLoS Comput Biol, 2016
+    Returns
+    -------
+    a dictionnary containing :
+        - `sequences_ids`: a list of strings corresponding to sequence names
+        - `alignment`: a list of strings corresponding to sequences. Because it
+          is an MSA, all the strings are of same length.
+        - `metadata`: a pandas dataframe containing the metadata associated
+          with the alignment.
+        - `sector_positions`: a dictionnary of arrays containing the residue
+          positions associated to each sector, either in Halabi et al, or in
+          Rivoire et al.
+        - `pdb_sequence`: sequence extracted from rat's trypsin PDB structure
+        - `pdb_positions`: positions extracted from rat's trypsin PDB structure
+    """
+    module_path = os.path.dirname(__file__)
+    if paper == 'halabi':
+        # Load the alignment used in Halabi et al, 2008
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/halabi_alignment.fasta")
+        data = load_MSA(filename, format="fasta")
+        # Load the positions of the 3 sectors identified in Halabi et al, Cell,
+        # 2008
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/halabi_sectors.npz")
+        sectors = np.load(filename)
+        # Load the metadata
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/halabi_metadata.csv")
+        metadata = pd.read_csv(filename)
+    elif paper == 'rivoire':
+        # Load the alignment used in Rivoire et al, 2016
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/rivoire_alignment.fasta")
+        data = load_MSA(filename, format="fasta")
+        # Load the positions of the 6 sectors identified in Rivoire et al, PLoS
+        # Comput Biol, 2016
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/rivoire_sectors.npz")
+        sectors = np.load(filename)
+        # Load the metadata
+        filename = os.path.join(
+            module_path,
+            "data/S1A_serine_proteases/rivoire_metadata.csv")
+        metadata = pd.read_csv(filename)
+    else:
+        raise ValueError(f"invalid paper: {paper}. Options are 'halabi' or \
+                         'rivoire'")
+    # Load the PDB structure
+    filename = os.path.join(
+        module_path,
+        "data/S1A_serine_proteases/3tgi.pdb")
+    pdb_sequence, pdb_positions = load_pdb(filename, '3TGI', 'E')
+    data["sector_positions"] = sectors
+    data["metadata"] = metadata
+    data["pdb_sequence"] = pdb_sequence,
+    data["pdb_positions"] = pdb_positions
+    return data
+def load_rhomboid_proteases():
+    """
+    Load the rhomboid protease dataset
+    This dataset comes from Mihaljevic & Urban, Cell, 2020
+    (DOI: https://doi.org/10.1016/j.str.2020.07.015).
+    Returns
+    -------
+    a dictionnary containing :
+        - `sequence_ids`: a list of strings corresponding to sequence names
+        - `alignment`: a list of strings corresponding to sequences. Because it
+          is an MSA, all the strings are of same length.
+         - `sector_positions`: a dictionnary of arrays containing the residue
+          positions associated to each sector as published in the original
+          paper.
+        - `pdb_sequence`: sequence extracted from E. coli's PDB structure
+        - `pdb_positions`: positions extracted from E. coli's PDB structure
+    """
+    module_path = os.path.dirname(__file__)
+    filename = os.path.join(
+        module_path,
+        "data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta")
+    data = load_MSA(filename, format="fasta")
+    filename = os.path.join(
+        module_path,
+        "data/rhomboid_proteases/rhomboid_sectors.npz")
+    sectors = np.load(filename)
+    # Load the metadata
+    filename = os.path.join(
+        module_path,
+        "data/rhomboid_proteases/rhomboid_metadata_clean.csv")
+    metadata = pd.read_csv(filename)
+    # Load the PDB structure
+    filename = os.path.join(
+        module_path,
+        "data/rhomboid_proteases/2NRF.pdb")
+    # Two chains: A or B
+    pdb_sequence, pdb_positions = load_pdb(filename, '2NRF', 'A')
+    data["sector_positions"] = sectors
+    data["metadata"] = metadata
+    data["pdb_sequence"] = pdb_sequence,
+    data["pdb_positions"] = pdb_positions
+    return data
+def load_DHFR():
+    """
+    load the DHFR dataset
+    This dataset comes from Kalmer et al, The Journal of Physical Chemistry B,
+    2024 (https://pubs.acs.org/doi/10.1021/acs.jpcb.4c04195)
+    Returns
+    -------
+    a dictionnary containing :
+        - `sequence_ids`: a list of strings corresponding to sequence names
+        - `alignment`: a list of strings corresponding to sequences. Because it
+          is an MSA, all the strings are of same length.
+         - `sector_positions`: a dictionnary of arrays containing the residue
+          positions associated to each sector as published in the original
+          paper.
+        - `pdb_sequence`: sequence extracted from E. coli's PDB structure
+        - `pdb_positions`: positions extracted from E. coli's PDB structure
+    """
+    module_path = os.path.dirname(__file__)
+    filename = os.path.join(
+        module_path,
+        "data/DHFR/alignment.faa.gz")
+    with gzip.open(filename, "rt") as f:
+        data = load_MSA(f, format="fasta")
+    filename = os.path.join(
+        module_path,
+        "data/DHFR/DHFR_sectors.npz")
+    sectors = np.load(filename)
+    # Load the PDB structure
+    filename = os.path.join(
+        module_path,
+        "data/DHFR/3QL0.pdb")
+    pdb_sequence, pdb_positions = load_pdb(filename, '3QL0', 'A')
+    data["sector_positions"] = sectors
+    data["pdb_sequence"] = pdb_sequence,
+    data["pdb_positions"] = pdb_positions
+    return data