PyPI - cocoatree - Versions diffs - 0.1.0rc0.dev2__py3-none-any.whl - Mend

cocoatree 0.1.0rc0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cocoatree/__init__.py +8 -0
cocoatree/__params.py +80 -0
cocoatree/_pipeline.py +144 -0
cocoatree/_scraper.py +23 -0
cocoatree/_version.py +1 -0
cocoatree/datasets/__init__.py +3 -0
cocoatree/datasets/_base.py +188 -0
cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
cocoatree/datasets/tests/test_datasets.py +14 -0
cocoatree/decomposition.py +263 -0
cocoatree/io.py +185 -0
cocoatree/msa.py +579 -0
cocoatree/pysca.py +238 -0
cocoatree/randomize.py +30 -0
cocoatree/scripts/cocoatree-sca.py +6 -0
cocoatree/statistics/__init__.py +58 -0
cocoatree/statistics/pairwise.py +318 -0
cocoatree/statistics/position.py +258 -0
cocoatree/tests/test_init.py +24 -0
cocoatree/tests/test_msa.py +14 -0
cocoatree/visualization.py +440 -0
cocoatree-0.1.0rc0.dev2.dist-info/METADATA +66 -0
cocoatree-0.1.0rc0.dev2.dist-info/RECORD +39 -0
cocoatree-0.1.0rc0.dev2.dist-info/WHEEL +5 -0
cocoatree-0.1.0rc0.dev2.dist-info/licenses/LICENSE +28 -0
cocoatree-0.1.0rc0.dev2.dist-info/top_level.txt +1 -0

cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz ADDED Viewed

Binary file

cocoatree/datasets/tests/test_datasets.py ADDED Viewed

@@ -0,0 +1,14 @@
+from cocoatree.datasets import load_S1A_serine_proteases, \
+    load_rhomboid_proteases
+def test_load_S1A_serine_proteases():
+    dataset = load_S1A_serine_proteases()
+    assert "sequence_ids" in dataset.keys()
+    assert "alignment" in dataset.keys()
+def test_load_rhomboid_proteases():
+    dataset = load_rhomboid_proteases()
+    assert "sequence_ids" in dataset.keys()
+    assert "alignment" in dataset.keys()

cocoatree/decomposition.py ADDED Viewed

@@ -0,0 +1,263 @@
+import numpy as np
+from .__params import __freq_regularization_ref
+from cocoatree.randomize import _randomize_seqs_conserving_col_compo
+from cocoatree.msa import compute_seq_weights
+from cocoatree.statistics.pairwise import compute_sca_matrix
+from cocoatree.pysca import _compute_ica, _icList
+def extract_independent_components(coevo_matrix, method=None,
+                                   n_components=3, nrandom_pySCA=10,
+                                   sequences=None,
+                                   learnrate_ICA=0.1, nb_iterations_ICA=100000,
+                                   freq_regul=__freq_regularization_ref,
+                                   verbose_random_iter=True):
+    """
+    Extract independent components from a coevolution matrix
+    The current method is fully applicable to SCA analysis. For other metrics,
+    we set n_components = 3 (to improve)
+    Parameters
+    ----------
+    coevo_matrix : np.ndarray
+        coevolution matrix
+    sequences : list of sequences, optional, default: None
+        when using pySCA's strategy to estimate the number of components,
+        sequences needs to be provided.
+    method : {None, "pysca"}, default=None
+        Methods to use to estimate the number of components to extract. By
+        default, relies on the number of components provided by the user.
+    n_components : int, default=3,
+        Number of independent components to extract
+    nrandom_pySCA : int, default=10,
+        Number of MSA randomizations to perform if method='pySCA'
+    learnrate_ICA : int, default=0.1,
+        Learning rate / relaxation parameter used if method='pySCA'
+    nb_iteration_ICA : int, default=100000,
+        Number of iterations if method='pySCA'
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    verbose_random_iter : Boolean
+    Returns
+    -------
+    idpt_components : ndarray of shape (n_components, n_pos)
+        corresponding to a list of independent components
+    """
+    if method is not None:
+        if method == 'pySCA':
+            if sequences is None:
+                raise ValueError(
+                    "Sequences need to be provided to estimate"
+                    "the number of components automatically")
+            n_components = _compute_n_components_as_pySCA(
+                sequences, coevo_matrix,
+                nrandom=nrandom_pySCA, freq_regul=freq_regul,
+                verbose_random_iter=verbose_random_iter)
+        else:
+            raise ValueError(
+                f"{method} is not a valid method. Options are None, 'pySCA'")
+    V, S, Vt = np.linalg.svd(coevo_matrix)
+    Vica, _ = _compute_ica(V, n_components,
+                           learnrate=learnrate_ICA,
+                           iterations=nb_iterations_ICA)
+    idpt_components = Vica.T
+    return idpt_components
+def _compute_n_components_as_pySCA(sequences, coevo_matrix,
+                                   seq_weights=None,
+                                   nrandom=10,
+                                   freq_regul=__freq_regularization_ref,
+                                   verbose_random_iter=True):
+    """
+    Compute the number of independent components as in pySCA
+    Given the eigenvalues of the coevolution matrix, and the
+    eigenvalues for the set of randomized matrices, return
+    the number of significant eigenmodes as those above the average second
+    eigenvalue plus 2 standard deviations.
+    Based on S1 text of Rivoire et al. (2016)
+    Rem: it concerns only SCA metrics
+    For other merics (MI, adding corrections) this should be adapted
+    Parameters
+    ----------
+    sequences : list of sequences
+    coevo_matrix : np.ndarray of shape (n_pos, n_pos)
+        coevolution matrix
+    seq_weights : np.array (nseq, ) of each sequence weight
+    nrandom : int
+        Number of randomizations performed
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    verbose_random_iter : boolean, default=True
+        Print the advance of the randomization procedure
+    Returns
+    -------
+    n_components : int
+        Number of independent components to select
+    """
+    if seq_weights is None:
+        seq_weights, m_eff = compute_seq_weights(sequences)
+    else:
+        m_eff = np.sum(seq_weights)
+    second_eigen_values_random = []
+    for irand in range(nrandom):
+        if verbose_random_iter:
+            print('%d/%d randomized msa (to compute number of\
+                  significant components) '
+                  % (irand+1, nrandom), end='\r')
+        rand_sequences = _randomize_seqs_conserving_col_compo(sequences)
+        seq_weights_rand, m_eff_rand = compute_seq_weights(rand_sequences)
+        # to get the correct m_eff
+        seq_weights_rand = seq_weights_rand / m_eff_rand * m_eff
+        SCA_rand = compute_sca_matrix(rand_sequences, seq_weights_rand,
+                                      freq_regul=freq_regul)
+        _, S, _ = np.linalg.svd(SCA_rand)
+        second_eigen_values_random.append(S[1])
+    mean_second_ev_rand = np.mean(second_eigen_values_random)
+    std_second_ev_rand = np.std(second_eigen_values_random)
+    _, S_input, _ = np.linalg.svd(coevo_matrix)
+    n_components = len(S_input[S_input > mean_second_ev_rand +
+                               2 * std_second_ev_rand])
+    return n_components
+def extract_principal_components(coevo_matrix):
+    """
+    Perform principal component decomposition of a coevolution matrix
+    Parameters
+    ----------
+    coevo_matrix : np.ndarray
+        coevolution matrix
+    Returns
+    -------
+    principal_components : np.ndarray (n_pos, n_pos)
+        Principal components obtained from the PCA of the coevolution matrix
+    """
+    _, _, principal_components = np.linalg.svd(coevo_matrix)
+    return principal_components
+def extract_xcors_from_ICs(idpt_components, coevo_matrix):
+    """
+    Extract residue positions of XCoRs from independent components
+    Parameters
+    ----------
+    idpt_components : independent components obtained from an ICA
+    coevo_matrix : coevolution matrix
+    Returns
+    -------
+    xcors : lists of residue positions on the filtered MSA for each of the
+        n_components xcor
+    """
+    Vica = idpt_components.T
+    _, xcor_sizes, sorted_pos, _, _, _ = _icList(
+        Vica, len(idpt_components), coevo_matrix)
+    xcors = [[sorted_pos[i] for i in range(xcor_sizes[0])]]
+    ref_index = xcor_sizes[0]
+    for isize in range(1, len(xcor_sizes)):
+        xcors.append([sorted_pos[i]
+                      for i in range(ref_index,
+                                     ref_index + xcor_sizes[isize])])
+        ref_index += xcor_sizes[isize]
+    return xcors
+def extract_xcors(coevo_matrix, n_xcors=3):
+    """
+    Extract residue positions of XCoRs directly from the coevo_matrix
+    Parameters
+    ----------
+    coevo_matrix : coevolution matrix
+    n_xcors : int
+        Number of XCoRs to return
+    Returns
+    -------
+    xcors : lists of residue positions on the filtered MSA for each of the
+        n_xcors XCoR
+    """
+    # extracting indepdent components
+    idpt_components = extract_independent_components(coevo_matrix,
+                                                     n_components=n_xcors)
+    Vica = idpt_components.T
+    _, xcor_sizes, sorted_pos, _, _, _ = _icList(
+        Vica, len(idpt_components), coevo_matrix)
+    xcors = [[sorted_pos[i] for i in range(xcor_sizes[0])]]
+    ref_index = xcor_sizes[0]
+    for isize in range(1, len(xcor_sizes)):
+        xcors.append([sorted_pos[i]
+                      for i in range(ref_index,
+                                     ref_index + xcor_sizes[isize])])
+        ref_index += xcor_sizes[isize]
+    return xcors
+def remove_global_correlations(coevo_matrix):
+    """
+    Remove global correlations by setting the first eigen value
+    of the coevolution matrix to 0
+    In the sector literature (and data analysis), this corresponds
+    to removing global correlations (from e.g. phylogenetic effects)
+    Parameters
+    ----------
+    coevo_matrix : np.ndarray (n_pos, n_pos),
+        coevolution matrix
+    Returns
+    -------
+    coevo_matrix_sub : np.ndarray (n_pos, n_pos),
+        coevolution matrix without global correlations
+    """
+    U, S, Vt = np.linalg.svd(coevo_matrix)
+    S[0] = 0
+    coevo_matrix_sub = np.maximum(np.linalg.multi_dot([U, np.diag(S), Vt]), 0)
+    return coevo_matrix_sub

cocoatree/io.py ADDED Viewed

@@ -0,0 +1,185 @@
+import warnings
+from Bio import AlignIO
+from Bio.PDB import PDBParser
+from .msa import _clean_msa
+from .__params import aatable
+from ete3 import Tree
+import numpy as np
+def load_MSA(file_path, format="fasta", clean=True, verbose=False):
+    """Read in a multiple sequence alignment (MSA)
+    Parameters
+    ----------
+    file_path : path to the alignment file
+    format : string {"fasta", "phylip", …}, optional, default: "fasta"
+        format of the alignment file (e.g. 'fasta', 'phylip', etc.)
+        All format supported by biopython's Bio.AlignIO.read are accepted.
+    clean : boolean, default=True
+        whether to remove ambiguous amino acids (e.g. B, X etc.)
+    verbose : boolean,
+            whether to print informations about the MSA
+    Returns
+    -------
+    a dictionnary containing:
+        - `sequences_id`, list of sequence identifiers
+        - `alignment`: list of sequences as strings
+    """
+    alignment = AlignIO.read(file_path, format)
+    if clean:
+        alignment = _clean_msa(alignment)
+    sequences_id = [record.id for record in alignment]
+    sequences = [str(record.seq) for record in alignment]
+    if verbose:
+        print('Number of sequences: %i' % len(alignment))
+        print('Alignment of length: %i' % len(alignment[0]))
+    return {"sequence_ids": sequences_id, "alignment": sequences}
+def load_tree_ete3(file_path):
+    """
+    From the loading of a Newick tree, generate a ete3.Tree object
+    The Newick file must be of the form: `(A:1,(B:1,(C:1,D:1):0.5):0.5);`
+    or `(A:1,(B:1,(C:1,D:1)95:0.5)98:0.5);` if branch support values are
+    available.
+    Parameters
+    ----------
+    file_path : path to the Newick file
+    Returns
+    -------
+    tree_ete3 : ``ete3.Tree`` object
+    """
+    tree_ete3 = Tree(file_path, format=0)
+    return tree_ete3
+def export_fasta(sequences, sequences_id, outpath):
+    """
+    Export intermediate files in FASTA format
+    Parameters
+    ----------
+    sequences : list of sequences as strings (as imported by load_MSA)
+    sequences_id : list of sequences identifiers (as imported by load_MSA)
+    outpath : path to the output file
+    """
+    # Add checks to see if the path exists?
+    Nseq = len(sequences)
+    with open(outpath, 'w') as outfile:
+        for record in range(0, Nseq):
+            outfile.write('>' + str(sequences_id[record]) + '\n')
+            outfile.write(str(sequences[record]) + '\n')
+def load_pdb(path2pdb, pdb_id, chain):
+    """
+    Read in a PDB file.
+    Import a PDB file and extract the associated sequence along with the
+    amino acid positions
+    Parameters
+    ----------
+    path2pdb : path to the PDB file
+    pdb_id : str,
+        the id that will be used for the structure
+    chain : str,
+        name of the chain to read
+    Returns
+    -------
+    pbd_seq : str,
+        amino acid sequence of the PDB file
+    pdb_pos : list,
+        PDB position of each amino acid
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        P = PDBParser(PERMISSIVE=1)
+        structure = P.get_structure(pdb_id, path2pdb)
+    # Fill up sequence and label information
+    pdb_seq = ""
+    pdb_pos = list()
+    residues = [res for res in structure[0][chain] if res.get_id()[0] == " "]
+    for res in residues:
+        pdb_pos.append(str(res.get_id()[1]) + str(res.get_id()[2]).strip())
+        try:
+            pdb_seq += aatable[res.get_resname()]
+        except BaseException as e:
+            print("Error: " + str(e))
+            pdb_seq += "X"
+    return pdb_seq, pdb_pos
+def export_xcor_for_pymol(mapping, independent_components, axis,
+                          xcor_pos_in_loaded_msa,
+                          xcor_pos_in_filtered_msa,
+                          outpath):
+    """
+    Export XCoR information for mapping on 3D structure in PyMOL.
+    Export numpy arrays of an XCoR's residue positions and their contribution
+    for coloring in PyMOL.
+    Parameters
+    ----------
+    mapping : numpy.ndarray,
+        mapping between the unfiltered MSA and the PDB structure, output of
+        cocoatree.msa.map_to_pdb() function
+    independent_components : numpy.ndarray,
+        output of cocoatree.decomposition.compute_ica() function
+    axis : int,
+        rank of the independent component associated with the desired XCoR
+    xcor_pos_in_loaded_msa : list,
+        positions of the XCoR's residues in the unfiltered MSA
+    xcor_pos_in_filtered_msa : numpy.ndarray,
+        positions of the XCoR's residues in the filtered MSA, output from
+        cocoatree.decomposition.icList() function
+    outpath : str,
+        path to the output file as a binary in .npy format
+    Returns
+    -------
+    binary file in .npy format containing an array with the positions of the
+    XCoR's residues and an array with their contribution to the independent
+    component.
+    """
+    xcor_pdb_pos = []
+    for residue in xcor_pos_in_loaded_msa:
+        index = np.where(mapping[2] == str(residue))[0][0]
+        xcor_pdb_pos.append(mapping[1][index])
+    ic_contributions = []
+    for residue in xcor_pos_in_filtered_msa:
+        ic_contributions.append(independent_components[residue, axis])
+    np.save(outpath, np.array([xcor_pdb_pos, ic_contributions]))