PyPI - cocoatree - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cocoatree 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cocoatree/__init__.py +8 -0
cocoatree/__params.py +80 -0
cocoatree/_pipeline.py +144 -0
cocoatree/_scraper.py +23 -0
cocoatree/_version.py +1 -0
cocoatree/datasets/__init__.py +3 -0
cocoatree/datasets/_base.py +188 -0
cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
cocoatree/datasets/tests/test_datasets.py +14 -0
cocoatree/decomposition.py +263 -0
cocoatree/io.py +185 -0
cocoatree/msa.py +579 -0
cocoatree/pysca.py +238 -0
cocoatree/randomize.py +30 -0
cocoatree/scripts/cocoatree-sca.py +6 -0
cocoatree/statistics/__init__.py +58 -0
cocoatree/statistics/pairwise.py +318 -0
cocoatree/statistics/position.py +258 -0
cocoatree/tests/test_init.py +24 -0
cocoatree/tests/test_msa.py +14 -0
cocoatree/visualization.py +440 -0
cocoatree-0.1.0.dist-info/METADATA +66 -0
cocoatree-0.1.0.dist-info/RECORD +39 -0
cocoatree-0.1.0.dist-info/WHEEL +5 -0
cocoatree-0.1.0.dist-info/licenses/LICENSE +28 -0
cocoatree-0.1.0.dist-info/top_level.txt +1 -0

cocoatree/pysca.py ADDED Viewed

@@ -0,0 +1,238 @@
+import numpy as np
+import sys
+from scipy.stats import t, scoreatpercentile
+def _basicICA(x, r0, Niter, tolerance=1e-15):
+    """
+    Basic ICA algorithm, based on work by Bell & Sejnowski (infomax). The input
+    data should preferentially be sphered, i.e., x.T.dot(x) = 1
+    Source: https://github.com/ranganathanlab/pySCA/
+    Parameters
+    ----------
+    x : LxM input matrix where L = # features and M = # samples
+    r : learning rate / relaxation parameter (e.g. r=.0001)
+    Niter : number of iterations (e.g. 1000)
+    Returns
+    -------
+    w : unmixing matrix
+    change: record of incremental changes during the iterations.
+    **Note** r and Niter should be adjusted to achieve convergence, which
+    should be assessed by visualizing 'change' with plot(range(iter), change)
+    **Example**::
+      [w, change] = basicICA(x, r, Niter)
+    """
+    [L, M] = x.shape
+    w = np.eye(L)
+    change = list()
+    r = r0 / M
+    with np.errstate(over="raise"):
+        try:
+            for _ in range(Niter):
+                w_old = np.copy(w)
+                u = w.dot(x)
+                w += r * (
+                    M * np.eye(L) + (1.0 - 2.0 / (1.0 + np.exp(-u))).dot(u.T)
+                ).dot(w)
+                delta = (w - w_old).ravel()
+                val = delta.dot(delta.T)
+                change.append(val)
+                if np.isclose(val, 0, atol=tolerance):
+                    break
+                if _ == Niter - 1:
+                    print("basicICA failed to converge: " + str(val))
+        except FloatingPointError as e:
+            sys.exit("Error: basicICA " + str(e))
+    return [w, change]
+def _compute_ica(V, kmax=6, learnrate=0.1, iterations=10000):
+    """
+    ICA rotation (using _basicICA) with default parameters and normalization of
+    outputs.
+    Basic ICA algorithm, based on work by Bell & Sejnowski (infomax). The input
+    data should preferentially be sphered, i.e., x.T.dot(x) = 1
+    Source: https://github.com/ranganathanlab/pySCA/
+    Parameters
+    ----------
+    V : ndarray,
+        eigenvectors obtained after matrix decomposition
+    kmax : integer,
+        number of independent components to retrieve
+    learnrate : integer,
+        learning rate / relaxation parameter
+    iterations : integer,
+        number of iterations
+    **Note** r and Niter should be adjusted to achieve convergence, which
+    should be assessed by visualizing 'change' with plot(range(iter), change)
+    Returns
+    -------
+    Vica : ndarray,
+        contributions along each independent components
+    W : ndarray of shape (kmax, kmax),
+        unmixing matrix
+    **Example**::
+       Vica, W = rotICA(V, kmax=6, learnrate=.0001, iterations=10000)
+    """
+    V1 = V[:, :kmax].T
+    [W, changes] = _basicICA(V1, learnrate, iterations)
+    Vica = (W.dot(V1)).T
+    for n in range(kmax):
+        imax = abs(Vica[:, n]).argmax()
+        Vica[:, n] = (
+            np.sign(Vica[imax, n]) * Vica[:, n] / np.linalg.norm(Vica[:, n])
+        )
+    return Vica, W
+class Unit:
+    """
+    A class for units (sectors, sequence families, etc.)
+    Attributes
+    ----------
+    name :  string describing the unit (ex: 'firmicutes')
+    items : set of member items (ex: indices for all firmicutes
+            sequence in an alignment)
+    col :   color code associated to the unit (for plotting)
+    vect :  an additional vector describing the member items (ex: a list
+            of sequence weights)
+    """
+    def __init__(self):
+        self.name = ""
+        self.items = set()
+        self.col = 0
+        self.vect = 0
+def _icList(Vica, n_component, Cij, p_cut=0.95):
+    """
+    Produces a list of positions contributing to each independent component
+    (IC) above a defined statistical cutoff (p_cut, the cutoff on the CDF of
+    the t-distribution fit to the histogram of each IC). Any position above the
+    cutoff on more than one IC are assigned to one IC based on which group of
+    positions to which it shows a higher degree of coevolution. Additionally
+    returns the numeric value of the cutoff for each IC, and the pdf fit, which
+    can be used for plotting/evaluation.
+    Parameters
+    ----------
+    Vica : ndarray,
+        independent components
+    n_component : int,
+        number of independent components chosen
+    Cij : numpy.ndarray,
+        coevolution matrix
+    p_cut : int,
+        cutoff on the CDF of the t-distribution fit to the histogran of each IC
+    Returns
+    -------
+    selected_res : list of cocoatree.decomposition.Unit,
+        positions of the selected residues for each independent component.
+        Beware that if the alignment used for the analysis has been filtered,
+        those are the positions on the filtered alignment and not on the
+        original alignment, a mapping of the positions may be needed.
+    ic_size : list,
+        number of selected residues for each component.
+    sorted_pos : list,
+        positions of the residues sorted by decreasing contribution for each
+        component.
+    cutoff : list,
+        numeric value of the cutoff for each component.
+    scaled_pdf : list of np.ndarrays,
+        scaled probability distribution function for each component.
+    all_fits : list,
+        t-distribution fits for each component.
+    **Example**::
+        selected_res, ic_size, sorted_pos, cutoff, scaled_pdf, all_fits = \
+            icList(Vica, n_component, Cij, p_cut=0.95)
+    """
+    # do the PDF/CDF fit, and assign cutoffs
+    Npos = len(Vica)
+    cutoff = list()
+    scaled_pdf = list()
+    all_fits = list()
+    for k in range(n_component):
+        pd = t.fit(Vica[:, k])
+        all_fits.append(pd)
+        iqr = scoreatpercentile(Vica[:, k], 75) - scoreatpercentile(
+            Vica[:, k], 25
+        )
+        binwidth = 2 * iqr * (len(Vica[:, k]) ** (-0.33))
+        nbins = round((max(Vica[:, k]) - min(Vica[:, k])) / binwidth)
+        h_params = np.histogram(Vica[:, k], int(nbins))
+        x_dist = np.linspace(min(h_params[1]), max(h_params[1]), num=100)
+        area_hist = Npos * (h_params[1][2] - h_params[1][1])
+        scaled_pdf.append(area_hist * (t.pdf(x_dist, pd[0], pd[1], pd[2])))
+        cd = t.cdf(x_dist, pd[0], pd[1], pd[2])
+        tmp = scaled_pdf[k].argmax()
+        if abs(max(Vica[:, k])) > abs(min(Vica[:, k])):
+            tail = cd[tmp: len(cd)]
+        else:
+            cd = 1 - cd
+            tail = cd[0:tmp]
+        diff = abs(tail - p_cut)
+        x_pos = diff.argmin()
+        cutoff.append(x_dist[x_pos + tmp])
+    # select the positions with significant contributions to each IC
+    ic_init = list()
+    for k in range(n_component):
+        ic_init.append([i for i in range(Npos) if Vica[i, k] > cutoff[k]])
+    # construct the sorted, non-redundant iclist
+    sorted_pos = list()
+    ic_size = list()
+    selected_res = list()
+    icpos_tmp = list()
+    Cij_nodiag = Cij.copy()
+    for i in range(Npos):
+        Cij_nodiag[i, i] = 0
+    for k in range(n_component):
+        icpos_tmp = list(ic_init[k])
+        for kprime in [kp for kp in range(n_component) if kp != k]:
+            tmp = [v for v in icpos_tmp if v in ic_init[kprime]]
+            for i in tmp:
+                remsec = np.linalg.norm(
+                    Cij_nodiag[i, ic_init[k]]
+                ) < np.linalg.norm(Cij_nodiag[i, ic_init[kprime]])
+                if remsec:
+                    icpos_tmp.remove(i)
+        sorted_pos += sorted(icpos_tmp, key=lambda i: -Vica[i, k])
+        ic_size.append(len(icpos_tmp))
+        s = Unit()
+        s.items = sorted(icpos_tmp, key=lambda i: -Vica[i, k])
+        s.col = k / n_component
+        s.vect = -Vica[s.items, k]
+        selected_res.append(s)
+    return selected_res, ic_size, sorted_pos, cutoff, scaled_pdf, all_fits

cocoatree/randomize.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Module to perform randomization of alignments"""
+import numpy as np
+def _randomize_seqs_conserving_col_compo(sequences=[], seed=None):
+    """
+    Randomize the list of sequenecs (MSA) so that the content of each
+    column is overall conserved (conservation of aa frequencies)
+    Parameters
+    ----------
+    sequences : list of sequences (MSA)
+    seed : int
+        to generate exact same list of random numbers
+        (mostly for testing )
+    Returns
+    -------
+    rand_seqs : list of sequences where the columns have been shuffled
+    """
+    seq_array = np.array([list(seq) for seq in sequences])
+    T = seq_array.T
+    rng = np.random.default_rng(seed)
+    rand_seq_array = np.array([rng.permutation(T[i]) for i in range(len(T))]).T
+    rand_seqs = [''.join(seq) for seq in rand_seq_array]
+    return rand_seqs

cocoatree/scripts/cocoatree-sca.py ADDED Viewed

@@ -0,0 +1,6 @@
+import cocoatree
+import argparse
+def main():

cocoatree/statistics/__init__.py ADDED Viewed

@@ -0,0 +1,58 @@
+from . import position
+from . import pairwise
+from .. import msa
+from ..__params import __freq_regularization_ref
+def compute_all_frequencies(sequences,
+                            seq_weights=None,
+                            freq_regul=__freq_regularization_ref):
+    """
+    Compute frequencies on sequences
+    Parameters
+    ----------
+    sequences : list of sequences
+    seq_weights : {None, np.ndarray (n_seq)}
+        if None, will re-compute the sequence weights.
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    Returns
+    -------
+    aa_freqs : np.ndarray (nseq, 21)
+        A (nseq, 21) ndarray containing the amino acid frequencies at each
+        positions.
+    bkgd_freqs :  np.ndarray (21, )
+        A (21,) np.array containing the background amino acid frequencies
+        at each position; it is computed from the mean frequency of amino acid
+        a in all proteins in the NCBI non-redundant database
+        (see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
+    aa_joint_freqs : np.ndarray (nseq, nseq, 21, 21)
+        An ndarray containing the pairwise joint frequencies of amino acids
+        for each pair of positions in the list of provided sequences.
+    """
+    if seq_weights is None:
+        seq_weights, _ = msa.compute_seq_weights(sequences)
+    aa_freqs = position._compute_aa_freqs(
+        sequences,
+        freq_regul=freq_regul,
+        seq_weights=seq_weights)
+    bkgd_freqs = position._compute_background_freqs(
+        aa_freqs,
+        sequences,
+        seq_weights=seq_weights,
+        freq_regul=__freq_regularization_ref)
+    aa_joint_freqs = pairwise._compute_aa_joint_freqs(
+        sequences,
+        seq_weights=seq_weights,
+        freq_regul=freq_regul)
+    return aa_freqs, bkgd_freqs, aa_joint_freqs

cocoatree/statistics/pairwise.py ADDED Viewed

@@ -0,0 +1,318 @@
+import numpy as np
+from ..__params import lett2num, __freq_regularization_ref, __aa_count
+from ..msa import compute_seq_weights
+from .position import _compute_first_order_freqs
+def _compute_aa_joint_freqs(sequences, seq_weights=None,
+                            freq_regul=__freq_regularization_ref):
+    """Computes the joint frequencies of each pair of amino acids in a MSA
+    .. math::
+        f_{ij}^{ab} = (\\sum_s w_s x_{si}^a x_{sj}^b +
+            \\lambda/(21)^2)/(M_{eff} + \\lambda)
+    where
+    .. math::
+        M_{eff} = \\sum_s w_s
+    represents the effective number of sequences in the alignment and *lambda*
+    is a regularization parameter (pseudocount).
+    Parameters
+    ----------
+    sequences : list of sequences as imported by load_MSA()
+    seq_weights : numpy 1D array, optional
+            Gives more or less importance to certain sequences. If
+            seq_weights=None, all sequences are attributed an equal weighti
+            of 1.
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    Returns
+    -------
+    aa_joint_freqs : np.ndarray of shape (Npos, Npos, aa_count, aa_count)
+        joint frequency of amino acids `a` and `b`
+        at respective positions `i` and `j`
+    """
+    # Convert sequences to binary format
+    tmp = np.array([[char for char in row] for row in sequences])
+    binary_array = np.array([tmp == aa for aa in lett2num.keys()]).astype(int)
+    # Adding weights
+    if seq_weights is None:
+        seq_weights = np.ones(len(sequences))
+    weighted_binary_array = binary_array * \
+        seq_weights[np.newaxis, :, np.newaxis]
+    # number of effective sequences
+    m_eff = np.sum(seq_weights)
+    # Joint frequencies
+    aa_joint_freqs = np.tensordot(weighted_binary_array, binary_array,
+                                  axes=([1], [1])).transpose(1, 3, 0, 2)
+    aa_joint_freqs = (aa_joint_freqs + freq_regul * m_eff / __aa_count ** 2)\
+        / ((1 + freq_regul) * m_eff)
+    return aa_joint_freqs
+def _compute_aa_product_freqs(aa_freqs_1, aa_freqs_2):
+    """Computes the product of frequencies
+    (joint frequencies if residues are independent)
+    Parameters
+    ----------
+    aa_freqs_1 : frequency of amino acid *a* at position *i* (set 1)
+    aa_freqs_2 : frequency of amino acid *a* at position *i* (set 2)
+    Returns
+    -------
+    aa_prod_freqs : np.ndarray of shape (Npos, Npos, aa_count, aa_count)
+        product of frequency of amino acids *a* and $b$
+        at respective positions *i* and *j*
+    """
+    aa_product_freqs = np.multiply.outer(aa_freqs_1, aa_freqs_2)
+    aa_product_freqs = np.moveaxis(aa_product_freqs,
+                                   [0, 1, 2, 3],
+                                   [0, 2, 1, 3])
+    return aa_product_freqs
+def _compute_second_order_freqs(sequences, seq_weights=None,
+                                freq_regul=__freq_regularization_ref):
+    """
+    Computes joint frequencies and the product of frequencies
+    Parameters
+    ----------
+    sequences : list of sequences
+    seq_weights : np.ndarray
+        weight values for each sequence of the alignment
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    Returns
+    -------
+    aa_joint_freqs : np.ndarray of shape (Npos, Npos, aa_count, aa_count)
+        joint frequency of amino acids `a` and `b` at respective positions
+        `i` and `j`
+    aa_product_freqs : np.ndarray of shape (Npos, Npos, aa_count, aa_count)
+        product of frequency of amino acids `a` and `b` at respective
+        positions `i` and `j`
+    """
+    # joint frequencies
+    aa_joint_freqs = _compute_aa_joint_freqs(sequences,
+                                             seq_weights=seq_weights,
+                                             freq_regul=freq_regul)
+    aa_freqs, _ = _compute_first_order_freqs(
+        sequences, seq_weights=seq_weights, freq_regul=freq_regul)
+    # joint frequencies if independence (product of frequencies)
+    aa_product_freqs = _compute_aa_product_freqs(aa_freqs, aa_freqs)
+    return aa_joint_freqs, aa_product_freqs
+def compute_sca_matrix(sequences, seq_weights=None, raw_correlation=False,
+                       freq_regul=__freq_regularization_ref):
+    """Compute the SCA coevolution matrix
+    .. math::
+        C_{ij}^{ab} = f_{ij}^{ab} - f_i^a f_j^b
+    .. math::
+        \\tilde{C_{ij}} = \\sqrt{sum_{a,b} \\tilde{(C_{ij}^{ab})^2}}
+    Parameters
+    ----------
+    sequences : list of sequences
+    seq_weights : ndarray (nseq), optional, default: None
+        if None, will compute sequence weights
+    raw_correlation : boolean, optional, default: False
+        whether to return raw correlations
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    Returns
+    -------
+    SCA_matrix : SCA coevolution matrix
+    """
+    # computing frequencies
+    if seq_weights is None:
+        seq_weights, _ = compute_seq_weights(sequences)
+    aa_joint_freqs, aa_product_freqs = _compute_second_order_freqs(
+        sequences, seq_weights=seq_weights, freq_regul=freq_regul)
+    # Cijab
+    Cijab = aa_joint_freqs - aa_product_freqs
+    if not raw_correlation:
+        # derivative of relative entropy
+        aa_freqs, bkgd_freqs = _compute_first_order_freqs(
+            sequences, seq_weights=seq_weights, freq_regul=freq_regul)
+        aa_freqs = aa_freqs.transpose([1, 0])
+        phi = np.log(
+            aa_freqs * (1 - bkgd_freqs[:, np.newaxis]) / (
+                (1 - aa_freqs) *
+                bkgd_freqs[:, np.newaxis])).transpose([1, 0])
+        phi = np.multiply.outer(phi, phi).transpose([0, 2, 1, 3])
+        # applying sca positional weights
+        Cijab = phi * Cijab
+    # Frobenius norm
+    SCA_matrix = np.sqrt(np.sum(Cijab ** 2, axis=(2, 3)))
+    return SCA_matrix
+def compute_mutual_information_matrix(sequences, seq_weights=None,
+                                      freq_regul=__freq_regularization_ref,
+                                      normalize=True):
+    """Compute the mutual information matrix
+    .. math::
+        I(X, Y) = \\sum_{x,y} p(x, y) \\log \\frac{p(x, y)}{p(x)p(y)}
+    Parameters
+    ----------
+    sequences : list of sequences
+    seq_weights : ndarray (nseq), optional, default: None
+        if None, will compute sequence weights
+    freq_regul : regularization parameter (default=__freq_regularization_ref)
+    normalize : boolean, default : True
+        Whether to normalize the mutual information by the entropy.
+    Returns
+    -------
+    mi_matrix : np.ndarray of shape (nseq, nseq)
+        the matrix of mutual information
+    """
+    # computing frequencies
+    if seq_weights is None:
+        seq_weights, _ = compute_seq_weights(sequences)
+    aa_joint_freqs, aa_product_freqs = _compute_second_order_freqs(
+        sequences, seq_weights=seq_weights,
+        freq_regul=freq_regul)
+    # mutual information
+    mi_matrix = np.sum(
+        aa_joint_freqs * np.log(aa_joint_freqs / aa_product_freqs),
+        axis=(2, 3))
+    if normalize:
+        joint_entropy = -np.sum(aa_joint_freqs * np.log(aa_joint_freqs),
+                                axis=(2, 3))
+        mi_matrix /= joint_entropy
+    return mi_matrix
+def compute_apc(MIij):
+    """
+    Computes the average product correction (APC) as described in Dunn et
+    al. (2008).
+    .. math::
+        APC(a, b) = \\frac{MI(a, \\bar{x}) MI(b, \\bar{x}){\\overline{MI}}
+    where :math:`MI(a, \\bar{x})` is the mean mutual information of column *a*
+    and :math:`\\overline{MI}` is the overall mean mutual information
+    The corrected mutual information is then:
+    .. math::
+        MIp(a, b) = MI(a, b) - APC(a, b)
+    Parameters
+    ----------
+    MIij : np.ndarray,
+        the mutual information matrix
+    Returns
+    -------
+    APC_ij : np.ndarray,
+        the average product correction (APC) matrix
+    MIp : np.ndarray,
+        the APC corrected mutual information matrix
+    """
+    n = MIij.shape[0]
+    m = n - 1
+    # Replace the matrix diagonal by 0
+    np.fill_diagonal(MIij, 0)
+    MI_colmean = (1/m) * np.sum(MIij, axis=0)
+    MI_colmean = np.multiply.outer(MI_colmean, MI_colmean)
+    MI_overmean = (2/(m*n)) * np.sum(np.tril(MIij))
+    APC_ij = MI_colmean / MI_overmean
+    MIp = MIij - APC_ij
+    return APC_ij, MIp
+def compute_entropy_correction(coevolution_matrix, s):
+    """
+    Computes the entropy correction according to Vorberg et al. (2018)
+    .. math::
+        C_{ij}^{EC} = C_{ij} - \\alpha s_{i}^{\\frac{1}{2}} \
+            s_{j}^{\\frac{1}{2}}
+    where :math:`\\alpha` is a coefficient determining the strength of the
+    correction:
+    .. math::
+        \\alpha = \\frac{\\sum_{i \\neq j}^{L} c_ij \
+        s_{i}^{\\frac{1}{2}}}{\\sum_{i \\neq j}^{L} s_i s_j}
+    Parameters
+    ----------
+    coevolution_matrix : square matrix of shape (Nseq, Nseq)
+    s : entropy computed for every position of the MSA
+    Returns
+    -------
+    a square matrix of shape (Nseq, Nseq)
+    """
+    s_prod = np.multiply.outer(s, s)
+    no_diag_eye = (1 - np.eye(s_prod.shape[0]))
+    alpha = np.sum(
+        (no_diag_eye * np.sqrt(s_prod) * coevolution_matrix) / np.sum(
+            (no_diag_eye * s_prod)))
+    return coevolution_matrix - alpha * np.sqrt(s_prod)