cocoatree 0.1.0rc0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cocoatree/__init__.py +8 -0
  2. cocoatree/__params.py +80 -0
  3. cocoatree/_pipeline.py +144 -0
  4. cocoatree/_scraper.py +23 -0
  5. cocoatree/_version.py +1 -0
  6. cocoatree/datasets/__init__.py +3 -0
  7. cocoatree/datasets/_base.py +188 -0
  8. cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
  9. cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
  10. cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
  11. cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
  12. cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
  13. cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
  14. cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
  15. cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
  16. cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
  17. cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
  18. cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
  19. cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
  20. cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
  21. cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
  22. cocoatree/datasets/tests/test_datasets.py +14 -0
  23. cocoatree/decomposition.py +263 -0
  24. cocoatree/io.py +185 -0
  25. cocoatree/msa.py +579 -0
  26. cocoatree/pysca.py +238 -0
  27. cocoatree/randomize.py +30 -0
  28. cocoatree/scripts/cocoatree-sca.py +6 -0
  29. cocoatree/statistics/__init__.py +58 -0
  30. cocoatree/statistics/pairwise.py +318 -0
  31. cocoatree/statistics/position.py +258 -0
  32. cocoatree/tests/test_init.py +24 -0
  33. cocoatree/tests/test_msa.py +14 -0
  34. cocoatree/visualization.py +440 -0
  35. cocoatree-0.1.0rc0.dev2.dist-info/METADATA +66 -0
  36. cocoatree-0.1.0rc0.dev2.dist-info/RECORD +39 -0
  37. cocoatree-0.1.0rc0.dev2.dist-info/WHEEL +5 -0
  38. cocoatree-0.1.0rc0.dev2.dist-info/licenses/LICENSE +28 -0
  39. cocoatree-0.1.0rc0.dev2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,258 @@
1
+ import numpy as np
2
+ from ..__params import lett2num, __freq_regularization_ref, __aa_count, __freq0
3
+ from ..msa import compute_seq_weights
4
+
5
+
6
+ def _compute_aa_freqs(sequences, seq_weights=None,
7
+ freq_regul=__freq_regularization_ref):
8
+ """Computes frequencies of amino acids at each position of the alignment.
9
+
10
+ .. math::
11
+ f_i^a = (\\sum_s w_s x_{si}^a + \\lambda/21)/(M_{eff} + \\lambda)
12
+
13
+ where
14
+
15
+ .. math::
16
+
17
+ M_{eff} = \\sum_s w_s
18
+
19
+ represents the effective number of sequences in the alignment and *lambda*
20
+ is a regularization parameter (pseudocount).
21
+
22
+ Parameters
23
+ ----------
24
+ sequences : list of sequences as imported by load_msa()
25
+
26
+ seq_weights : numpy 1D array, optional
27
+ Gives more or less importance to certain sequences. If
28
+ seq_weights=None, all sequences are attributed an equal weight
29
+ of 1.
30
+
31
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
32
+
33
+ Returns
34
+ -------
35
+ aa_freq : np.ndarray of shape (Npos, aa_count)
36
+ frequency of amino acid *a* at position *i*
37
+ """
38
+
39
+ tmp = np.array([[char for char in row] for row in sequences])
40
+ binary_array = np.array([tmp == aa for aa in lett2num.keys()]).astype(int)
41
+ # weights
42
+ if seq_weights is None:
43
+ seq_weights = np.ones(len(sequences))
44
+ m_eff = np.sum(seq_weights)
45
+ weighted_binary_array = \
46
+ binary_array * seq_weights[np.newaxis, :, np.newaxis]
47
+ aa_freq = (np.sum(weighted_binary_array, axis=1).T
48
+ + freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)
49
+
50
+ return aa_freq
51
+
52
+
53
+ def _compute_background_freqs(aa_freqs, sequences, seq_weights=None,
54
+ freq_regul=__freq_regularization_ref):
55
+ """Computes (regularized) background frequencies of amino acids
56
+
57
+ Parameters
58
+ ----------
59
+ aa_freqs : np.ndarray of the positional amino acid frequencies
60
+
61
+ sequences : list of sequences for which seq_weights give weights
62
+
63
+ seq_weights : numpy 1D array, optional
64
+ Gives more or less importance to certain sequences.
65
+ If seq_weights=None, all sequences are attributed an equal weight
66
+ of 1.
67
+
68
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
69
+
70
+
71
+ Returns
72
+ -------
73
+ bkgd_freqs : np.ndarray (21, )
74
+ A (21,) np.array containing the background amino acid frequencies
75
+ at each position; it is computed from the mean frequency of amino acid
76
+ *a* in all proteins in the NCBI non-redundant database
77
+ (see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
78
+ """
79
+
80
+ # q0 : fraction of gaps in the alignment
81
+ q0 = np.mean(aa_freqs[:, 0])
82
+ # background_freq : correction factor on __freq0 in order to take the
83
+ # proportion of gaps into account
84
+ bkgd_freqs = list((1 - q0) * __freq0)
85
+ bkgd_freqs.insert(0, q0)
86
+ bkgd_freqs = np.array(bkgd_freqs)
87
+
88
+ # weights
89
+ if seq_weights is None:
90
+ seq_weights = np.ones(len(sequences))
91
+ m_eff = np.sum(seq_weights)
92
+
93
+ # regularization
94
+ bkgd_freqs = (bkgd_freqs * m_eff +
95
+ freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)
96
+
97
+ return bkgd_freqs
98
+
99
+
100
+ def _compute_first_order_freqs(sequences, seq_weights=None,
101
+ freq_regul=__freq_regularization_ref):
102
+ """
103
+ Compute amino acid frequencies at each position and background frequencies
104
+
105
+ Parameters
106
+ ----------
107
+ sequences : list of sequences for which seq_weights gives weights
108
+
109
+ seq_weights : numpy 1D array, optional, default=None
110
+ Gives more or less importance to certain sequences.
111
+ If seq_weights=None, will compute sequence weights
112
+
113
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
114
+
115
+ Returns
116
+ -------
117
+ aa_freqs : np.ndarray of the positional amino acid frequencies
118
+
119
+ bkgd_freqs : np.ndarray (21, )
120
+ A (21, ) np.array containing the background amino acid frequencies at each
121
+ position. It is computed from the mean frequency of amino acid *a* in all
122
+ proteins in the NCBI non-redundant database.
123
+
124
+ (see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
125
+ """
126
+
127
+ if seq_weights is None:
128
+ seq_weights, _ = compute_seq_weights(sequences)
129
+
130
+ aa_freqs = _compute_aa_freqs(
131
+ sequences,
132
+ freq_regul=freq_regul,
133
+ seq_weights=seq_weights)
134
+
135
+ bkgd_freqs = _compute_background_freqs(
136
+ aa_freqs,
137
+ sequences,
138
+ seq_weights=seq_weights,
139
+ freq_regul=__freq_regularization_ref)
140
+
141
+ return aa_freqs, bkgd_freqs
142
+
143
+
144
+ def compute_entropy(aa_freq):
145
+ """Computes Shannon's entropy for each position in the alignment
146
+
147
+ .. math::
148
+
149
+ H(a) = -\\sum_i f_{ia} \\log f_{ia}
150
+
151
+ where *H(a)* is the relative entropy of amino acid *a*,
152
+ *fia* is the frequency of amino acid *a* at position *i*
153
+
154
+ Parameters
155
+ ----------
156
+ aa_freq : np.ndarray,
157
+ amino acid frequencies per position
158
+
159
+ Returns
160
+ -------
161
+ s: array of shape (N_pos)
162
+ """
163
+
164
+ s = -np.sum(aa_freq * np.log(aa_freq), axis=1)
165
+
166
+ return s
167
+
168
+
169
+ def compute_conservation(sequences, seq_weights=None,
170
+ freq_regul=__freq_regularization_ref):
171
+ """
172
+ Compute the conservation of amino acid at each position.
173
+
174
+ The conservation is computed as the relative entropy (e.g., the
175
+ Kullback-Leibler divergence)
176
+
177
+ .. math::
178
+
179
+ D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \
180
+ \\frac{1 - f_i^a}{1 - q^a}
181
+
182
+ where :math:`f_i^a` is the observed frequency of amino acid `a` at
183
+ position i`, :math:`q^a` is the background expectation
184
+
185
+ :math:`D_i^a` indicates how unlikely the observed frequencies of amino
186
+ acid `a` at position `i` would be if `a` occurred randomly with
187
+ probability :math:`q^a`.
188
+
189
+ Parameters
190
+ ----------
191
+ sequences : list of sequences
192
+
193
+ seq_weights : ndarray (nseq), optional, default: None
194
+ if None, will compute sequence weights
195
+
196
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
197
+
198
+ Returns
199
+ -------
200
+ Di : np.ndarray (npos,)
201
+ where each entry corresponds to the conservation at this position in
202
+ the sequences.
203
+
204
+ """
205
+
206
+ aa_freqs, bkgd_freqs = _compute_first_order_freqs(sequences, seq_weights,
207
+ freq_regul)
208
+
209
+ _, Di = _compute_rel_entropy(aa_freqs, bkgd_freqs)
210
+
211
+ return Di
212
+
213
+
214
+ def _compute_rel_entropy(aa_freqs, bkgd_freqs):
215
+ """Compute the relative entropy
216
+
217
+ Also know as the Kullback-Leibler divergence
218
+
219
+ .. math::
220
+
221
+ D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \
222
+ \\frac{1 - f_i^a}{1 - q^a}
223
+
224
+ where f_i^a is the observed frequency of amino acid *a* at position *i*,
225
+ q^a is the background expectation
226
+
227
+ D_i^a is known as the Kullback-Leibler relative entropy (Cover and Thomas,
228
+ 2012) and indicates how unlikely the observed frequencies of amino acid
229
+ *a* at position *i* would be if *a* occurred randomly with probability q^a.
230
+
231
+ Parameters
232
+ ----------
233
+ aa_freqs: np.ndarray,
234
+ amino acid frequencies per position
235
+
236
+ bck_freq: np.ndarray,
237
+ background frequenvies of amino acids
238
+
239
+ returns
240
+ -------
241
+ Dia: np.ndarray,
242
+ relative entropy of aa_freq given the background distribution of amino
243
+ acids. Indicates how unlikely the observed frequency of amino acid *a*
244
+ at position *i* would be if a occurred randomly with probability
245
+ background_freq
246
+
247
+ Di: np.ndarray,
248
+ overall conservation of position *i* taking all amino acids into
249
+ account
250
+ """
251
+
252
+ Dia = aa_freqs * np.log(aa_freqs / bkgd_freqs) + \
253
+ (1 - aa_freqs) * np.log((1 - aa_freqs) / (1 - bkgd_freqs))
254
+
255
+ # sum on all amino acid at each position
256
+ Di = np.sum(aa_freqs * np.log(aa_freqs / bkgd_freqs), axis=1)
257
+
258
+ return Dia, Di
@@ -0,0 +1,24 @@
1
+ import cocoatree
2
+ import itertools
3
+
4
+
5
+ def test_ghost():
6
+ print(cocoatree.__file__)
7
+ assert True
8
+
9
+
10
+ def test_perform_sca():
11
+ data = cocoatree.datasets.load_rhomboid_proteases()
12
+ sequences_id = data["sequence_ids"][:300]
13
+ sequences = data["alignment"][:300]
14
+
15
+ coevol_metrics = ["SCA", "MI", "NMI"]
16
+ corrections = [None, "entropy", "APC"]
17
+
18
+ for coevol_met, corr in itertools.product(coevol_metrics, corrections):
19
+ cocoatree.perform_sca(
20
+ sequences_id, sequences, n_components=2,
21
+ coevolution_metric=coevol_met,
22
+ correction=corr
23
+ )
24
+ assert True
@@ -0,0 +1,14 @@
1
+ from cocoatree import msa
2
+ from cocoatree.datasets import load_S1A_serine_proteases
3
+
4
+
5
+ def test_filter_seq_id():
6
+ sequences = load_S1A_serine_proteases()
7
+ sequence_ids = sequences["sequence_ids"]
8
+ sequences = sequences["alignment"]
9
+
10
+ filtered_seq = msa.filter_seq_id(
11
+ sequences,
12
+ sequence_ids,
13
+ sequence_ids[:100])
14
+ assert len(filtered_seq) <= 100