cocoatree 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoatree/__init__.py +8 -0
- cocoatree/__params.py +80 -0
- cocoatree/_pipeline.py +144 -0
- cocoatree/_scraper.py +23 -0
- cocoatree/_version.py +1 -0
- cocoatree/datasets/__init__.py +3 -0
- cocoatree/datasets/_base.py +188 -0
- cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
- cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
- cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
- cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
- cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
- cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
- cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
- cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
- cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
- cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
- cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
- cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
- cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
- cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
- cocoatree/datasets/tests/test_datasets.py +14 -0
- cocoatree/decomposition.py +263 -0
- cocoatree/io.py +185 -0
- cocoatree/msa.py +579 -0
- cocoatree/pysca.py +238 -0
- cocoatree/randomize.py +30 -0
- cocoatree/scripts/cocoatree-sca.py +6 -0
- cocoatree/statistics/__init__.py +58 -0
- cocoatree/statistics/pairwise.py +318 -0
- cocoatree/statistics/position.py +258 -0
- cocoatree/tests/test_init.py +24 -0
- cocoatree/tests/test_msa.py +14 -0
- cocoatree/visualization.py +440 -0
- cocoatree-0.1.0.dist-info/METADATA +66 -0
- cocoatree-0.1.0.dist-info/RECORD +39 -0
- cocoatree-0.1.0.dist-info/WHEEL +5 -0
- cocoatree-0.1.0.dist-info/licenses/LICENSE +28 -0
- cocoatree-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from ..__params import lett2num, __freq_regularization_ref, __aa_count, __freq0
|
|
3
|
+
from ..msa import compute_seq_weights
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _compute_aa_freqs(sequences, seq_weights=None,
|
|
7
|
+
freq_regul=__freq_regularization_ref):
|
|
8
|
+
"""Computes frequencies of amino acids at each position of the alignment.
|
|
9
|
+
|
|
10
|
+
.. math::
|
|
11
|
+
f_i^a = (\\sum_s w_s x_{si}^a + \\lambda/21)/(M_{eff} + \\lambda)
|
|
12
|
+
|
|
13
|
+
where
|
|
14
|
+
|
|
15
|
+
.. math::
|
|
16
|
+
|
|
17
|
+
M_{eff} = \\sum_s w_s
|
|
18
|
+
|
|
19
|
+
represents the effective number of sequences in the alignment and *lambda*
|
|
20
|
+
is a regularization parameter (pseudocount).
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
sequences : list of sequences as imported by load_msa()
|
|
25
|
+
|
|
26
|
+
seq_weights : numpy 1D array, optional
|
|
27
|
+
Gives more or less importance to certain sequences. If
|
|
28
|
+
seq_weights=None, all sequences are attributed an equal weight
|
|
29
|
+
of 1.
|
|
30
|
+
|
|
31
|
+
freq_regul : regularization parameter (default=__freq_regularization_ref)
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
aa_freq : np.ndarray of shape (Npos, aa_count)
|
|
36
|
+
frequency of amino acid *a* at position *i*
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
tmp = np.array([[char for char in row] for row in sequences])
|
|
40
|
+
binary_array = np.array([tmp == aa for aa in lett2num.keys()]).astype(int)
|
|
41
|
+
# weights
|
|
42
|
+
if seq_weights is None:
|
|
43
|
+
seq_weights = np.ones(len(sequences))
|
|
44
|
+
m_eff = np.sum(seq_weights)
|
|
45
|
+
weighted_binary_array = \
|
|
46
|
+
binary_array * seq_weights[np.newaxis, :, np.newaxis]
|
|
47
|
+
aa_freq = (np.sum(weighted_binary_array, axis=1).T
|
|
48
|
+
+ freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)
|
|
49
|
+
|
|
50
|
+
return aa_freq
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _compute_background_freqs(aa_freqs, sequences, seq_weights=None,
|
|
54
|
+
freq_regul=__freq_regularization_ref):
|
|
55
|
+
"""Computes (regularized) background frequencies of amino acids
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
aa_freqs : np.ndarray of the positional amino acid frequencies
|
|
60
|
+
|
|
61
|
+
sequences : list of sequences for which seq_weights give weights
|
|
62
|
+
|
|
63
|
+
seq_weights : numpy 1D array, optional
|
|
64
|
+
Gives more or less importance to certain sequences.
|
|
65
|
+
If seq_weights=None, all sequences are attributed an equal weight
|
|
66
|
+
of 1.
|
|
67
|
+
|
|
68
|
+
freq_regul : regularization parameter (default=__freq_regularization_ref)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
bkgd_freqs : np.ndarray (21, )
|
|
74
|
+
A (21,) np.array containing the background amino acid frequencies
|
|
75
|
+
at each position; it is computed from the mean frequency of amino acid
|
|
76
|
+
*a* in all proteins in the NCBI non-redundant database
|
|
77
|
+
(see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
# q0 : fraction of gaps in the alignment
|
|
81
|
+
q0 = np.mean(aa_freqs[:, 0])
|
|
82
|
+
# background_freq : correction factor on __freq0 in order to take the
|
|
83
|
+
# proportion of gaps into account
|
|
84
|
+
bkgd_freqs = list((1 - q0) * __freq0)
|
|
85
|
+
bkgd_freqs.insert(0, q0)
|
|
86
|
+
bkgd_freqs = np.array(bkgd_freqs)
|
|
87
|
+
|
|
88
|
+
# weights
|
|
89
|
+
if seq_weights is None:
|
|
90
|
+
seq_weights = np.ones(len(sequences))
|
|
91
|
+
m_eff = np.sum(seq_weights)
|
|
92
|
+
|
|
93
|
+
# regularization
|
|
94
|
+
bkgd_freqs = (bkgd_freqs * m_eff +
|
|
95
|
+
freq_regul * m_eff / __aa_count) / ((1 + freq_regul) * m_eff)
|
|
96
|
+
|
|
97
|
+
return bkgd_freqs
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compute_first_order_freqs(sequences, seq_weights=None,
|
|
101
|
+
freq_regul=__freq_regularization_ref):
|
|
102
|
+
"""
|
|
103
|
+
Compute amino acid frequencies at each position and background frequencies
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
sequences : list of sequences for which seq_weights gives weights
|
|
108
|
+
|
|
109
|
+
seq_weights : numpy 1D array, optional, default=None
|
|
110
|
+
Gives more or less importance to certain sequences.
|
|
111
|
+
If seq_weights=None, will compute sequence weights
|
|
112
|
+
|
|
113
|
+
freq_regul : regularization parameter (default=__freq_regularization_ref)
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
aa_freqs : np.ndarray of the positional amino acid frequencies
|
|
118
|
+
|
|
119
|
+
bkgd_freqs : np.ndarray (21, )
|
|
120
|
+
A (21, ) np.array containing the background amino acid frequencies at each
|
|
121
|
+
position. It is computed from the mean frequency of amino acid *a* in all
|
|
122
|
+
proteins in the NCBI non-redundant database.
|
|
123
|
+
|
|
124
|
+
(see Rivoire et al., https://dx.plos.org/10.1371/journal.pcbi.1004817)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
if seq_weights is None:
|
|
128
|
+
seq_weights, _ = compute_seq_weights(sequences)
|
|
129
|
+
|
|
130
|
+
aa_freqs = _compute_aa_freqs(
|
|
131
|
+
sequences,
|
|
132
|
+
freq_regul=freq_regul,
|
|
133
|
+
seq_weights=seq_weights)
|
|
134
|
+
|
|
135
|
+
bkgd_freqs = _compute_background_freqs(
|
|
136
|
+
aa_freqs,
|
|
137
|
+
sequences,
|
|
138
|
+
seq_weights=seq_weights,
|
|
139
|
+
freq_regul=__freq_regularization_ref)
|
|
140
|
+
|
|
141
|
+
return aa_freqs, bkgd_freqs
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def compute_entropy(aa_freq):
|
|
145
|
+
"""Computes Shannon's entropy for each position in the alignment
|
|
146
|
+
|
|
147
|
+
.. math::
|
|
148
|
+
|
|
149
|
+
H(a) = -\\sum_i f_{ia} \\log f_{ia}
|
|
150
|
+
|
|
151
|
+
where *H(a)* is the relative entropy of amino acid *a*,
|
|
152
|
+
*fia* is the frequency of amino acid *a* at position *i*
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
aa_freq : np.ndarray,
|
|
157
|
+
amino acid frequencies per position
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
s: array of shape (N_pos)
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
s = -np.sum(aa_freq * np.log(aa_freq), axis=1)
|
|
165
|
+
|
|
166
|
+
return s
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def compute_conservation(sequences, seq_weights=None,
|
|
170
|
+
freq_regul=__freq_regularization_ref):
|
|
171
|
+
"""
|
|
172
|
+
Compute the conservation of amino acid at each position.
|
|
173
|
+
|
|
174
|
+
The conservation is computed as the relative entropy (e.g., the
|
|
175
|
+
Kullback-Leibler divergence)
|
|
176
|
+
|
|
177
|
+
.. math::
|
|
178
|
+
|
|
179
|
+
D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \
|
|
180
|
+
\\frac{1 - f_i^a}{1 - q^a}
|
|
181
|
+
|
|
182
|
+
where :math:`f_i^a` is the observed frequency of amino acid `a` at
|
|
183
|
+
position i`, :math:`q^a` is the background expectation
|
|
184
|
+
|
|
185
|
+
:math:`D_i^a` indicates how unlikely the observed frequencies of amino
|
|
186
|
+
acid `a` at position `i` would be if `a` occurred randomly with
|
|
187
|
+
probability :math:`q^a`.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
sequences : list of sequences
|
|
192
|
+
|
|
193
|
+
seq_weights : ndarray (nseq), optional, default: None
|
|
194
|
+
if None, will compute sequence weights
|
|
195
|
+
|
|
196
|
+
freq_regul : regularization parameter (default=__freq_regularization_ref)
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
Di : np.ndarray (npos,)
|
|
201
|
+
where each entry corresponds to the conservation at this position in
|
|
202
|
+
the sequences.
|
|
203
|
+
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
aa_freqs, bkgd_freqs = _compute_first_order_freqs(sequences, seq_weights,
|
|
207
|
+
freq_regul)
|
|
208
|
+
|
|
209
|
+
_, Di = _compute_rel_entropy(aa_freqs, bkgd_freqs)
|
|
210
|
+
|
|
211
|
+
return Di
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _compute_rel_entropy(aa_freqs, bkgd_freqs):
|
|
215
|
+
"""Compute the relative entropy
|
|
216
|
+
|
|
217
|
+
Also know as the Kullback-Leibler divergence
|
|
218
|
+
|
|
219
|
+
.. math::
|
|
220
|
+
|
|
221
|
+
D_i^a = f_i^a \\ln \\frac{f_i^a}{q^a} + (1 - f_i^a) \\ln \
|
|
222
|
+
\\frac{1 - f_i^a}{1 - q^a}
|
|
223
|
+
|
|
224
|
+
where f_i^a is the observed frequency of amino acid *a* at position *i*,
|
|
225
|
+
q^a is the background expectation
|
|
226
|
+
|
|
227
|
+
D_i^a is known as the Kullback-Leibler relative entropy (Cover and Thomas,
|
|
228
|
+
2012) and indicates how unlikely the observed frequencies of amino acid
|
|
229
|
+
*a* at position *i* would be if *a* occurred randomly with probability q^a.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
aa_freqs: np.ndarray,
|
|
234
|
+
amino acid frequencies per position
|
|
235
|
+
|
|
236
|
+
bck_freq: np.ndarray,
|
|
237
|
+
background frequenvies of amino acids
|
|
238
|
+
|
|
239
|
+
returns
|
|
240
|
+
-------
|
|
241
|
+
Dia: np.ndarray,
|
|
242
|
+
relative entropy of aa_freq given the background distribution of amino
|
|
243
|
+
acids. Indicates how unlikely the observed frequency of amino acid *a*
|
|
244
|
+
at position *i* would be if a occurred randomly with probability
|
|
245
|
+
background_freq
|
|
246
|
+
|
|
247
|
+
Di: np.ndarray,
|
|
248
|
+
overall conservation of position *i* taking all amino acids into
|
|
249
|
+
account
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
Dia = aa_freqs * np.log(aa_freqs / bkgd_freqs) + \
|
|
253
|
+
(1 - aa_freqs) * np.log((1 - aa_freqs) / (1 - bkgd_freqs))
|
|
254
|
+
|
|
255
|
+
# sum on all amino acid at each position
|
|
256
|
+
Di = np.sum(aa_freqs * np.log(aa_freqs / bkgd_freqs), axis=1)
|
|
257
|
+
|
|
258
|
+
return Dia, Di
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import cocoatree
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_ghost():
|
|
6
|
+
print(cocoatree.__file__)
|
|
7
|
+
assert True
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_perform_sca():
|
|
11
|
+
data = cocoatree.datasets.load_rhomboid_proteases()
|
|
12
|
+
sequences_id = data["sequence_ids"][:300]
|
|
13
|
+
sequences = data["alignment"][:300]
|
|
14
|
+
|
|
15
|
+
coevol_metrics = ["SCA", "MI", "NMI"]
|
|
16
|
+
corrections = [None, "entropy", "APC"]
|
|
17
|
+
|
|
18
|
+
for coevol_met, corr in itertools.product(coevol_metrics, corrections):
|
|
19
|
+
cocoatree.perform_sca(
|
|
20
|
+
sequences_id, sequences, n_components=2,
|
|
21
|
+
coevolution_metric=coevol_met,
|
|
22
|
+
correction=corr
|
|
23
|
+
)
|
|
24
|
+
assert True
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from cocoatree import msa
|
|
2
|
+
from cocoatree.datasets import load_S1A_serine_proteases
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_filter_seq_id():
|
|
6
|
+
sequences = load_S1A_serine_proteases()
|
|
7
|
+
sequence_ids = sequences["sequence_ids"]
|
|
8
|
+
sequences = sequences["alignment"]
|
|
9
|
+
|
|
10
|
+
filtered_seq = msa.filter_seq_id(
|
|
11
|
+
sequences,
|
|
12
|
+
sequence_ids,
|
|
13
|
+
sequence_ids[:100])
|
|
14
|
+
assert len(filtered_seq) <= 100
|