cocoatree 0.1.0rc0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cocoatree/__init__.py +8 -0
  2. cocoatree/__params.py +80 -0
  3. cocoatree/_pipeline.py +144 -0
  4. cocoatree/_scraper.py +23 -0
  5. cocoatree/_version.py +1 -0
  6. cocoatree/datasets/__init__.py +3 -0
  7. cocoatree/datasets/_base.py +188 -0
  8. cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
  9. cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
  10. cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
  11. cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
  12. cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
  13. cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
  14. cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
  15. cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
  16. cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
  17. cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
  18. cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
  19. cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
  20. cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
  21. cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
  22. cocoatree/datasets/tests/test_datasets.py +14 -0
  23. cocoatree/decomposition.py +263 -0
  24. cocoatree/io.py +185 -0
  25. cocoatree/msa.py +579 -0
  26. cocoatree/pysca.py +238 -0
  27. cocoatree/randomize.py +30 -0
  28. cocoatree/scripts/cocoatree-sca.py +6 -0
  29. cocoatree/statistics/__init__.py +58 -0
  30. cocoatree/statistics/pairwise.py +318 -0
  31. cocoatree/statistics/position.py +258 -0
  32. cocoatree/tests/test_init.py +24 -0
  33. cocoatree/tests/test_msa.py +14 -0
  34. cocoatree/visualization.py +440 -0
  35. cocoatree-0.1.0rc0.dev2.dist-info/METADATA +66 -0
  36. cocoatree-0.1.0rc0.dev2.dist-info/RECORD +39 -0
  37. cocoatree-0.1.0rc0.dev2.dist-info/WHEEL +5 -0
  38. cocoatree-0.1.0rc0.dev2.dist-info/licenses/LICENSE +28 -0
  39. cocoatree-0.1.0rc0.dev2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
1
+ from cocoatree.datasets import load_S1A_serine_proteases, \
2
+ load_rhomboid_proteases
3
+
4
+
5
+ def test_load_S1A_serine_proteases():
6
+ dataset = load_S1A_serine_proteases()
7
+ assert "sequence_ids" in dataset.keys()
8
+ assert "alignment" in dataset.keys()
9
+
10
+
11
+ def test_load_rhomboid_proteases():
12
+ dataset = load_rhomboid_proteases()
13
+ assert "sequence_ids" in dataset.keys()
14
+ assert "alignment" in dataset.keys()
@@ -0,0 +1,263 @@
1
+ import numpy as np
2
+ from .__params import __freq_regularization_ref
3
+ from cocoatree.randomize import _randomize_seqs_conserving_col_compo
4
+ from cocoatree.msa import compute_seq_weights
5
+ from cocoatree.statistics.pairwise import compute_sca_matrix
6
+ from cocoatree.pysca import _compute_ica, _icList
7
+
8
+
9
+ def extract_independent_components(coevo_matrix, method=None,
10
+ n_components=3, nrandom_pySCA=10,
11
+ sequences=None,
12
+ learnrate_ICA=0.1, nb_iterations_ICA=100000,
13
+ freq_regul=__freq_regularization_ref,
14
+ verbose_random_iter=True):
15
+ """
16
+ Extract independent components from a coevolution matrix
17
+
18
+ The current method is fully applicable to SCA analysis. For other metrics,
19
+ we set n_components = 3 (to improve)
20
+
21
+ Parameters
22
+ ----------
23
+ coevo_matrix : np.ndarray
24
+ coevolution matrix
25
+
26
+ sequences : list of sequences, optional, default: None
27
+ when using pySCA's strategy to estimate the number of components,
28
+ sequences needs to be provided.
29
+
30
+
31
+ method : {None, "pysca"}, default=None
32
+ Methods to use to estimate the number of components to extract. By
33
+ default, relies on the number of components provided by the user.
34
+
35
+ n_components : int, default=3,
36
+ Number of independent components to extract
37
+
38
+ nrandom_pySCA : int, default=10,
39
+ Number of MSA randomizations to perform if method='pySCA'
40
+
41
+ learnrate_ICA : int, default=0.1,
42
+ Learning rate / relaxation parameter used if method='pySCA'
43
+
44
+ nb_iteration_ICA : int, default=100000,
45
+ Number of iterations if method='pySCA'
46
+
47
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
48
+
49
+ verbose_random_iter : Boolean
50
+
51
+ Returns
52
+ -------
53
+ idpt_components : ndarray of shape (n_components, n_pos)
54
+ corresponding to a list of independent components
55
+ """
56
+
57
+ if method is not None:
58
+ if method == 'pySCA':
59
+ if sequences is None:
60
+ raise ValueError(
61
+ "Sequences need to be provided to estimate"
62
+ "the number of components automatically")
63
+
64
+ n_components = _compute_n_components_as_pySCA(
65
+ sequences, coevo_matrix,
66
+ nrandom=nrandom_pySCA, freq_regul=freq_regul,
67
+ verbose_random_iter=verbose_random_iter)
68
+ else:
69
+ raise ValueError(
70
+ f"{method} is not a valid method. Options are None, 'pySCA'")
71
+
72
+ V, S, Vt = np.linalg.svd(coevo_matrix)
73
+ Vica, _ = _compute_ica(V, n_components,
74
+ learnrate=learnrate_ICA,
75
+ iterations=nb_iterations_ICA)
76
+
77
+ idpt_components = Vica.T
78
+
79
+ return idpt_components
80
+
81
+
82
+ def _compute_n_components_as_pySCA(sequences, coevo_matrix,
83
+ seq_weights=None,
84
+ nrandom=10,
85
+ freq_regul=__freq_regularization_ref,
86
+ verbose_random_iter=True):
87
+ """
88
+ Compute the number of independent components as in pySCA
89
+
90
+ Given the eigenvalues of the coevolution matrix, and the
91
+ eigenvalues for the set of randomized matrices, return
92
+ the number of significant eigenmodes as those above the average second
93
+ eigenvalue plus 2 standard deviations.
94
+ Based on S1 text of Rivoire et al. (2016)
95
+
96
+ Rem: it concerns only SCA metrics
97
+ For other merics (MI, adding corrections) this should be adapted
98
+
99
+ Parameters
100
+ ----------
101
+ sequences : list of sequences
102
+
103
+ coevo_matrix : np.ndarray of shape (n_pos, n_pos)
104
+ coevolution matrix
105
+
106
+ seq_weights : np.array (nseq, ) of each sequence weight
107
+
108
+ nrandom : int
109
+ Number of randomizations performed
110
+
111
+ freq_regul : regularization parameter (default=__freq_regularization_ref)
112
+
113
+ verbose_random_iter : boolean, default=True
114
+ Print the advance of the randomization procedure
115
+
116
+ Returns
117
+ -------
118
+ n_components : int
119
+ Number of independent components to select
120
+ """
121
+
122
+ if seq_weights is None:
123
+ seq_weights, m_eff = compute_seq_weights(sequences)
124
+ else:
125
+ m_eff = np.sum(seq_weights)
126
+
127
+ second_eigen_values_random = []
128
+ for irand in range(nrandom):
129
+ if verbose_random_iter:
130
+ print('%d/%d randomized msa (to compute number of\
131
+ significant components) '
132
+ % (irand+1, nrandom), end='\r')
133
+ rand_sequences = _randomize_seqs_conserving_col_compo(sequences)
134
+
135
+ seq_weights_rand, m_eff_rand = compute_seq_weights(rand_sequences)
136
+
137
+ # to get the correct m_eff
138
+ seq_weights_rand = seq_weights_rand / m_eff_rand * m_eff
139
+
140
+ SCA_rand = compute_sca_matrix(rand_sequences, seq_weights_rand,
141
+ freq_regul=freq_regul)
142
+
143
+ _, S, _ = np.linalg.svd(SCA_rand)
144
+ second_eigen_values_random.append(S[1])
145
+
146
+ mean_second_ev_rand = np.mean(second_eigen_values_random)
147
+ std_second_ev_rand = np.std(second_eigen_values_random)
148
+
149
+ _, S_input, _ = np.linalg.svd(coevo_matrix)
150
+
151
+ n_components = len(S_input[S_input > mean_second_ev_rand +
152
+ 2 * std_second_ev_rand])
153
+
154
+ return n_components
155
+
156
+
157
+ def extract_principal_components(coevo_matrix):
158
+ """
159
+ Perform principal component decomposition of a coevolution matrix
160
+
161
+ Parameters
162
+ ----------
163
+ coevo_matrix : np.ndarray
164
+ coevolution matrix
165
+
166
+ Returns
167
+ -------
168
+ principal_components : np.ndarray (n_pos, n_pos)
169
+ Principal components obtained from the PCA of the coevolution matrix
170
+ """
171
+
172
+ _, _, principal_components = np.linalg.svd(coevo_matrix)
173
+
174
+ return principal_components
175
+
176
+
177
+ def extract_xcors_from_ICs(idpt_components, coevo_matrix):
178
+ """
179
+ Extract residue positions of XCoRs from independent components
180
+
181
+ Parameters
182
+ ----------
183
+ idpt_components : independent components obtained from an ICA
184
+
185
+ coevo_matrix : coevolution matrix
186
+
187
+ Returns
188
+ -------
189
+ xcors : lists of residue positions on the filtered MSA for each of the
190
+ n_components xcor
191
+ """
192
+ Vica = idpt_components.T
193
+ _, xcor_sizes, sorted_pos, _, _, _ = _icList(
194
+ Vica, len(idpt_components), coevo_matrix)
195
+
196
+ xcors = [[sorted_pos[i] for i in range(xcor_sizes[0])]]
197
+ ref_index = xcor_sizes[0]
198
+ for isize in range(1, len(xcor_sizes)):
199
+ xcors.append([sorted_pos[i]
200
+ for i in range(ref_index,
201
+ ref_index + xcor_sizes[isize])])
202
+ ref_index += xcor_sizes[isize]
203
+ return xcors
204
+
205
+
206
+ def extract_xcors(coevo_matrix, n_xcors=3):
207
+ """
208
+ Extract residue positions of XCoRs directly from the coevo_matrix
209
+
210
+ Parameters
211
+ ----------
212
+ coevo_matrix : coevolution matrix
213
+
214
+ n_xcors : int
215
+ Number of XCoRs to return
216
+
217
+ Returns
218
+ -------
219
+ xcors : lists of residue positions on the filtered MSA for each of the
220
+ n_xcors XCoR
221
+ """
222
+
223
+ # extracting indepdent components
224
+ idpt_components = extract_independent_components(coevo_matrix,
225
+ n_components=n_xcors)
226
+
227
+ Vica = idpt_components.T
228
+ _, xcor_sizes, sorted_pos, _, _, _ = _icList(
229
+ Vica, len(idpt_components), coevo_matrix)
230
+
231
+ xcors = [[sorted_pos[i] for i in range(xcor_sizes[0])]]
232
+ ref_index = xcor_sizes[0]
233
+ for isize in range(1, len(xcor_sizes)):
234
+ xcors.append([sorted_pos[i]
235
+ for i in range(ref_index,
236
+ ref_index + xcor_sizes[isize])])
237
+ ref_index += xcor_sizes[isize]
238
+ return xcors
239
+
240
+
241
+ def remove_global_correlations(coevo_matrix):
242
+ """
243
+ Remove global correlations by setting the first eigen value
244
+ of the coevolution matrix to 0
245
+
246
+ In the sector literature (and data analysis), this corresponds
247
+ to removing global correlations (from e.g. phylogenetic effects)
248
+
249
+ Parameters
250
+ ----------
251
+ coevo_matrix : np.ndarray (n_pos, n_pos),
252
+ coevolution matrix
253
+
254
+ Returns
255
+ -------
256
+ coevo_matrix_sub : np.ndarray (n_pos, n_pos),
257
+ coevolution matrix without global correlations
258
+ """
259
+ U, S, Vt = np.linalg.svd(coevo_matrix)
260
+ S[0] = 0
261
+ coevo_matrix_sub = np.maximum(np.linalg.multi_dot([U, np.diag(S), Vt]), 0)
262
+
263
+ return coevo_matrix_sub
cocoatree/io.py ADDED
@@ -0,0 +1,185 @@
1
+ import warnings
2
+
3
+ from Bio import AlignIO
4
+ from Bio.PDB import PDBParser
5
+ from .msa import _clean_msa
6
+ from .__params import aatable
7
+ from ete3 import Tree
8
+ import numpy as np
9
+
10
+
11
+ def load_MSA(file_path, format="fasta", clean=True, verbose=False):
12
+ """Read in a multiple sequence alignment (MSA)
13
+
14
+ Parameters
15
+ ----------
16
+ file_path : path to the alignment file
17
+
18
+ format : string {"fasta", "phylip", …}, optional, default: "fasta"
19
+ format of the alignment file (e.g. 'fasta', 'phylip', etc.)
20
+ All format supported by biopython's Bio.AlignIO.read are accepted.
21
+
22
+ clean : boolean, default=True
23
+ whether to remove ambiguous amino acids (e.g. B, X etc.)
24
+
25
+ verbose : boolean,
26
+ whether to print informations about the MSA
27
+
28
+ Returns
29
+ -------
30
+ a dictionnary containing:
31
+ - `sequences_id`, list of sequence identifiers
32
+ - `alignment`: list of sequences as strings
33
+ """
34
+
35
+ alignment = AlignIO.read(file_path, format)
36
+
37
+ if clean:
38
+ alignment = _clean_msa(alignment)
39
+
40
+ sequences_id = [record.id for record in alignment]
41
+ sequences = [str(record.seq) for record in alignment]
42
+
43
+ if verbose:
44
+ print('Number of sequences: %i' % len(alignment))
45
+ print('Alignment of length: %i' % len(alignment[0]))
46
+
47
+ return {"sequence_ids": sequences_id, "alignment": sequences}
48
+
49
+
50
+ def load_tree_ete3(file_path):
51
+ """
52
+ From the loading of a Newick tree, generate a ete3.Tree object
53
+
54
+ The Newick file must be of the form: `(A:1,(B:1,(C:1,D:1):0.5):0.5);`
55
+ or `(A:1,(B:1,(C:1,D:1)95:0.5)98:0.5);` if branch support values are
56
+ available.
57
+
58
+ Parameters
59
+ ----------
60
+ file_path : path to the Newick file
61
+
62
+ Returns
63
+ -------
64
+ tree_ete3 : ``ete3.Tree`` object
65
+
66
+ """
67
+ tree_ete3 = Tree(file_path, format=0)
68
+ return tree_ete3
69
+
70
+
71
+ def export_fasta(sequences, sequences_id, outpath):
72
+ """
73
+ Export intermediate files in FASTA format
74
+
75
+ Parameters
76
+ ----------
77
+ sequences : list of sequences as strings (as imported by load_MSA)
78
+
79
+ sequences_id : list of sequences identifiers (as imported by load_MSA)
80
+
81
+ outpath : path to the output file
82
+ """
83
+
84
+ # Add checks to see if the path exists?
85
+ Nseq = len(sequences)
86
+ with open(outpath, 'w') as outfile:
87
+ for record in range(0, Nseq):
88
+ outfile.write('>' + str(sequences_id[record]) + '\n')
89
+ outfile.write(str(sequences[record]) + '\n')
90
+
91
+
92
+ def load_pdb(path2pdb, pdb_id, chain):
93
+ """
94
+ Read in a PDB file.
95
+
96
+ Import a PDB file and extract the associated sequence along with the
97
+ amino acid positions
98
+
99
+ Parameters
100
+ ----------
101
+ path2pdb : path to the PDB file
102
+
103
+ pdb_id : str,
104
+ the id that will be used for the structure
105
+
106
+ chain : str,
107
+ name of the chain to read
108
+
109
+ Returns
110
+ -------
111
+ pbd_seq : str,
112
+ amino acid sequence of the PDB file
113
+
114
+ pdb_pos : list,
115
+ PDB position of each amino acid
116
+ """
117
+ with warnings.catch_warnings():
118
+ warnings.simplefilter("ignore")
119
+ P = PDBParser(PERMISSIVE=1)
120
+ structure = P.get_structure(pdb_id, path2pdb)
121
+
122
+ # Fill up sequence and label information
123
+ pdb_seq = ""
124
+ pdb_pos = list()
125
+ residues = [res for res in structure[0][chain] if res.get_id()[0] == " "]
126
+ for res in residues:
127
+ pdb_pos.append(str(res.get_id()[1]) + str(res.get_id()[2]).strip())
128
+ try:
129
+ pdb_seq += aatable[res.get_resname()]
130
+ except BaseException as e:
131
+ print("Error: " + str(e))
132
+ pdb_seq += "X"
133
+
134
+ return pdb_seq, pdb_pos
135
+
136
+
137
+ def export_xcor_for_pymol(mapping, independent_components, axis,
138
+ xcor_pos_in_loaded_msa,
139
+ xcor_pos_in_filtered_msa,
140
+ outpath):
141
+ """
142
+ Export XCoR information for mapping on 3D structure in PyMOL.
143
+
144
+ Export numpy arrays of an XCoR's residue positions and their contribution
145
+ for coloring in PyMOL.
146
+
147
+ Parameters
148
+ ----------
149
+ mapping : numpy.ndarray,
150
+ mapping between the unfiltered MSA and the PDB structure, output of
151
+ cocoatree.msa.map_to_pdb() function
152
+
153
+ independent_components : numpy.ndarray,
154
+ output of cocoatree.decomposition.compute_ica() function
155
+
156
+ axis : int,
157
+ rank of the independent component associated with the desired XCoR
158
+
159
+ xcor_pos_in_loaded_msa : list,
160
+ positions of the XCoR's residues in the unfiltered MSA
161
+
162
+ xcor_pos_in_filtered_msa : numpy.ndarray,
163
+ positions of the XCoR's residues in the filtered MSA, output from
164
+ cocoatree.decomposition.icList() function
165
+
166
+ outpath : str,
167
+ path to the output file as a binary in .npy format
168
+
169
+ Returns
170
+ -------
171
+ binary file in .npy format containing an array with the positions of the
172
+ XCoR's residues and an array with their contribution to the independent
173
+ component.
174
+ """
175
+
176
+ xcor_pdb_pos = []
177
+ for residue in xcor_pos_in_loaded_msa:
178
+ index = np.where(mapping[2] == str(residue))[0][0]
179
+ xcor_pdb_pos.append(mapping[1][index])
180
+
181
+ ic_contributions = []
182
+ for residue in xcor_pos_in_filtered_msa:
183
+ ic_contributions.append(independent_components[residue, axis])
184
+
185
+ np.save(outpath, np.array([xcor_pdb_pos, ic_contributions]))