cocoatree 0.1.0rc0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cocoatree/__init__.py +8 -0
  2. cocoatree/__params.py +80 -0
  3. cocoatree/_pipeline.py +144 -0
  4. cocoatree/_scraper.py +23 -0
  5. cocoatree/_version.py +1 -0
  6. cocoatree/datasets/__init__.py +3 -0
  7. cocoatree/datasets/_base.py +188 -0
  8. cocoatree/datasets/data/DHFR/3QL0.pdb +3507 -0
  9. cocoatree/datasets/data/DHFR/DHFR_sectors.npz +0 -0
  10. cocoatree/datasets/data/DHFR/alignment.faa.gz +0 -0
  11. cocoatree/datasets/data/S1A_serine_proteases/3tgi.pdb +2844 -0
  12. cocoatree/datasets/data/S1A_serine_proteases/halabi_alignment.fasta +20580 -0
  13. cocoatree/datasets/data/S1A_serine_proteases/halabi_metadata.csv +1471 -0
  14. cocoatree/datasets/data/S1A_serine_proteases/halabi_sectors.npz +0 -0
  15. cocoatree/datasets/data/S1A_serine_proteases/rivoire_alignment.fasta +19460 -0
  16. cocoatree/datasets/data/S1A_serine_proteases/rivoire_metadata.csv +1391 -0
  17. cocoatree/datasets/data/S1A_serine_proteases/rivoire_sectors.npz +0 -0
  18. cocoatree/datasets/data/rhomboid_proteases/2NRF.pdb +3300 -0
  19. cocoatree/datasets/data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta +5534 -0
  20. cocoatree/datasets/data/rhomboid_proteases/rhomboid_metadata_clean.csv +2766 -0
  21. cocoatree/datasets/data/rhomboid_proteases/rhomboid_sectors.npz +0 -0
  22. cocoatree/datasets/tests/test_datasets.py +14 -0
  23. cocoatree/decomposition.py +263 -0
  24. cocoatree/io.py +185 -0
  25. cocoatree/msa.py +579 -0
  26. cocoatree/pysca.py +238 -0
  27. cocoatree/randomize.py +30 -0
  28. cocoatree/scripts/cocoatree-sca.py +6 -0
  29. cocoatree/statistics/__init__.py +58 -0
  30. cocoatree/statistics/pairwise.py +318 -0
  31. cocoatree/statistics/position.py +258 -0
  32. cocoatree/tests/test_init.py +24 -0
  33. cocoatree/tests/test_msa.py +14 -0
  34. cocoatree/visualization.py +440 -0
  35. cocoatree-0.1.0rc0.dev2.dist-info/METADATA +66 -0
  36. cocoatree-0.1.0rc0.dev2.dist-info/RECORD +39 -0
  37. cocoatree-0.1.0rc0.dev2.dist-info/WHEEL +5 -0
  38. cocoatree-0.1.0rc0.dev2.dist-info/licenses/LICENSE +28 -0
  39. cocoatree-0.1.0rc0.dev2.dist-info/top_level.txt +1 -0
cocoatree/msa.py ADDED
@@ -0,0 +1,579 @@
1
+ from Bio.Seq import Seq
2
+ from Bio.SeqRecord import SeqRecord
3
+ from Bio.Align import MultipleSeqAlignment, substitution_matrices
4
+ import numpy as np
5
+ import sklearn.metrics as sn
6
+ from .__params import lett2num
7
+ from joblib import Parallel, delayed
8
+
9
+
10
+ def _clean_msa(msa):
11
+ """
12
+ This function compares the amino acid codes in the sequence alignment with
13
+ the ones in lett2num and removes unknown amino acids (such as 'X' or 'B')
14
+ when importing the multiple sequence alignment.
15
+
16
+ Parameters
17
+ ----------
18
+ msa : bioalign object
19
+ """
20
+
21
+ for index, record in enumerate(msa):
22
+ for char in record.seq:
23
+ if char not in lett2num.keys():
24
+ sequence = list(record.seq)
25
+ sequence[record.seq.index(char)] = '-'
26
+ sequence = "".join(sequence)
27
+ msa[index].seq = Seq(sequence)
28
+
29
+ return msa
30
+
31
+
32
+ def filter_sequences(sequences, sequences_id,
33
+ gap_threshold=0.4, seq_threshold=0.2,
34
+ verbose=False):
35
+ """
36
+ Filter sequences
37
+
38
+ Remove (1) overly gapped positions; (2) overly gapped sequences.
39
+
40
+ Parameters
41
+ ----------
42
+ sequences : list of MSA sequences to filter
43
+
44
+ sequences_id : list of the MSA's sequence identifiers
45
+
46
+ gap_threshold : float,
47
+ maximum proportion of gaps tolerated per position (default=0.4)
48
+
49
+ seq_threshold : float,
50
+ maximum proportion of gaps tolerated per sequence (default=0.2)
51
+
52
+ Returns
53
+ -------
54
+ filtered_seqs : list of the remaining sequences (written as strings)
55
+ after applying the filters
56
+
57
+ filtered_seqs_id : list of sequence identifiers that were kept after
58
+ applying the filters
59
+
60
+ remaining_pos : numpy.ndarray
61
+ remaining positions after filtering
62
+ """
63
+
64
+ updated_sequences, remaining_pos = _filter_gap_pos(
65
+ sequences, threshold=gap_threshold,
66
+ verbose=verbose)
67
+ filtered_seqs, filtered_seqs_id, = _filter_gap_seq(
68
+ updated_sequences, sequences_id,
69
+ threshold=seq_threshold, verbose=verbose)
70
+
71
+ return filtered_seqs, filtered_seqs_id, remaining_pos
72
+
73
+
74
+ def _filter_gap_pos(sequences, threshold=0.4, verbose=False):
75
+ """Filter the sequences for overly gapped positions.
76
+
77
+ Parameters
78
+ ----------
79
+ sequences : list of the MSA sequences to filter
80
+
81
+ threshold : max proportion of gaps tolerated (default=0.4)
82
+
83
+ Returns
84
+ -------
85
+ updated_seqs : updated_list of sequences with filtered gaps
86
+
87
+ remaining_pos : numpy.ndarray
88
+ remaining positions after filtering
89
+ """
90
+
91
+ if verbose:
92
+ print("Filter MSA for overly gapped positions")
93
+
94
+ Nseq, Npos = len(sequences), len(sequences[0])
95
+
96
+ gaps = np.array([[int(sequences[seq][pos] == '-') for pos in range(Npos)]
97
+ for seq in range(Nseq)])
98
+
99
+ freq_gap_per_pos = np.sum(gaps, axis=0) / Nseq
100
+
101
+ remaining_pos = np.where(freq_gap_per_pos <= threshold)[0]
102
+
103
+ if verbose:
104
+ print("Keeping %i out of %i positions" % (len(remaining_pos), Npos))
105
+
106
+ updated_seqs = ["".join([sequences[seq][pos] for pos in remaining_pos])
107
+ for seq in range(Nseq)]
108
+
109
+ return updated_seqs, remaining_pos
110
+
111
+
112
+ def _filter_gap_seq(sequences, sequences_id, threshold=0.2, verbose=False):
113
+ """
114
+ Remove sequences with a fraction of gaps greater than a specified
115
+ value.
116
+
117
+ Parameters
118
+ ----------
119
+ sequences : list of MSA sequences
120
+
121
+ sequences_id : list of the MSA's sequence identifiers
122
+
123
+ threshold : maximum fraction of gaps per sequence (default 0.2)
124
+
125
+ Returns
126
+ -------
127
+ filt_seqs : filtered list of sequences
128
+
129
+ filt_seqs_id : corresponding list of sequence identifiers
130
+ """
131
+
132
+ if verbose:
133
+ print('Filter MSA for overly gapped sequences')
134
+
135
+ Nseq, Npos = len(sequences), len(sequences[0])
136
+
137
+ freq_gap_per_seq = np.array([sequences[seq].count('-') / Npos
138
+ for seq in range(Nseq)])
139
+
140
+ filt_seqs_ix = np.where(freq_gap_per_seq <= threshold)[0]
141
+ if verbose:
142
+ print('Keeping %i sequences out of %i sequences' %
143
+ (len(filt_seqs_ix), Nseq))
144
+
145
+ filt_seqs = [sequences[seq] for seq in filt_seqs_ix]
146
+ filt_seqs_id = [sequences_id[seq] for seq in filt_seqs_ix]
147
+
148
+ return filt_seqs, filt_seqs_id
149
+
150
+
151
+ def filter_ref_seq(sequences, sequences_id, delta=0.2, refseq_id=None,
152
+ verbose=False):
153
+ '''
154
+ Filter the alignment based on identity with a reference sequence
155
+
156
+ Remove sequences *r* with Sr < delta, where Sr is the fractional identity
157
+ between the sequence *r* and a specified reference sequence.
158
+
159
+ Parameters
160
+ ----------
161
+ sequences : list of sequences in the MSA
162
+
163
+ sequences_id : list of sequence identifiers in the MSA
164
+
165
+ delta : identity threshold (default=0.2)
166
+
167
+ refseq_id : identifier of the reference sequence, if 'None', a reference
168
+ sequence is choosen as the sequence that has the mean pairwise
169
+ sequence identity closest to that of the entire sequence
170
+ alignment (default 'None')
171
+
172
+ Returns
173
+ -------
174
+ filt_seqs : filtered list of sequences
175
+
176
+ filt_seqs_id : corresponding list of sequence identifiers
177
+ '''
178
+
179
+ Nseq = len(sequences)
180
+
181
+ if refseq_id is None:
182
+ if verbose:
183
+ print('Choose a default reference sequence within the alignment')
184
+ refseq_idx = _choose_ref_seq(sequences)
185
+ else:
186
+ if verbose:
187
+ print('Reference sequence is: %i' % refseq_id)
188
+ refseq_idx = sequences_id.index(refseq_id)
189
+
190
+ sim_matrix = compute_seq_identity(sequences)
191
+ filt_seqs_ix = np.where(sim_matrix[refseq_idx] >= delta)[0]
192
+ filt_seqs = [sequences[seq] for seq in filt_seqs_ix]
193
+ filt_seqs_id = [sequences_id[seq] for seq in filt_seqs_ix]
194
+
195
+ if verbose:
196
+ print('Keeping %i out of %i sequences' % (len(filt_seqs), Nseq))
197
+
198
+ return filt_seqs, filt_seqs_id
199
+
200
+
201
+ def _choose_ref_seq(msa):
202
+ """
203
+ Determine a reference sequence for the alignment
204
+
205
+ This function chooses a default reference sequence for the alignment by
206
+ taking the sequence which has the mean pairwise sequence identity closest
207
+ to that of the entire sequence alignment.
208
+
209
+ Parameters
210
+ ----------
211
+ msa : the multiple sequence alignment as a list of sequences
212
+
213
+ Returns
214
+ -------
215
+ The index of the reference sequence in the given alignment
216
+ """
217
+
218
+ sim_matrix = compute_seq_identity(msa)
219
+
220
+ mean_pairwise_seq_sim = np.mean(sim_matrix, axis=0)
221
+
222
+ ref_seq = np.argmin(mean_pairwise_seq_sim)
223
+
224
+ return ref_seq
225
+
226
+
227
+ def filter_seq_id(sequences, sequences_id, list_id):
228
+ """
229
+ Filter sequences based on list
230
+
231
+ Filter a multiple sequence alignment to keep only sequences whose
232
+ identifiers are in a user provided list.
233
+
234
+ Parameters
235
+ ----------
236
+ sequences : list of MSA sequences
237
+
238
+ sequences_id : list of the MSA's sequence identifiers
239
+
240
+ list_id : list of sequence identifiers the user wants to keep. The
241
+ identifiers must be in the same format as in the input MSA
242
+
243
+ Returns
244
+ -------
245
+ new_msa : Bio.Align.MultipleSeqAlignment object,
246
+ filtered msa
247
+
248
+ id_list : list of sequence ID in the filtered MSA
249
+
250
+ seq_list : list of sequences of the filtered MSA
251
+ """
252
+ new_msa = MultipleSeqAlignment([])
253
+ for ident in sequences_id:
254
+ if ident in list_id:
255
+ new_record = SeqRecord(Seq(sequences[sequences_id.index(ident)]),
256
+ id=ident)
257
+ new_msa.append(new_record)
258
+
259
+ seq_list = []
260
+ id_list = []
261
+ for record in new_msa:
262
+ seq_list.append(str(record.seq))
263
+ id_list.append(record.id)
264
+ seq_list = np.array(seq_list)
265
+
266
+ return [new_msa, id_list, seq_list]
267
+
268
+
269
+ def map_to_pdb(pdb_seq, pdb_pos, sequences, sequences_id, pdb_seq_id):
270
+ """
271
+ Mapping of the unfiltered MSA positions on a PDB structure.
272
+
273
+ Parameters
274
+ ----------
275
+ pdb_seq: str,
276
+ amino acid sequence of the reference PDB file
277
+
278
+ pdb_pos: list,
279
+ Residue positions as found in the PDB file
280
+
281
+ sequences: list,
282
+ List of sequences of the unfiltered MSA
283
+
284
+ sequences_id: list,
285
+ List of sequence identifiers in the unfiltered MSA
286
+
287
+ pdb_seq_id: str,
288
+ identifier of the sequence the positions are mapped onto. Should be
289
+ included in sequences_id.
290
+
291
+ Returns
292
+ -------
293
+ mapping: numpy.ndarray of shape (3, len(pdb_seq)),
294
+ the first element is an array of the residues found in the PDB sequence
295
+ the second element is an array of the PDB position of each amino acid
296
+ the third element is an array of the positions of those same amino
297
+ acids in the unfiltered MSA
298
+ """
299
+ msa_pos = []
300
+ pdb_seq_idx = sequences_id.index(pdb_seq_id)
301
+ for aa_index in range(len(sequences[pdb_seq_idx])):
302
+ if sequences[pdb_seq_idx][aa_index] != '-':
303
+ msa_pos.append(aa_index)
304
+
305
+ mapping = np.array((list(pdb_seq), pdb_pos, msa_pos))
306
+
307
+ return mapping
308
+
309
+
310
+ def compute_seq_identity(sequences):
311
+ """
312
+ Computes the identity between sequences in a MSA (as Hamming's pairwise
313
+ distance)
314
+
315
+ Parameters
316
+ ----------
317
+ sequences : list of sequences
318
+
319
+ Returns
320
+ -------
321
+ sim_matrix : identity matrix of shape (Nseq, Nseq)
322
+ """
323
+
324
+ separated_aa = np.array([[lett2num[char] for char in row]
325
+ for row in sequences])
326
+
327
+ sim_matrix = 1 - sn.DistanceMetric.get_metric(
328
+ "hamming").pairwise(separated_aa)
329
+
330
+ return sim_matrix
331
+
332
+
333
+ def compute_seq_weights(sequences, threshold=0.8, verbose_every=0,
334
+ n_jobs=1, verbose_parallel=5):
335
+ """
336
+ Compute sequence weights
337
+
338
+ Each sequence s is given a weight ws = 1/Ns where Ns is the number of
339
+ sequences with an identity to s above a specified threshold.
340
+
341
+ Parameters
342
+ ----------
343
+ sequences : list of sequences
344
+
345
+ threshold : float, optional, default: 0.8
346
+ percentage identity above which the sequences are considered identical
347
+ (default=0.8)
348
+
349
+ verbose_every : int
350
+ if > 0, verbose every {verbose_every} sequences
351
+
352
+ n_jobs : int, default=1 (no parallelization)
353
+ the maximum number of concurrently running jobs
354
+ (see joblib doc)
355
+
356
+ verbose_parallel : int
357
+ verbosity level for parallelization
358
+ (see joblib doc)
359
+
360
+ Returns
361
+ -------
362
+ weights : np.array (nseq, ) of each sequence weight
363
+
364
+ m_eff : float
365
+ number of effective sequences
366
+ """
367
+ if threshold < 0 or threshold > 1:
368
+ raise ValueError(
369
+ "The threshold needs to be between 0 and 1." +
370
+ f" Value provided {threshold}")
371
+
372
+ sequences_num = np.array([[lett2num[char] for char in row]
373
+ for row in sequences])
374
+
375
+ if n_jobs == 1:
376
+ seq_weights = []
377
+ for iseq, seq in enumerate(sequences_num):
378
+ if verbose_every and iseq % verbose_every == 0:
379
+ print('computing weight of seq %d/%d\t' %
380
+ (iseq+1, len(sequences_num)), end='\r')
381
+ sim = 1 - sn.DistanceMetric.get_metric(
382
+ "hamming").pairwise([seq], sequences_num)
383
+ seq_weights.append(1 / np.sum(sim >= threshold))
384
+ else:
385
+ def _weight_f(sequence, sequence_list):
386
+ return 1 / np.sum(1 - sn.DistanceMetric.get_metric(
387
+ "hamming").pairwise([sequence], sequence_list) >= threshold)
388
+
389
+ seq_weights = Parallel(n_jobs=n_jobs, verbose=verbose_parallel)(
390
+ delayed(_weight_f)(seq, sequences_num) for seq in sequences_num)
391
+
392
+ seq_weights = np.array(seq_weights)
393
+ m_eff = sum(seq_weights)
394
+
395
+ return seq_weights, m_eff
396
+
397
+
398
+ def map_msa_positions(n_loaded_pos, remaining_pos):
399
+ """
400
+ Maps positions between the original and the filtered MSA
401
+
402
+ Parameters
403
+ ----------
404
+ n_loaded_pos : int,
405
+ Number of positions in the original unfiltered MSA
406
+
407
+ remaining_pos : np.ndarray,
408
+ array containing the indexes of positions that have been conserved
409
+ after filtering the MSA (output from cocoatree.msa.filter_sequences)
410
+
411
+ Returns
412
+ -------
413
+ original2filtered : dictionnary,
414
+ the keys are the positions in the original MSA and the values are the
415
+ corresponding positions in the filtered MSA. When the original position
416
+ has been filtered, the value is set to 'None'.
417
+
418
+ filtered2original : dictionnary,
419
+ the keys are the positions in the filtered MSA and the values are the
420
+ corresponding positions in the original MSA.
421
+ """
422
+ mapping = [
423
+ int(val) if f else None
424
+ for f, val in zip(
425
+ np.isin(np.arange(n_loaded_pos), remaining_pos),
426
+ np.isin(np.arange(n_loaded_pos), remaining_pos).cumsum()-1)]
427
+ original2filtered = {
428
+ i: t for i, t
429
+ in enumerate(mapping)}
430
+
431
+ filtered2original = dict()
432
+
433
+ for pos in range(len(remaining_pos)):
434
+ filtered2original[pos] = int(remaining_pos[pos])
435
+
436
+ return original2filtered, filtered2original
437
+
438
+
439
+ def compute_seq_similarity(sequences, subst_matrix='BLOSUM62', gap_penalty=-4,
440
+ n_jobs=1, verbose_parallel=0):
441
+ """
442
+ Computes a similarity matrix using a precalculated substitution matrix.
443
+
444
+ The similarity score for a pair of sequences is obtained as the sum of
445
+ the substitution scores at each position of the sequence pair.
446
+
447
+ Parameters
448
+ ----------
449
+ sequences : list of str,
450
+ list of Nseq MSA sequences.
451
+ subst_matrix : str, default='BLOSUM62'
452
+ name of the substitution matrix.
453
+ Type `Bio.Align.substitution_matrices.load()` to obtain a list of
454
+ available substitution matrices.
455
+ gap_penalty : int, default=-4
456
+ penalty score for gaps. You can adjust this parameter to reflect
457
+ biological assumptions (e.g., -1 for mild, -10 for harsh).
458
+ n_jobs : int, default=1 (no parallelization)
459
+ the maximum number of concurrently running jobs (-1 uses all
460
+ available cores)
461
+ verbose_parallel : int, default=0
462
+ verbosity level for parallelization (see joblib doc)
463
+
464
+ Returns
465
+ -------
466
+ similarity_matrix : np.ndarray,
467
+ a (Nseq, Nseq) array of similarity scores.
468
+ """
469
+ matrix = substitution_matrices.load(subst_matrix)
470
+ n = len(sequences)
471
+ seq_length = len(sequences[0])
472
+
473
+ if not all(len(seq) == seq_length for seq in sequences):
474
+ raise ValueError("All sequences must be of equal length.")
475
+
476
+ seq_array = np.array([list(seq) for seq in sequences])
477
+
478
+ def score_pair(i, j):
479
+ a_seq = seq_array[i]
480
+ b_seq = seq_array[j]
481
+ score = sum(
482
+ 0 if a == '-' and b == '-'
483
+ else gap_penalty if a == '-' or b == '-'
484
+ else matrix.get((a, b))
485
+ for a, b in zip(a_seq, b_seq)
486
+ )
487
+ return i, j, score
488
+
489
+ results = Parallel(n_jobs=n_jobs, verbose=verbose_parallel)(
490
+ delayed(score_pair)(i, j) for i in range(n) for j in range(i, n)
491
+ )
492
+
493
+ similarity_matrix = np.zeros((n, n), dtype=int)
494
+ for i, j, score in results:
495
+ similarity_matrix[i, j] = score
496
+ similarity_matrix[j, i] = score
497
+
498
+ return similarity_matrix
499
+
500
+
501
+ def compute_normalized_seq_similarity(sequences, subst_matrix='BLOSUM62',
502
+ gap_penalty=-4, n_jobs=1,
503
+ verbose_parallel=0):
504
+ """
505
+ Computes a normalized similarity matrix using a precalculated substitution
506
+ matrix.
507
+
508
+ Each pairwise similarity score is normalized by the maximum possible
509
+ score for the pair of sequences (i.e., the score we would obtain by
510
+ comparing the sequence to itself).
511
+
512
+ Parameters:
513
+ -----------
514
+ sequences : list of str,
515
+ list of Nseq MSA sequences.
516
+ subst_matrix : str, default='BLOSUM62'
517
+ name of the substitution matrix.
518
+ Type `Bio.Align.substitution_matrices.load()` to obtain a list of
519
+ available substitution matrices.
520
+ gap_penalty : int, default=-4
521
+ Penalty score for gaps. You can adjust this parameter to reflect
522
+ biological assumptions (e.g., -1 for mild, -10 for harsh).
523
+ n_jobs : int, default=1 (no parallelization)
524
+ the maximum number of concurrently running jobs (-1 uses all
525
+ available cores)
526
+ verbose_parallel : int, default=0
527
+ verbosity level for parallelization (see joblib doc)
528
+
529
+ Returns:
530
+ --------
531
+ similarity_matrix : np.ndarray,
532
+ a (Nseq, Nseq) array of normalized similarity scores (0.0 to 1.0).
533
+ """
534
+ matrix = substitution_matrices.load(subst_matrix)
535
+ n_seq = len(sequences)
536
+ seq_length = len(sequences[0])
537
+
538
+ if not all(len(seq) == seq_length for seq in sequences):
539
+ raise ValueError("All sequences must be of equal length.")
540
+
541
+ seq_array = np.array([list(seq) for seq in sequences])
542
+
543
+ def score_pair(i, j):
544
+ a_seq = seq_array[i]
545
+ b_seq = seq_array[j]
546
+ score = sum(
547
+ 0 if a == '-' and b == '-'
548
+ else gap_penalty if a == '-' or b == '-'
549
+ else matrix.get((a, b))
550
+ for a, b in zip(a_seq, b_seq)
551
+ )
552
+ return i, j, score
553
+
554
+ def max_score(i):
555
+ seq = seq_array[i]
556
+ return sum(
557
+ 0 if a == '-' else matrix.get((a, a), -1)
558
+ for a in seq
559
+ )
560
+
561
+ # Compute maximum scores for normalization
562
+ max_scores = Parallel(n_jobs=n_jobs, verbose=verbose_parallel)(
563
+ delayed(max_score)(i) for i in range(n_seq)
564
+ )
565
+
566
+ # Compute pairwise scores
567
+ results = Parallel(n_jobs=n_jobs, verbose=verbose_parallel)(
568
+ delayed(score_pair)(i, j) for i in range(n_seq)
569
+ for j in range(i, n_seq)
570
+ )
571
+
572
+ similarity_matrix = np.zeros((n_seq, n_seq), dtype=float)
573
+ for i, j, score in results:
574
+ max_possible = max(max_scores[i], max_scores[j])
575
+ normalized = score / max_possible if max_possible != 0 else 0.0
576
+ similarity_matrix[i, j] = normalized
577
+ similarity_matrix[j, i] = normalized
578
+
579
+ return similarity_matrix