biotite 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (90) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  20. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  26. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  28. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  37. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +129 -40
  51. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  54. biotite/structure/charges.cpython-311-darwin.so +0 -0
  55. biotite/structure/geometry.py +60 -113
  56. biotite/structure/info/__init__.py +1 -0
  57. biotite/structure/info/atoms.py +13 -13
  58. biotite/structure/info/bonds.py +12 -6
  59. biotite/structure/info/ccd.py +125 -32
  60. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  61. biotite/structure/info/groups.py +63 -17
  62. biotite/structure/info/masses.py +9 -6
  63. biotite/structure/info/misc.py +15 -21
  64. biotite/structure/info/standardize.py +3 -2
  65. biotite/structure/io/mol/sdf.py +41 -40
  66. biotite/structure/io/pdb/convert.py +2 -0
  67. biotite/structure/io/pdb/file.py +74 -3
  68. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  69. biotite/structure/io/pdbqt/file.py +32 -32
  70. biotite/structure/io/pdbx/__init__.py +1 -0
  71. biotite/structure/io/pdbx/bcif.py +32 -8
  72. biotite/structure/io/pdbx/cif.py +72 -59
  73. biotite/structure/io/pdbx/component.py +9 -4
  74. biotite/structure/io/pdbx/compress.py +321 -0
  75. biotite/structure/io/pdbx/convert.py +194 -48
  76. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  77. biotite/structure/io/pdbx/encoding.pyx +98 -17
  78. biotite/structure/molecules.py +141 -141
  79. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  80. biotite/structure/segments.py +1 -2
  81. biotite/structure/util.py +73 -1
  82. biotite/version.py +2 -2
  83. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/METADATA +3 -1
  84. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/RECORD +86 -76
  85. biotite/structure/info/ccd/README.rst +0 -8
  86. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  87. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  88. biotite/structure/info/ccd/nucleotides.txt +0 -798
  89. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  90. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -2,14 +2,21 @@
2
2
  # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
3
  # information.
4
4
 
5
+ __all__ = ["SubstitutionMatrix"]
5
6
  __name__ = "biotite.sequence.align"
6
7
  __author__ = "Patrick Kunzmann"
7
8
 
8
- import os
9
+ import functools
10
+ from pathlib import Path
9
11
  import numpy as np
10
- from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
12
+ from biotite.sequence.seqtypes import (
13
+ NucleotideSequence,
14
+ PositionalSequence,
15
+ ProteinSequence,
16
+ )
11
17
 
12
- __all__ = ["SubstitutionMatrix"]
18
+ # Directory of matrix files
19
+ _DB_DIR = Path(__file__).parent / "matrix_data"
13
20
 
14
21
 
15
22
  class SubstitutionMatrix(object):
@@ -59,6 +66,11 @@ class SubstitutionMatrix(object):
59
66
  - **RBLOSUM<n>_<BLOCKS>**
60
67
  - **CorBLOSUM<n>_<BLOCKS>**
61
68
 
69
+ - Structural alphabet substitution matrices
70
+
71
+ - **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
72
+ - **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
73
+
62
74
  A list of all available matrix names is returned by
63
75
  :meth:`list_db()`.
64
76
 
@@ -78,6 +90,11 @@ class SubstitutionMatrix(object):
78
90
  or a dictionary mapping the symbol pairing to scores,
79
91
  or a string referencing a matrix in the internal database.
80
92
 
93
+ Attributes
94
+ ----------
95
+ shape : tuple
96
+ The shape of the substitution matrix.
97
+
81
98
  Raises
82
99
  ------
83
100
  KeyError
@@ -110,7 +127,7 @@ class SubstitutionMatrix(object):
110
127
  Creating an identity substitution matrix via the score matrix:
111
128
 
112
129
  >>> alph = NucleotideSequence.alphabet_unamb
113
- >>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph)))
130
+ >>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph), dtype=int))
114
131
  >>> print(matrix)
115
132
  A C G T
116
133
  A 1 0 0 0
@@ -124,9 +141,6 @@ class SubstitutionMatrix(object):
124
141
  >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
125
142
  """
126
143
 
127
- # Directory of matrix files
128
- _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
129
-
130
144
  def __init__(self, alphabet1, alphabet2, score_matrix):
131
145
  self._alph1 = alphabet1
132
146
  self._alph2 = alphabet2
@@ -139,7 +153,21 @@ class SubstitutionMatrix(object):
139
153
  f"Matrix has shape {score_matrix.shape}, "
140
154
  f"but {alph_shape} is required"
141
155
  )
156
+ if not np.issubdtype(score_matrix.dtype, np.integer):
157
+ raise TypeError("Score matrix must be an integer ndarray")
142
158
  self._matrix = score_matrix.astype(np.int32)
159
+ # If the score matrix was converted from a a float matrix,
160
+ # inf values would be converted to 2**31,
161
+ # which is probably undesired and gives overflow issues in the alignment
162
+ # functions
163
+ if (
164
+ np.any(self._matrix == np.iinfo(np.int32).max) or
165
+ np.any(self._matrix == np.iinfo(np.int32).min)
166
+ ): # fmt: skip
167
+ raise ValueError(
168
+ "Score values are too large. "
169
+ "Maybe it was converted from a float matrix containing inf values?"
170
+ )
143
171
  elif isinstance(score_matrix, str):
144
172
  matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
145
173
  self._fill_with_matrix_dict(matrix_dict)
@@ -151,34 +179,18 @@ class SubstitutionMatrix(object):
151
179
  # score matrix -> make the score matrix read-only
152
180
  self._matrix.setflags(write=False)
153
181
 
154
- def __repr__(self):
155
- """Represent SubstitutionMatrix as a string for debugging."""
156
- return (
157
- f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
158
- f"np.{np.array_repr(self._matrix)})"
159
- )
160
-
161
- def __eq__(self, item):
162
- if not isinstance(item, SubstitutionMatrix):
163
- return False
164
- if self._alph1 != item.get_alphabet1():
165
- return False
166
- if self._alph2 != item.get_alphabet2():
167
- return False
168
- if not np.array_equal(self.score_matrix(), item.score_matrix()):
169
- return False
170
- return True
171
-
172
- def __ne__(self, item):
173
- return not self == item
182
+ @property
183
+ def shape(self):
184
+ """
185
+ Get the shape (i.e. the length of both alphabets)
186
+ of the substitution matrix.
174
187
 
175
- def _fill_with_matrix_dict(self, matrix_dict):
176
- self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
177
- for i in range(len(self._alph1)):
178
- for j in range(len(self._alph2)):
179
- sym1 = self._alph1.decode(i)
180
- sym2 = self._alph2.decode(j)
181
- self._matrix[i, j] = int(matrix_dict[sym1, sym2])
188
+ Returns
189
+ -------
190
+ shape : tuple
191
+ Matrix shape.
192
+ """
193
+ return (len(self._alph1), len(self._alph2))
182
194
 
183
195
  def get_alphabet1(self):
184
196
  """
@@ -280,26 +292,155 @@ class SubstitutionMatrix(object):
280
292
  code2 = self._alph2.encode(symbol2)
281
293
  return self._matrix[code1, code2]
282
294
 
283
- def shape(self):
295
+ def as_positional(self, sequence1, sequence2):
284
296
  """
285
- Get the shape (i.e. the length of both alphabets)
286
- of the subsitution matrix.
297
+ Transform this substitution matrix and two sequences into positional
298
+ equivalents.
299
+
300
+ This means the new substitution matrix is position-specific: It has the lengths
301
+ of the sequences instead of the lengths of their alphabets.
302
+ Its scores represent the same scores as the original matrix, but now mapped
303
+ onto the positions of the sequences.
304
+
305
+ Parameters
306
+ ----------
307
+ sequence1, sequence2 : seq.Sequence, length=n
308
+ The sequences to create the positional equivalents from.
287
309
 
288
310
  Returns
289
311
  -------
290
- shape : tuple
291
- Matrix shape.
312
+ pos_matrix : align.SubstitutionMatrix, shape=(n, n)
313
+ The position-specific substitution matrix.
314
+ pos_sequence1, pos_sequence2 : PositionalSequence, length=n
315
+ The positional sequences.
316
+
317
+ Notes
318
+ -----
319
+ After the transformation the substitution scores remain the same, i.e.
320
+ `substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to
321
+ `pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`.
322
+
323
+ Examples
324
+ --------
325
+
326
+ Run an alignment with the usual substitution matrix:
327
+
328
+ >>> seq1 = ProteinSequence("BIQTITE")
329
+ >>> seq2 = ProteinSequence("IQLITE")
330
+ >>> matrix = SubstitutionMatrix.std_protein_matrix()
331
+ >>> print(matrix)
332
+ A C D E F G H I K L M N P Q R S T V W Y B Z X *
333
+ A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 -2 -1 0 -4
334
+ C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 -3 -3 -2 -4
335
+ D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 4 1 -1 -4
336
+ E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 1 4 -1 -4
337
+ F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 -3 -3 -1 -4
338
+ G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 -1 -2 -1 -4
339
+ H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 0 0 -1 -4
340
+ I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -3 -3 -1 -4
341
+ K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 0 1 -1 -4
342
+ L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -4 -3 -1 -4
343
+ M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -3 -1 -1 -4
344
+ N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 3 0 -1 -4
345
+ P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 -2 -1 -2 -4
346
+ Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 0 3 -1 -4
347
+ R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 -1 0 -1 -4
348
+ S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 0 0 0 -4
349
+ T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 -1 -1 0 -4
350
+ V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -3 -2 -1 -4
351
+ W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 -4 -3 -2 -4
352
+ Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 -3 -2 -1 -4
353
+ B -2 -3 4 1 -3 -1 0 -3 0 -4 -3 3 -2 0 -1 0 -1 -3 -4 -3 4 1 -1 -4
354
+ Z -1 -3 1 4 -3 -2 0 -3 1 -3 -1 0 -1 3 0 0 -1 -2 -3 -2 1 4 -1 -4
355
+ X 0 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 -1 0 0 -1 -2 -1 -1 -1 -1 -4
356
+ * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
357
+ >>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0]
358
+ >>> print(alignment)
359
+ BIQTITE
360
+ -IQLITE
361
+
362
+ Running the alignment with positional equivalents gives the same result:
363
+
364
+ >>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2)
365
+ >>> print(pos_matrix)
366
+ I Q L I T E
367
+ B -3 0 -4 -3 -1 1
368
+ I 4 -3 2 4 -1 -3
369
+ Q -3 5 -2 -3 -1 2
370
+ T -1 -1 -1 -1 5 -1
371
+ I 4 -3 2 4 -1 -3
372
+ T -1 -1 -1 -1 5 -1
373
+ E -3 2 -3 -3 -1 5
374
+ >>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0]
375
+ >>> print(pos_alignment)
376
+ BIQTITE
377
+ -IQLITE
378
+
379
+ Increase the substitution score for the first symbols in both sequences to align
380
+ to each other:
381
+
382
+ >>> score_matrix = pos_matrix.score_matrix().copy()
383
+ >>> score_matrix[0, 0] = 100
384
+ >>> biased_matrix = SubstitutionMatrix(
385
+ ... pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix
386
+ ... )
387
+ >>> print(biased_matrix)
388
+ I Q L I T E
389
+ B 100 0 -4 -3 -1 1
390
+ I 4 -3 2 4 -1 -3
391
+ Q -3 5 -2 -3 -1 2
392
+ T -1 -1 -1 -1 5 -1
393
+ I 4 -3 2 4 -1 -3
394
+ T -1 -1 -1 -1 5 -1
395
+ E -3 2 -3 -3 -1 5
396
+ >>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0]
397
+ >>> print(biased_alignment)
398
+ BIQTITE
399
+ I-QLITE
292
400
  """
293
- return (len(self._alph1), len(self._alph2))
401
+ pos_sequence1 = PositionalSequence(sequence1)
402
+ pos_sequence2 = PositionalSequence(sequence2)
403
+
404
+ pos_score_matrix = self._matrix[
405
+ tuple(_cartesian_product(sequence1.code, sequence2.code).T)
406
+ ].reshape(len(sequence1), len(sequence2))
407
+ pos_matrix = SubstitutionMatrix(
408
+ pos_sequence1.get_alphabet(),
409
+ pos_sequence2.get_alphabet(),
410
+ pos_score_matrix,
411
+ )
412
+
413
+ return pos_matrix, pos_sequence1, pos_sequence2
414
+
415
+ def __repr__(self):
416
+ """Represent SubstitutionMatrix as a string for debugging."""
417
+ return (
418
+ f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
419
+ f"np.{np.array_repr(self._matrix)})"
420
+ )
421
+
422
+ def __eq__(self, item):
423
+ if not isinstance(item, SubstitutionMatrix):
424
+ return False
425
+ if self._alph1 != item.get_alphabet1():
426
+ return False
427
+ if self._alph2 != item.get_alphabet2():
428
+ return False
429
+ if not np.array_equal(self.score_matrix(), item.score_matrix()):
430
+ return False
431
+ return True
432
+
433
+ def __ne__(self, item):
434
+ return not self == item
294
435
 
295
436
  def __str__(self):
296
437
  # Create matrix in NCBI format
297
438
  string = " "
298
439
  for symbol in self._alph2:
299
- string += f" {symbol:>3}"
440
+ string += f" {str(symbol):>3}"
300
441
  string += "\n"
301
442
  for i, symbol in enumerate(self._alph1):
302
- string += f"{symbol:>1}"
443
+ string += f"{str(symbol):>1}"
303
444
  for j in range(len(self._alph2)):
304
445
  string += f" {int(self._matrix[i,j]):>3d}"
305
446
  string += "\n"
@@ -350,7 +491,7 @@ class SubstitutionMatrix(object):
350
491
  matrix_dict : dict
351
492
  A dictionary representing the substitution matrix.
352
493
  """
353
- filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
494
+ filename = _DB_DIR / f"{matrix_name}.mat"
354
495
  with open(filename, "r") as f:
355
496
  return SubstitutionMatrix.dict_from_str(f.read())
356
497
 
@@ -364,11 +505,10 @@ class SubstitutionMatrix(object):
364
505
  db_list : list
365
506
  List of matrix names in the internal database.
366
507
  """
367
- files = os.listdir(SubstitutionMatrix._db_dir)
368
- # Remove '.mat' from files
369
- return [file[:-4] for file in sorted(files)]
508
+ return [path.stem for path in _DB_DIR.glob("*.mat")]
370
509
 
371
510
  @staticmethod
511
+ @functools.cache
372
512
  def std_protein_matrix():
373
513
  """
374
514
  Get the default :class:`SubstitutionMatrix` for protein sequence
@@ -379,9 +519,12 @@ class SubstitutionMatrix(object):
379
519
  matrix : SubstitutionMatrix
380
520
  Default matrix.
381
521
  """
382
- return _matrix_blosum62
522
+ return SubstitutionMatrix(
523
+ ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
524
+ )
383
525
 
384
526
  @staticmethod
527
+ @functools.cache
385
528
  def std_nucleotide_matrix():
386
529
  """
387
530
  Get the default :class:`SubstitutionMatrix` for DNA sequence
@@ -392,13 +535,88 @@ class SubstitutionMatrix(object):
392
535
  matrix : SubstitutionMatrix
393
536
  Default matrix.
394
537
  """
395
- return _matrix_nuc
538
+ return SubstitutionMatrix(
539
+ NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
540
+ )
396
541
 
542
+ @staticmethod
543
+ @functools.cache
544
+ def std_3di_matrix():
545
+ """
546
+ Get the default :class:`SubstitutionMatrix` for 3Di sequence
547
+ alignments.
548
+ :footcite:`VanKempen2024`
397
549
 
398
- # Preformatted BLOSUM62 and NUC substitution matrix from NCBI
399
- _matrix_blosum62 = SubstitutionMatrix(
400
- ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
401
- )
402
- _matrix_nuc = SubstitutionMatrix(
403
- NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
404
- )
550
+ Returns
551
+ -------
552
+ matrix : SubstitutionMatrix
553
+ Default matrix.
554
+ """
555
+ # Import inside function to avoid circular import
556
+ from biotite.structure.alphabet.i3d import I3DSequence
557
+
558
+ return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
559
+
560
+ @staticmethod
561
+ @functools.cache
562
+ def std_protein_blocks_matrix(undefined_match=200, undefined_mismatch=-200):
563
+ """
564
+ Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.
565
+
566
+ The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
567
+
568
+ Parameters
569
+ ----------
570
+ undefined_match, undefined_mismatch : int, optional
571
+ The match and mismatch score for undefined symbols.
572
+ The default values were chosen arbitrarily, but are in the order of
573
+ magnitude of the other score values.
574
+
575
+ Returns
576
+ -------
577
+ matrix : SubstitutionMatrix
578
+ Default matrix.
579
+
580
+ References
581
+ ----------
582
+
583
+ .. footbibliography::
584
+
585
+ """
586
+ from biotite.structure.alphabet.pb import ProteinBlocksSequence
587
+
588
+ alphabet = ProteinBlocksSequence.alphabet
589
+ undefined_symbol = ProteinBlocksSequence.undefined_symbol
590
+ matrix_dict = SubstitutionMatrix.dict_from_db("PB")
591
+ # Add match/mismatch scores for undefined symbols residues
592
+ for symbol in alphabet:
593
+ if symbol == undefined_symbol:
594
+ continue
595
+ matrix_dict[symbol, undefined_symbol] = undefined_mismatch
596
+ matrix_dict[undefined_symbol, symbol] = undefined_mismatch
597
+ matrix_dict[undefined_symbol, undefined_symbol] = undefined_match
598
+ return SubstitutionMatrix(
599
+ alphabet,
600
+ alphabet,
601
+ matrix_dict,
602
+ )
603
+
604
+ def _fill_with_matrix_dict(self, matrix_dict):
605
+ self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
606
+ for i in range(len(self._alph1)):
607
+ for j in range(len(self._alph2)):
608
+ sym1 = self._alph1.decode(i)
609
+ sym2 = self._alph2.decode(j)
610
+ self._matrix[i, j] = int(matrix_dict[sym1, sym2])
611
+
612
+
613
+ def _cartesian_product(array1, array2):
614
+ """
615
+ Create all combinations of elements from two arrays.
616
+ """
617
+ return np.transpose(
618
+ [
619
+ np.repeat(array1, len(array2)),
620
+ np.tile(array2, len(array1)),
621
+ ]
622
+ )
@@ -0,0 +1,24 @@
1
+ # 3Di bit/2
2
+ # Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
3
+ # Lambda (precomputed optional): 0.351568
4
+ a c d e f g h i k l m n p q r s t v w y
5
+ a 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2
6
+ c -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9
7
+ d 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2
8
+ e 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3
9
+ f 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4
10
+ g -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2
11
+ h -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3
12
+ i -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8
13
+ k -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8
14
+ l -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9
15
+ m -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9
16
+ n -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5
17
+ p -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5
18
+ q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5
19
+ r -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3
20
+ s -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9
21
+ t -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5
22
+ v -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11
23
+ w 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6
24
+ y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Poulain, A. G. de Brevern
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,18 @@
1
+ # PB substitution matrix, adapted from PBxplore
2
+ a b c d e f g h i j k l m n o p
3
+ a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
4
+ b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
5
+ c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
6
+ d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
7
+ e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
8
+ f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
9
+ g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
10
+ h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
11
+ i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
12
+ j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
13
+ k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
14
+ l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
15
+ m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
16
+ n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
17
+ o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
18
+ p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
@@ -410,6 +410,9 @@ class LetterAlphabet(Alphabet):
410
410
  symbols = symbols.astype("U1")
411
411
  return symbols
412
412
 
413
+ def is_letter_alphabet(self):
414
+ return True
415
+
413
416
  def __contains__(self, symbol):
414
417
  if not isinstance(symbol, (str, bytes)):
415
418
  return False
@@ -0,0 +1,48 @@
1
+ {
2
+ "comment": "Generated with 'gecos --matrix 3Di --name flower --lmin 60 --lmax 80 -f 3di_flower.json'",
3
+ "name": "flower",
4
+ "alphabet": [
5
+ "a",
6
+ "c",
7
+ "d",
8
+ "e",
9
+ "f",
10
+ "g",
11
+ "h",
12
+ "i",
13
+ "k",
14
+ "l",
15
+ "m",
16
+ "n",
17
+ "p",
18
+ "q",
19
+ "r",
20
+ "s",
21
+ "t",
22
+ "v",
23
+ "w",
24
+ "y"
25
+ ],
26
+ "colors": {
27
+ "a": "#a189a1",
28
+ "c": "#ff5806",
29
+ "d": "#ab9a93",
30
+ "e": "#e754d5",
31
+ "f": "#8191b5",
32
+ "g": "#cbc7ae",
33
+ "h": "#dac1bc",
34
+ "i": "#5eaf6e",
35
+ "k": "#04c1fd",
36
+ "l": "#ff544b",
37
+ "m": "#07e560",
38
+ "n": "#f28d05",
39
+ "p": "#b68767",
40
+ "q": "#bc8277",
41
+ "r": "#eebe86",
42
+ "s": "#ffa103",
43
+ "t": "#a4c49a",
44
+ "v": "#ed6903",
45
+ "w": "#3a97d8",
46
+ "y": "#f7adfd"
47
+ }
48
+ }
@@ -16,7 +16,8 @@
16
16
  "m",
17
17
  "n",
18
18
  "o",
19
- "p"
19
+ "p",
20
+ "z"
20
21
  ],
21
22
  "colors": {
22
23
  "a": "#31b5fc",
@@ -94,27 +94,32 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
94
94
  >>> print(color_scheme)
95
95
  ['#3737f5', '#37f537', '#f5f537', '#f53737']
96
96
  """
97
+ # Try exact alphabet match first
98
+ for scheme in _color_schemes:
99
+ if scheme["name"] == name and scheme["alphabet"] == alphabet:
100
+ return _fit_color_scheme(alphabet, scheme, default)
101
+ # If no exact match was found, try to find a scheme for an alphabet
102
+ # that extends the given alphabet
97
103
  for scheme in _color_schemes:
98
104
  if scheme["name"] == name and scheme["alphabet"].extends(alphabet):
99
- colors = scheme["colors"]
100
- # Replace None values with default color
101
- colors = [color if color is not None else default for color in colors]
102
- # Only return colors that are in scope of this alphabet
103
- # and not the extended alphabet
104
- return colors[: len(alphabet)]
105
+ return _fit_color_scheme(alphabet, scheme, default)
106
+
105
107
  raise ValueError(f"Unkown scheme '{name}' for given alphabet")
106
108
 
107
109
 
108
- def list_color_scheme_names(alphabet):
110
+ def list_color_scheme_names(alphabet, strict=False):
109
111
  """
110
112
  Get a list of available color scheme names for a given alphabet.
111
113
 
112
114
  Parameters
113
115
  ----------
114
116
  alphabet : Alphabet
115
- The alphbet to get the color scheme names for.
116
- The alphabet of the scheme must equal or extend this parameter,
117
- to be included in the list.
117
+ The alphabet to get the color scheme names for.
118
+ strict : bool, optional
119
+ If set to true, only schemes with an exact match to the given
120
+ alphabet are included in the list.
121
+ If set to false, schemes with an alphabet that extends the given
122
+ alphabet are also included.
118
123
 
119
124
  Returns
120
125
  -------
@@ -123,7 +128,9 @@ def list_color_scheme_names(alphabet):
123
128
  """
124
129
  scheme_list = []
125
130
  for scheme in _color_schemes:
126
- if scheme["alphabet"].extends(alphabet):
131
+ if strict and scheme["alphabet"] == alphabet:
132
+ scheme_list.append(scheme["name"])
133
+ if not strict and scheme["alphabet"].extends(alphabet):
127
134
  scheme_list.append(scheme["name"])
128
135
  return scheme_list
129
136
 
@@ -135,3 +142,29 @@ _color_schemes = []
135
142
  for file_name in glob.glob(_scheme_dir + os.sep + "*.json"):
136
143
  scheme = load_color_scheme(file_name)
137
144
  _color_schemes.append(scheme)
145
+
146
+
147
+ def _fit_color_scheme(alphabet, color_scheme, default_color):
148
+ """
149
+ Fit a color scheme to the given alphabet.
150
+
151
+ Parameters
152
+ ----------
153
+ alphabet : Alphabet
154
+ The alphabet to get the color scheme for.
155
+ color_scheme : dict
156
+ The color scheme.
157
+ default_color : str or tuple
158
+ The default color.
159
+
160
+ Returns
161
+ -------
162
+ scheme : list of str
163
+ The colors from the scheme.
164
+ """
165
+ colors = color_scheme["colors"]
166
+ # Replace None values with default color
167
+ colors = [color if color is not None else default_color for color in colors]
168
+ # Only return colors that are in scope of this alphabet
169
+ # and not the extended alphabet
170
+ return colors[: len(alphabet)]