biotite 1.0.1__cp312-cp312-win_amd64.whl → 1.2.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (177) hide show
  1. biotite/application/application.py +3 -3
  2. biotite/application/autodock/app.py +1 -1
  3. biotite/application/blast/webapp.py +1 -1
  4. biotite/application/clustalo/app.py +1 -1
  5. biotite/application/dssp/app.py +13 -3
  6. biotite/application/localapp.py +36 -2
  7. biotite/application/msaapp.py +10 -10
  8. biotite/application/muscle/app3.py +5 -18
  9. biotite/application/muscle/app5.py +5 -5
  10. biotite/application/sra/app.py +0 -5
  11. biotite/application/util.py +22 -2
  12. biotite/application/viennarna/rnaalifold.py +8 -8
  13. biotite/application/viennarna/rnaplot.py +9 -3
  14. biotite/application/viennarna/util.py +1 -1
  15. biotite/application/webapp.py +1 -1
  16. biotite/database/afdb/__init__.py +12 -0
  17. biotite/database/afdb/download.py +191 -0
  18. biotite/database/entrez/dbnames.py +10 -0
  19. biotite/database/entrez/download.py +9 -10
  20. biotite/database/entrez/key.py +1 -1
  21. biotite/database/entrez/query.py +5 -4
  22. biotite/database/pubchem/download.py +6 -6
  23. biotite/database/pubchem/error.py +10 -0
  24. biotite/database/pubchem/query.py +12 -23
  25. biotite/database/rcsb/download.py +3 -2
  26. biotite/database/rcsb/query.py +8 -9
  27. biotite/database/uniprot/check.py +22 -17
  28. biotite/database/uniprot/download.py +3 -6
  29. biotite/database/uniprot/query.py +4 -5
  30. biotite/file.py +14 -2
  31. biotite/interface/__init__.py +19 -0
  32. biotite/interface/openmm/__init__.py +16 -0
  33. biotite/interface/openmm/state.py +93 -0
  34. biotite/interface/openmm/system.py +227 -0
  35. biotite/interface/pymol/__init__.py +198 -0
  36. biotite/interface/pymol/cgo.py +346 -0
  37. biotite/interface/pymol/convert.py +185 -0
  38. biotite/interface/pymol/display.py +267 -0
  39. biotite/interface/pymol/object.py +1226 -0
  40. biotite/interface/pymol/shapes.py +178 -0
  41. biotite/interface/pymol/startup.py +169 -0
  42. biotite/interface/rdkit/__init__.py +15 -0
  43. biotite/interface/rdkit/mol.py +490 -0
  44. biotite/interface/version.py +71 -0
  45. biotite/interface/warning.py +19 -0
  46. biotite/sequence/align/__init__.py +0 -4
  47. biotite/sequence/align/alignment.py +49 -14
  48. biotite/sequence/align/banded.cp312-win_amd64.pyd +0 -0
  49. biotite/sequence/align/banded.pyx +26 -26
  50. biotite/sequence/align/cigar.py +2 -2
  51. biotite/sequence/align/kmeralphabet.cp312-win_amd64.pyd +0 -0
  52. biotite/sequence/align/kmeralphabet.pyx +19 -2
  53. biotite/sequence/align/kmersimilarity.cp312-win_amd64.pyd +0 -0
  54. biotite/sequence/align/kmertable.cp312-win_amd64.pyd +0 -0
  55. biotite/sequence/align/kmertable.pyx +58 -48
  56. biotite/sequence/align/localgapped.cp312-win_amd64.pyd +0 -0
  57. biotite/sequence/align/localgapped.pyx +47 -47
  58. biotite/sequence/align/localungapped.cp312-win_amd64.pyd +0 -0
  59. biotite/sequence/align/localungapped.pyx +10 -10
  60. biotite/sequence/align/matrix.py +284 -57
  61. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  62. biotite/sequence/align/matrix_data/PB.license +21 -0
  63. biotite/sequence/align/matrix_data/PB.mat +18 -0
  64. biotite/sequence/align/multiple.cp312-win_amd64.pyd +0 -0
  65. biotite/sequence/align/pairwise.cp312-win_amd64.pyd +0 -0
  66. biotite/sequence/align/pairwise.pyx +35 -35
  67. biotite/sequence/align/permutation.cp312-win_amd64.pyd +0 -0
  68. biotite/sequence/align/selector.cp312-win_amd64.pyd +0 -0
  69. biotite/sequence/align/selector.pyx +2 -2
  70. biotite/sequence/align/statistics.py +1 -1
  71. biotite/sequence/align/tracetable.cp312-win_amd64.pyd +0 -0
  72. biotite/sequence/alphabet.py +5 -2
  73. biotite/sequence/annotation.py +19 -13
  74. biotite/sequence/codec.cp312-win_amd64.pyd +0 -0
  75. biotite/sequence/codon.py +1 -2
  76. biotite/sequence/graphics/alignment.py +25 -39
  77. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  78. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  79. biotite/sequence/graphics/colorschemes.py +44 -11
  80. biotite/sequence/graphics/dendrogram.py +4 -2
  81. biotite/sequence/graphics/features.py +2 -2
  82. biotite/sequence/graphics/logo.py +10 -12
  83. biotite/sequence/io/fasta/convert.py +1 -2
  84. biotite/sequence/io/fasta/file.py +1 -1
  85. biotite/sequence/io/fastq/file.py +3 -3
  86. biotite/sequence/io/genbank/file.py +3 -3
  87. biotite/sequence/io/genbank/sequence.py +2 -0
  88. biotite/sequence/io/gff/convert.py +1 -1
  89. biotite/sequence/io/gff/file.py +1 -2
  90. biotite/sequence/phylo/nj.cp312-win_amd64.pyd +0 -0
  91. biotite/sequence/phylo/tree.cp312-win_amd64.pyd +0 -0
  92. biotite/sequence/phylo/upgma.cp312-win_amd64.pyd +0 -0
  93. biotite/sequence/profile.py +105 -29
  94. biotite/sequence/search.py +0 -1
  95. biotite/sequence/seqtypes.py +136 -8
  96. biotite/sequence/sequence.py +1 -2
  97. biotite/setup_ccd.py +197 -0
  98. biotite/structure/__init__.py +6 -3
  99. biotite/structure/alphabet/__init__.py +25 -0
  100. biotite/structure/alphabet/encoder.py +332 -0
  101. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  102. biotite/structure/alphabet/i3d.py +109 -0
  103. biotite/structure/alphabet/layers.py +86 -0
  104. biotite/structure/alphabet/pb.license +21 -0
  105. biotite/structure/alphabet/pb.py +170 -0
  106. biotite/structure/alphabet/unkerasify.py +128 -0
  107. biotite/structure/atoms.py +163 -66
  108. biotite/structure/basepairs.py +26 -26
  109. biotite/structure/bonds.cp312-win_amd64.pyd +0 -0
  110. biotite/structure/bonds.pyx +79 -25
  111. biotite/structure/box.py +19 -21
  112. biotite/structure/celllist.cp312-win_amd64.pyd +0 -0
  113. biotite/structure/celllist.pyx +83 -67
  114. biotite/structure/chains.py +5 -37
  115. biotite/structure/charges.cp312-win_amd64.pyd +0 -0
  116. biotite/structure/compare.py +420 -13
  117. biotite/structure/density.py +1 -1
  118. biotite/structure/dotbracket.py +27 -28
  119. biotite/structure/filter.py +8 -8
  120. biotite/structure/geometry.py +74 -127
  121. biotite/structure/hbond.py +17 -19
  122. biotite/structure/info/__init__.py +1 -0
  123. biotite/structure/info/atoms.py +24 -15
  124. biotite/structure/info/bonds.py +12 -6
  125. biotite/structure/info/ccd.py +125 -34
  126. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  127. biotite/structure/info/groups.py +62 -19
  128. biotite/structure/info/masses.py +9 -6
  129. biotite/structure/info/misc.py +15 -22
  130. biotite/structure/info/radii.py +92 -22
  131. biotite/structure/info/standardize.py +4 -4
  132. biotite/structure/integrity.py +4 -6
  133. biotite/structure/io/general.py +2 -2
  134. biotite/structure/io/gro/file.py +8 -9
  135. biotite/structure/io/mol/convert.py +1 -1
  136. biotite/structure/io/mol/ctab.py +33 -28
  137. biotite/structure/io/mol/mol.py +1 -1
  138. biotite/structure/io/mol/sdf.py +80 -53
  139. biotite/structure/io/pdb/convert.py +4 -3
  140. biotite/structure/io/pdb/file.py +85 -25
  141. biotite/structure/io/pdb/hybrid36.cp312-win_amd64.pyd +0 -0
  142. biotite/structure/io/pdbqt/file.py +36 -36
  143. biotite/structure/io/pdbx/__init__.py +1 -0
  144. biotite/structure/io/pdbx/bcif.py +54 -15
  145. biotite/structure/io/pdbx/cif.py +92 -66
  146. biotite/structure/io/pdbx/component.py +15 -4
  147. biotite/structure/io/pdbx/compress.py +321 -0
  148. biotite/structure/io/pdbx/convert.py +410 -75
  149. biotite/structure/io/pdbx/encoding.cp312-win_amd64.pyd +0 -0
  150. biotite/structure/io/pdbx/encoding.pyx +98 -17
  151. biotite/structure/io/trajfile.py +9 -6
  152. biotite/structure/io/util.py +38 -0
  153. biotite/structure/mechanics.py +0 -1
  154. biotite/structure/molecules.py +141 -156
  155. biotite/structure/pseudoknots.py +7 -13
  156. biotite/structure/repair.py +2 -4
  157. biotite/structure/residues.py +13 -24
  158. biotite/structure/rings.py +335 -0
  159. biotite/structure/sasa.cp312-win_amd64.pyd +0 -0
  160. biotite/structure/sasa.pyx +2 -1
  161. biotite/structure/segments.py +69 -11
  162. biotite/structure/sequence.py +0 -1
  163. biotite/structure/sse.py +0 -2
  164. biotite/structure/superimpose.py +74 -62
  165. biotite/structure/tm.py +581 -0
  166. biotite/structure/transform.py +12 -25
  167. biotite/structure/util.py +76 -4
  168. biotite/version.py +9 -4
  169. biotite/visualize.py +111 -1
  170. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
  171. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
  172. biotite/structure/info/ccd/README.rst +0 -8
  173. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  174. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  175. biotite/structure/info/ccd/nucleotides.txt +0 -798
  176. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
  177. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -2,14 +2,21 @@
2
2
  # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
3
  # information.
4
4
 
5
+ __all__ = ["SubstitutionMatrix"]
5
6
  __name__ = "biotite.sequence.align"
6
7
  __author__ = "Patrick Kunzmann"
7
8
 
8
- import os
9
+ import functools
10
+ from pathlib import Path
9
11
  import numpy as np
10
- from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
12
+ from biotite.sequence.seqtypes import (
13
+ NucleotideSequence,
14
+ PositionalSequence,
15
+ ProteinSequence,
16
+ )
11
17
 
12
- __all__ = ["SubstitutionMatrix"]
18
+ # Directory of matrix files
19
+ _DB_DIR = Path(__file__).parent / "matrix_data"
13
20
 
14
21
 
15
22
  class SubstitutionMatrix(object):
@@ -59,6 +66,11 @@ class SubstitutionMatrix(object):
59
66
  - **RBLOSUM<n>_<BLOCKS>**
60
67
  - **CorBLOSUM<n>_<BLOCKS>**
61
68
 
69
+ - Structural alphabet substitution matrices
70
+
71
+ - **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
72
+ - **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
73
+
62
74
  A list of all available matrix names is returned by
63
75
  :meth:`list_db()`.
64
76
 
@@ -78,6 +90,11 @@ class SubstitutionMatrix(object):
78
90
  or a dictionary mapping the symbol pairing to scores,
79
91
  or a string referencing a matrix in the internal database.
80
92
 
93
+ Attributes
94
+ ----------
95
+ shape : tuple
96
+ The shape of the substitution matrix.
97
+
81
98
  Raises
82
99
  ------
83
100
  KeyError
@@ -110,7 +127,7 @@ class SubstitutionMatrix(object):
110
127
  Creating an identity substitution matrix via the score matrix:
111
128
 
112
129
  >>> alph = NucleotideSequence.alphabet_unamb
113
- >>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph)))
130
+ >>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph), dtype=int))
114
131
  >>> print(matrix)
115
132
  A C G T
116
133
  A 1 0 0 0
@@ -124,9 +141,6 @@ class SubstitutionMatrix(object):
124
141
  >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
125
142
  """
126
143
 
127
- # Directory of matrix files
128
- _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
129
-
130
144
  def __init__(self, alphabet1, alphabet2, score_matrix):
131
145
  self._alph1 = alphabet1
132
146
  self._alph2 = alphabet2
@@ -139,46 +153,44 @@ class SubstitutionMatrix(object):
139
153
  f"Matrix has shape {score_matrix.shape}, "
140
154
  f"but {alph_shape} is required"
141
155
  )
156
+ if not np.issubdtype(score_matrix.dtype, np.integer):
157
+ raise TypeError("Score matrix must be an integer ndarray")
142
158
  self._matrix = score_matrix.astype(np.int32)
159
+ # If the score matrix was converted from a a float matrix,
160
+ # inf values would be converted to 2**31,
161
+ # which is probably undesired and gives overflow issues in the alignment
162
+ # functions
163
+ if (
164
+ np.any(self._matrix == np.iinfo(np.int32).max) or
165
+ np.any(self._matrix == np.iinfo(np.int32).min)
166
+ ): # fmt: skip
167
+ raise ValueError(
168
+ "Score values are too large. "
169
+ "Maybe it was converted from a float matrix containing inf values?"
170
+ )
143
171
  elif isinstance(score_matrix, str):
144
172
  matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
145
173
  self._fill_with_matrix_dict(matrix_dict)
146
174
  else:
147
175
  raise TypeError(
148
- "Matrix must be either a dictionary, " "an 2-D ndarray or a string"
176
+ "Matrix must be either a dictionary, an 2-D ndarray or a string"
149
177
  )
150
178
  # This class is immutable and has a getter function for the
151
179
  # score matrix -> make the score matrix read-only
152
180
  self._matrix.setflags(write=False)
153
181
 
154
- def __repr__(self):
155
- """Represent SubstitutionMatrix as a string for debugging."""
156
- return (
157
- f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
158
- f"np.{np.array_repr(self._matrix)})"
159
- )
160
-
161
- def __eq__(self, item):
162
- if not isinstance(item, SubstitutionMatrix):
163
- return False
164
- if self._alph1 != item.get_alphabet1():
165
- return False
166
- if self._alph2 != item.get_alphabet2():
167
- return False
168
- if not np.array_equal(self.score_matrix(), item.score_matrix()):
169
- return False
170
- return True
171
-
172
- def __ne__(self, item):
173
- return not self == item
182
+ @property
183
+ def shape(self):
184
+ """
185
+ Get the shape (i.e. the length of both alphabets)
186
+ of the substitution matrix.
174
187
 
175
- def _fill_with_matrix_dict(self, matrix_dict):
176
- self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
177
- for i in range(len(self._alph1)):
178
- for j in range(len(self._alph2)):
179
- sym1 = self._alph1.decode(i)
180
- sym2 = self._alph2.decode(j)
181
- self._matrix[i, j] = int(matrix_dict[sym1, sym2])
188
+ Returns
189
+ -------
190
+ shape : tuple
191
+ Matrix shape.
192
+ """
193
+ return (len(self._alph1), len(self._alph2))
182
194
 
183
195
  def get_alphabet1(self):
184
196
  """
@@ -280,28 +292,157 @@ class SubstitutionMatrix(object):
280
292
  code2 = self._alph2.encode(symbol2)
281
293
  return self._matrix[code1, code2]
282
294
 
283
- def shape(self):
295
+ def as_positional(self, sequence1, sequence2):
284
296
  """
285
- Get the shape (i.e. the length of both alphabets)
286
- of the subsitution matrix.
297
+ Transform this substitution matrix and two sequences into positional
298
+ equivalents.
299
+
300
+ This means the new substitution matrix is position-specific: It has the lengths
301
+ of the sequences instead of the lengths of their alphabets.
302
+ Its scores represent the same scores as the original matrix, but now mapped
303
+ onto the positions of the sequences.
304
+
305
+ Parameters
306
+ ----------
307
+ sequence1, sequence2 : seq.Sequence, length=n
308
+ The sequences to create the positional equivalents from.
287
309
 
288
310
  Returns
289
311
  -------
290
- shape : tuple
291
- Matrix shape.
312
+ pos_matrix : align.SubstitutionMatrix, shape=(n, n)
313
+ The position-specific substitution matrix.
314
+ pos_sequence1, pos_sequence2 : PositionalSequence, length=n
315
+ The positional sequences.
316
+
317
+ Notes
318
+ -----
319
+ After the transformation the substitution scores remain the same, i.e.
320
+ `substitution_matrix.get_score(sequence1[i], sequence2[j])` is equal to
321
+ `pos_matrix.get_score(pos_sequence1[i], pos_sequence2[j])`.
322
+
323
+ Examples
324
+ --------
325
+
326
+ Run an alignment with the usual substitution matrix:
327
+
328
+ >>> seq1 = ProteinSequence("BIQTITE")
329
+ >>> seq2 = ProteinSequence("IQLITE")
330
+ >>> matrix = SubstitutionMatrix.std_protein_matrix()
331
+ >>> print(matrix)
332
+ A C D E F G H I K L M N P Q R S T V W Y B Z X *
333
+ A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 -2 -1 0 -4
334
+ C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 -3 -3 -2 -4
335
+ D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 4 1 -1 -4
336
+ E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 1 4 -1 -4
337
+ F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 -3 -3 -1 -4
338
+ G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 -1 -2 -1 -4
339
+ H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 0 0 -1 -4
340
+ I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 -3 -3 -1 -4
341
+ K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 0 1 -1 -4
342
+ L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 -4 -3 -1 -4
343
+ M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 -3 -1 -1 -4
344
+ N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 3 0 -1 -4
345
+ P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 -2 -1 -2 -4
346
+ Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 0 3 -1 -4
347
+ R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 -1 0 -1 -4
348
+ S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 0 0 0 -4
349
+ T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 -1 -1 0 -4
350
+ V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 -3 -2 -1 -4
351
+ W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 -4 -3 -2 -4
352
+ Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 -3 -2 -1 -4
353
+ B -2 -3 4 1 -3 -1 0 -3 0 -4 -3 3 -2 0 -1 0 -1 -3 -4 -3 4 1 -1 -4
354
+ Z -1 -3 1 4 -3 -2 0 -3 1 -3 -1 0 -1 3 0 0 -1 -2 -3 -2 1 4 -1 -4
355
+ X 0 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 -1 0 0 -1 -2 -1 -1 -1 -1 -4
356
+ * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
357
+ >>> alignment = align_optimal(seq1, seq2, matrix, gap_penalty=-10)[0]
358
+ >>> print(alignment)
359
+ BIQTITE
360
+ -IQLITE
361
+
362
+ Running the alignment with positional equivalents gives the same result:
363
+
364
+ >>> pos_matrix, pos_seq1, pos_seq2 = matrix.as_positional(seq1, seq2)
365
+ >>> print(pos_matrix)
366
+ I Q L I T E
367
+ B -3 0 -4 -3 -1 1
368
+ I 4 -3 2 4 -1 -3
369
+ Q -3 5 -2 -3 -1 2
370
+ T -1 -1 -1 -1 5 -1
371
+ I 4 -3 2 4 -1 -3
372
+ T -1 -1 -1 -1 5 -1
373
+ E -3 2 -3 -3 -1 5
374
+ >>> pos_alignment = align_optimal(pos_seq1, pos_seq2, pos_matrix, gap_penalty=-10)[0]
375
+ >>> print(pos_alignment)
376
+ BIQTITE
377
+ -IQLITE
378
+
379
+ Increase the substitution score for the first symbols in both sequences to align
380
+ to each other:
381
+
382
+ >>> score_matrix = pos_matrix.score_matrix().copy()
383
+ >>> score_matrix[0, 0] = 100
384
+ >>> biased_matrix = SubstitutionMatrix(
385
+ ... pos_matrix.get_alphabet1(), pos_matrix.get_alphabet2(), score_matrix
386
+ ... )
387
+ >>> print(biased_matrix)
388
+ I Q L I T E
389
+ B 100 0 -4 -3 -1 1
390
+ I 4 -3 2 4 -1 -3
391
+ Q -3 5 -2 -3 -1 2
392
+ T -1 -1 -1 -1 5 -1
393
+ I 4 -3 2 4 -1 -3
394
+ T -1 -1 -1 -1 5 -1
395
+ E -3 2 -3 -3 -1 5
396
+ >>> biased_alignment = align_optimal(pos_seq1, pos_seq2, biased_matrix, gap_penalty=-10)[0]
397
+ >>> print(biased_alignment)
398
+ BIQTITE
399
+ I-QLITE
292
400
  """
293
- return (len(self._alph1), len(self._alph2))
401
+ pos_sequence1 = PositionalSequence(sequence1)
402
+ pos_sequence2 = PositionalSequence(sequence2)
403
+
404
+ pos_score_matrix = self._matrix[
405
+ tuple(_cartesian_product(sequence1.code, sequence2.code).T)
406
+ ].reshape(len(sequence1), len(sequence2))
407
+ pos_matrix = SubstitutionMatrix(
408
+ pos_sequence1.get_alphabet(),
409
+ pos_sequence2.get_alphabet(),
410
+ pos_score_matrix,
411
+ )
412
+
413
+ return pos_matrix, pos_sequence1, pos_sequence2
414
+
415
+ def __repr__(self):
416
+ """Represent SubstitutionMatrix as a string for debugging."""
417
+ return (
418
+ f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
419
+ f"np.{np.array_repr(self._matrix)})"
420
+ )
421
+
422
+ def __eq__(self, item):
423
+ if not isinstance(item, SubstitutionMatrix):
424
+ return False
425
+ if self._alph1 != item.get_alphabet1():
426
+ return False
427
+ if self._alph2 != item.get_alphabet2():
428
+ return False
429
+ if not np.array_equal(self.score_matrix(), item.score_matrix()):
430
+ return False
431
+ return True
432
+
433
+ def __ne__(self, item):
434
+ return not self == item
294
435
 
295
436
  def __str__(self):
296
437
  # Create matrix in NCBI format
297
438
  string = " "
298
439
  for symbol in self._alph2:
299
- string += f" {symbol:>3}"
440
+ string += f" {str(symbol):>3}"
300
441
  string += "\n"
301
442
  for i, symbol in enumerate(self._alph1):
302
- string += f"{symbol:>1}"
443
+ string += f"{str(symbol):>1}"
303
444
  for j in range(len(self._alph2)):
304
- string += f" {int(self._matrix[i,j]):>3d}"
445
+ string += f" {int(self._matrix[i, j]):>3d}"
305
446
  string += "\n"
306
447
  # Remove terminal line break
307
448
  string = string[:-1]
@@ -318,6 +459,11 @@ class SubstitutionMatrix(object):
318
459
  The keys of the dictionary consist of tuples containing the
319
460
  aligned symbols and the values are the corresponding scores.
320
461
 
462
+ Parameters
463
+ ----------
464
+ string : str
465
+ The string containing the substitution matrix in NCBI format.
466
+
321
467
  Returns
322
468
  -------
323
469
  matrix_dict : dict
@@ -345,12 +491,17 @@ class SubstitutionMatrix(object):
345
491
  The keys of the dictionary consist of tuples containing the
346
492
  aligned symbols and the values are the corresponding scores.
347
493
 
494
+ Parameters
495
+ ----------
496
+ matrix_name : str
497
+ The name of the matrix in the internal database.
498
+
348
499
  Returns
349
500
  -------
350
501
  matrix_dict : dict
351
502
  A dictionary representing the substitution matrix.
352
503
  """
353
- filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
504
+ filename = _DB_DIR / f"{matrix_name}.mat"
354
505
  with open(filename, "r") as f:
355
506
  return SubstitutionMatrix.dict_from_str(f.read())
356
507
 
@@ -364,11 +515,10 @@ class SubstitutionMatrix(object):
364
515
  db_list : list
365
516
  List of matrix names in the internal database.
366
517
  """
367
- files = os.listdir(SubstitutionMatrix._db_dir)
368
- # Remove '.mat' from files
369
- return [file[:-4] for file in sorted(files)]
518
+ return [path.stem for path in _DB_DIR.glob("*.mat")]
370
519
 
371
520
  @staticmethod
521
+ @functools.cache
372
522
  def std_protein_matrix():
373
523
  """
374
524
  Get the default :class:`SubstitutionMatrix` for protein sequence
@@ -379,9 +529,12 @@ class SubstitutionMatrix(object):
379
529
  matrix : SubstitutionMatrix
380
530
  Default matrix.
381
531
  """
382
- return _matrix_blosum62
532
+ return SubstitutionMatrix(
533
+ ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
534
+ )
383
535
 
384
536
  @staticmethod
537
+ @functools.cache
385
538
  def std_nucleotide_matrix():
386
539
  """
387
540
  Get the default :class:`SubstitutionMatrix` for DNA sequence
@@ -392,13 +545,87 @@ class SubstitutionMatrix(object):
392
545
  matrix : SubstitutionMatrix
393
546
  Default matrix.
394
547
  """
395
- return _matrix_nuc
548
+ return SubstitutionMatrix(
549
+ NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
550
+ )
396
551
 
552
+ @staticmethod
553
+ @functools.cache
554
+ def std_3di_matrix():
555
+ """
556
+ Get the default :class:`SubstitutionMatrix` for 3Di sequence
557
+ alignments.
558
+ :footcite:`VanKempen2024`
397
559
 
398
- # Preformatted BLOSUM62 and NUC substitution matrix from NCBI
399
- _matrix_blosum62 = SubstitutionMatrix(
400
- ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
401
- )
402
- _matrix_nuc = SubstitutionMatrix(
403
- NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
404
- )
560
+ Returns
561
+ -------
562
+ matrix : SubstitutionMatrix
563
+ Default matrix.
564
+ """
565
+ # Import inside function to avoid circular import
566
+ from biotite.structure.alphabet.i3d import I3DSequence
567
+
568
+ return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
569
+
570
+ @staticmethod
571
+ @functools.cache
572
+ def std_protein_blocks_matrix(undefined_match=200, undefined_mismatch=-200):
573
+ """
574
+ Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.
575
+
576
+ The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.
577
+
578
+ Parameters
579
+ ----------
580
+ undefined_match, undefined_mismatch : int, optional
581
+ The match and mismatch score for undefined symbols.
582
+ The default values were chosen arbitrarily, but are in the order of
583
+ magnitude of the other score values.
584
+
585
+ Returns
586
+ -------
587
+ matrix : SubstitutionMatrix
588
+ Default matrix.
589
+
590
+ References
591
+ ----------
592
+
593
+ .. footbibliography::
594
+ """
595
+ from biotite.structure.alphabet.pb import ProteinBlocksSequence
596
+
597
+ alphabet = ProteinBlocksSequence.alphabet
598
+ undefined_symbol = ProteinBlocksSequence.undefined_symbol
599
+ matrix_dict = SubstitutionMatrix.dict_from_db("PB")
600
+ # Add match/mismatch scores for undefined symbols residues
601
+ for symbol in alphabet:
602
+ if symbol == undefined_symbol:
603
+ continue
604
+ matrix_dict[symbol, undefined_symbol] = undefined_mismatch
605
+ matrix_dict[undefined_symbol, symbol] = undefined_mismatch
606
+ matrix_dict[undefined_symbol, undefined_symbol] = undefined_match
607
+ return SubstitutionMatrix(
608
+ alphabet,
609
+ alphabet,
610
+ matrix_dict,
611
+ )
612
+
613
+ def _fill_with_matrix_dict(self, matrix_dict):
614
+ self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
615
+ for i in range(len(self._alph1)):
616
+ for j in range(len(self._alph2)):
617
+ sym1 = self._alph1.decode(i)
618
+ sym2 = self._alph2.decode(j)
619
+ self._matrix[i, j] = int(matrix_dict[sym1, sym2])
620
+
621
+
622
+ def _cartesian_product(array1, array2):
623
+ """
624
+ Create all combinations of elements from two arrays.
625
+ """
626
+ return np.transpose(
627
+ [
628
+ np.repeat(array1, len(array2)),
629
+ np.tile(array2, len(array1)),
630
+ ]
631
+ )
@@ -0,0 +1,24 @@
1
+ # 3Di bit/2
2
+ # Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
3
+ # Lambda (precomputed optional): 0.351568
4
+ a c d e f g h i k l m n p q r s t v w y
5
+ a 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2
6
+ c -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9
7
+ d 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2
8
+ e 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3
9
+ f 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4
10
+ g -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2
11
+ h -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3
12
+ i -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8
13
+ k -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8
14
+ l -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9
15
+ m -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9
16
+ n -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5
17
+ p -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5
18
+ q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5
19
+ r -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3
20
+ s -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9
21
+ t -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5
22
+ v -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11
23
+ w 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6
24
+ y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Poulain, A. G. de Brevern
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,18 @@
1
+ # PB substitution matrix, adapted from PBxplore
2
+ a b c d e f g h i j k l m n o p
3
+ a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
4
+ b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
5
+ c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
6
+ d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
7
+ e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
8
+ f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
9
+ g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
10
+ h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
11
+ i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
12
+ j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
13
+ k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
14
+ l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
15
+ m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
16
+ n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
17
+ o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
18
+ p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609