biotite 0.41.1__cp312-cp312-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,279 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["align_local_ungapped"]
8
+
9
+ cimport cython
10
+ cimport numpy as np
11
+
12
+ import numpy as np
13
+ from .alignment import Alignment
14
+
15
+
16
+ ctypedef np.int32_t int32
17
+ ctypedef np.int64_t int64
18
+ ctypedef np.uint8_t uint8
19
+ ctypedef np.uint16_t uint16
20
+ ctypedef np.uint32_t uint32
21
+ ctypedef np.uint64_t uint64
22
+
23
+ ctypedef fused CodeType1:
24
+ uint8
25
+ uint16
26
+ uint32
27
+ uint64
28
+ ctypedef fused CodeType2:
29
+ uint8
30
+ uint16
31
+ uint32
32
+ uint64
33
+
34
+
35
+ def align_local_ungapped(seq1, seq2, matrix, seed, int32 threshold,
36
+ str direction="both", bint score_only=False,
37
+ bint check_matrix=True):
38
+ """
39
+ align_local_ungapped(seq1, seq2, matrix, seed, threshold,
40
+ direction="both", score_only=False, check_matrix=True)
41
+
42
+ Perform a local alignment extending from given `seed` position
43
+ without inserting gaps.
44
+
45
+ The alignment extends into one or both directions (controlled by
46
+ `direction`) until the total alignment score falls more than
47
+ `threshold` below the maximum score found (*X-Drop*).
48
+ The returned alignment contains the range that yielded the maximum
49
+ score.
50
+
51
+ Parameters
52
+ ----------
53
+ seq1, seq2 : Sequence
54
+ The sequences to be aligned.
55
+ The sequences do not need to have the same alphabets, as long as
56
+ the two alphabets of `matrix` extend the alphabets of the two
57
+ sequences.
58
+ matrix : SubstitutionMatrix
59
+ The substitution matrix used for scoring.
60
+ seed : tuple(int, int)
61
+ The indices in `seq1` and `seq2` where the local alignment
62
+ starts.
63
+ The indices must be non-negative.
64
+ threshold : int
65
+ If the current score falls this value below the maximum score
66
+ found, the alignment terminates.
67
+ direction : {'both', 'upstream', 'downstream'}, optional
68
+ Controls in which direction the alignment extends starting
69
+ from the seed.
70
+ If ``'upstream'``, the alignment starts before the `seed` and
71
+ ends at the `seed`.
72
+ If ``'downstream'``, the alignment starts at the `seed` and
73
+ ends behind the `seed`.
74
+ If ``'both'`` (default) the alignment starts before the `seed`
75
+ and ends behind the `seed`.
76
+ The `seed` position itself is always included in the alignment.
77
+ score_only : bool, optional
78
+ If set to ``True``, only the similarity score is returned
79
+ instead of the :class:`Alignment`, decreasing the runtime
80
+ substantially.
81
+ check_matrix : bool, optional
82
+ If set to False, the `matrix` is not checked for compatibility
83
+ with the alphabets of the sequences.
84
+ Due to the small overall runtime of the function, this can increase
85
+ performance substantially.
86
+ However, unexpected results or crashes may occur, if an
87
+ incompatible `matrix` is given.
88
+
89
+
90
+ Returns
91
+ -------
92
+ alignment : Alignment
93
+ The resulting ungapped alignment.
94
+ Only returned, if `score_only` is ``False``.
95
+ score : int
96
+ The alignment similarity score.
97
+ Only returned, if `score_only` is ``True``.
98
+
99
+ See also
100
+ --------
101
+ align_gapped
102
+ For gapped local alignments with the same *X-Drop* technique.
103
+
104
+ Examples
105
+ --------
106
+
107
+ >>> seq1 = ProteinSequence("BIQTITE")
108
+ >>> seq2 = ProteinSequence("PYRRHQTITE")
109
+ >>> matrix = SubstitutionMatrix.std_protein_matrix()
110
+ >>> alignment = align_local_ungapped(seq1, seq2, matrix, seed=(4,7), threshold=10)
111
+ >>> print(alignment)
112
+ QTITE
113
+ QTITE
114
+ >>> alignment = align_local_ungapped(seq1, seq2, matrix, (4,7), 10, direction="upstream")
115
+ >>> print(alignment)
116
+ QTI
117
+ QTI
118
+ >>> alignment = align_local_ungapped(seq1, seq2, matrix, (4,7), 10, direction="downstream")
119
+ >>> print(alignment)
120
+ ITE
121
+ ITE
122
+ >>> score = align_local_ungapped(seq1, seq2, matrix, (4,7), 10, score_only=True)
123
+ >>> print(score)
124
+ 24
125
+ """
126
+ if check_matrix:
127
+ if not matrix.get_alphabet1().extends(seq1.get_alphabet()) \
128
+ or not matrix.get_alphabet2().extends(seq2.get_alphabet()):
129
+ raise ValueError(
130
+ "The sequences' alphabets do not fit the matrix"
131
+ )
132
+ cdef const int32[:,:] score_matrix = matrix.score_matrix()
133
+
134
+ cdef bint upstream
135
+ cdef bint downstream
136
+ if direction == "both":
137
+ upstream = True
138
+ downstream = True
139
+ elif direction == "upstream":
140
+ upstream = True
141
+ downstream = False
142
+ elif direction == "downstream":
143
+ upstream = False
144
+ downstream = True
145
+ else:
146
+ raise ValueError(f"Direction '{direction}' is invalid")
147
+
148
+ if threshold < 0:
149
+ raise ValueError("The threshold value must be a non-negative integer")
150
+
151
+ cdef int seq1_start, seq2_start
152
+ seq1_start, seq2_start = seed
153
+ if seq1_start < 0 or seq2_start < 0:
154
+ raise IndexError("Seed must contain positive indices")
155
+
156
+ cdef np.ndarray code1 = seq1.code
157
+ cdef np.ndarray code2 = seq2.code
158
+ # For C- function call of the '_seed_extend_uint8()' function
159
+ # for the common case
160
+ # This gives significant performance increase since the
161
+ # seed extend itself runs fast
162
+ cdef bint both_uint8 = (code1.dtype == np.uint8) \
163
+ & (code2.dtype == np.uint8)
164
+
165
+ cdef int32 length
166
+ cdef int start_offset = 0
167
+ cdef int stop_offset = 1
168
+ cdef int32 score = 0
169
+ cdef int32 total_score = 0
170
+
171
+ # Separate alignment into two parts:
172
+ # the regions upstream and downstream from the seed position
173
+ # Range check to avoid negative indices
174
+ if upstream and seq1_start > 0 and seq2_start > 0:
175
+ # For the upstream region the respective part of the sequence
176
+ # must be reversed
177
+ if both_uint8:
178
+ length = _seed_extend_uint8(
179
+ code1[seq1_start-1::-1], code2[seq2_start-1::-1],
180
+ score_matrix, threshold, &score
181
+ )
182
+ else:
183
+ score, length = _seed_extend_generic(
184
+ code1[seq1_start-1::-1], code2[seq2_start-1::-1],
185
+ score_matrix, threshold
186
+ )
187
+ total_score += score
188
+ start_offset -= length
189
+ if downstream:
190
+ if both_uint8:
191
+ length = _seed_extend_uint8(
192
+ code1[seq1_start+1:], code2[seq2_start+1:],
193
+ score_matrix, threshold, &score
194
+ )
195
+ else:
196
+ score, length = _seed_extend_generic(
197
+ code1[seq1_start+1:], code2[seq2_start+1:],
198
+ score_matrix, threshold
199
+ )
200
+ total_score += score
201
+ stop_offset += length
202
+ total_score += score_matrix[code1[seq1_start], code2[seq2_start]]
203
+
204
+ if score_only:
205
+ return total_score
206
+ else:
207
+ trace = np.stack([
208
+ np.arange(seq1_start + start_offset, seq1_start + stop_offset),
209
+ np.arange(seq2_start + start_offset, seq2_start + stop_offset)
210
+ ], axis=-1)
211
+ return Alignment([seq1, seq2], trace, total_score)
212
+
213
+
214
+ @cython.boundscheck(False)
215
+ @cython.wraparound(False)
216
+ def _seed_extend_generic(CodeType1[:] code1 not None,
217
+ CodeType2[:] code2 not None,
218
+ const int32[:,:] matrix not None,
219
+ int32 threshold):
220
+ """
221
+ Align two sequences without insertion of gaps beginning from
222
+ start of the given sequences.
223
+ If the score drops too low, terminate the alignment.
224
+ Return the similarity score and the number of aligned symbols.
225
+ """
226
+ cdef int i
227
+ cdef int32 total_score = 0, max_score = 0
228
+ cdef int i_max_score = -1
229
+
230
+ # Iterate over the symbols in both sequences
231
+ # The alignment automatically terminates,
232
+ # if the the end of either sequence is reached
233
+ for i in range(_min(code1.shape[0], code2.shape[0])):
234
+ total_score += matrix[code1[i], code2[i]]
235
+ if total_score >= max_score:
236
+ max_score = total_score
237
+ i_max_score = i
238
+ elif max_score - total_score > threshold:
239
+ # Score drops too low -> terminate alignment
240
+ break
241
+
242
+ # Return the total score and the number of aligned symbols at the
243
+ # point with maximum total score
244
+ return max_score, i_max_score + 1
245
+
246
+ @cython.boundscheck(False)
247
+ @cython.wraparound(False)
248
+ cdef int _seed_extend_uint8(uint8[:] code1, uint8[:] code2,
249
+ const int32[:,:] matrix,
250
+ int32 threshold, int32* score):
251
+ """
252
+ The same functionality as :func:`_seed_extend_generic()` but as
253
+ C-function tailored for the common ``uint8`` sequence code *dtype*.
254
+ This increases the performance for this common case.
255
+ """
256
+ cdef int i
257
+ cdef int32 total_score = 0, max_score = 0
258
+ cdef int i_max_score = -1
259
+
260
+ # Iterate over the symbols in both sequences
261
+ # The alignment automatically terminates,
262
+ # if the the end of either sequence is reached
263
+ for i in range(_min(code1.shape[0], code2.shape[0])):
264
+ total_score += matrix[code1[i], code2[i]]
265
+ if total_score >= max_score:
266
+ max_score = total_score
267
+ i_max_score = i
268
+ elif max_score - total_score > threshold:
269
+ # Score drops too low -> terminate alignment
270
+ break
271
+
272
+ # Return the total score and the number of aligned symbols at the
273
+ # point with maximum total score
274
+ score[0] = max_score
275
+ return i_max_score + 1
276
+
277
+
278
+ cdef inline int _min(int a, int b):
279
+ return a if a < b else b
@@ -0,0 +1,405 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+
8
+ from ..sequence import Sequence
9
+ from ..seqtypes import NucleotideSequence, ProteinSequence
10
+ from ..alphabet import Alphabet
11
+ import numpy as np
12
+ import os
13
+
14
+ __all__ = ["SubstitutionMatrix"]
15
+
16
+
17
+ class SubstitutionMatrix(object):
18
+ """
19
+ A :class:`SubstitutionMatrix` is the foundation for scoring in
20
+ sequence alignments.
21
+ A :class:`SubstitutionMatrix` maps each possible pairing of a symbol
22
+ of a first alphabet with a symbol of a second alphabet to a score
23
+ (integer).
24
+
25
+ The class uses a 2-D (m x n) :class:`ndarray`
26
+ (dtype=:attr:`numpy.int32`),
27
+ where each element stores the score for a symbol pairing, indexed
28
+ by the symbol codes of the respective symbols in an *m*-length
29
+ alphabet 1 and an *n*-length alphabet 2.
30
+
31
+ There are 3 ways to creates instances:
32
+
33
+ At first a 2-D :class:`ndarray` containing the scores can be
34
+ directly provided.
35
+
36
+ Secondly a dictionary can be provided, where the keys are pairing
37
+ tuples and values are the corresponding scores.
38
+ The pairing tuples consist of a symbol of alphabet 1 as first
39
+ element and a symbol of alphabet 2 as second element. Parings have
40
+ to be provided for each possible combination.
41
+
42
+ At last a valid matrix name can be given, which is loaded from the
43
+ internal matrix database. The following matrices are avaliable:
44
+
45
+ - Nucleotide substitution matrices from NCBI database
46
+ - **NUC** - Also usable with ambiguous alphabet
47
+
48
+ - Protein substitution matrices from NCBI database
49
+
50
+ - **PAM<n>**
51
+ - **BLOSUM<n>**
52
+ - **MATCH** - Only differentiates between match and mismatch
53
+ - **IDENTITY** - Strongly penalizes mismatches
54
+ - **GONNET** - Not usable with default protein alphabet
55
+ - **DAYHOFF**
56
+
57
+ - Corrected protein substitution matrices :footcite:`Hess2016`,
58
+ **<BLOCKS>** is the BLOCKS version, the matrix is based on
59
+
60
+ - **BLOSUM<n>_<BLOCKS>**
61
+ - **RBLOSUM<n>_<BLOCKS>**
62
+ - **CorBLOSUM<n>_<BLOCKS>**
63
+
64
+ A list of all available matrix names is returned by
65
+ :meth:`list_db()`.
66
+
67
+ Since this class can handle two different alphabets, it is possible
68
+ to align two different types of sequences.
69
+
70
+ Objects of this class are immutable.
71
+
72
+ Parameters
73
+ ----------
74
+ alphabet1 : Alphabet, length=m
75
+ The first alphabet of the substitution matrix.
76
+ alphabet2 : Alphabet, length=n
77
+ The second alphabet of the substitution matrix.
78
+ score_matrix : ndarray, shape=(m,n) or dict or str
79
+ Either a symbol code indexed :class:`ndarray` containing the scores,
80
+ or a dictionary mapping the symbol pairing to scores,
81
+ or a string referencing a matrix in the internal database.
82
+
83
+ Raises
84
+ ------
85
+ KeyError
86
+ If the matrix dictionary misses a symbol given in the alphabet.
87
+
88
+ References
89
+ ----------
90
+
91
+ .. footbibliography::
92
+
93
+ Examples
94
+ --------
95
+
96
+ Creating a matrix for two different (nonsense) alphabets
97
+ via a matrix dictionary:
98
+
99
+ >>> alph1 = Alphabet(["foo","bar"])
100
+ >>> alph2 = Alphabet([1,2,3])
101
+ >>> matrix_dict = {("foo",1):5, ("foo",2):10, ("foo",3):15,
102
+ ... ("bar",1):42, ("bar",2):42, ("bar",3):42}
103
+ >>> matrix = SubstitutionMatrix(alph1, alph2, matrix_dict)
104
+ >>> print(matrix.score_matrix())
105
+ [[ 5 10 15]
106
+ [42 42 42]]
107
+ >>> print(matrix.get_score("foo", 2))
108
+ 10
109
+ >>> print(matrix.get_score_by_code(0, 1))
110
+ 10
111
+
112
+ Creating an identity substitution matrix via the score matrix:
113
+
114
+ >>> alph = NucleotideSequence.alphabet_unamb
115
+ >>> matrix = SubstitutionMatrix(alph, alph, np.identity(len(alph)))
116
+ >>> print(matrix)
117
+ A C G T
118
+ A 1 0 0 0
119
+ C 0 1 0 0
120
+ G 0 0 1 0
121
+ T 0 0 0 1
122
+
123
+ Creating a matrix via database name:
124
+
125
+ >>> alph = ProteinSequence.alphabet
126
+ >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
127
+ """
128
+
129
+ # Directory of matrix files
130
+ _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
131
+ "matrix_data")
132
+
133
+ def __init__(self, alphabet1, alphabet2, score_matrix):
134
+ self._alph1 = alphabet1
135
+ self._alph2 = alphabet2
136
+ if isinstance(score_matrix, dict):
137
+ self._fill_with_matrix_dict(score_matrix)
138
+ elif isinstance(score_matrix, np.ndarray):
139
+ alph_shape = (len(alphabet1), len(alphabet2))
140
+ if score_matrix.shape != alph_shape:
141
+ raise ValueError(
142
+ f"Matrix has shape {score_matrix.shape}, "
143
+ f"but {alph_shape} is required"
144
+ )
145
+ self._matrix = score_matrix.astype(np.int32)
146
+ elif isinstance(score_matrix, str):
147
+ matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
148
+ self._fill_with_matrix_dict(matrix_dict)
149
+ else:
150
+ raise TypeError("Matrix must be either a dictionary, "
151
+ "an 2-D ndarray or a string")
152
+ # This class is immutable and has a getter function for the
153
+ # score matrix -> make the score matrix read-only
154
+ self._matrix.setflags(write=False)
155
+
156
+ def __repr__(self):
157
+ """Represent SubstitutionMatrix as a string for debugging."""
158
+ return f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " \
159
+ f"np.{np.array_repr(self._matrix)})"
160
+
161
+ def __eq__(self, item):
162
+ if not isinstance(item, SubstitutionMatrix):
163
+ return False
164
+ if self._alph1 != item.get_alphabet1():
165
+ return False
166
+ if self._alph2 != item.get_alphabet2():
167
+ return False
168
+ if not np.array_equal(self.score_matrix(), item.score_matrix()):
169
+ return False
170
+ return True
171
+
172
+ def __ne__(self, item):
173
+ return not self == item
174
+
175
+ def _fill_with_matrix_dict(self, matrix_dict):
176
+ self._matrix = np.zeros(( len(self._alph1), len(self._alph2) ),
177
+ dtype=np.int32)
178
+ for i in range(len(self._alph1)):
179
+ for j in range(len(self._alph2)):
180
+ sym1 = self._alph1.decode(i)
181
+ sym2 = self._alph2.decode(j)
182
+ self._matrix[i,j] = int(matrix_dict[sym1, sym2])
183
+
184
+ def get_alphabet1(self):
185
+ """
186
+ Get the first alphabet.
187
+
188
+ Returns
189
+ -------
190
+ alphabet : Alphabet
191
+ The first alphabet.
192
+ """
193
+ return self._alph1
194
+
195
+ def get_alphabet2(self):
196
+ """
197
+ Get the second alphabet.
198
+
199
+ Returns
200
+ -------
201
+ alphabet : Alphabet
202
+ The second alphabet.
203
+ """
204
+ return self._alph2
205
+
206
+ def score_matrix(self):
207
+ """
208
+ Get the 2-D :class:`ndarray` containing the score values.
209
+
210
+ Returns
211
+ -------
212
+ matrix : ndarray, shape=(m,n), dtype=np.int32
213
+ The symbol code indexed score matrix.
214
+ The array is read-only.
215
+ """
216
+ return self._matrix
217
+
218
+ def transpose(self):
219
+ """
220
+ Get a copy of this instance, where the alphabets are
221
+ interchanged.
222
+
223
+ Returns
224
+ -------
225
+ transposed : SubstitutionMatrix
226
+ The transposed substitution matrix.
227
+ """
228
+ new_alph1 = self._alph2
229
+ new_alph2 = self._alph1
230
+ new_matrix = np.transpose(self._matrix)
231
+ return SubstitutionMatrix(new_alph1, new_alph2, new_matrix)
232
+
233
+ def is_symmetric(self):
234
+ """
235
+ Check whether the substitution matrix is symmetric,
236
+ i.e. both alphabets are identical
237
+ and the score matrix is symmetric.
238
+
239
+ Returns
240
+ -------
241
+ is_symmetric : bool
242
+ True, if both alphabets are identical and the score matrix
243
+ is symmetric, false otherwise.
244
+ """
245
+ return self._alph1 == self._alph2 \
246
+ and np.array_equal(self._matrix, np.transpose(self._matrix))
247
+
248
+ def get_score_by_code(self, code1, code2):
249
+ """
250
+ Get the substitution score of two symbols,
251
+ represented by their code.
252
+
253
+ Parameters
254
+ ----------
255
+ code1, code2 : int
256
+ Symbol codes of the two symbols to be aligned.
257
+
258
+ Returns
259
+ -------
260
+ score : int
261
+ The substitution / alignment score.
262
+ """
263
+ return self._matrix[code1, code2]
264
+
265
+ def get_score(self, symbol1, symbol2):
266
+ """
267
+ Get the substitution score of two symbols.
268
+
269
+ Parameters
270
+ ----------
271
+ symbol1, symbol2 : object
272
+ Symbols to be aligned.
273
+
274
+ Returns
275
+ -------
276
+ score : int
277
+ The substitution / alignment score.
278
+ """
279
+ code1 = self._alph1.encode(symbol1)
280
+ code2 = self._alph2.encode(symbol2)
281
+ return self._matrix[code1, code2]
282
+
283
+ def shape(self):
284
+ """
285
+ Get the shape (i.e. the length of both alphabets)
286
+ of the subsitution matrix.
287
+
288
+ Returns
289
+ -------
290
+ shape : tuple
291
+ Matrix shape.
292
+ """
293
+ return (len(self._alph1), len(self._alph2))
294
+
295
+ def __str__(self):
296
+ # Create matrix in NCBI format
297
+ string = " "
298
+ for symbol in self._alph2:
299
+ string += f" {symbol:>3}"
300
+ string += "\n"
301
+ for i, symbol in enumerate(self._alph1):
302
+ string += f"{symbol:>1}"
303
+ for j in range(len(self._alph2)):
304
+ string += f" {int(self._matrix[i,j]):>3d}"
305
+ string += "\n"
306
+ # Remove terminal line break
307
+ string = string[:-1]
308
+ return string
309
+
310
+ @staticmethod
311
+ def dict_from_str(string):
312
+ """
313
+ Create a matrix dictionary from a string in NCBI matrix format.
314
+
315
+ Symbols of the first alphabet are taken from the left column,
316
+ symbols of the second alphabet are taken from the top row.
317
+
318
+ The keys of the dictionary consist of tuples containing the
319
+ aligned symbols and the values are the corresponding scores.
320
+
321
+ Returns
322
+ -------
323
+ matrix_dict : dict
324
+ A dictionary representing the substitution matrix.
325
+ """
326
+ lines = [line.strip() for line in string.split("\n")]
327
+ lines = [line for line in lines if len(line) != 0 and line[0] != "#"]
328
+ symbols1 = [line.split()[0] for line in lines[1:]]
329
+ symbols2 = [e for e in lines[0].split()]
330
+ scores = np.array([line.split()[1:] for line in lines[1:]]).astype(int)
331
+ scores = np.transpose(scores)
332
+
333
+ matrix_dict = {}
334
+ for i in range(len(symbols1)):
335
+ for j in range(len(symbols2)):
336
+ matrix_dict[(symbols1[i], symbols2[j])] = scores[i,j]
337
+ return matrix_dict
338
+
339
+ @staticmethod
340
+ def dict_from_db(matrix_name):
341
+ """
342
+ Create a matrix dictionary from a valid matrix name in the
343
+ internal matrix database.
344
+
345
+ The keys of the dictionary consist of tuples containing the
346
+ aligned symbols and the values are the corresponding scores.
347
+
348
+ Returns
349
+ -------
350
+ matrix_dict : dict
351
+ A dictionary representing the substitution matrix.
352
+ """
353
+ filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
354
+ with open(filename, "r") as f:
355
+ return SubstitutionMatrix.dict_from_str(f.read())
356
+
357
+ @staticmethod
358
+ def list_db():
359
+ """
360
+ List all matrix names in the internal database.
361
+
362
+ Returns
363
+ -------
364
+ db_list : list
365
+ List of matrix names in the internal database.
366
+ """
367
+ files = os.listdir(SubstitutionMatrix._db_dir)
368
+ # Remove '.mat' from files
369
+ return [file[:-4] for file in sorted(files)]
370
+
371
+
372
+ @staticmethod
373
+ def std_protein_matrix():
374
+ """
375
+ Get the default :class:`SubstitutionMatrix` for protein sequence
376
+ alignments, which is BLOSUM62.
377
+
378
+ Returns
379
+ -------
380
+ matrix : SubstitutionMatrix
381
+ Default matrix.
382
+ """
383
+ return _matrix_blosum62
384
+
385
+ @staticmethod
386
+ def std_nucleotide_matrix():
387
+ """
388
+ Get the default :class:`SubstitutionMatrix` for DNA sequence
389
+ alignments.
390
+
391
+ Returns
392
+ -------
393
+ matrix : SubstitutionMatrix
394
+ Default matrix.
395
+ """
396
+ return _matrix_nuc
397
+
398
+ # Preformatted BLOSUM62 and NUC substitution matrix from NCBI
399
+ _matrix_blosum62 = SubstitutionMatrix(ProteinSequence.alphabet,
400
+ ProteinSequence.alphabet,
401
+ "BLOSUM62")
402
+ _matrix_nuc = SubstitutionMatrix(NucleotideSequence.alphabet_amb,
403
+ NucleotideSequence.alphabet_amb,
404
+ "NUC")
405
+