biotite 0.41.1__cp311-cp311-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-311-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,116 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["find_subsequence", "find_symbol", "find_symbol_first",
8
+ "find_symbol_last"]
9
+
10
+ import numpy as np
11
+
12
+
13
+ def find_subsequence(sequence, query):
14
+ """
15
+ Find a subsequence in a sequence.
16
+
17
+ Parameters
18
+ ----------
19
+ sequence : Sequence
20
+ The sequence to find the subsequence in.
21
+ query : Sequence
22
+ The potential subsequence. Its alphabet must extend the
23
+ `sequence` alphabet.
24
+
25
+ Returns
26
+ -------
27
+ match_indices : ndarray
28
+ The starting indices in `sequence`, where `query` has been
29
+ found. The array is empty if no match has been found.
30
+
31
+ Raises
32
+ ------
33
+ ValueError
34
+ If the `query` alphabet does not extend the `sequence` alphabet.
35
+
36
+ Examples
37
+ --------
38
+
39
+ >>> main_seq = NucleotideSequence("ACTGAATGA")
40
+ >>> sub_seq = NucleotideSequence("TGA")
41
+ >>> print(find_subsequence(main_seq, sub_seq))
42
+ [2 6]
43
+
44
+ """
45
+ if not sequence.get_alphabet().extends(query.get_alphabet()):
46
+ raise ValueError("The sequences alphabets are not equal")
47
+ match_indices = []
48
+ frame_size = len(query)
49
+ for i in range(len(sequence) - frame_size + 1):
50
+ sub_seq_code = sequence.code[i : i + frame_size]
51
+ if np.array_equal(query.code, sub_seq_code):
52
+ match_indices.append(i)
53
+ return np.array(match_indices)
54
+
55
+ def find_symbol(sequence, symbol):
56
+ """
57
+ Find a symbol in a sequence.
58
+
59
+ Parameters
60
+ ----------
61
+ sequence : Sequence
62
+ The sequence to find the symbol in.
63
+ symbol : object
64
+ The symbol to be found in `sequence`.
65
+
66
+ Returns
67
+ -------
68
+ match_indices : ndarray
69
+ The indices in `sequence`, where `symbol` has been found.
70
+ """
71
+ code = sequence.get_alphabet().encode(symbol)
72
+ return np.where(sequence.code == code)[0]
73
+
74
+ def find_symbol_first(sequence, symbol):
75
+ """
76
+ Find first occurence of a symbol in a sequence.
77
+
78
+ Parameters
79
+ ----------
80
+ sequence : Sequence
81
+ The sequence to find the symbol in.
82
+ symbol : object
83
+ The symbol to be found in `sequence`.
84
+
85
+ Returns
86
+ -------
87
+ first_index : int
88
+ The first index of `symbol` in `sequence`. If `symbol` is not in
89
+ `sequence`, -1 is returned.
90
+ """
91
+ match_i = find_symbol(sequence, symbol)
92
+ if len(match_i) == 0:
93
+ return -1
94
+ return np.min(match_i)
95
+
96
+ def find_symbol_last(sequence, symbol):
97
+ """
98
+ Find last occurence of a symbol in a sequence.
99
+
100
+ Parameters
101
+ ----------
102
+ sequence : Sequence
103
+ The sequence to find the symbol in.
104
+ symbol : object
105
+ The symbol to be found in `sequence`.
106
+
107
+ Returns
108
+ -------
109
+ flast_index : int
110
+ The last index of `symbol` in `sequence`. If `symbol` is not in
111
+ `sequence`, -1 is returned.
112
+ """
113
+ match_i = find_symbol(sequence, symbol)
114
+ if len(match_i) == 0:
115
+ return -1
116
+ return np.max(match_i)
@@ -0,0 +1,556 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence"
6
+ __author__ = "Patrick Kunzmann", "Thomas Nevolianis"
7
+ __all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
8
+
9
+ from .sequence import Sequence
10
+ from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper
11
+ import numpy as np
12
+ import copy
13
+
14
+
15
+ class GeneralSequence(Sequence):
16
+ """
17
+ This class allows the creation of a sequence with custom
18
+ :class:`Alphabet` without the need to subclass :class:`Sequence`.
19
+
20
+ Parameters
21
+ ----------
22
+ alphabet : Alphabet
23
+ The alphabet of this sequence.
24
+ sequence : iterable object, optional
25
+ The symbol sequence, the :class:`Sequence` is initialized with.
26
+ For alphabets containing single letter strings, this parameter
27
+ may also be a :class:`str` object.
28
+ By default the sequence is empty.
29
+ """
30
+
31
+ def __init__(self, alphabet, sequence=()):
32
+ self._alphabet = alphabet
33
+ super().__init__(sequence)
34
+
35
+ def __repr__(self):
36
+ """Represent GeneralSequence as a string for debugging."""
37
+ return f"GeneralSequence(Alphabet({self._alphabet}), " \
38
+ f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
39
+
40
+ def __copy_create__(self):
41
+ return GeneralSequence(self._alphabet)
42
+
43
+ def get_alphabet(self):
44
+ return self._alphabet
45
+
46
+ def as_type(self, sequence):
47
+ """
48
+ Convert the :class:`GeneralSequence` into a sequence of another
49
+ :class:`Sequence` type.
50
+
51
+ This function simply replaces the sequence code of the given
52
+ sequence with the sequence code of this object.
53
+
54
+ Parameters
55
+ ----------
56
+ sequence : Sequence
57
+ The `Sequence` whose sequence code is replaced with the one
58
+ of this object.
59
+ The alphabet must equal or extend the alphabet of this
60
+ object.
61
+
62
+ Returns
63
+ -------
64
+ sequence : Sequence
65
+ The input `sequence` with replaced sequence code.
66
+
67
+ Raises
68
+ ------
69
+ AlphabetError
70
+ If the the :class:`Alphabet` of the input `sequence` does
71
+ not extend the :class:`Alphabet` of this sequence.
72
+ """
73
+ if not sequence.get_alphabet().extends(self._alphabet):
74
+ raise AlphabetError(
75
+ f"The alphabet of '{type(sequence).__name__}' "
76
+ f"is not compatible with the alphabet of this sequence"
77
+ )
78
+ sequence.code = self.code
79
+ return sequence
80
+
81
+ class NucleotideSequence(Sequence):
82
+ """
83
+ Representation of a nucleotide sequence (DNA or RNA).
84
+
85
+ This class may have one of two different alphabets:
86
+ :attr:`unambiguous_alphabet()` contains only the unambiguous DNA
87
+ letters 'A', 'C', 'G' and 'T'.
88
+ :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
89
+ letters.
90
+
91
+ Parameters
92
+ ----------
93
+ sequence : iterable object, optional
94
+ The initial DNA sequence. This may either be a list or a string.
95
+ May take upper or lower case letters.
96
+ By default the sequence is empty.
97
+ ambiguous : bool, optional
98
+ If true, the ambiguous alphabet is used. By default the
99
+ object tries to use the unambiguous alphabet. If this fails due
100
+ ambiguous letters in the sequence, the ambiguous alphabet
101
+ is used.
102
+ """
103
+
104
+ alphabet_unamb = LetterAlphabet(["A","C","G","T"])
105
+ alphabet_amb = LetterAlphabet(
106
+ ["A","C","G","T","R","Y","W","S",
107
+ "M","K","H","B","V","D","N"]
108
+ )
109
+
110
+ compl_symbol_dict = {"A" : "T",
111
+ "C" : "G",
112
+ "G" : "C",
113
+ "T" : "A",
114
+ "M" : "K",
115
+ "R" : "Y",
116
+ "W" : "W",
117
+ "S" : "S",
118
+ "Y" : "R",
119
+ "K" : "M",
120
+ "V" : "B",
121
+ "H" : "D",
122
+ "D" : "H",
123
+ "B" : "V",
124
+ "N" : "N"}
125
+ # List comprehension does not work in this scope
126
+ _compl_symbols = []
127
+ for _symbol in alphabet_amb.get_symbols():
128
+ _compl_symbols.append(compl_symbol_dict[_symbol])
129
+ _compl_alphabet_unamb = LetterAlphabet(_compl_symbols)
130
+ _compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb)
131
+
132
+ def __init__(self, sequence=[], ambiguous=None):
133
+ if isinstance(sequence, str):
134
+ sequence = sequence.upper()
135
+ else:
136
+ sequence = [symbol.upper() for symbol in sequence]
137
+ if ambiguous is None:
138
+ try:
139
+ self._alphabet = NucleotideSequence.alphabet_unamb
140
+ seq_code = self._alphabet.encode_multiple(sequence)
141
+ except AlphabetError:
142
+ self._alphabet = NucleotideSequence.alphabet_amb
143
+ seq_code = self._alphabet.encode_multiple(sequence)
144
+ elif not ambiguous:
145
+ self._alphabet = NucleotideSequence.alphabet_unamb
146
+ seq_code = self._alphabet.encode_multiple(sequence)
147
+ else:
148
+ self._alphabet = NucleotideSequence.alphabet_amb
149
+ seq_code = self._alphabet.encode_multiple(sequence)
150
+ super().__init__()
151
+ self.code = seq_code
152
+
153
+ def __repr__(self):
154
+ """Represent NucleotideSequence as a string for debugging."""
155
+ if self._alphabet == NucleotideSequence.alphabet_amb:
156
+ ambiguous = True
157
+ else:
158
+ ambiguous = False
159
+ return f'NucleotideSequence("{"".join(self.symbols)}", ambiguous={ambiguous})'
160
+
161
+ def __copy_create__(self):
162
+ if self._alphabet == NucleotideSequence.alphabet_amb:
163
+ seq_copy = NucleotideSequence(ambiguous=True)
164
+ else:
165
+ seq_copy = NucleotideSequence(ambiguous=False)
166
+ return seq_copy
167
+
168
+ def get_alphabet(self):
169
+ return self._alphabet
170
+
171
+ def complement(self):
172
+ """
173
+ Get the complement nucleotide sequence.
174
+
175
+ Returns
176
+ -------
177
+ complement : NucleotideSequence
178
+ The complement sequence.
179
+
180
+ Examples
181
+ --------
182
+
183
+ >>> dna_seq = NucleotideSequence("ACGCTT")
184
+ >>> print(dna_seq.complement())
185
+ TGCGAA
186
+ >>> print(dna_seq.reverse().complement())
187
+ AAGCGT
188
+
189
+ """
190
+ # Interpreting the sequence code of this object in the
191
+ # complementary alphabet gives the complementary symbols
192
+ # In order to get the complementary symbols in the original
193
+ # alphabet, the sequence code is mapped from the complementary
194
+ # alphabet into the original alphabet
195
+ compl_code = NucleotideSequence._compl_mapper[self.code]
196
+ return self.copy(compl_code)
197
+
198
+ def translate(self, complete=False, codon_table=None, met_start=False):
199
+ """
200
+ Translate the nucleotide sequence into a protein sequence.
201
+
202
+ If `complete` is true, the entire sequence is translated,
203
+ beginning with the first codon and ending with the last codon,
204
+ even if stop codons occur during the translation.
205
+
206
+ Otherwise this method returns possible ORFs in the
207
+ sequence, even if not stop codon occurs in an ORF.
208
+
209
+ Parameters
210
+ ----------
211
+ complete : bool, optional
212
+ If true, the complete sequence is translated. In this case
213
+ the sequence length must be a multiple of 3.
214
+ Otherwise all ORFs are translated. (Default: False)
215
+ codon_table : CodonTable, optional
216
+ The codon table to be used. By default the default table
217
+ will be used
218
+ (NCBI "Standard" table with "ATG" as single start codon).
219
+ met_start : bool, optional
220
+ If true, the translation starts always with a 'methionine',
221
+ even if the start codon codes for another amino acid.
222
+ Otherwise the translation starts with the amino acid
223
+ the codon codes for. Only applies, if `complete` is false.
224
+ (Default: False)
225
+
226
+ Returns
227
+ -------
228
+ protein : ProteinSequence or list of ProteinSequence
229
+ The translated protein sequence. If `complete` is true,
230
+ only a single :class:`ProteinSequence` is returned. Otherwise
231
+ a list of :class:`ProteinSequence` is returned, which contains
232
+ every ORF.
233
+ pos : list of tuple (int, int)
234
+ Is only returned if `complete` is false. The list contains
235
+ a tuple for each ORF.
236
+ The first element of the tuple is the index of the
237
+ :class:`NucleotideSequence`, where the translation starts.
238
+ The second element is the exclusive stop index, it
239
+ represents the first nucleotide in the
240
+ :class:`NucleotideSequence` after a stop codon.
241
+
242
+ Examples
243
+ --------
244
+
245
+ >>> dna_seq = NucleotideSequence("AATGATGCTATAGAT")
246
+ >>> prot_seq = dna_seq.translate(complete=True)
247
+ >>> print(prot_seq)
248
+ NDAID
249
+ >>> prot_seqs, pos = dna_seq.translate(complete=False)
250
+ >>> for seq in prot_seqs:
251
+ ... print(seq)
252
+ MML*
253
+ ML*
254
+
255
+ """
256
+ if self._alphabet != NucleotideSequence.alphabet_unamb:
257
+ raise AlphabetError("Translation requires unambiguous alphabet")
258
+ # Determine codon_table
259
+ if codon_table is None:
260
+ # Import at this position to avoid circular import
261
+ from .codon import CodonTable
262
+ codon_table = CodonTable.default_table()
263
+
264
+ if complete:
265
+ if len(self) % 3 != 0:
266
+ raise ValueError("Sequence length needs to be a multiple of 3 "
267
+ "for complete translation")
268
+ # Reshape code into (n,3), with n being the amount of codons
269
+ codons = self.code.reshape(-1, 3)
270
+ protein_seq = ProteinSequence()
271
+ protein_seq.code = codon_table.map_codon_codes(codons)
272
+ return protein_seq
273
+
274
+ else:
275
+ stop_code = ProteinSequence.alphabet.encode("*")
276
+ met_code = ProteinSequence.alphabet.encode("M")
277
+ protein_seqs = []
278
+ pos = []
279
+ code = self.code
280
+ # Create all three frames
281
+ for shift in range(3):
282
+ # The frame length is always a multiple of 3
283
+ # If there is a trailing partial codon, remove it
284
+ frame_length = ((len(code) - shift) // 3) * 3
285
+ frame = code[shift : shift+frame_length]
286
+ # Reshape frame into (n,3), with n being the amount of codons
287
+ frame_codons = frame.reshape(-1, 3)
288
+ # At first, translate frame completely
289
+ protein_code = codon_table.map_codon_codes(frame_codons)
290
+ # Iterate over all start codons in this frame
291
+ starts = np.where(codon_table.is_start_codon(frame_codons))[0]
292
+ for start_i in starts:
293
+ # Protein sequence beginning from start codon
294
+ code_from_start = protein_code[start_i:]
295
+ # Get all stop codon positions
296
+ # relative to 'code_from_start'
297
+ stops = np.where(code_from_start == stop_code)[0]
298
+ # Find first stop codon after start codon
299
+ # Include stop -> stops[0] + 1
300
+ stop_i = stops[0] + 1 if len(stops) > 0 \
301
+ else len(code_from_start)
302
+ code_from_start_to_stop = code_from_start[:stop_i]
303
+ prot_seq = ProteinSequence()
304
+ if met_start:
305
+ # Copy as the slice is edited
306
+ prot_seq.code = code_from_start_to_stop.copy()
307
+ prot_seq.code[0] = met_code
308
+ else:
309
+ prot_seq.code = code_from_start_to_stop
310
+ protein_seqs.append(prot_seq)
311
+ # Codon indices are transformed
312
+ # to nucleotide sequence indices
313
+ pos.append((shift + start_i*3, shift + (start_i+stop_i)*3))
314
+ # Sort by start position
315
+ order = np.argsort([start for start, stop in pos])
316
+ pos = [pos[i] for i in order]
317
+ protein_seqs = [protein_seqs[i] for i in order]
318
+ return protein_seqs, pos
319
+
320
+ @staticmethod
321
+ def unambiguous_alphabet():
322
+ """
323
+ Get the unambiguous nucleotide alphabet containing the symbols
324
+ ``A``, ``C``, ``G`` and ``T``.
325
+
326
+ Returns
327
+ -------
328
+ alphabet : LetterAlphabet
329
+ The unambiguous nucleotide alphabet.
330
+ """
331
+ return NucleotideSequence.alphabet_unamb
332
+
333
+ @staticmethod
334
+ def ambiguous_alphabet():
335
+ """
336
+ Get the ambiguous nucleotide alphabet containing the symbols
337
+ ``A``, ``C``, ``G`` and ``T`` and symbols describing
338
+ ambiguous combinations of these.
339
+
340
+ Returns
341
+ -------
342
+ alphabet : LetterAlphabet
343
+ The ambiguous nucleotide alphabet.
344
+ """
345
+ return NucleotideSequence.alphabet_amb
346
+
347
+
348
+ class ProteinSequence(Sequence):
349
+ """
350
+ Representation of a protein sequence.
351
+
352
+ Furthermore this class offers a conversion of amino acids from
353
+ 3-letter code into 1-letter code and vice versa.
354
+
355
+ Parameters
356
+ ----------
357
+ sequence : iterable object, optional
358
+ The initial protein sequence. This may either be a list or a
359
+ string. May take upper or lower case letters. If a list is
360
+ given, the list elements can be 1-letter or 3-letter amino acid
361
+ representations. By default the sequence is empty.
362
+
363
+ Notes
364
+ -----
365
+ The :class:`Alphabet` of this :class:`Sequence` class does not
366
+ support selenocysteine.
367
+ Please convert selenocysteine (``U``) into cysteine (``C``)
368
+ or use a custom :class:`Sequence` class, if the differentiation is
369
+ necessary.
370
+ """
371
+
372
+ _codon_table = None
373
+
374
+ alphabet = LetterAlphabet(["A","C","D","E","F","G","H","I","K","L",
375
+ "M","N","P","Q","R","S","T","V","W","Y",
376
+ "B","Z","X","*"])
377
+
378
+ # Masses are taken from
379
+ # https://web.expasy.org/findmod/findmod_masses.html#AA
380
+
381
+ _mol_weight_average = np.array([
382
+ 71.0788, # A
383
+ 103.1388, # C
384
+ 115.0886, # D
385
+ 129.1155, # E
386
+ 147.1766, # F
387
+ 57.0519, # G
388
+ 137.1411, # H
389
+ 113.1594, # I
390
+ 128.1741, # K
391
+ 113.1594, # L
392
+ 131.1926, # M
393
+ 114.1038, # N
394
+ 97.1167, # P
395
+ 128.1307, # Q
396
+ 156.1875, # R
397
+ 87.0782, # S
398
+ 101.1051, # T
399
+ 99.1326, # V
400
+ 186.2132, # W
401
+ 163.1760, # Y
402
+ np.nan, # B
403
+ np.nan, # Z
404
+ np.nan, # X
405
+ np.nan, # *
406
+ ])
407
+
408
+ _mol_weight_monoisotopic = np.array([
409
+ 71.03711, # A
410
+ 103.00919, # C
411
+ 115.02694, # D
412
+ 129.04259, # E
413
+ 147.06841, # F
414
+ 57.02146, # G
415
+ 137.05891, # H
416
+ 113.08406, # I
417
+ 128.09496, # K
418
+ 113.08406, # L
419
+ 131.04049, # M
420
+ 114.04293, # N
421
+ 97.05276, # P
422
+ 128.05858, # Q
423
+ 156.10111, # R
424
+ 87.03203, # S
425
+ 101.04768, # T
426
+ 99.06841, # V
427
+ 186.07931, # W
428
+ 163.06333, # Y
429
+ np.nan, # B
430
+ np.nan, # Z
431
+ np.nan, # X
432
+ np.nan, # *
433
+ ])
434
+
435
+ _dict_1to3 = {"A" : "ALA",
436
+ "C" : "CYS",
437
+ "D" : "ASP",
438
+ "E" : "GLU",
439
+ "F" : "PHE",
440
+ "G" : "GLY",
441
+ "H" : "HIS",
442
+ "I" : "ILE",
443
+ "K" : "LYS",
444
+ "L" : "LEU",
445
+ "M" : "MET",
446
+ "N" : "ASN",
447
+ "P" : "PRO",
448
+ "Q" : "GLN",
449
+ "R" : "ARG",
450
+ "S" : "SER",
451
+ "T" : "THR",
452
+ "V" : "VAL",
453
+ "W" : "TRP",
454
+ "Y" : "TYR",
455
+ "B" : "ASX",
456
+ "Z" : "GLX",
457
+ "X" : "UNK",
458
+ "*" : " * "}
459
+
460
+ _dict_3to1 = {}
461
+ for _key, _value in _dict_1to3.items():
462
+ _dict_3to1[_value] = _key
463
+ _dict_3to1["SEC"] = "C"
464
+ _dict_3to1["MSE"] = "M"
465
+
466
+ def __init__(self, sequence=()):
467
+ dict_3to1 = ProteinSequence._dict_3to1
468
+ alph = ProteinSequence.alphabet
469
+ # Convert 3-letter codes to single letter codes,
470
+ # if list contains 3-letter codes
471
+ sequence = [dict_3to1[symbol.upper()] if len(symbol) == 3
472
+ else symbol.upper() for symbol in sequence]
473
+ super().__init__(sequence)
474
+
475
+ def __repr__(self):
476
+ """Represent ProteinSequence as a string for debugging."""
477
+ return f'ProteinSequence("{"".join(self.symbols)}")'
478
+
479
+ def get_alphabet(self):
480
+ return ProteinSequence.alphabet
481
+
482
+ def remove_stops(self):
483
+ """
484
+ Remove *stop signals* from the sequence.
485
+
486
+ Returns
487
+ -------
488
+ no_stop : ProteinSequence
489
+ A copy of this sequence without stop signals.
490
+ """
491
+ stop_code = ProteinSequence.alphabet.encode("*")
492
+ no_stop = self.copy()
493
+ seq_code = no_stop.code
494
+ no_stop.code = seq_code[seq_code != stop_code]
495
+ return no_stop
496
+
497
+ @staticmethod
498
+ def convert_letter_3to1(symbol):
499
+ """
500
+ Convert a 3-letter to a 1-letter amino acid representation.
501
+
502
+ Parameters
503
+ ----------
504
+ symbol : string
505
+ 3-letter amino acid representation.
506
+
507
+ Returns
508
+ -------
509
+ convert : string
510
+ 1-letter amino acid representation.
511
+ """
512
+ return ProteinSequence._dict_3to1[symbol.upper()]
513
+
514
+ @staticmethod
515
+ def convert_letter_1to3(symbol):
516
+ """
517
+ Convert a 1-letter to a 3-letter amino acid representation.
518
+
519
+ Parameters
520
+ ----------
521
+ symbol : string
522
+ 1-letter amino acid representation.
523
+
524
+ Returns
525
+ -------
526
+ convert : string
527
+ 3-letter amino acid representation.
528
+ """
529
+ return ProteinSequence._dict_1to3[symbol.upper()]
530
+
531
+ def get_molecular_weight(self, monoisotopic=False):
532
+ """
533
+ Calculate the molecular weight of this protein.
534
+
535
+ Average protein molecular weight is calculated by the addition
536
+ of average isotopic masses of the amino acids
537
+ in the protein and the average isotopic mass of one water
538
+ molecule.
539
+
540
+ Returns
541
+ -------
542
+ weight : float
543
+ Molecular weight of the protein represented by the sequence.
544
+ Molecular weight values are given in Dalton (Da).
545
+ """
546
+ if monoisotopic:
547
+ weight = np.sum(self._mol_weight_monoisotopic[self.code]) + 18.015
548
+ else:
549
+ weight = np.sum(self._mol_weight_average[self.code]) + 18.015
550
+
551
+ if np.isnan(weight):
552
+ raise ValueError(
553
+ "Sequence contains ambiguous amino acids, "
554
+ "cannot calculate weight"
555
+ )
556
+ return weight