biotite 0.41.1__cp312-cp312-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,324 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ Functions for obtaining metadata fields of a GenBank file.
7
+ """
8
+
9
+ __name__ = "biotite.sequence.io.genbank"
10
+ __author__ = "Patrick Kunzmann, Natasha Jaffe"
11
+ __all__ = ["get_locus", "get_definition", "get_accession", "get_version",
12
+ "get_gi", "get_db_link", "get_source",
13
+ "set_locus"]
14
+
15
+ from ....file import InvalidFileError
16
+ from .file import GenBankFile
17
+
18
+ def get_locus(gb_file):
19
+ """
20
+ Parse the *LOCUS* field of a GenBank or GenPept file.
21
+
22
+ Parameters
23
+ ----------
24
+ gb_file : GenBankFile
25
+ The GenBank file to read the *LOCUS* field from.
26
+
27
+ Returns
28
+ -------
29
+ name : str
30
+ The locus name.
31
+ length : int
32
+ Sequence length.
33
+ mol_type : str, optional
34
+ The molecule type.
35
+ Usually one of ``'DNA'``, ``'RNA'``, ``'Protein'`` or ``''``.
36
+ is_circular : bool, optional
37
+ True, if the sequence is circular, false otherwise.
38
+ division : str, optional
39
+ The GenBank division to which the file belongs.
40
+ date : str, optional
41
+ The date of last modification.
42
+
43
+ Examples
44
+ --------
45
+
46
+ >>> import os.path
47
+ >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
48
+ >>> name, length, mol_type, is_circular, division, date = get_locus(file)
49
+ >>> print(name)
50
+ CP001509
51
+ >>> print(length)
52
+ 4558953
53
+ >>> print(mol_type)
54
+ DNA
55
+ >>> print(is_circular)
56
+ True
57
+ >>> print(division)
58
+ BCT
59
+ >>> print(date)
60
+ 16-FEB-2017
61
+ """
62
+ lines, _ = _expect_single_field(gb_file, "LOCUS")
63
+ # 'LOCUS' field has only one line
64
+ locus_info = lines[0]
65
+
66
+ fields = str(locus_info).split()
67
+
68
+ # The first field will always be the ID
69
+ name = fields[0]
70
+
71
+ # The second field will always be the length followed
72
+ # by units (eg 1224 aa)
73
+ length = int(fields[1])
74
+
75
+ # The third field *should* be the molecular type
76
+ # but sometimes this is missing. This gets tricky
77
+ # because sometimes the next field, circular/linear,
78
+ # is missing, too. The field after that, division,
79
+ # is a 3 letter all caps token. Unfortunately, mol_type
80
+ # is also often a 3 letter all caps token (eg DNA)!
81
+ # Fortunately, GenBank publishes the set list of divisions
82
+ # here: https://www.ncbi.nlm.nih.gov/genbank/samplerecord ,
83
+ # so we can check against that set when determining whether
84
+ # the current token represents the molecular type.
85
+ divisions = (
86
+ 'PRI', # primate sequences
87
+ 'ROD', # rodent sequences
88
+ 'MAM', # other mammalian sequences
89
+ 'VRT', # other vertebrate sequences
90
+ 'INV', # invertebrate sequences
91
+ 'PLN', # plant, fungal, and algal sequences
92
+ 'BCT', # bacterial sequences
93
+ 'VRL', # viral sequences
94
+ 'PHG', # bacteriophage sequences
95
+ 'SYN', # synthetic sequences
96
+ 'UNA', # unannotated sequences
97
+ 'EST', # EST sequences (expressed sequence tags)
98
+ 'PAT', # patent sequences
99
+ 'STS', # STS sequences (sequence tagged sites)
100
+ 'GSS', # GSS sequences (genome survey sequences)
101
+ 'HTG', # HTG sequences (high-throughput genomic sequences)
102
+ 'HTC', # unfinished high-throughput cDNA sequencing
103
+ 'ENV', # environmental sampling sequences
104
+ 'CON',
105
+ )
106
+
107
+ # NOTE: Remember that fields[2] is the unit for length,
108
+ # eg bp or aa, so we move to fields[3] here.
109
+ if fields[3] not in ('linear', 'circular') \
110
+ and fields[3] not in divisions:
111
+ mol_type = fields[3]
112
+ next_idx = 4
113
+ else:
114
+ mol_type = None
115
+ next_idx = 3
116
+
117
+
118
+ # The next field should be the token 'linear' or 'circular',
119
+ # but sometimes this is missing
120
+ if 'linear' == fields[next_idx]:
121
+ is_circular = False
122
+ next_idx += 1
123
+ elif 'circular' == fields[next_idx]:
124
+ is_circular = True
125
+ next_idx += 1
126
+ else:
127
+ is_circular = False
128
+
129
+ # The next field should be the division
130
+ if fields[next_idx] in divisions:
131
+ division = fields[next_idx]
132
+ next_idx += 1
133
+
134
+ # The last field is a date in the format DD-M-YYYY
135
+ date = fields[next_idx]
136
+
137
+ return name, length, mol_type, is_circular, division, date
138
+
139
+ def get_definition(gb_file):
140
+ """
141
+ Parse the *DEFINITION* field of a GenBank or GenPept file.
142
+
143
+ Parameters
144
+ ----------
145
+ gb_file : GenBankFile
146
+ The GenBank file to read the *DEFINITION* field from.
147
+
148
+ Returns
149
+ -------
150
+ definition : str
151
+ Content of the *DEFINITION* field.
152
+
153
+ Examples
154
+ --------
155
+
156
+ >>> import os.path
157
+ >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
158
+ >>> print(get_definition(file))
159
+ Escherichia coli BL21(DE3), complete genome.
160
+ """
161
+ lines, _ = _expect_single_field(gb_file, "DEFINITION")
162
+ return " ".join([line.strip() for line in lines])
163
+
164
+ def get_accession(gb_file):
165
+ """
166
+ Parse the *ACCESSION* field of a GenBank or GenPept file.
167
+
168
+ Parameters
169
+ ----------
170
+ gb_file : GenBankFile
171
+ The GenBank file to read the *ACCESSION* field from.
172
+
173
+ Returns
174
+ -------
175
+ accession : str
176
+ The accession ID of the file.
177
+
178
+ Examples
179
+ --------
180
+
181
+ >>> import os.path
182
+ >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
183
+ >>> print(get_accession(file))
184
+ CP001509
185
+ """
186
+ lines, _ = _expect_single_field(gb_file, "ACCESSION")
187
+ # 'ACCESSION' field has only one line
188
+ return lines[0]
189
+
190
+ def get_version(gb_file):
191
+ """
192
+ Parse the version from the *VERSION* field of a GenBank or GenPept
193
+ file.
194
+
195
+ Parameters
196
+ ----------
197
+ gb_file : GenBankFile
198
+ The GenBank file to read the *VERSION* field from.
199
+
200
+ Returns
201
+ -------
202
+ version : str
203
+ Content of the *VERSION* field. Does not include GI.
204
+ """
205
+ lines, _ = _expect_single_field(gb_file, "VERSION")
206
+ # 'VERSION' field has only one line
207
+ return lines[0].split()[0]
208
+
209
+ def get_gi(gb_file):
210
+ """
211
+ Parse the GI from the *VERSION* field of a GenBank or GenPept
212
+ file.
213
+
214
+ Parameters
215
+ ----------
216
+ gb_file : GenBankFile
217
+ The GenBank file to read the *VERSION* field from.
218
+
219
+ Returns
220
+ -------
221
+ gi : str
222
+ The GI of the file.
223
+ """
224
+ lines, _ = _expect_single_field(gb_file, "VERSION")
225
+ # 'VERSION' field has only one line
226
+ version_info = lines[0].split()
227
+ if len(version_info) < 2 or "GI" not in version_info[1]:
228
+ raise InvalidFileError("File does not contain GI")
229
+ # Truncate GI
230
+ return int(version_info[1][3:])
231
+
232
+ def get_db_link(gb_file):
233
+ """
234
+ Parse the *DBLINK* field of a GenBank or GenPept file.
235
+
236
+ Parameters
237
+ ----------
238
+ gb_file : GenBankFile
239
+ The GenBank file to read the *DBLINK* field from.
240
+
241
+ Returns
242
+ -------
243
+ link_dict : dict
244
+ A dictionary storing the database links, with the database
245
+ name as key, and the corresponding ID as value.
246
+
247
+ Examples
248
+ --------
249
+
250
+ >>> import os.path
251
+ >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
252
+ >>> for key, val in get_db_link(file).items():
253
+ ... print(key, ":", val)
254
+ BioProject : PRJNA20713
255
+ BioSample : SAMN02603478
256
+ """
257
+ lines, _ = _expect_single_field(gb_file, "DBLINK")
258
+ link_dict = {}
259
+ for line in lines:
260
+ key, value = line.split(":")
261
+ link_dict[key.strip()] = value.strip()
262
+ return link_dict
263
+
264
+
265
+ def get_source(gb_file):
266
+ """
267
+ Parse the *SOURCE* field of a GenBank or GenPept file.
268
+
269
+ Parameters
270
+ ----------
271
+ gb_file : GenBankFile
272
+ The GenBank file to read the *SOURCE* field from.
273
+
274
+ Returns
275
+ -------
276
+ accession : str
277
+ The name of the source organism.
278
+ """
279
+ lines, _ = _expect_single_field(gb_file, "SOURCE")
280
+ # 'SOURCE' field has only one line
281
+ return lines[0]
282
+
283
+
284
+ def _expect_single_field(gb_file, name):
285
+ fields = gb_file.get_fields(name)
286
+ if len(fields) == 0:
287
+ raise InvalidFileError(f"File has no '{name}' field")
288
+ if len(fields) > 1:
289
+ raise InvalidFileError(f"File has multiple '{name}' fields")
290
+ return fields[0]
291
+
292
+
293
+
294
+ def set_locus(gb_file, name, length, mol_type=None, is_circular=False,
295
+ division=None, date=None):
296
+ """
297
+ Set the *LOCUS* field of a GenBank file.
298
+
299
+ Parameters
300
+ ----------
301
+ gb_file : GenBankFile
302
+ The GenBank file to be edited.
303
+ name : str
304
+ The locus name.
305
+ length : int
306
+ Sequence length.
307
+ mol_type : str, optional
308
+ The molecule type.
309
+ Usually one of ``'DNA'``, ``'RNA'``, ``'Protein'`` or ``''``.
310
+ is_circular : bool, optional
311
+ True, if the sequence is circular, false otherwise.
312
+ division : str, optional
313
+ The GenBank division to which the file belongs.
314
+ date : str, optional
315
+ The date of last modification.
316
+ """
317
+ mol_type = "" if mol_type is None else mol_type
318
+ restype_abbr = "aa" if mol_type in ["", "Protein"] else "bp"
319
+ circularity = "circular" if is_circular else "linear"
320
+ division = "" if division is None else division
321
+ date = "" if date is None else date
322
+ line = f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " \
323
+ f"{circularity:8} {division:3} {date:11}"
324
+ gb_file.set_field("LOCUS", [line])
@@ -0,0 +1,172 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ Functions for converting a sequence from/to a GenBank file.
7
+ """
8
+
9
+ __name__ = "biotite.sequence.io.genbank"
10
+ __author__ = "Patrick Kunzmann"
11
+ __all__ = ["get_raw_sequence", "get_sequence", "get_annotated_sequence",
12
+ "set_sequence", "set_annotated_sequence"]
13
+
14
+ import re
15
+ from ....file import InvalidFileError
16
+ from ...seqtypes import ProteinSequence, NucleotideSequence
17
+ from ...annotation import AnnotatedSequence
18
+ from .file import GenBankFile
19
+ from .annotation import get_annotation, set_annotation
20
+
21
+
22
+ _SYMBOLS_PER_CHUNK = 10
23
+ _SEQ_CHUNKS_PER_LINE = 6
24
+ _SYMBOLS_PER_LINE = _SYMBOLS_PER_CHUNK * _SEQ_CHUNKS_PER_LINE
25
+
26
+
27
+ def get_raw_sequence(gb_file):
28
+ """
29
+ Get the raw sequence string from the *ORIGIN* field
30
+ of a GenBank file.
31
+
32
+ Parameters
33
+ ----------
34
+ gb_file : GenBankFile
35
+ The GenBank file to read the *ORIGIN* field from.
36
+
37
+ Returns
38
+ -------
39
+ seq_str: str
40
+ The unaltered sequence as string.
41
+ Sequence positions and whitespace characters are removed.
42
+ """
43
+ fields = gb_file.get_fields("ORIGIN")
44
+ if len(fields) == 0:
45
+ raise InvalidFileError("File has no 'ORIGIN' field")
46
+ if len(fields) > 1:
47
+ raise InvalidFileError("File has multiple 'ORIGIN' fields")
48
+ lines, _ = fields[0]
49
+ return _field_to_seq_string(lines)
50
+
51
+
52
+ def get_sequence(gb_file, format="gb"):
53
+ """
54
+ Get the sequence from the *ORIGIN* field of a GenBank file.
55
+
56
+ Parameters
57
+ ----------
58
+ gb_file : GenBankFile
59
+ The GenBank file to read the *ORIGIN* field from.
60
+ format : {'gb', 'gp'}
61
+ Indicates whether the file is a GenBank or a GenPept file.
62
+ Depending on this parameter a :class:`NucleotideSequence` or a
63
+ :class:`ProteinSequence` is returned.
64
+
65
+ Returns
66
+ -------
67
+ sequence : NucleotideSequence or ProteinSequence
68
+ The reference sequence in the file.
69
+ """
70
+ return _convert_seq_str(get_raw_sequence(gb_file), format)
71
+
72
+
73
+ def get_annotated_sequence(gb_file, format="gb", include_only=None):
74
+ """
75
+ Get an annotated sequence by combining the *ANNOTATION* and
76
+ *ORIGIN* fields of a GenBank file.
77
+
78
+ Parameters
79
+ ----------
80
+ gb_file : GenBankFile
81
+ The GenBank file to read the fields from.
82
+ include_only : iterable object of str, optional
83
+ List of names of feature keys, which should included
84
+ in the annotation. By default all features are included.
85
+
86
+ Returns
87
+ -------
88
+ annot_seq : AnnotatedSequence
89
+ The annotated sequence.
90
+ """
91
+ fields = gb_file.get_fields("ORIGIN")
92
+ if len(fields) == 0:
93
+ raise InvalidFileError("File has no 'ORIGIN' field")
94
+ if len(fields) > 1:
95
+ raise InvalidFileError("File has multiple 'ORIGIN' fields")
96
+ lines, _ = fields[0]
97
+ sequence = _convert_seq_str(_field_to_seq_string(lines), format)
98
+ seq_start = _get_seq_start(lines)
99
+ annotation = get_annotation(gb_file, include_only)
100
+ return AnnotatedSequence(annotation, sequence, sequence_start=seq_start)
101
+
102
+
103
+ def _field_to_seq_string(origin_content):
104
+ seq_str = "".join(origin_content)
105
+ # Remove numbers and emtpy spaces
106
+ regex = re.compile("[0-9]| ")
107
+ seq_str = regex.sub("", seq_str)
108
+ return seq_str
109
+
110
+
111
+ def _convert_seq_str(seq_str, format):
112
+ if len(seq_str) == 0:
113
+ raise InvalidFileError("The file's 'ORIGIN' field is empty")
114
+ if format == "gb":
115
+ return NucleotideSequence(seq_str.replace("U","T").replace("X","N"))
116
+ elif format == "gp":
117
+ return ProteinSequence(seq_str.replace("U", "C").replace("O", "K"))
118
+ else:
119
+ raise ValueError(f"Unknown format '{format}'")
120
+
121
+
122
+ def _get_seq_start(origin_content):
123
+ # Start of sequence is the sequence position indicator
124
+ # at the beginning of the first line
125
+ return int(origin_content[0].split()[0])
126
+
127
+
128
+
129
+
130
+ def set_sequence(gb_file, sequence, sequence_start=1):
131
+ """
132
+ Set the *ORIGIN* field of a GenBank file with a sequence.
133
+
134
+ Parameters
135
+ ----------
136
+ gb_file : GenBankFile
137
+ The GenBank file to be edited.
138
+ sequence : str or NucleotideSequence or ProteinSequence
139
+ The sequence that is put into the GenBank file.
140
+ sequence_start : int, optional
141
+ The number of the first base of the sequence.
142
+ """
143
+ lines = []
144
+ seq_str = str(sequence).lower()
145
+ line = "{:>9d}".format(sequence_start)
146
+ for i in range(0, len(sequence), _SYMBOLS_PER_CHUNK):
147
+ # New line after 5 sequence chunks
148
+ if i != 0 and i % _SYMBOLS_PER_LINE == 0:
149
+ lines.append(line)
150
+ line = "{:>9d}".format(sequence_start + i)
151
+ line += " " + str(seq_str[i : i + _SYMBOLS_PER_CHUNK])
152
+ # Append last line
153
+ lines.append(line)
154
+ gb_file.set_field("ORIGIN", lines)
155
+
156
+
157
+ def set_annotated_sequence(gb_file, annot_sequence):
158
+ """
159
+ Set the *FEATURES* and *ORIGIN* fields of a GenBank file with the
160
+ annotation and sequence of an annotated sequence.
161
+
162
+ Parameters
163
+ ----------
164
+ gb_file : GenBankFile
165
+ The GenBank file to be edited.
166
+ annot_sequence : AnnotatedSequence
167
+ The annotated sequence that is put into the GenBank file.
168
+ """
169
+ set_annotation(gb_file, annot_sequence.annotation)
170
+ set_sequence(
171
+ gb_file, annot_sequence.sequence, annot_sequence.sequence_start
172
+ )
@@ -0,0 +1,192 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module contains a convenience function for loading sequences from
7
+ general sequence files.
8
+ """
9
+
10
+ __name__ = "biotite.sequence.io"
11
+ __author__ = "Patrick Kunzmann"
12
+ __all__ = ["load_sequence", "save_sequence",
13
+ "load_sequences", "save_sequences"]
14
+
15
+ import itertools
16
+ import os.path
17
+ import io
18
+ from collections import OrderedDict
19
+ import numpy as np
20
+ from ..seqtypes import NucleotideSequence, ProteinSequence
21
+ from ..alphabet import Alphabet
22
+
23
+
24
+ def load_sequence(file_path):
25
+ """
26
+ Load a sequence from a sequence file without the need
27
+ to manually instantiate a :class:`File` object.
28
+
29
+ Internally this function uses a :class:`File` object, based on the
30
+ file extension.
31
+
32
+ Parameters
33
+ ----------
34
+ file_path : str
35
+ The path to the sequence file.
36
+
37
+ Returns
38
+ -------
39
+ sequence : Sequence
40
+ The first sequence in the file.
41
+ """
42
+ # We only need the suffix here
43
+ filename, suffix = os.path.splitext(file_path)
44
+ if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
45
+ from .fasta import FastaFile, get_sequence
46
+ file = FastaFile.read(file_path)
47
+ return get_sequence(file)
48
+ elif suffix in [".fastq", ".fq"]:
49
+ from .fastq import FastqFile
50
+ # Quality scores are irrelevant for this function
51
+ # -> Offset is irrelevant
52
+ file = FastqFile.read(file_path, offset="Sanger")
53
+ # Get first sequence
54
+ for seq_str, scores in file.values():
55
+ sequence = NucleotideSequence(seq_str)
56
+ break
57
+ return sequence
58
+ elif suffix in [".gb", ".gbk", ".gp"]:
59
+ from .genbank import GenBankFile, get_sequence
60
+ format = "gp" if suffix == ".gp" else "gb"
61
+ file = GenBankFile.read(file_path)
62
+ return get_sequence(file, format)
63
+ else:
64
+ raise ValueError(f"Unknown file format '{suffix}'")
65
+
66
+
67
+ def save_sequence(file_path, sequence):
68
+ """
69
+ Save a sequence into a sequence file without the need
70
+ to manually instantiate a :class:`File` object.
71
+
72
+ Internally this function uses a :class:`File` object, based on the
73
+ given file extension.
74
+
75
+ Parameters
76
+ ----------
77
+ file_path : str
78
+ The path to structure file.
79
+ sequence : Sequence
80
+ The sequence to be saved.
81
+ """
82
+ # We only need the suffix here
83
+ filename, suffix = os.path.splitext(file_path)
84
+ if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
85
+ from .fasta import FastaFile, set_sequence
86
+ file = FastaFile()
87
+ set_sequence(file, sequence)
88
+ file.write(file_path)
89
+ elif suffix in [".fastq", ".fq"]:
90
+ from .fastq import FastqFile
91
+ # Quality scores are irrelevant for this function
92
+ # -> Offset is irrelevant
93
+ file = FastqFile(offset="Sanger")
94
+ # Scores are set to 0 since no score information is supplied
95
+ scores = np.zeros(len(sequence))
96
+ file["sequence"] = str(sequence), scores
97
+ file.write(file_path)
98
+ elif suffix in [".gb", ".gbk", ".gp"]:
99
+ from .genbank import GenBankFile, set_locus, set_sequence
100
+ file = GenBankFile()
101
+ set_locus(file, "sequence", len(sequence))
102
+ set_sequence(file, sequence)
103
+ file.write(file_path)
104
+ else:
105
+ raise ValueError(f"Unknown file format '{suffix}'")
106
+
107
+
108
+ def load_sequences(file_path):
109
+ """
110
+ Load multiple sequences from a sequence file without the need
111
+ to manually instantiate a :class:`File` object.
112
+
113
+ Internally this function uses a :class:`File` object, based on the
114
+ file extension.
115
+
116
+ Parameters
117
+ ----------
118
+ file_path : str
119
+ The path to the sequence file.
120
+
121
+ Returns
122
+ -------
123
+ sequences : dict of (str, Sequence)
124
+ The sequences in the file.
125
+ This dictionary maps each header name to
126
+ the respective sequence.
127
+ """
128
+ # We only need the suffix here
129
+ filename, suffix = os.path.splitext(file_path)
130
+ if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
131
+ from .fasta import FastaFile, get_sequences
132
+ file = FastaFile.read(file_path)
133
+ return get_sequences(file)
134
+ elif suffix in [".fastq", ".fq"]:
135
+ from .fastq import FastqFile
136
+ # Quality scores are irrelevant for this function
137
+ # -> Offset is irrelevant
138
+ file = FastqFile.read(file_path, offset="Sanger")
139
+ return {identifier : NucleotideSequence(seq_str)
140
+ for identifier, (seq_str, scores) in file.items()}
141
+ elif suffix in [".gb", ".gbk", ".gp"]:
142
+ from .genbank import MultiFile, get_definition, get_sequence
143
+ file = MultiFile.read(file_path)
144
+ format = "gp" if suffix == ".gp" else "gb"
145
+ sequences = OrderedDict()
146
+ for f in file:
147
+ sequences[get_definition(f)] = get_sequence(f, format)
148
+ return sequences
149
+ else:
150
+ raise ValueError(f"Unknown file format '{suffix}'")
151
+
152
+
153
+ def save_sequences(file_path, sequences):
154
+ """
155
+ Save multiple sequences into a sequence file without the need
156
+ to manually instantiate a :class:`File` object.
157
+
158
+ Internally this function uses a :class:`File` object, based on the
159
+ given file extension.
160
+
161
+ Parameters
162
+ ----------
163
+ file_path : str
164
+ The path to structure file.
165
+ sequences : dict of (str, Sequence)
166
+ The sequences to be saved. The dictionary maps a header name
167
+ to asequence.
168
+ """
169
+ # We only need the suffix here
170
+ filename, suffix = os.path.splitext(file_path)
171
+ if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
172
+ from .fasta import FastaFile, set_sequences
173
+ file = FastaFile()
174
+ set_sequences(file, sequences)
175
+ file.write(file_path)
176
+ elif suffix in [".fastq", ".fq"]:
177
+ from .fastq import FastqFile
178
+ # Quality scores are irrelevant for this function
179
+ # -> Offset is irrelevant
180
+ file = FastqFile(offset="Sanger")
181
+ for identifier, sequence in sequences.items():
182
+ # Scores are set to 0 since no score information is supplied
183
+ scores = np.zeros(len(sequence))
184
+ file["identifer"] = str(sequence), scores
185
+ file.write(file_path)
186
+ elif suffix in [".gb", ".gbk", ".gp"]:
187
+ raise NotImplementedError(
188
+ "Writing GenBank files containing multiple records is currently "
189
+ "not supported"
190
+ )
191
+ else:
192
+ raise ValueError(f"Unknown file format '{suffix}'")