biotite 0.41.1__cp312-cp312-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,575 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.io.genbank"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["GenBankFile", "MultiFile"]
8
+
9
+ #import textwrap
10
+ import copy
11
+ #import re
12
+ import io
13
+ from ....file import TextFile, InvalidFileError
14
+ from collections import OrderedDict
15
+ #from ...annotation import Location, Feature, Annotation, AnnotatedSequence
16
+ #from ...seqtypes import NucleotideSequence, ProteinSequence
17
+
18
+
19
+ class GenBankFile(TextFile):
20
+ """
21
+ This class represents a file in GenBank format (including GenPept).
22
+
23
+ A GenBank file annotates a reference sequence with features such as
24
+ positions of genes, promoters, etc.
25
+ Additionally, it provides metadata further describing the file.
26
+
27
+ A file is divided into separate fields, e.g. the *DEFINITION*
28
+ field contains a description of the file.
29
+ The field name starts at the beginning of a line,
30
+ followed by the content.
31
+ A field may contain subfields, whose name is indented.
32
+ For example, the *SOURCE* field contains the *ORGANISM* subfield.
33
+ Some fields may occur multiple times, e.g. the *REFERENCE* field.
34
+ A sample GenBank file can be viewed at
35
+ `<https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html>`_.
36
+
37
+ This class provides a low-level interface for parsing, editing and
38
+ writing GenBank files.
39
+ It works like a list of field entries, where a field consists of the
40
+ field name, the field content and the subfields.
41
+ The field content is separated into the lines belonging to the
42
+ content.
43
+ While the content of metadata fields starts at the standard
44
+ GenBank indentation of 12, the content of the *FEATURES*
45
+ (contains the annotation) and *ORIGIN* (contains the sequence)
46
+ fields starts without indentation.
47
+ The subfields are represented by a dictionary, with subfield names
48
+ being keys and the corresponding lines being values.
49
+ The *FEATURES* and *ORIGIN* fields have no subfields.
50
+
51
+ Every entry can be obtained, set and deleted via the index operator.
52
+
53
+ Notes
54
+ -----
55
+ This class does not support location identifiers with references
56
+ to other Entrez database entries, e.g.
57
+ ``join(1..100,J00194.1:100..202)``.
58
+
59
+ Examples
60
+ --------
61
+ Create a GenBank file from scratch:
62
+
63
+ >>> file = GenBankFile()
64
+ >>> file.append(
65
+ ... "SOMEFIELD", ["One line", "A second line"],
66
+ ... subfields={"SUBFIELD1": ["Single Line"], "SUBFIELD2": ["Two", "lines"]}
67
+ ... )
68
+ >>> print(file)
69
+ SOMEFIELD One line
70
+ A second line
71
+ SUBFIELD1 Single Line
72
+ SUBFIELD2 Two
73
+ lines
74
+ //
75
+ >>> name, content, subfields = file[0]
76
+ >>> print(name)
77
+ SOMEFIELD
78
+ >>> print(content)
79
+ ['One line', 'A second line']
80
+ >>> print(subfields)
81
+ OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])])
82
+
83
+ Adding an additional field:
84
+
85
+ >>> file.insert(0, "OTHERFIELD", ["Another line"])
86
+ >>> print(len(file))
87
+ 2
88
+ >>> print(file)
89
+ OTHERFIELD Another line
90
+ SOMEFIELD One line
91
+ A second line
92
+ SUBFIELD1 Single Line
93
+ SUBFIELD2 Two
94
+ lines
95
+ //
96
+
97
+ Overwriting and deleting an existing field:
98
+
99
+ >>> file[1] = "NEWFIELD", ["Yet another line"]
100
+ >>> print(file)
101
+ OTHERFIELD Another line
102
+ NEWFIELD Yet another line
103
+ //
104
+ >>> file[1] = "NEWFIELD", ["Yet another line"], {"NEWSUB": ["Subfield line"]}
105
+ >>> print(file)
106
+ OTHERFIELD Another line
107
+ NEWFIELD Yet another line
108
+ NEWSUB Subfield line
109
+ //
110
+ >>> del file[1]
111
+ >>> print(file)
112
+ OTHERFIELD Another line
113
+ //
114
+
115
+ Parsing fields from a real GenBank file:
116
+
117
+ >>> import os.path
118
+ >>> file = GenBankFile.read(os.path.join(path_to_sequences, "gg_avidin.gb"))
119
+ >>> print(file)
120
+ LOCUS AJ311647 1224 bp DNA linear VRT 14-NOV-2006
121
+ DEFINITION Gallus gallus AVD gene for avidin, exons 1-4.
122
+ ACCESSION AJ311647
123
+ VERSION AJ311647.1 GI:13397825
124
+ KEYWORDS AVD gene; avidin.
125
+ SOURCE Gallus gallus (chicken)
126
+ ORGANISM Gallus gallus
127
+ Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
128
+ Archelosauria; Archosauria; Dinosauria; Saurischia; Theropoda;
129
+ Coelurosauria; Aves; Neognathae; Galloanserae; Galliformes;
130
+ Phasianidae; Phasianinae; Gallus.
131
+ REFERENCE 1
132
+ AUTHORS Wallen,M.J., Laukkanen,M.O. and Kulomaa,M.S.
133
+ TITLE Cloning and sequencing of the chicken egg-white avidin-encoding
134
+ gene and its relationship with the avidin-related genes Avr1-Avr5
135
+ JOURNAL Gene 161 (2), 205-209 (1995)
136
+ PUBMED 7665080
137
+ REFERENCE 2
138
+ AUTHORS Ahlroth,M.K., Kola,E.H., Ewald,D., Masabanda,J., Sazanov,A.,
139
+ Fries,R. and Kulomaa,M.S.
140
+ TITLE Characterization and chromosomal localization of the chicken avidin
141
+ gene family
142
+ JOURNAL Anim. Genet. 31 (6), 367-375 (2000)
143
+ PUBMED 11167523
144
+ REFERENCE 3 (bases 1 to 1224)
145
+ AUTHORS Ahlroth,M.K.
146
+ TITLE Direct Submission
147
+ JOURNAL Submitted (09-MAR-2001) Ahlroth M.K., Department of Biological and
148
+ Environmental Science, University of Jyvaskyla, PO Box 35,
149
+ FIN-40351 Jyvaskyla, FINLAND
150
+ FEATURES Location/Qualifiers
151
+ source 1..1224
152
+ /organism="Gallus gallus"
153
+ /mol_type="genomic DNA"
154
+ ...
155
+ >>> name, content, _ = file[3]
156
+ >>> print(name)
157
+ VERSION
158
+ >>> print(content)
159
+ ['AJ311647.1 GI:13397825']
160
+ >>> name, content, subfields = file[5]
161
+ >>> print(name)
162
+ SOURCE
163
+ >>> print(content)
164
+ ['Gallus gallus (chicken)']
165
+ >>> print(dict(subfields))
166
+ {'ORGANISM': ['Gallus gallus', 'Eukaryota; Metazoa; Chordata; ...', ...]}
167
+ """
168
+
169
+ def __init__(self):
170
+ super().__init__()
171
+ # Add '//' as general terminator of a GenBank file
172
+ self.lines = ["//"]
173
+ # Field start and stop indices in list of lines
174
+ # and names of categories
175
+ self._field_pos = []
176
+ self._find_field_indices()
177
+
178
+ @classmethod
179
+ def read(cls, file):
180
+ """
181
+ Read a GenBank file.
182
+
183
+ Parameters
184
+ ----------
185
+ file : file-like object or str
186
+ The file to be read.
187
+ Alternatively a file path can be supplied.
188
+
189
+ Returns
190
+ -------
191
+ file_object : GenBankFile
192
+ The parsed file.
193
+ """
194
+ file = super().read(file)
195
+ file._find_field_indices()
196
+ return file
197
+
198
+ def get_fields(self, name):
199
+ """
200
+ Get all *GenBank* fields associated with a given field name.
201
+
202
+ Parameters
203
+ ----------
204
+ name : str
205
+ The field name.
206
+
207
+ Returns
208
+ -------
209
+ fields : list of (list of str, OrderedDict of str -> str)
210
+ A list containing the fields.
211
+ For most field names, the list will only contain one
212
+ element, but fields like *REFERENCE* are an exception.
213
+ Each field is represented by a tuple.
214
+ Each tuple contains as first element the content lines and
215
+ as second element the subfields as dictionary.
216
+ If the field has no subfields, the dictionary is empty.
217
+ """
218
+ indices = self.get_indices(name)
219
+ # Omit the field name
220
+ return [self[i][1:] for i in indices]
221
+
222
+ def get_indices(self, name):
223
+ """
224
+ Get the indices to all *GenBank* fields associated with a given
225
+ field name.
226
+
227
+ Parameters
228
+ ----------
229
+ name : str
230
+ The field name.
231
+
232
+ Returns
233
+ -------
234
+ fields : list of int
235
+ A list of indices.
236
+ For most field names, the list will only contain one
237
+ element, but fields like *REFERENCE* are an exception.
238
+ """
239
+ name = name.upper()
240
+ indices = []
241
+ for i, (_, _, fname) in enumerate(self._field_pos):
242
+ if fname == name:
243
+ indices.append(i)
244
+ return indices
245
+
246
+ def set_field(self, name, content, subfield_dict=None):
247
+ """
248
+ Set a *GenBank* field with the given content.
249
+
250
+ If the field already exists in the file, the field is
251
+ overwritten, otherwise a new field is created at the end of
252
+ the file.
253
+
254
+ Parameters
255
+ ----------
256
+ name : str
257
+ The field name.
258
+ content : list of str
259
+ The content lines.
260
+ subfield_dict : dict of str -> str, optional
261
+ The subfields of the field.
262
+ The dictionary maps subfield names to the content lines of
263
+ the respective subfield.
264
+
265
+ Raises
266
+ ------
267
+ InvalidFileError
268
+ If the field occurs multiple times in the file.
269
+ In this case it is ambiguous which field to overwrite.
270
+ """
271
+ name = name.upper()
272
+ indices = self.get_indices(name)
273
+ if len(indices) > 1:
274
+ raise InvalidFileError(f"File contains multiple '{name}' fields")
275
+ elif len(indices) == 1:
276
+ # Replace existing entry
277
+ index = indices[0]
278
+ self[index] = name, content, subfield_dict
279
+ else:
280
+ # Add new entry as no entry exists yet
281
+ self.append(name, content, subfield_dict)
282
+
283
+ def __getitem__(self, index):
284
+ index = self._translate_idx(index)
285
+ start, stop, name = self._field_pos[index]
286
+
287
+ if name in ["FEATURES", "ORIGIN"]:
288
+ # For those two fields return the complete lines,
289
+ # beginning with the line after the field name
290
+ content = self._get_field_content(start+1, stop, indent=0)
291
+ subfield_dict = OrderedDict()
292
+
293
+ else:
294
+ # For all metadata fields use the
295
+ # standard GenBank indentation (=12)
296
+ # Find subfields
297
+ subfield_dict = OrderedDict()
298
+ subfield_start = None
299
+ first_subfield_start = None
300
+ for i in range(start+1, stop):
301
+ line = self.lines[i]
302
+ # Check if line contains a new subfield
303
+ # (Header beginning from first column)
304
+ if len(line) != 0 and line[:12].strip() != "":
305
+ if first_subfield_start is None:
306
+ first_subfield_start = i
307
+ # Store previous subfield
308
+ if subfield_start is not None:
309
+ subfield_dict[header] = self._get_field_content(
310
+ subfield_start, i, indent=12
311
+ )
312
+ header = line[:12].strip()
313
+ subfield_start = i
314
+ # Store last subfield
315
+ if subfield_start is not None:
316
+ subfield_dict[header] = self._get_field_content(
317
+ subfield_start, stop, indent=12
318
+ )
319
+ # Only include lines in field content,
320
+ # that are not part of a subfield
321
+ if first_subfield_start is not None:
322
+ stop = first_subfield_start
323
+ content = self._get_field_content(
324
+ start, stop, indent=12
325
+ )
326
+
327
+ return name, content, subfield_dict
328
+
329
+ def __setitem__(self, index, item):
330
+ index = self._translate_idx(index)
331
+ if not isinstance(item, tuple):
332
+ raise TypeError(
333
+ "Expected a tuple of name, content and optionally subfields"
334
+ )
335
+ if len(item) == 2:
336
+ name, content = item
337
+ subfields = None
338
+ elif len(item) == 3:
339
+ name, content, subfields = item
340
+ else:
341
+ raise TypeError(
342
+ "Expected a tuple of name, content and optionally subfields"
343
+ )
344
+ inserted_lines = self._to_lines(name, content, subfields)
345
+
346
+ # Stop of field to be replaced is start of new field
347
+ start, old_stop, _ = self._field_pos[index]
348
+ # If not the last element is set,
349
+ # the following lines need to be added, too
350
+ if old_stop is not len(self.lines):
351
+ follow_lines = self.lines[old_stop:]
352
+ else:
353
+ follow_lines = []
354
+ self.lines = self.lines[:start] + inserted_lines + follow_lines
355
+ # Shift the start/stop indices of the following fields
356
+ # by the amount of created fields
357
+ shift = len(inserted_lines) - (old_stop - start)
358
+ for i in range(index+1, len(self._field_pos)):
359
+ old_start, old_stop, fname = self._field_pos[i]
360
+ self._field_pos[i] = old_start+shift, old_stop+shift, fname
361
+ # Add new entry
362
+ self._field_pos[index] = start, start+len(inserted_lines), name.upper()
363
+
364
+ def __delitem__(self, index):
365
+ index = self._translate_idx(index)
366
+ start, stop, _ = self._field_pos[index]
367
+ # Shift the start/stop indices of the following fields
368
+ # by the amount of deleted fields
369
+ shift = stop - start
370
+ for i in range(index, len(self._field_pos)):
371
+ old_start, old_stop, name = self._field_pos[i]
372
+ self._field_pos[i] = old_start-shift, old_stop-shift, name
373
+ del self.lines[start : stop]
374
+ del self._field_pos[index]
375
+
376
+ def __len__(self):
377
+ return len(self._field_pos)
378
+
379
+ def insert(self, index, name, content, subfields=None):
380
+ """
381
+ Insert a *GenBank* field at the given position.
382
+
383
+ Parameters
384
+ ----------
385
+ index : int
386
+ The new field is inserted before the current field at this
387
+ index.
388
+ If the index is after the last field, the new field
389
+ is appended to the end of the file.
390
+ name : str
391
+ The field name.
392
+ content : list of str
393
+ The content lines.
394
+ subfield_dict : dict of str -> str, optional
395
+ The subfields of the field.
396
+ The dictionary maps subfield names to the content lines of
397
+ the respective subfield.
398
+ """
399
+ index = self._translate_idx(index, length_exclusive=False)
400
+ inserted_lines = self._to_lines(name, content, subfields)
401
+
402
+ # Stop of previous field is start of new field
403
+ if index == 0:
404
+ start = 0
405
+ else:
406
+ _, start, _ = self._field_pos[index-1]
407
+ # If the new lines are not inserted at the end,
408
+ # the following lines need to be added, too
409
+ if start is not len(self.lines):
410
+ follow_lines = self.lines[start:]
411
+ else:
412
+ follow_lines = []
413
+ self.lines = self.lines[:start] + inserted_lines + follow_lines
414
+ # Shift the start/stop indices of the following fields
415
+ # by the amount of created fields
416
+ shift = len(inserted_lines)
417
+ for i in range(index, len(self._field_pos)):
418
+ old_start, old_stop, fname = self._field_pos[i]
419
+ self._field_pos[i] = old_start+shift, old_stop+shift, fname
420
+ # Add new entry
421
+ self._field_pos.insert(
422
+ index,
423
+ (start, start+len(inserted_lines), name.upper())
424
+ )
425
+
426
+ def append(self, name, content, subfields=None):
427
+ """
428
+ Create a new *GenBank* field at the end of the file.
429
+
430
+ Parameters
431
+ ----------
432
+ name : str
433
+ The field name.
434
+ content : list of str
435
+ The content lines.
436
+ subfield_dict : dict of str -> str, optional
437
+ The subfields of the field.
438
+ The dictionary maps subfield names to the content lines of
439
+ the respective subfield.
440
+ """
441
+ self.insert(len(self), name, content, subfields)
442
+
443
+
444
+ def _find_field_indices(self):
445
+ """
446
+ Identify the start and exclusive stop indices of lines
447
+ corresponding to a field name for all fields in the file.
448
+ """
449
+ start = None
450
+ name = ""
451
+ self._field_pos = []
452
+ for i, line in enumerate(self.lines):
453
+ # Check if line contains a new major field
454
+ # (Header beginning from first column)
455
+ if len(line) != 0 and line[0] != " ":
456
+ if line[:2] != "//":
457
+ stop = i
458
+ if start is not None:
459
+ # Store previous field
460
+ self._field_pos.append((start, stop, name))
461
+ start = i
462
+ name = line[0:12].strip()
463
+ else:
464
+ # '//' means end of file
465
+ # -> Store last field
466
+ if start is not None:
467
+ stop = i
468
+ self._field_pos.append((start, stop, name))
469
+
470
+ def _get_field_content(self, start, stop, indent):
471
+ if indent == 0:
472
+ return self.lines[start : stop]
473
+ else:
474
+ return [line[12:] for line in self.lines[start : stop]]
475
+
476
+ def _to_lines(self, name, content, subfields):
477
+ """
478
+ Convert the field name, field content und subfield dictionary
479
+ into text lines
480
+ """
481
+ if subfields is None:
482
+ subfields = {}
483
+
484
+ name = name.strip().upper()
485
+ if len(name) == 0:
486
+ raise ValueError(f"Must give a non emtpy name")
487
+ subfields = OrderedDict({
488
+ subfield_name.upper().strip() : subfield_lines
489
+ for subfield_name, subfield_lines in subfields.items()
490
+ })
491
+
492
+ # Create lines for new field
493
+ if name == "FEATURES":
494
+ # Header line plus all actual feature lines
495
+ lines = copy.copy(content)
496
+ lines.insert(
497
+ 0, "FEATURES" + " "*13 + "Location/Qualifiers"
498
+ )
499
+ elif name == "ORIGIN":
500
+ # Header line plus all actual sequence lines
501
+ lines = copy.copy(content)
502
+ lines.insert(0, "ORIGIN")
503
+ else:
504
+ name_column = []
505
+ content_column = []
506
+ # Create a line for the field name and empty lines
507
+ # for each additional line required by the content
508
+ name_column += [name] + [""] * (len(content)-1)
509
+ content_column += content
510
+ for subfield_name, subfield_lines in subfields.items():
511
+ name_column += [" " + subfield_name] \
512
+ + [""] * (len(subfield_lines)-1)
513
+ content_column += subfield_lines
514
+ lines = [f"{n_col:12}{c_col}" for n_col, c_col
515
+ in zip(name_column, content_column)]
516
+
517
+ return lines
518
+
519
+
520
+ def _translate_idx(self, index, length_exclusive=True):
521
+ """
522
+ Check index boundaries and convert negative index to positive
523
+ index.
524
+ """
525
+ if index < 0:
526
+ new_index = len(self) + index
527
+ else:
528
+ new_index = index
529
+ if length_exclusive:
530
+ if new_index >= len(self):
531
+ raise IndexError(f"Index {index} is out of range")
532
+ else:
533
+ if new_index > len(self):
534
+ raise IndexError(f"Index {index} is out of range")
535
+ return new_index
536
+
537
+
538
+ class MultiFile(TextFile):
539
+ """
540
+ This class represents a file in *GenBank* or *GenPept* format,
541
+ that contains multiple entries, for more than one UID.
542
+
543
+ The information for each UID are appended to each other in such a
544
+ file.
545
+ Objects of this class can be iterated to obtain a
546
+ :class:`GenBankFile` for each entry in the file.
547
+
548
+ Examples
549
+ --------
550
+
551
+ >>> import os.path
552
+ >>> file_name = fetch_single_file(
553
+ ... ["1L2Y_A", "3O5R_A", "5UGO_A"],
554
+ ... os.path.join(path_to_directory, "multifile.gp"),
555
+ ... "protein", "gp"
556
+ ... )
557
+ >>> multi_file = MultiFile.read(file_name)
558
+ >>> for gp_file in multi_file:
559
+ ... print(get_accession(gp_file))
560
+ 1L2Y_A
561
+ 3O5R_A
562
+ 5UGO_A
563
+ """
564
+
565
+ def __iter__(self):
566
+ start_i = 0
567
+ for i in range(len(self.lines)):
568
+ line = self.lines[i]
569
+ if line.strip() == "//":
570
+ # Create file with lines corresponding to that file
571
+ file_content = "\n".join(self.lines[start_i : i+1])
572
+ file = GenBankFile.read(io.StringIO(file_content))
573
+ # Reset file start index
574
+ start_i = i
575
+ yield file