biotite 1.5.0__cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-313-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.io.genbank"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["GenBankFile", "MultiFile"]
|
|
8
|
+
|
|
9
|
+
# import textwrap
|
|
10
|
+
import copy
|
|
11
|
+
|
|
12
|
+
# import re
|
|
13
|
+
import io
|
|
14
|
+
from collections import OrderedDict
|
|
15
|
+
from biotite.file import InvalidFileError, TextFile
|
|
16
|
+
|
|
17
|
+
# from ...annotation import Location, Feature, Annotation, AnnotatedSequence
|
|
18
|
+
# from ...seqtypes import NucleotideSequence, ProteinSequence
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GenBankFile(TextFile):
|
|
22
|
+
"""
|
|
23
|
+
This class represents a file in GenBank format (including GenPept).
|
|
24
|
+
|
|
25
|
+
A GenBank file annotates a reference sequence with features such as
|
|
26
|
+
positions of genes, promoters, etc.
|
|
27
|
+
Additionally, it provides metadata further describing the file.
|
|
28
|
+
|
|
29
|
+
A file is divided into separate fields, e.g. the *DEFINITION*
|
|
30
|
+
field contains a description of the file.
|
|
31
|
+
The field name starts at the beginning of a line,
|
|
32
|
+
followed by the content.
|
|
33
|
+
A field may contain subfields, whose name is indented.
|
|
34
|
+
For example, the *SOURCE* field contains the *ORGANISM* subfield.
|
|
35
|
+
Some fields may occur multiple times, e.g. the *REFERENCE* field.
|
|
36
|
+
A sample GenBank file can be viewed at
|
|
37
|
+
`<https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html>`_.
|
|
38
|
+
|
|
39
|
+
This class provides a low-level interface for parsing, editing and
|
|
40
|
+
writing GenBank files.
|
|
41
|
+
It works like a list of field entries, where a field consists of the
|
|
42
|
+
field name, the field content and the subfields.
|
|
43
|
+
The field content is separated into the lines belonging to the
|
|
44
|
+
content.
|
|
45
|
+
While the content of metadata fields starts at the standard
|
|
46
|
+
GenBank indentation of 12, the content of the *FEATURES*
|
|
47
|
+
(contains the annotation) and *ORIGIN* (contains the sequence)
|
|
48
|
+
fields starts without indentation.
|
|
49
|
+
The subfields are represented by a dictionary, with subfield names
|
|
50
|
+
being keys and the corresponding lines being values.
|
|
51
|
+
The *FEATURES* and *ORIGIN* fields have no subfields.
|
|
52
|
+
|
|
53
|
+
Every entry can be obtained, set and deleted via the index operator.
|
|
54
|
+
|
|
55
|
+
Notes
|
|
56
|
+
-----
|
|
57
|
+
This class does not support location identifiers with references
|
|
58
|
+
to other Entrez database entries, e.g.
|
|
59
|
+
``join(1..100,J00194.1:100..202)``.
|
|
60
|
+
|
|
61
|
+
Examples
|
|
62
|
+
--------
|
|
63
|
+
Create a GenBank file from scratch:
|
|
64
|
+
|
|
65
|
+
>>> file = GenBankFile()
|
|
66
|
+
>>> file.append(
|
|
67
|
+
... "SOMEFIELD", ["One line", "A second line"],
|
|
68
|
+
... subfields={"SUBFIELD1": ["Single Line"], "SUBFIELD2": ["Two", "lines"]}
|
|
69
|
+
... )
|
|
70
|
+
>>> print(file)
|
|
71
|
+
SOMEFIELD One line
|
|
72
|
+
A second line
|
|
73
|
+
SUBFIELD1 Single Line
|
|
74
|
+
SUBFIELD2 Two
|
|
75
|
+
lines
|
|
76
|
+
//
|
|
77
|
+
>>> name, content, subfields = file[0]
|
|
78
|
+
>>> print(name)
|
|
79
|
+
SOMEFIELD
|
|
80
|
+
>>> print(content)
|
|
81
|
+
['One line', 'A second line']
|
|
82
|
+
>>> print(subfields)
|
|
83
|
+
OrderedDict({'SUBFIELD1': ['Single Line'], 'SUBFIELD2': ['Two', 'lines']})
|
|
84
|
+
|
|
85
|
+
Adding an additional field:
|
|
86
|
+
|
|
87
|
+
>>> file.insert(0, "OTHERFIELD", ["Another line"])
|
|
88
|
+
>>> print(len(file))
|
|
89
|
+
2
|
|
90
|
+
>>> print(file)
|
|
91
|
+
OTHERFIELD Another line
|
|
92
|
+
SOMEFIELD One line
|
|
93
|
+
A second line
|
|
94
|
+
SUBFIELD1 Single Line
|
|
95
|
+
SUBFIELD2 Two
|
|
96
|
+
lines
|
|
97
|
+
//
|
|
98
|
+
|
|
99
|
+
Overwriting and deleting an existing field:
|
|
100
|
+
|
|
101
|
+
>>> file[1] = "NEWFIELD", ["Yet another line"]
|
|
102
|
+
>>> print(file)
|
|
103
|
+
OTHERFIELD Another line
|
|
104
|
+
NEWFIELD Yet another line
|
|
105
|
+
//
|
|
106
|
+
>>> file[1] = "NEWFIELD", ["Yet another line"], {"NEWSUB": ["Subfield line"]}
|
|
107
|
+
>>> print(file)
|
|
108
|
+
OTHERFIELD Another line
|
|
109
|
+
NEWFIELD Yet another line
|
|
110
|
+
NEWSUB Subfield line
|
|
111
|
+
//
|
|
112
|
+
>>> del file[1]
|
|
113
|
+
>>> print(file)
|
|
114
|
+
OTHERFIELD Another line
|
|
115
|
+
//
|
|
116
|
+
|
|
117
|
+
Parsing fields from a real GenBank file:
|
|
118
|
+
|
|
119
|
+
>>> import os.path
|
|
120
|
+
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "gg_avidin.gb"))
|
|
121
|
+
>>> print(file)
|
|
122
|
+
LOCUS AJ311647 1224 bp DNA linear VRT 14-NOV-2006
|
|
123
|
+
DEFINITION Gallus gallus AVD gene for avidin, exons 1-4.
|
|
124
|
+
ACCESSION AJ311647
|
|
125
|
+
VERSION AJ311647.1 GI:13397825
|
|
126
|
+
KEYWORDS AVD gene; avidin.
|
|
127
|
+
SOURCE Gallus gallus (chicken)
|
|
128
|
+
ORGANISM Gallus gallus
|
|
129
|
+
Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
|
|
130
|
+
Archelosauria; Archosauria; Dinosauria; Saurischia; Theropoda;
|
|
131
|
+
Coelurosauria; Aves; Neognathae; Galloanserae; Galliformes;
|
|
132
|
+
Phasianidae; Phasianinae; Gallus.
|
|
133
|
+
REFERENCE 1
|
|
134
|
+
AUTHORS Wallen,M.J., Laukkanen,M.O. and Kulomaa,M.S.
|
|
135
|
+
TITLE Cloning and sequencing of the chicken egg-white avidin-encoding
|
|
136
|
+
gene and its relationship with the avidin-related genes Avr1-Avr5
|
|
137
|
+
JOURNAL Gene 161 (2), 205-209 (1995)
|
|
138
|
+
PUBMED 7665080
|
|
139
|
+
REFERENCE 2
|
|
140
|
+
AUTHORS Ahlroth,M.K., Kola,E.H., Ewald,D., Masabanda,J., Sazanov,A.,
|
|
141
|
+
Fries,R. and Kulomaa,M.S.
|
|
142
|
+
TITLE Characterization and chromosomal localization of the chicken avidin
|
|
143
|
+
gene family
|
|
144
|
+
JOURNAL Anim. Genet. 31 (6), 367-375 (2000)
|
|
145
|
+
PUBMED 11167523
|
|
146
|
+
REFERENCE 3 (bases 1 to 1224)
|
|
147
|
+
AUTHORS Ahlroth,M.K.
|
|
148
|
+
TITLE Direct Submission
|
|
149
|
+
JOURNAL Submitted (09-MAR-2001) Ahlroth M.K., Department of Biological and
|
|
150
|
+
Environmental Science, University of Jyvaskyla, PO Box 35,
|
|
151
|
+
FIN-40351 Jyvaskyla, FINLAND
|
|
152
|
+
FEATURES Location/Qualifiers
|
|
153
|
+
source 1..1224
|
|
154
|
+
/organism="Gallus gallus"
|
|
155
|
+
/mol_type="genomic DNA"
|
|
156
|
+
...
|
|
157
|
+
>>> name, content, _ = file[3]
|
|
158
|
+
>>> print(name)
|
|
159
|
+
VERSION
|
|
160
|
+
>>> print(content)
|
|
161
|
+
['AJ311647.1 GI:13397825']
|
|
162
|
+
>>> name, content, subfields = file[5]
|
|
163
|
+
>>> print(name)
|
|
164
|
+
SOURCE
|
|
165
|
+
>>> print(content)
|
|
166
|
+
['Gallus gallus (chicken)']
|
|
167
|
+
>>> print(dict(subfields))
|
|
168
|
+
{'ORGANISM': ['Gallus gallus', 'Eukaryota; Metazoa; Chordata; ...', ...]}
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self):
|
|
172
|
+
super().__init__()
|
|
173
|
+
# Add '//' as general terminator of a GenBank file
|
|
174
|
+
self.lines = ["//"]
|
|
175
|
+
# Field start and stop indices in list of lines
|
|
176
|
+
# and names of categories
|
|
177
|
+
self._field_pos = []
|
|
178
|
+
self._find_field_indices()
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def read(cls, file):
|
|
182
|
+
"""
|
|
183
|
+
Read a GenBank file.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
file : file-like object or str
|
|
188
|
+
The file to be read.
|
|
189
|
+
Alternatively a file path can be supplied.
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
file_object : GenBankFile
|
|
194
|
+
The parsed file.
|
|
195
|
+
"""
|
|
196
|
+
file = super().read(file)
|
|
197
|
+
file._find_field_indices()
|
|
198
|
+
return file
|
|
199
|
+
|
|
200
|
+
def get_fields(self, name):
|
|
201
|
+
"""
|
|
202
|
+
Get all *GenBank* fields associated with a given field name.
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
name : str
|
|
207
|
+
The field name.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
fields : list of (list of str, OrderedDict of str -> str)
|
|
212
|
+
A list containing the fields.
|
|
213
|
+
For most field names, the list will only contain one
|
|
214
|
+
element, but fields like *REFERENCE* are an exception.
|
|
215
|
+
Each field is represented by a tuple.
|
|
216
|
+
Each tuple contains as first element the content lines and
|
|
217
|
+
as second element the subfields as dictionary.
|
|
218
|
+
If the field has no subfields, the dictionary is empty.
|
|
219
|
+
"""
|
|
220
|
+
indices = self.get_indices(name)
|
|
221
|
+
# Omit the field name
|
|
222
|
+
return [self[i][1:] for i in indices]
|
|
223
|
+
|
|
224
|
+
def get_indices(self, name):
|
|
225
|
+
"""
|
|
226
|
+
Get the indices to all *GenBank* fields associated with a given
|
|
227
|
+
field name.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
name : str
|
|
232
|
+
The field name.
|
|
233
|
+
|
|
234
|
+
Returns
|
|
235
|
+
-------
|
|
236
|
+
fields : list of int
|
|
237
|
+
A list of indices.
|
|
238
|
+
For most field names, the list will only contain one
|
|
239
|
+
element, but fields like *REFERENCE* are an exception.
|
|
240
|
+
"""
|
|
241
|
+
name = name.upper()
|
|
242
|
+
indices = []
|
|
243
|
+
for i, (_, _, fname) in enumerate(self._field_pos):
|
|
244
|
+
if fname == name:
|
|
245
|
+
indices.append(i)
|
|
246
|
+
return indices
|
|
247
|
+
|
|
248
|
+
def set_field(self, name, content, subfield_dict=None):
|
|
249
|
+
"""
|
|
250
|
+
Set a *GenBank* field with the given content.
|
|
251
|
+
|
|
252
|
+
If the field already exists in the file, the field is
|
|
253
|
+
overwritten, otherwise a new field is created at the end of
|
|
254
|
+
the file.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
name : str
|
|
259
|
+
The field name.
|
|
260
|
+
content : list of str
|
|
261
|
+
The content lines.
|
|
262
|
+
subfield_dict : dict of str -> str, optional
|
|
263
|
+
The subfields of the field.
|
|
264
|
+
The dictionary maps subfield names to the content lines of
|
|
265
|
+
the respective subfield.
|
|
266
|
+
|
|
267
|
+
Raises
|
|
268
|
+
------
|
|
269
|
+
InvalidFileError
|
|
270
|
+
If the field occurs multiple times in the file.
|
|
271
|
+
In this case it is ambiguous which field to overwrite.
|
|
272
|
+
"""
|
|
273
|
+
name = name.upper()
|
|
274
|
+
indices = self.get_indices(name)
|
|
275
|
+
if len(indices) > 1:
|
|
276
|
+
raise InvalidFileError(f"File contains multiple '{name}' fields")
|
|
277
|
+
elif len(indices) == 1:
|
|
278
|
+
# Replace existing entry
|
|
279
|
+
index = indices[0]
|
|
280
|
+
self[index] = name, content, subfield_dict
|
|
281
|
+
else:
|
|
282
|
+
# Add new entry as no entry exists yet
|
|
283
|
+
self.append(name, content, subfield_dict)
|
|
284
|
+
|
|
285
|
+
def __getitem__(self, index):
|
|
286
|
+
index = self._translate_idx(index)
|
|
287
|
+
start, stop, name = self._field_pos[index]
|
|
288
|
+
|
|
289
|
+
if name in ["FEATURES", "ORIGIN"]:
|
|
290
|
+
# For those two fields return the complete lines,
|
|
291
|
+
# beginning with the line after the field name
|
|
292
|
+
content = self._get_field_content(start + 1, stop, indent=0)
|
|
293
|
+
subfield_dict = OrderedDict()
|
|
294
|
+
|
|
295
|
+
else:
|
|
296
|
+
# For all metadata fields use the
|
|
297
|
+
# standard GenBank indentation (=12)
|
|
298
|
+
# Find subfields
|
|
299
|
+
subfield_dict = OrderedDict()
|
|
300
|
+
subfield_start = None
|
|
301
|
+
first_subfield_start = None
|
|
302
|
+
header = None
|
|
303
|
+
for i in range(start + 1, stop):
|
|
304
|
+
line = self.lines[i]
|
|
305
|
+
if len(line) != 0 and line[:12].strip() != "":
|
|
306
|
+
# New header -> new subfield
|
|
307
|
+
if first_subfield_start is None:
|
|
308
|
+
first_subfield_start = i
|
|
309
|
+
# Store previous subfield
|
|
310
|
+
if subfield_start is not None:
|
|
311
|
+
subfield_dict[header] = self._get_field_content(
|
|
312
|
+
subfield_start, i, indent=12
|
|
313
|
+
)
|
|
314
|
+
header = line[:12].strip()
|
|
315
|
+
subfield_start = i
|
|
316
|
+
# Store last subfield
|
|
317
|
+
if subfield_start is not None:
|
|
318
|
+
subfield_dict[header] = self._get_field_content(
|
|
319
|
+
subfield_start, stop, indent=12
|
|
320
|
+
)
|
|
321
|
+
# Only include lines in field content,
|
|
322
|
+
# that are not part of a subfield
|
|
323
|
+
if first_subfield_start is not None:
|
|
324
|
+
stop = first_subfield_start
|
|
325
|
+
content = self._get_field_content(start, stop, indent=12)
|
|
326
|
+
|
|
327
|
+
return name, content, subfield_dict
|
|
328
|
+
|
|
329
|
+
def __setitem__(self, index, item):
|
|
330
|
+
index = self._translate_idx(index)
|
|
331
|
+
if not isinstance(item, tuple):
|
|
332
|
+
raise TypeError(
|
|
333
|
+
"Expected a tuple of name, content and optionally subfields"
|
|
334
|
+
)
|
|
335
|
+
if len(item) == 2:
|
|
336
|
+
name, content = item
|
|
337
|
+
subfields = None
|
|
338
|
+
elif len(item) == 3:
|
|
339
|
+
name, content, subfields = item
|
|
340
|
+
else:
|
|
341
|
+
raise TypeError(
|
|
342
|
+
"Expected a tuple of name, content and optionally subfields"
|
|
343
|
+
)
|
|
344
|
+
inserted_lines = self._to_lines(name, content, subfields)
|
|
345
|
+
|
|
346
|
+
# Stop of field to be replaced is start of new field
|
|
347
|
+
start, old_stop, _ = self._field_pos[index]
|
|
348
|
+
# If not the last element is set,
|
|
349
|
+
# the following lines need to be added, too
|
|
350
|
+
if old_stop is not len(self.lines):
|
|
351
|
+
follow_lines = self.lines[old_stop:]
|
|
352
|
+
else:
|
|
353
|
+
follow_lines = []
|
|
354
|
+
self.lines = self.lines[:start] + inserted_lines + follow_lines
|
|
355
|
+
# Shift the start/stop indices of the following fields
|
|
356
|
+
# by the amount of created fields
|
|
357
|
+
shift = len(inserted_lines) - (old_stop - start)
|
|
358
|
+
for i in range(index + 1, len(self._field_pos)):
|
|
359
|
+
old_start, old_stop, fname = self._field_pos[i]
|
|
360
|
+
self._field_pos[i] = old_start + shift, old_stop + shift, fname
|
|
361
|
+
# Add new entry
|
|
362
|
+
self._field_pos[index] = start, start + len(inserted_lines), name.upper()
|
|
363
|
+
|
|
364
|
+
def __delitem__(self, index):
|
|
365
|
+
index = self._translate_idx(index)
|
|
366
|
+
start, stop, _ = self._field_pos[index]
|
|
367
|
+
# Shift the start/stop indices of the following fields
|
|
368
|
+
# by the amount of deleted fields
|
|
369
|
+
shift = stop - start
|
|
370
|
+
for i in range(index, len(self._field_pos)):
|
|
371
|
+
old_start, old_stop, name = self._field_pos[i]
|
|
372
|
+
self._field_pos[i] = old_start - shift, old_stop - shift, name
|
|
373
|
+
del self.lines[start:stop]
|
|
374
|
+
del self._field_pos[index]
|
|
375
|
+
|
|
376
|
+
def __len__(self):
|
|
377
|
+
return len(self._field_pos)
|
|
378
|
+
|
|
379
|
+
def insert(self, index, name, content, subfields=None):
|
|
380
|
+
"""
|
|
381
|
+
Insert a *GenBank* field at the given position.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
index : int
|
|
386
|
+
The new field is inserted before the current field at this
|
|
387
|
+
index.
|
|
388
|
+
If the index is after the last field, the new field
|
|
389
|
+
is appended to the end of the file.
|
|
390
|
+
name : str
|
|
391
|
+
The field name.
|
|
392
|
+
content : list of str
|
|
393
|
+
The content lines.
|
|
394
|
+
subfields : dict of str -> str, optional
|
|
395
|
+
The subfields of the field.
|
|
396
|
+
The dictionary maps subfield names to the content lines of
|
|
397
|
+
the respective subfield.
|
|
398
|
+
"""
|
|
399
|
+
index = self._translate_idx(index, length_exclusive=False)
|
|
400
|
+
inserted_lines = self._to_lines(name, content, subfields)
|
|
401
|
+
|
|
402
|
+
# Stop of previous field is start of new field
|
|
403
|
+
if index == 0:
|
|
404
|
+
start = 0
|
|
405
|
+
else:
|
|
406
|
+
_, start, _ = self._field_pos[index - 1]
|
|
407
|
+
# If the new lines are not inserted at the end,
|
|
408
|
+
# the following lines need to be added, too
|
|
409
|
+
if start is not len(self.lines):
|
|
410
|
+
follow_lines = self.lines[start:]
|
|
411
|
+
else:
|
|
412
|
+
follow_lines = []
|
|
413
|
+
self.lines = self.lines[:start] + inserted_lines + follow_lines
|
|
414
|
+
# Shift the start/stop indices of the following fields
|
|
415
|
+
# by the amount of created fields
|
|
416
|
+
shift = len(inserted_lines)
|
|
417
|
+
for i in range(index, len(self._field_pos)):
|
|
418
|
+
old_start, old_stop, fname = self._field_pos[i]
|
|
419
|
+
self._field_pos[i] = old_start + shift, old_stop + shift, fname
|
|
420
|
+
# Add new entry
|
|
421
|
+
self._field_pos.insert(
|
|
422
|
+
index, (start, start + len(inserted_lines), name.upper())
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
def append(self, name, content, subfields=None):
|
|
426
|
+
"""
|
|
427
|
+
Create a new *GenBank* field at the end of the file.
|
|
428
|
+
|
|
429
|
+
Parameters
|
|
430
|
+
----------
|
|
431
|
+
name : str
|
|
432
|
+
The field name.
|
|
433
|
+
content : list of str
|
|
434
|
+
The content lines.
|
|
435
|
+
subfields : dict of str -> str, optional
|
|
436
|
+
The subfields of the field.
|
|
437
|
+
The dictionary maps subfield names to the content lines of
|
|
438
|
+
the respective subfield.
|
|
439
|
+
"""
|
|
440
|
+
self.insert(len(self), name, content, subfields)
|
|
441
|
+
|
|
442
|
+
def _find_field_indices(self):
|
|
443
|
+
"""
|
|
444
|
+
Identify the start and exclusive stop indices of lines
|
|
445
|
+
corresponding to a field name for all fields in the file.
|
|
446
|
+
"""
|
|
447
|
+
start = None
|
|
448
|
+
name = ""
|
|
449
|
+
self._field_pos = []
|
|
450
|
+
for i, line in enumerate(self.lines):
|
|
451
|
+
# Check if line contains a new major field
|
|
452
|
+
# (Header beginning from first column)
|
|
453
|
+
if len(line) != 0 and line[0] != " ":
|
|
454
|
+
if line[:2] != "//":
|
|
455
|
+
stop = i
|
|
456
|
+
if start is not None:
|
|
457
|
+
# Store previous field
|
|
458
|
+
self._field_pos.append((start, stop, name))
|
|
459
|
+
start = i
|
|
460
|
+
name = line[0:12].strip()
|
|
461
|
+
else:
|
|
462
|
+
# '//' means end of file
|
|
463
|
+
# -> Store last field
|
|
464
|
+
if start is not None:
|
|
465
|
+
stop = i
|
|
466
|
+
self._field_pos.append((start, stop, name))
|
|
467
|
+
|
|
468
|
+
def _get_field_content(self, start, stop, indent):
|
|
469
|
+
if indent == 0:
|
|
470
|
+
return self.lines[start:stop]
|
|
471
|
+
else:
|
|
472
|
+
return [line[12:] for line in self.lines[start:stop]]
|
|
473
|
+
|
|
474
|
+
def _to_lines(self, name, content, subfields):
|
|
475
|
+
"""
|
|
476
|
+
Convert the field name, field content und subfield dictionary
|
|
477
|
+
into text lines
|
|
478
|
+
"""
|
|
479
|
+
if subfields is None:
|
|
480
|
+
subfields = {}
|
|
481
|
+
|
|
482
|
+
name = name.strip().upper()
|
|
483
|
+
if len(name) == 0:
|
|
484
|
+
raise ValueError("Must give a non emtpy name")
|
|
485
|
+
subfields = OrderedDict(
|
|
486
|
+
{
|
|
487
|
+
subfield_name.upper().strip(): subfield_lines
|
|
488
|
+
for subfield_name, subfield_lines in subfields.items()
|
|
489
|
+
}
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Create lines for new field
|
|
493
|
+
if name == "FEATURES":
|
|
494
|
+
# Header line plus all actual feature lines
|
|
495
|
+
lines = copy.copy(content)
|
|
496
|
+
lines.insert(0, "FEATURES" + " " * 13 + "Location/Qualifiers")
|
|
497
|
+
elif name == "ORIGIN":
|
|
498
|
+
# Header line plus all actual sequence lines
|
|
499
|
+
lines = copy.copy(content)
|
|
500
|
+
lines.insert(0, "ORIGIN")
|
|
501
|
+
else:
|
|
502
|
+
name_column = []
|
|
503
|
+
content_column = []
|
|
504
|
+
# Create a line for the field name and empty lines
|
|
505
|
+
# for each additional line required by the content
|
|
506
|
+
name_column += [name] + [""] * (len(content) - 1)
|
|
507
|
+
content_column += content
|
|
508
|
+
for subfield_name, subfield_lines in subfields.items():
|
|
509
|
+
name_column += [" " + subfield_name] + [""] * (len(subfield_lines) - 1)
|
|
510
|
+
content_column += subfield_lines
|
|
511
|
+
lines = [
|
|
512
|
+
f"{n_col:12}{c_col}"
|
|
513
|
+
for n_col, c_col in zip(name_column, content_column)
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
return lines
|
|
517
|
+
|
|
518
|
+
def _translate_idx(self, index, length_exclusive=True):
|
|
519
|
+
"""
|
|
520
|
+
Check index boundaries and convert negative index to positive
|
|
521
|
+
index.
|
|
522
|
+
"""
|
|
523
|
+
if index < 0:
|
|
524
|
+
new_index = len(self) + index
|
|
525
|
+
else:
|
|
526
|
+
new_index = index
|
|
527
|
+
if length_exclusive:
|
|
528
|
+
if new_index >= len(self):
|
|
529
|
+
raise IndexError(f"Index {index} is out of range")
|
|
530
|
+
else:
|
|
531
|
+
if new_index > len(self):
|
|
532
|
+
raise IndexError(f"Index {index} is out of range")
|
|
533
|
+
return new_index
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
class MultiFile(TextFile):
|
|
537
|
+
"""
|
|
538
|
+
This class represents a file in *GenBank* or *GenPept* format,
|
|
539
|
+
that contains multiple entries, for more than one UID.
|
|
540
|
+
|
|
541
|
+
The information for each UID are appended to each other in such a
|
|
542
|
+
file.
|
|
543
|
+
Objects of this class can be iterated to obtain a
|
|
544
|
+
:class:`GenBankFile` for each entry in the file.
|
|
545
|
+
|
|
546
|
+
Examples
|
|
547
|
+
--------
|
|
548
|
+
|
|
549
|
+
>>> import os.path
|
|
550
|
+
>>> file_name = fetch_single_file(
|
|
551
|
+
... ["1L2Y_A", "3O5R_A", "5UGO_A"],
|
|
552
|
+
... os.path.join(path_to_directory, "multifile.gp"),
|
|
553
|
+
... "protein", "gp"
|
|
554
|
+
... )
|
|
555
|
+
>>> multi_file = MultiFile.read(file_name)
|
|
556
|
+
>>> for gp_file in multi_file:
|
|
557
|
+
... print(get_accession(gp_file))
|
|
558
|
+
1L2Y_A
|
|
559
|
+
3O5R_A
|
|
560
|
+
5UGO_A
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
def __iter__(self):
|
|
564
|
+
start_i = 0
|
|
565
|
+
for i in range(len(self.lines)):
|
|
566
|
+
line = self.lines[i]
|
|
567
|
+
if line.strip() == "//":
|
|
568
|
+
# Create file with lines corresponding to that file
|
|
569
|
+
file_content = "\n".join(self.lines[start_i : i + 1])
|
|
570
|
+
file = GenBankFile.read(io.StringIO(file_content))
|
|
571
|
+
# Reset file start index
|
|
572
|
+
start_i = i
|
|
573
|
+
yield file
|