biotite 1.5.0__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cp314-win_amd64.pyd +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +4 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
The module contains the :class:`Sequence` superclass and :class:`GeneralSequence`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__name__ = "biotite.sequence"
|
|
10
|
+
__author__ = "Patrick Kunzmann"
|
|
11
|
+
__all__ = ["Sequence"]
|
|
12
|
+
|
|
13
|
+
import abc
|
|
14
|
+
import numbers
|
|
15
|
+
import numpy as np
|
|
16
|
+
from biotite.copyable import Copyable
|
|
17
|
+
from biotite.sequence.alphabet import LetterAlphabet
|
|
18
|
+
|
|
19
|
+
_size_uint8 = np.iinfo(np.uint8).max + 1
|
|
20
|
+
_size_uint16 = np.iinfo(np.uint16).max + 1
|
|
21
|
+
_size_uint32 = np.iinfo(np.uint32).max + 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Sequence(Copyable, metaclass=abc.ABCMeta):
|
|
25
|
+
"""
|
|
26
|
+
The abstract base class for all sequence types.
|
|
27
|
+
|
|
28
|
+
A :class:`Sequence` can be seen as a succession of symbols, that are
|
|
29
|
+
elements in the allowed set of symbols, the :class:`Alphabet`.
|
|
30
|
+
Internally, a :class:`Sequence` object uses a *NumPy*
|
|
31
|
+
:class:`ndarray` of integers, where each integer represents a
|
|
32
|
+
symbol.
|
|
33
|
+
The :class:`Alphabet` of a :class:`Sequence` object is used to
|
|
34
|
+
encode each symbol, that is used to create the
|
|
35
|
+
:class:`Sequence`, into an integer. These integer values are called
|
|
36
|
+
symbol code, the encoding of an entire sequence of symbols is
|
|
37
|
+
called sequence code.
|
|
38
|
+
|
|
39
|
+
The size of the symbol code type in the array is determined by the
|
|
40
|
+
size of the :class:`Alphabet`:
|
|
41
|
+
If the :class:`Alphabet` contains 256 symbols or less, one byte is
|
|
42
|
+
used per array element; if the :class:`Alphabet` contains
|
|
43
|
+
between 257 and 65536 symbols, two bytes are used, and so on.
|
|
44
|
+
|
|
45
|
+
Two :class:`Sequence` objects are equal if they are instances of the
|
|
46
|
+
same class, have the same :class:`Alphabet` and have equal sequence
|
|
47
|
+
codes.
|
|
48
|
+
Comparison with a string or list of symbols evaluates always to
|
|
49
|
+
false.
|
|
50
|
+
|
|
51
|
+
A :class:`Sequence` can be indexed by any 1-D index a
|
|
52
|
+
:class:`ndarray` accepts.
|
|
53
|
+
If the index is a single integer, the decoded symbol at that
|
|
54
|
+
position is returned, otherwise a subsequence is returned.
|
|
55
|
+
|
|
56
|
+
Individual symbols of the sequence can also be exchanged in indexed
|
|
57
|
+
form: If the an integer is used as index, the item is treated as a
|
|
58
|
+
symbol. Any other index (slice, index list, boolean mask) expects
|
|
59
|
+
multiple symbols, either as list of symbols, as :class:`ndarray`
|
|
60
|
+
containing a sequence code or another :class:`Sequence` instance.
|
|
61
|
+
Concatenation of two sequences is achieved with the '+' operator.
|
|
62
|
+
|
|
63
|
+
Each subclass of :class:`Sequence` needs to overwrite the abstract
|
|
64
|
+
method :func:`get_alphabet()`, which specifies the alphabet the
|
|
65
|
+
:class:`Sequence` uses.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
sequence : iterable object, optional
|
|
70
|
+
The symbol sequence, the :class:`Sequence` is initialized with.
|
|
71
|
+
For alphabets containing single letter strings, this parameter
|
|
72
|
+
may also be a :class`str` object.
|
|
73
|
+
By default the sequence is empty.
|
|
74
|
+
|
|
75
|
+
Attributes
|
|
76
|
+
----------
|
|
77
|
+
code : ndarray
|
|
78
|
+
The sequence code.
|
|
79
|
+
symbols : list
|
|
80
|
+
The list of symbols, represented by the sequence.
|
|
81
|
+
The list is generated by decoding the sequence code, when
|
|
82
|
+
this attribute is accessed. When this attribute is modified,
|
|
83
|
+
the new list of symbols is encoded into the sequence code.
|
|
84
|
+
alphabet : Alphabet
|
|
85
|
+
The alphabet of this sequence. Cannot be set.
|
|
86
|
+
Equal to `get_alphabet()`.
|
|
87
|
+
|
|
88
|
+
Examples
|
|
89
|
+
--------
|
|
90
|
+
Creating a DNA sequence from string and print the symbols and the
|
|
91
|
+
code:
|
|
92
|
+
|
|
93
|
+
>>> dna_seq = NucleotideSequence("ACGTA")
|
|
94
|
+
>>> print(dna_seq)
|
|
95
|
+
ACGTA
|
|
96
|
+
>>> print(dna_seq.code)
|
|
97
|
+
[0 1 2 3 0]
|
|
98
|
+
>>> print(dna_seq.symbols)
|
|
99
|
+
['A' 'C' 'G' 'T' 'A']
|
|
100
|
+
>>> print(list(dna_seq))
|
|
101
|
+
['A', 'C', 'G', 'T', 'A']
|
|
102
|
+
|
|
103
|
+
Sequence indexing:
|
|
104
|
+
|
|
105
|
+
>>> print(dna_seq[1:3])
|
|
106
|
+
CG
|
|
107
|
+
>>> print(dna_seq[[0,2,4]])
|
|
108
|
+
AGA
|
|
109
|
+
>>> print(dna_seq[np.array([False,False,True,True,True])])
|
|
110
|
+
GTA
|
|
111
|
+
|
|
112
|
+
Sequence manipulation:
|
|
113
|
+
|
|
114
|
+
>>> dna_copy = dna_seq.copy()
|
|
115
|
+
>>> dna_copy[2] = "C"
|
|
116
|
+
>>> print(dna_copy)
|
|
117
|
+
ACCTA
|
|
118
|
+
>>> dna_copy = dna_seq.copy()
|
|
119
|
+
>>> dna_copy[0:2] = dna_copy[3:5]
|
|
120
|
+
>>> print(dna_copy)
|
|
121
|
+
TAGTA
|
|
122
|
+
>>> dna_copy = dna_seq.copy()
|
|
123
|
+
>>> dna_copy[np.array([True,False,False,False,True])] = "T"
|
|
124
|
+
>>> print(dna_copy)
|
|
125
|
+
TCGTT
|
|
126
|
+
>>> dna_copy = dna_seq.copy()
|
|
127
|
+
>>> dna_copy[1:4] = np.array([0,1,2])
|
|
128
|
+
>>> print(dna_copy)
|
|
129
|
+
AACGA
|
|
130
|
+
|
|
131
|
+
Reverse sequence:
|
|
132
|
+
|
|
133
|
+
>>> dna_seq_rev = dna_seq.reverse()
|
|
134
|
+
>>> print(dna_seq_rev)
|
|
135
|
+
ATGCA
|
|
136
|
+
|
|
137
|
+
Concatenate the two sequences:
|
|
138
|
+
|
|
139
|
+
>>> dna_seq_concat = dna_seq + dna_seq_rev
|
|
140
|
+
>>> print(dna_seq_concat)
|
|
141
|
+
ACGTAATGCA
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __init__(self, sequence=()):
|
|
145
|
+
self.symbols = sequence
|
|
146
|
+
|
|
147
|
+
def copy(self, new_seq_code=None):
|
|
148
|
+
"""
|
|
149
|
+
Copy the object.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
new_seq_code : ndarray, optional
|
|
154
|
+
If this parameter is set, the sequence code is set to this
|
|
155
|
+
value, rather than the original sequence code.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
copy
|
|
160
|
+
A copy of this object.
|
|
161
|
+
"""
|
|
162
|
+
# Override in order to achieve better performance,
|
|
163
|
+
# in case only a subsequence is needed,
|
|
164
|
+
# because not the entire sequence code is copied then
|
|
165
|
+
clone = self.__copy_create__()
|
|
166
|
+
if new_seq_code is None:
|
|
167
|
+
clone.code = np.copy(self.code)
|
|
168
|
+
else:
|
|
169
|
+
clone.code = new_seq_code
|
|
170
|
+
self.__copy_fill__(clone)
|
|
171
|
+
return clone
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def symbols(self):
|
|
175
|
+
return self.get_alphabet().decode_multiple(self.code)
|
|
176
|
+
|
|
177
|
+
@symbols.setter
|
|
178
|
+
def symbols(self, value):
|
|
179
|
+
alph = self.get_alphabet()
|
|
180
|
+
dtype = Sequence.dtype(len(alph))
|
|
181
|
+
self._seq_code = alph.encode_multiple(value, dtype)
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def code(self):
|
|
185
|
+
return self._seq_code
|
|
186
|
+
|
|
187
|
+
@code.setter
|
|
188
|
+
def code(self, value):
|
|
189
|
+
dtype = Sequence.dtype(len(self.get_alphabet()))
|
|
190
|
+
if not isinstance(value, np.ndarray):
|
|
191
|
+
raise TypeError("Sequence code must be an integer ndarray")
|
|
192
|
+
self._seq_code = value.astype(dtype, copy=False)
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def alphabet(self):
|
|
196
|
+
return self.get_alphabet()
|
|
197
|
+
|
|
198
|
+
@abc.abstractmethod
|
|
199
|
+
def get_alphabet(self):
|
|
200
|
+
"""
|
|
201
|
+
Get the :class:`Alphabet` of the :class:`Sequence`.
|
|
202
|
+
|
|
203
|
+
This method must be overwritten, when subclassing
|
|
204
|
+
:class:`Sequence`.
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
alphabet : Alphabet
|
|
209
|
+
:class:`Sequence` alphabet.
|
|
210
|
+
"""
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
def reverse(self, copy=True):
|
|
214
|
+
"""
|
|
215
|
+
Reverse the :class:`Sequence`.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
copy : bool, optional
|
|
220
|
+
If set to False, the code :class:`ndarray` of the returned
|
|
221
|
+
sequence is an array view to the sequence code of this
|
|
222
|
+
object.
|
|
223
|
+
In this case, manipulations on the returned sequence would
|
|
224
|
+
also affect this object.
|
|
225
|
+
Otherwise, the sequence code is copied.
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
reversed : Sequence
|
|
230
|
+
The reversed :class:`Sequence`.
|
|
231
|
+
|
|
232
|
+
Examples
|
|
233
|
+
--------
|
|
234
|
+
|
|
235
|
+
>>> dna_seq = NucleotideSequence("ACGTA")
|
|
236
|
+
>>> dna_seq_rev = dna_seq.reverse()
|
|
237
|
+
>>> print(dna_seq_rev)
|
|
238
|
+
ATGCA
|
|
239
|
+
"""
|
|
240
|
+
reversed_code = np.flip(self._seq_code, axis=0)
|
|
241
|
+
if copy:
|
|
242
|
+
reversed_code = np.copy(reversed_code)
|
|
243
|
+
return self.copy(reversed_code)
|
|
244
|
+
|
|
245
|
+
def is_valid(self):
|
|
246
|
+
"""
|
|
247
|
+
Check, if the sequence contains a valid sequence code.
|
|
248
|
+
|
|
249
|
+
A sequence code is valid, if at each sequence position the
|
|
250
|
+
code is smaller than the size of the alphabet.
|
|
251
|
+
|
|
252
|
+
Invalid code means that the code cannot be decoded into
|
|
253
|
+
symbols. Furthermore invalid code can lead to serious
|
|
254
|
+
errors in alignments, since the substitution matrix
|
|
255
|
+
is indexed with an invalid index.
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
valid : bool
|
|
260
|
+
True, if the sequence is valid, false otherwise.
|
|
261
|
+
"""
|
|
262
|
+
return (self.code < len(self.get_alphabet())).all()
|
|
263
|
+
|
|
264
|
+
def get_symbol_frequency(self):
|
|
265
|
+
"""
|
|
266
|
+
Get the number of occurences of each symbol in the sequence.
|
|
267
|
+
|
|
268
|
+
If a symbol does not occur in the sequence, but it is in the
|
|
269
|
+
alphabet, its number of occurences is 0.
|
|
270
|
+
|
|
271
|
+
Returns
|
|
272
|
+
-------
|
|
273
|
+
frequency : dict
|
|
274
|
+
A dictionary containing the symbols as keys and the
|
|
275
|
+
corresponding number of occurences in the sequence as
|
|
276
|
+
values.
|
|
277
|
+
"""
|
|
278
|
+
counts = np.bincount(self._seq_code, minlength=len(self.get_alphabet()))
|
|
279
|
+
return {
|
|
280
|
+
symbol: count
|
|
281
|
+
for symbol, count in zip(self.get_alphabet().get_symbols(), counts)
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
def __getitem__(self, index):
|
|
285
|
+
alph = self.get_alphabet()
|
|
286
|
+
sub_seq = self._seq_code.__getitem__(index)
|
|
287
|
+
if isinstance(sub_seq, np.ndarray):
|
|
288
|
+
return self.copy(sub_seq)
|
|
289
|
+
else:
|
|
290
|
+
return alph.decode(sub_seq)
|
|
291
|
+
|
|
292
|
+
def __setitem__(self, index, item):
|
|
293
|
+
alph = self.get_alphabet()
|
|
294
|
+
if isinstance(index, numbers.Integral):
|
|
295
|
+
# Expect a single symbol
|
|
296
|
+
code = alph.encode(item)
|
|
297
|
+
else:
|
|
298
|
+
# Expect multiple symbols
|
|
299
|
+
if isinstance(item, Sequence):
|
|
300
|
+
code = item.code
|
|
301
|
+
elif isinstance(item, np.ndarray):
|
|
302
|
+
code = item
|
|
303
|
+
else:
|
|
304
|
+
# Default: item is iterable object of symbols
|
|
305
|
+
code = alph.encode_multiple(item)
|
|
306
|
+
self._seq_code.__setitem__(index, code)
|
|
307
|
+
|
|
308
|
+
def __len__(self):
|
|
309
|
+
return len(self._seq_code)
|
|
310
|
+
|
|
311
|
+
def __iter__(self):
|
|
312
|
+
alph = self.get_alphabet()
|
|
313
|
+
i = 0
|
|
314
|
+
while i < len(self):
|
|
315
|
+
yield alph.decode(self._seq_code[i])
|
|
316
|
+
i += 1
|
|
317
|
+
|
|
318
|
+
def __eq__(self, item):
|
|
319
|
+
if not isinstance(item, type(self)):
|
|
320
|
+
return False
|
|
321
|
+
if self.get_alphabet() != item.get_alphabet():
|
|
322
|
+
return False
|
|
323
|
+
return np.array_equal(self._seq_code, item._seq_code)
|
|
324
|
+
|
|
325
|
+
def __str__(self):
|
|
326
|
+
alph = self.get_alphabet()
|
|
327
|
+
if isinstance(alph, LetterAlphabet):
|
|
328
|
+
return (
|
|
329
|
+
alph.decode_multiple(self._seq_code, as_bytes=True)
|
|
330
|
+
.tobytes()
|
|
331
|
+
.decode("ASCII")
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
return ", ".join([str(e) for e in alph.decode_multiple(self._seq_code)])
|
|
335
|
+
|
|
336
|
+
def __add__(self, sequence):
|
|
337
|
+
if self.get_alphabet().extends(sequence.get_alphabet()):
|
|
338
|
+
new_code = np.concatenate((self._seq_code, sequence._seq_code))
|
|
339
|
+
new_seq = self.copy(new_code)
|
|
340
|
+
return new_seq
|
|
341
|
+
elif sequence.get_alphabet().extends(self.get_alphabet()):
|
|
342
|
+
new_code = np.concatenate((self._seq_code, sequence._seq_code))
|
|
343
|
+
new_seq = sequence.copy(new_code)
|
|
344
|
+
return new_seq
|
|
345
|
+
else:
|
|
346
|
+
raise ValueError("The sequences alphabets are not compatible")
|
|
347
|
+
|
|
348
|
+
@staticmethod
|
|
349
|
+
def dtype(alphabet_size):
|
|
350
|
+
"""
|
|
351
|
+
Get the sequence code dtype required for the given size of the
|
|
352
|
+
alphabet.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
alphabet_size : int
|
|
357
|
+
The size of the alphabet.
|
|
358
|
+
|
|
359
|
+
Returns
|
|
360
|
+
-------
|
|
361
|
+
dtype
|
|
362
|
+
The :class:`dtype`, that is large enough to store symbol
|
|
363
|
+
codes, that are encoded by an :class:`Alphabet` of the given
|
|
364
|
+
size.
|
|
365
|
+
"""
|
|
366
|
+
if alphabet_size <= _size_uint8:
|
|
367
|
+
return np.uint8
|
|
368
|
+
elif alphabet_size <= _size_uint16:
|
|
369
|
+
return np.uint16
|
|
370
|
+
elif alphabet_size <= _size_uint32:
|
|
371
|
+
return np.uint32
|
|
372
|
+
else:
|
|
373
|
+
return np.uint64
|
biotite/setup_ccd.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
__author__ = "Patrick Kunzmann"
|
|
2
|
+
__all__ = []
|
|
3
|
+
|
|
4
|
+
import gzip
|
|
5
|
+
import logging
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import numpy as np
|
|
10
|
+
import requests
|
|
11
|
+
from biotite.structure.io.pdbx import *
|
|
12
|
+
|
|
13
|
+
OUTPUT_CCD = Path(__file__).parent / "structure" / "info" / "components.bcif"
|
|
14
|
+
CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def concatenate_ccd(categories=None):
|
|
18
|
+
"""
|
|
19
|
+
Create the CCD in BinaryCIF format with each category contains the
|
|
20
|
+
data of all blocks.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
categories : list of str, optional
|
|
25
|
+
The names of the categories to include.
|
|
26
|
+
By default, all categories from the CCD are included.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
compressed_file : BinaryCIFFile
|
|
31
|
+
The compressed CCD in BinaryCIF format.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
logging.info("Download and read CCD...")
|
|
35
|
+
ccd_cif_text = gzip.decompress(requests.get(CCD_URL).content).decode()
|
|
36
|
+
ccd_file = CIFFile.read(StringIO(ccd_cif_text))
|
|
37
|
+
|
|
38
|
+
compressed_block = BinaryCIFBlock()
|
|
39
|
+
if categories is None:
|
|
40
|
+
categories = _list_all_category_names(ccd_file)
|
|
41
|
+
for category_name in categories:
|
|
42
|
+
logging.info(f"Concatenate and compress '{category_name}' category...")
|
|
43
|
+
compressed_block[category_name] = compress(
|
|
44
|
+
_concatenate_blocks_into_category(ccd_file, category_name)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logging.info("Write concatenated CCD into BinaryCIF...")
|
|
48
|
+
compressed_file = BinaryCIFFile()
|
|
49
|
+
compressed_file["components"] = compressed_block
|
|
50
|
+
return compressed_file
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _concatenate_blocks_into_category(pdbx_file, category_name):
|
|
54
|
+
"""
|
|
55
|
+
Concatenate the given category from all blocks into a single
|
|
56
|
+
category.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
pdbx_file : PDBxFile
|
|
61
|
+
The PDBx file, whose blocks should be concatenated.
|
|
62
|
+
category_name : str
|
|
63
|
+
The name of the category to concatenate.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
category : BinaryCIFCategory
|
|
68
|
+
The concatenated category.
|
|
69
|
+
"""
|
|
70
|
+
columns_names = _list_all_column_names(pdbx_file, category_name)
|
|
71
|
+
data_chunks = defaultdict(list)
|
|
72
|
+
mask_chunks = defaultdict(list)
|
|
73
|
+
for block in pdbx_file.values():
|
|
74
|
+
if category_name not in block:
|
|
75
|
+
continue
|
|
76
|
+
category = block[category_name]
|
|
77
|
+
for column_name in columns_names:
|
|
78
|
+
if column_name in category:
|
|
79
|
+
column = category[column_name]
|
|
80
|
+
data_chunks[column_name].append(column.data.array)
|
|
81
|
+
if column.mask is not None:
|
|
82
|
+
mask_chunks[column_name].append(column.mask.array)
|
|
83
|
+
else:
|
|
84
|
+
mask_chunks[column_name].append(
|
|
85
|
+
np.full(category.row_count, MaskValue.PRESENT, dtype=np.uint8)
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
# Column is missing in this block
|
|
89
|
+
# -> handle it as data masked as 'missing'
|
|
90
|
+
data_chunks[column_name].append(
|
|
91
|
+
# For now all arrays are of type string anyway,
|
|
92
|
+
# as they are read from a CIF file
|
|
93
|
+
np.full(category.row_count, "", dtype="U1")
|
|
94
|
+
)
|
|
95
|
+
mask_chunks[column_name].append(
|
|
96
|
+
np.full(category.row_count, MaskValue.MISSING, dtype=np.uint8)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
bcif_columns = {}
|
|
100
|
+
for col_name in columns_names:
|
|
101
|
+
data = np.concatenate(data_chunks[col_name])
|
|
102
|
+
mask = np.concatenate(mask_chunks[col_name])
|
|
103
|
+
data = _into_fitting_type(data, mask)
|
|
104
|
+
if np.all(mask == MaskValue.PRESENT):
|
|
105
|
+
mask = None
|
|
106
|
+
bcif_columns[col_name] = BinaryCIFColumn(data, mask)
|
|
107
|
+
return BinaryCIFCategory(bcif_columns)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _list_all_column_names(pdbx_file, category_name):
|
|
111
|
+
"""
|
|
112
|
+
Get all columns that exist in any block for a given category.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
pdbx_file : PDBxFile
|
|
117
|
+
The PDBx file to search in for the columns.
|
|
118
|
+
category_name : str
|
|
119
|
+
The name of the category to search in.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
columns_names : list of str
|
|
124
|
+
The names of the columns.
|
|
125
|
+
"""
|
|
126
|
+
columns_names = set()
|
|
127
|
+
for block in pdbx_file.values():
|
|
128
|
+
if category_name in block:
|
|
129
|
+
columns_names.update(block[category_name].keys())
|
|
130
|
+
return sorted(columns_names)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _list_all_category_names(pdbx_file):
|
|
134
|
+
"""
|
|
135
|
+
Get all categories that exist in any block.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
pdbx_file : PDBxFile
|
|
140
|
+
The PDBx file to search in for the columns.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
columns_names : list of str
|
|
145
|
+
The names of the columns.
|
|
146
|
+
"""
|
|
147
|
+
category_names = set()
|
|
148
|
+
for block in pdbx_file.values():
|
|
149
|
+
category_names.update(block.keys())
|
|
150
|
+
return sorted(category_names)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _into_fitting_type(string_array, mask):
|
|
154
|
+
"""
|
|
155
|
+
Try to find a numeric type for a string ndarray, if possible.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
string_array : ndarray, dtype=string
|
|
160
|
+
The array to convert.
|
|
161
|
+
mask : ndarray, dtype=uint8
|
|
162
|
+
Only values in `string_array` where the mask is ``MaskValue.PRESENT`` are
|
|
163
|
+
considered for type conversion.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
array : ndarray
|
|
168
|
+
The array converted into an appropriate dtype.
|
|
169
|
+
"""
|
|
170
|
+
mask = mask == MaskValue.PRESENT
|
|
171
|
+
# Only try to find an appropriate dtype for unmasked values
|
|
172
|
+
values = string_array[mask]
|
|
173
|
+
try:
|
|
174
|
+
# Try to fit into integer type
|
|
175
|
+
values = values.astype(int)
|
|
176
|
+
except ValueError:
|
|
177
|
+
try:
|
|
178
|
+
# Try to fit into float type
|
|
179
|
+
values = values.astype(float)
|
|
180
|
+
except ValueError:
|
|
181
|
+
# Keep string type
|
|
182
|
+
pass
|
|
183
|
+
array = np.zeros(string_array.shape, dtype=values.dtype)
|
|
184
|
+
array[mask] = values
|
|
185
|
+
return array
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def main():
|
|
189
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
|
|
190
|
+
OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True)
|
|
191
|
+
|
|
192
|
+
compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"])
|
|
193
|
+
compressed_ccd.write(OUTPUT_CCD)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
main()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
A subpackage for handling molecular structures.
|
|
7
|
+
|
|
8
|
+
In this context an atom is described by two kinds of attributes: the
|
|
9
|
+
coordinates and the annotations. The annotations include information
|
|
10
|
+
about polypetide chain id, residue id, residue name, hetero atom
|
|
11
|
+
information, atom name and optionally more. The coordinates are a
|
|
12
|
+
`NumPy` float :class:`ndarray` of length 3, containing the x, y and z
|
|
13
|
+
coordinates.
|
|
14
|
+
|
|
15
|
+
An :class:`Atom` contains data for a single atom, it stores the
|
|
16
|
+
annotations as scalar values and the coordinates as length 3
|
|
17
|
+
:class:`ndarray`.
|
|
18
|
+
|
|
19
|
+
An :class:`AtomArray` stores data for an entire structure model
|
|
20
|
+
containing *n* atoms.
|
|
21
|
+
Therefore the annotations are represented as :class:`ndarray` objects of
|
|
22
|
+
length *n*, the so called annotation arrays.
|
|
23
|
+
The coordinates are a *(n x 3)* :class:`ndarray`.
|
|
24
|
+
|
|
25
|
+
An :class:`AtomArrayStack` stores data for *m* models, where each model
|
|
26
|
+
contains the same atoms at different positions.
|
|
27
|
+
Hence, the annotation arrays are represented as :class:`ndarray` objects
|
|
28
|
+
of length *n* like the :class:`AtomArray`, while the coordinates are a
|
|
29
|
+
*(m x n x 3)* :class:`ndarray`.
|
|
30
|
+
|
|
31
|
+
Like an :class:`AtomArray` can be iterated to obtain :class:`Atom`
|
|
32
|
+
objects, an :class:`AtomArrayStack` yields :class:`AtomArray` objects.
|
|
33
|
+
All three types must not be subclassed.
|
|
34
|
+
|
|
35
|
+
The following annotation categories are mandatory:
|
|
36
|
+
|
|
37
|
+
========= =========== ================= =======================================
|
|
38
|
+
Category Type Examples Description
|
|
39
|
+
========= =========== ================= =======================================
|
|
40
|
+
chain_id string (U4) 'A','S','AB', ... Polypeptide chain
|
|
41
|
+
res_id int 1,2,3, ... Sequence position of residue
|
|
42
|
+
ins_code string (U1) '', 'A','B',.. PDB insertion code (iCode)
|
|
43
|
+
res_name string (U5) 'GLY','ALA', ... Residue name
|
|
44
|
+
hetero bool True, False False for ``ATOM``, true for ``HETATM``
|
|
45
|
+
atom_name string (U6) 'CA','N', ... Atom name
|
|
46
|
+
element string (U2) 'C','O','SE', ... Chemical Element
|
|
47
|
+
========= =========== ================= =======================================
|
|
48
|
+
|
|
49
|
+
For all :class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`
|
|
50
|
+
objects these annotations are initially set with default values.
|
|
51
|
+
Additionally to these annotations, an arbitrary amount of annotation
|
|
52
|
+
categories can be added via :func:`add_annotation()` or
|
|
53
|
+
:func:`set_annotation()`.
|
|
54
|
+
The annotation arrays can be accessed either via the method
|
|
55
|
+
:func:`get_annotation()` or directly (e.g. ``array.res_id``).
|
|
56
|
+
|
|
57
|
+
The following annotation categories are optionally used by some
|
|
58
|
+
functions:
|
|
59
|
+
|
|
60
|
+
========= =========== ================= =========================================
|
|
61
|
+
Category Type Examples Description
|
|
62
|
+
========= =========== ================= =========================================
|
|
63
|
+
atom_id int 1,2,3, ... Atom serial number
|
|
64
|
+
b_factor float 0.9, 12.3, ... Temperature factor
|
|
65
|
+
occupancy float .1, .3, .9, ... Occupancy
|
|
66
|
+
charge int -2,-1,0,1,2, ... Electric charge of the atom
|
|
67
|
+
sym_id string '1','2','3', ... Symmetry ID for assemblies/symmetry mates
|
|
68
|
+
========= =========== ================= =========================================
|
|
69
|
+
|
|
70
|
+
For each type, the attributes can be accessed directly.
|
|
71
|
+
Both :class:`AtomArray` and :class:`AtomArrayStack` support
|
|
72
|
+
*NumPy* style indexing.
|
|
73
|
+
The index is propagated to each attribute.
|
|
74
|
+
If a single integer is used as index,
|
|
75
|
+
an object with one dimension less is returned
|
|
76
|
+
(:class:`AtomArrayStack` -> :class:`AtomArray`,
|
|
77
|
+
:class:`AtomArray` -> :class:`Atom`).
|
|
78
|
+
If a slice, index array or a boolean mask is given, a substructure is
|
|
79
|
+
returned
|
|
80
|
+
(:class:`AtomArrayStack` -> :class:`AtomArrayStack`,
|
|
81
|
+
:class:`AtomArray` -> :class:`AtomArray`)
|
|
82
|
+
As in *NumPy*, these are not necessarily deep copies of the originals:
|
|
83
|
+
The attributes of the sliced object may still point to the original
|
|
84
|
+
:class:`ndarray`.
|
|
85
|
+
Use the :func:`copy()` method if a deep copy is required.
|
|
86
|
+
|
|
87
|
+
Bond information can be associated to an :class:`AtomArray` or
|
|
88
|
+
:class:`AtomArrayStack` by setting the ``bonds`` attribute with a
|
|
89
|
+
:class:`BondList`.
|
|
90
|
+
A :class:`BondList` specifies the indices of atoms that form chemical
|
|
91
|
+
bonds.
|
|
92
|
+
Some functionalities require that the input structure has an associated
|
|
93
|
+
:class:`BondList`.
|
|
94
|
+
If no :class:`BondList` is associated, the ``bonds`` attribute is
|
|
95
|
+
``None``.
|
|
96
|
+
|
|
97
|
+
Based on the implementation in *NumPy* arrays, this package furthermore
|
|
98
|
+
contains a comprehensive set of functions for structure analysis,
|
|
99
|
+
manipulation and visualization.
|
|
100
|
+
|
|
101
|
+
The universal length unit in this package is Å.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
__name__ = "biotite.structure"
|
|
105
|
+
__author__ = "Patrick Kunzmann"
|
|
106
|
+
|
|
107
|
+
from .atoms import *
|
|
108
|
+
from .basepairs import *
|
|
109
|
+
from .bonds import *
|
|
110
|
+
from .box import *
|
|
111
|
+
from .celllist import *
|
|
112
|
+
from .chains import *
|
|
113
|
+
from .charges import *
|
|
114
|
+
from .compare import *
|
|
115
|
+
from .density import *
|
|
116
|
+
from .dotbracket import *
|
|
117
|
+
from .error import *
|
|
118
|
+
from .filter import *
|
|
119
|
+
from .geometry import *
|
|
120
|
+
from .hbond import *
|
|
121
|
+
from .integrity import *
|
|
122
|
+
from .mechanics import *
|
|
123
|
+
from .molecules import *
|
|
124
|
+
from .pseudoknots import *
|
|
125
|
+
from .rdf import *
|
|
126
|
+
from .repair import *
|
|
127
|
+
from .residues import *
|
|
128
|
+
from .rings import *
|
|
129
|
+
from .sasa import *
|
|
130
|
+
from .sequence import *
|
|
131
|
+
from .sse import *
|
|
132
|
+
from .superimpose import *
|
|
133
|
+
from .tm import *
|
|
134
|
+
from .transform import *
|
|
135
|
+
# util and segments are used internally
|