biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence"
|
|
6
|
+
__author__ = "Patrick Kunzmann", "Thomas Nevolianis"
|
|
7
|
+
__all__ = [
|
|
8
|
+
"GeneralSequence",
|
|
9
|
+
"NucleotideSequence",
|
|
10
|
+
"ProteinSequence",
|
|
11
|
+
"PositionalSequence",
|
|
12
|
+
"PurePositionalSequence",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
import numpy as np
|
|
17
|
+
from biotite.sequence.alphabet import (
|
|
18
|
+
Alphabet,
|
|
19
|
+
AlphabetError,
|
|
20
|
+
AlphabetMapper,
|
|
21
|
+
LetterAlphabet,
|
|
22
|
+
)
|
|
23
|
+
from biotite.sequence.sequence import Sequence
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GeneralSequence(Sequence):
|
|
27
|
+
"""
|
|
28
|
+
This class allows the creation of a sequence with custom
|
|
29
|
+
:class:`Alphabet` without the need to subclass :class:`Sequence`.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
alphabet : Alphabet
|
|
34
|
+
The alphabet of this sequence.
|
|
35
|
+
sequence : iterable object, optional
|
|
36
|
+
The symbol sequence, the :class:`Sequence` is initialized with.
|
|
37
|
+
For alphabets containing single letter strings, this parameter
|
|
38
|
+
may also be a :class:`str` object.
|
|
39
|
+
By default the sequence is empty.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, alphabet, sequence=()):
|
|
43
|
+
self._alphabet = alphabet
|
|
44
|
+
super().__init__(sequence)
|
|
45
|
+
|
|
46
|
+
def __repr__(self):
|
|
47
|
+
"""Represent GeneralSequence as a string for debugging."""
|
|
48
|
+
return (
|
|
49
|
+
f"GeneralSequence(Alphabet({self._alphabet}), "
|
|
50
|
+
f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def __copy_create__(self):
|
|
54
|
+
return GeneralSequence(self._alphabet)
|
|
55
|
+
|
|
56
|
+
def get_alphabet(self):
|
|
57
|
+
return self._alphabet
|
|
58
|
+
|
|
59
|
+
def as_type(self, sequence):
|
|
60
|
+
"""
|
|
61
|
+
Convert the :class:`GeneralSequence` into a sequence of another
|
|
62
|
+
:class:`Sequence` type.
|
|
63
|
+
|
|
64
|
+
This function simply replaces the sequence code of the given
|
|
65
|
+
sequence with the sequence code of this object.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
sequence : Sequence
|
|
70
|
+
The `Sequence` whose sequence code is replaced with the one
|
|
71
|
+
of this object.
|
|
72
|
+
The alphabet must equal or extend the alphabet of this
|
|
73
|
+
object.
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
sequence : Sequence
|
|
78
|
+
The input `sequence` with replaced sequence code.
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
AlphabetError
|
|
83
|
+
If the the :class:`Alphabet` of the input `sequence` does
|
|
84
|
+
not extend the :class:`Alphabet` of this sequence.
|
|
85
|
+
"""
|
|
86
|
+
if not sequence.get_alphabet().extends(self._alphabet):
|
|
87
|
+
raise AlphabetError(
|
|
88
|
+
f"The alphabet of '{type(sequence).__name__}' "
|
|
89
|
+
f"is not compatible with the alphabet of this sequence"
|
|
90
|
+
)
|
|
91
|
+
sequence.code = self.code
|
|
92
|
+
return sequence
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class NucleotideSequence(Sequence):
|
|
96
|
+
"""
|
|
97
|
+
Representation of a nucleotide sequence (DNA or RNA).
|
|
98
|
+
|
|
99
|
+
This class may have one of two different alphabets:
|
|
100
|
+
:attr:`unambiguous_alphabet()` contains only the unambiguous DNA
|
|
101
|
+
letters 'A', 'C', 'G' and 'T'.
|
|
102
|
+
:attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
|
|
103
|
+
letters.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
sequence : iterable object, optional
|
|
108
|
+
The initial DNA sequence. This may either be a list or a string.
|
|
109
|
+
May take upper or lower case letters.
|
|
110
|
+
By default the sequence is empty.
|
|
111
|
+
ambiguous : bool, optional
|
|
112
|
+
If true, the ambiguous alphabet is used. By default the
|
|
113
|
+
object tries to use the unambiguous alphabet. If this fails due
|
|
114
|
+
ambiguous letters in the sequence, the ambiguous alphabet
|
|
115
|
+
is used.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
alphabet_unamb = LetterAlphabet(["A", "C", "G", "T"])
|
|
119
|
+
alphabet_amb = LetterAlphabet(
|
|
120
|
+
["A", "C", "G", "T", "R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"]
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
compl_symbol_dict = {
|
|
124
|
+
"A": "T",
|
|
125
|
+
"C": "G",
|
|
126
|
+
"G": "C",
|
|
127
|
+
"T": "A",
|
|
128
|
+
"M": "K",
|
|
129
|
+
"R": "Y",
|
|
130
|
+
"W": "W",
|
|
131
|
+
"S": "S",
|
|
132
|
+
"Y": "R",
|
|
133
|
+
"K": "M",
|
|
134
|
+
"V": "B",
|
|
135
|
+
"H": "D",
|
|
136
|
+
"D": "H",
|
|
137
|
+
"B": "V",
|
|
138
|
+
"N": "N",
|
|
139
|
+
}
|
|
140
|
+
# List comprehension does not work in this scope
|
|
141
|
+
_compl_symbols = []
|
|
142
|
+
for _symbol in alphabet_amb.get_symbols():
|
|
143
|
+
_compl_symbols.append(compl_symbol_dict[_symbol])
|
|
144
|
+
_compl_alphabet_unamb = LetterAlphabet(_compl_symbols)
|
|
145
|
+
_compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb)
|
|
146
|
+
|
|
147
|
+
def __init__(self, sequence=[], ambiguous=None):
|
|
148
|
+
if isinstance(sequence, str):
|
|
149
|
+
sequence = sequence.upper()
|
|
150
|
+
else:
|
|
151
|
+
sequence = [symbol.upper() for symbol in sequence]
|
|
152
|
+
if ambiguous is None:
|
|
153
|
+
try:
|
|
154
|
+
self._alphabet = NucleotideSequence.alphabet_unamb
|
|
155
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
156
|
+
except AlphabetError:
|
|
157
|
+
self._alphabet = NucleotideSequence.alphabet_amb
|
|
158
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
159
|
+
elif not ambiguous:
|
|
160
|
+
self._alphabet = NucleotideSequence.alphabet_unamb
|
|
161
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
162
|
+
else:
|
|
163
|
+
self._alphabet = NucleotideSequence.alphabet_amb
|
|
164
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
165
|
+
super().__init__()
|
|
166
|
+
self.code = seq_code
|
|
167
|
+
|
|
168
|
+
def __repr__(self):
|
|
169
|
+
"""Represent NucleotideSequence as a string for debugging."""
|
|
170
|
+
if self._alphabet == NucleotideSequence.alphabet_amb:
|
|
171
|
+
ambiguous = True
|
|
172
|
+
else:
|
|
173
|
+
ambiguous = False
|
|
174
|
+
return f'NucleotideSequence("{"".join(self.symbols)}", ambiguous={ambiguous})'
|
|
175
|
+
|
|
176
|
+
def __copy_create__(self):
|
|
177
|
+
if self._alphabet == NucleotideSequence.alphabet_amb:
|
|
178
|
+
seq_copy = NucleotideSequence(ambiguous=True)
|
|
179
|
+
else:
|
|
180
|
+
seq_copy = NucleotideSequence(ambiguous=False)
|
|
181
|
+
return seq_copy
|
|
182
|
+
|
|
183
|
+
def get_alphabet(self):
|
|
184
|
+
return self._alphabet
|
|
185
|
+
|
|
186
|
+
def complement(self):
|
|
187
|
+
"""
|
|
188
|
+
Get the complement nucleotide sequence.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
complement : NucleotideSequence
|
|
193
|
+
The complement sequence.
|
|
194
|
+
|
|
195
|
+
Examples
|
|
196
|
+
--------
|
|
197
|
+
|
|
198
|
+
>>> dna_seq = NucleotideSequence("ACGCTT")
|
|
199
|
+
>>> print(dna_seq.complement())
|
|
200
|
+
TGCGAA
|
|
201
|
+
>>> print(dna_seq.reverse().complement())
|
|
202
|
+
AAGCGT
|
|
203
|
+
"""
|
|
204
|
+
# Interpreting the sequence code of this object in the
|
|
205
|
+
# complementary alphabet gives the complementary symbols
|
|
206
|
+
# In order to get the complementary symbols in the original
|
|
207
|
+
# alphabet, the sequence code is mapped from the complementary
|
|
208
|
+
# alphabet into the original alphabet
|
|
209
|
+
compl_code = NucleotideSequence._compl_mapper[self.code]
|
|
210
|
+
return self.copy(compl_code)
|
|
211
|
+
|
|
212
|
+
def translate(self, complete=False, codon_table=None, met_start=False):
|
|
213
|
+
"""
|
|
214
|
+
Translate the nucleotide sequence into a protein sequence.
|
|
215
|
+
|
|
216
|
+
If `complete` is true, the entire sequence is translated,
|
|
217
|
+
beginning with the first codon and ending with the last codon,
|
|
218
|
+
even if stop codons occur during the translation.
|
|
219
|
+
|
|
220
|
+
Otherwise this method returns possible ORFs in the
|
|
221
|
+
sequence, even if not stop codon occurs in an ORF.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
complete : bool, optional
|
|
226
|
+
If true, the complete sequence is translated. In this case
|
|
227
|
+
the sequence length must be a multiple of 3.
|
|
228
|
+
Otherwise all ORFs are translated.
|
|
229
|
+
codon_table : CodonTable, optional
|
|
230
|
+
The codon table to be used. By default the default table
|
|
231
|
+
will be used
|
|
232
|
+
(NCBI "Standard" table with "ATG" as single start codon).
|
|
233
|
+
met_start : bool, optional
|
|
234
|
+
If true, the translation starts always with a 'methionine',
|
|
235
|
+
even if the start codon codes for another amino acid.
|
|
236
|
+
Otherwise the translation starts with the amino acid
|
|
237
|
+
the codon codes for. Only applies, if `complete` is false.
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
protein : ProteinSequence or list of ProteinSequence
|
|
242
|
+
The translated protein sequence. If `complete` is true,
|
|
243
|
+
only a single :class:`ProteinSequence` is returned. Otherwise
|
|
244
|
+
a list of :class:`ProteinSequence` is returned, which contains
|
|
245
|
+
every ORF.
|
|
246
|
+
pos : list of tuple (int, int)
|
|
247
|
+
Is only returned if `complete` is false. The list contains
|
|
248
|
+
a tuple for each ORF.
|
|
249
|
+
The first element of the tuple is the index of the
|
|
250
|
+
:class:`NucleotideSequence`, where the translation starts.
|
|
251
|
+
The second element is the exclusive stop index, it
|
|
252
|
+
represents the first nucleotide in the
|
|
253
|
+
:class:`NucleotideSequence` after a stop codon.
|
|
254
|
+
|
|
255
|
+
Examples
|
|
256
|
+
--------
|
|
257
|
+
|
|
258
|
+
>>> dna_seq = NucleotideSequence("AATGATGCTATAGAT")
|
|
259
|
+
>>> prot_seq = dna_seq.translate(complete=True)
|
|
260
|
+
>>> print(prot_seq)
|
|
261
|
+
NDAID
|
|
262
|
+
>>> prot_seqs, pos = dna_seq.translate(complete=False)
|
|
263
|
+
>>> for seq in prot_seqs:
|
|
264
|
+
... print(seq)
|
|
265
|
+
MML*
|
|
266
|
+
ML*
|
|
267
|
+
"""
|
|
268
|
+
if self._alphabet != NucleotideSequence.alphabet_unamb:
|
|
269
|
+
raise AlphabetError("Translation requires unambiguous alphabet")
|
|
270
|
+
# Determine codon_table
|
|
271
|
+
if codon_table is None:
|
|
272
|
+
# Import at this position to avoid circular import
|
|
273
|
+
from biotite.sequence.codon import CodonTable
|
|
274
|
+
|
|
275
|
+
codon_table = CodonTable.default_table()
|
|
276
|
+
|
|
277
|
+
if complete:
|
|
278
|
+
if len(self) % 3 != 0:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
"Sequence length needs to be a multiple of 3 "
|
|
281
|
+
"for complete translation"
|
|
282
|
+
)
|
|
283
|
+
# Reshape code into (n,3), with n being the amount of codons
|
|
284
|
+
codons = self.code.reshape(-1, 3)
|
|
285
|
+
protein_seq = ProteinSequence()
|
|
286
|
+
protein_seq.code = codon_table.map_codon_codes(codons)
|
|
287
|
+
return protein_seq
|
|
288
|
+
|
|
289
|
+
else:
|
|
290
|
+
stop_code = ProteinSequence.alphabet.encode("*")
|
|
291
|
+
met_code = ProteinSequence.alphabet.encode("M")
|
|
292
|
+
protein_seqs = []
|
|
293
|
+
pos = []
|
|
294
|
+
code = self.code
|
|
295
|
+
# Create all three frames
|
|
296
|
+
for shift in range(3):
|
|
297
|
+
# The frame length is always a multiple of 3
|
|
298
|
+
# If there is a trailing partial codon, remove it
|
|
299
|
+
frame_length = ((len(code) - shift) // 3) * 3
|
|
300
|
+
frame = code[shift : shift + frame_length]
|
|
301
|
+
# Reshape frame into (n,3), with n being the amount of codons
|
|
302
|
+
frame_codons = frame.reshape(-1, 3)
|
|
303
|
+
# At first, translate frame completely
|
|
304
|
+
protein_code = codon_table.map_codon_codes(frame_codons)
|
|
305
|
+
# Iterate over all start codons in this frame
|
|
306
|
+
starts = np.where(codon_table.is_start_codon(frame_codons))[0]
|
|
307
|
+
for start_i in starts:
|
|
308
|
+
# Protein sequence beginning from start codon
|
|
309
|
+
code_from_start = protein_code[start_i:]
|
|
310
|
+
# Get all stop codon positions
|
|
311
|
+
# relative to 'code_from_start'
|
|
312
|
+
stops = np.where(code_from_start == stop_code)[0]
|
|
313
|
+
# Find first stop codon after start codon
|
|
314
|
+
# Include stop -> stops[0] + 1
|
|
315
|
+
stop_i = stops[0] + 1 if len(stops) > 0 else len(code_from_start)
|
|
316
|
+
code_from_start_to_stop = code_from_start[:stop_i]
|
|
317
|
+
prot_seq = ProteinSequence()
|
|
318
|
+
if met_start:
|
|
319
|
+
# Copy as the slice is edited
|
|
320
|
+
prot_seq.code = code_from_start_to_stop.copy()
|
|
321
|
+
prot_seq.code[0] = met_code
|
|
322
|
+
else:
|
|
323
|
+
prot_seq.code = code_from_start_to_stop
|
|
324
|
+
protein_seqs.append(prot_seq)
|
|
325
|
+
# Codon indices are transformed
|
|
326
|
+
# to nucleotide sequence indices
|
|
327
|
+
pos.append((shift + start_i * 3, shift + (start_i + stop_i) * 3))
|
|
328
|
+
# Sort by start position
|
|
329
|
+
order = np.argsort([start for start, stop in pos])
|
|
330
|
+
pos = [pos[i] for i in order]
|
|
331
|
+
protein_seqs = [protein_seqs[i] for i in order]
|
|
332
|
+
return protein_seqs, pos
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def unambiguous_alphabet():
|
|
336
|
+
"""
|
|
337
|
+
Get the unambiguous nucleotide alphabet containing the symbols
|
|
338
|
+
``A``, ``C``, ``G`` and ``T``.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
alphabet : LetterAlphabet
|
|
343
|
+
The unambiguous nucleotide alphabet.
|
|
344
|
+
"""
|
|
345
|
+
return NucleotideSequence.alphabet_unamb
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def ambiguous_alphabet():
|
|
349
|
+
"""
|
|
350
|
+
Get the ambiguous nucleotide alphabet containing the symbols
|
|
351
|
+
``A``, ``C``, ``G`` and ``T`` and symbols describing
|
|
352
|
+
ambiguous combinations of these.
|
|
353
|
+
|
|
354
|
+
Returns
|
|
355
|
+
-------
|
|
356
|
+
alphabet : LetterAlphabet
|
|
357
|
+
The ambiguous nucleotide alphabet.
|
|
358
|
+
"""
|
|
359
|
+
return NucleotideSequence.alphabet_amb
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
class ProteinSequence(Sequence):
|
|
363
|
+
"""
|
|
364
|
+
Representation of a protein sequence.
|
|
365
|
+
|
|
366
|
+
Furthermore this class offers a conversion of amino acids from
|
|
367
|
+
3-letter code into 1-letter code and vice versa.
|
|
368
|
+
|
|
369
|
+
Parameters
|
|
370
|
+
----------
|
|
371
|
+
sequence : iterable object, optional
|
|
372
|
+
The initial protein sequence. This may either be a list or a
|
|
373
|
+
string. May take upper or lower case letters. If a list is
|
|
374
|
+
given, the list elements can be 1-letter or 3-letter amino acid
|
|
375
|
+
representations. By default the sequence is empty.
|
|
376
|
+
|
|
377
|
+
Notes
|
|
378
|
+
-----
|
|
379
|
+
The :class:`Alphabet` of this :class:`Sequence` class does not
|
|
380
|
+
support selenocysteine.
|
|
381
|
+
Please convert selenocysteine (``U``) into cysteine (``C``)
|
|
382
|
+
or use a custom :class:`Sequence` class, if the differentiation is
|
|
383
|
+
necessary.
|
|
384
|
+
"""
|
|
385
|
+
|
|
386
|
+
_codon_table = None
|
|
387
|
+
|
|
388
|
+
alphabet = LetterAlphabet(
|
|
389
|
+
[
|
|
390
|
+
"A",
|
|
391
|
+
"C",
|
|
392
|
+
"D",
|
|
393
|
+
"E",
|
|
394
|
+
"F",
|
|
395
|
+
"G",
|
|
396
|
+
"H",
|
|
397
|
+
"I",
|
|
398
|
+
"K",
|
|
399
|
+
"L",
|
|
400
|
+
"M",
|
|
401
|
+
"N",
|
|
402
|
+
"P",
|
|
403
|
+
"Q",
|
|
404
|
+
"R",
|
|
405
|
+
"S",
|
|
406
|
+
"T",
|
|
407
|
+
"V",
|
|
408
|
+
"W",
|
|
409
|
+
"Y",
|
|
410
|
+
"B",
|
|
411
|
+
"Z",
|
|
412
|
+
"X",
|
|
413
|
+
"*",
|
|
414
|
+
]
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Masses are taken from
|
|
418
|
+
# https://web.expasy.org/findmod/findmod_masses.html#AA
|
|
419
|
+
|
|
420
|
+
_mol_weight_average = np.array(
|
|
421
|
+
[
|
|
422
|
+
71.0788, # A
|
|
423
|
+
103.1388, # C
|
|
424
|
+
115.0886, # D
|
|
425
|
+
129.1155, # E
|
|
426
|
+
147.1766, # F
|
|
427
|
+
57.0519, # G
|
|
428
|
+
137.1411, # H
|
|
429
|
+
113.1594, # I
|
|
430
|
+
128.1741, # K
|
|
431
|
+
113.1594, # L
|
|
432
|
+
131.1926, # M
|
|
433
|
+
114.1038, # N
|
|
434
|
+
97.1167, # P
|
|
435
|
+
128.1307, # Q
|
|
436
|
+
156.1875, # R
|
|
437
|
+
87.0782, # S
|
|
438
|
+
101.1051, # T
|
|
439
|
+
99.1326, # V
|
|
440
|
+
186.2132, # W
|
|
441
|
+
163.1760, # Y
|
|
442
|
+
np.nan, # B
|
|
443
|
+
np.nan, # Z
|
|
444
|
+
np.nan, # X
|
|
445
|
+
np.nan, # *
|
|
446
|
+
]
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
_mol_weight_monoisotopic = np.array(
|
|
450
|
+
[
|
|
451
|
+
71.03711, # A
|
|
452
|
+
103.00919, # C
|
|
453
|
+
115.02694, # D
|
|
454
|
+
129.04259, # E
|
|
455
|
+
147.06841, # F
|
|
456
|
+
57.02146, # G
|
|
457
|
+
137.05891, # H
|
|
458
|
+
113.08406, # I
|
|
459
|
+
128.09496, # K
|
|
460
|
+
113.08406, # L
|
|
461
|
+
131.04049, # M
|
|
462
|
+
114.04293, # N
|
|
463
|
+
97.05276, # P
|
|
464
|
+
128.05858, # Q
|
|
465
|
+
156.10111, # R
|
|
466
|
+
87.03203, # S
|
|
467
|
+
101.04768, # T
|
|
468
|
+
99.06841, # V
|
|
469
|
+
186.07931, # W
|
|
470
|
+
163.06333, # Y
|
|
471
|
+
np.nan, # B
|
|
472
|
+
np.nan, # Z
|
|
473
|
+
np.nan, # X
|
|
474
|
+
np.nan, # *
|
|
475
|
+
]
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
_dict_1to3 = {
|
|
479
|
+
"A": "ALA",
|
|
480
|
+
"C": "CYS",
|
|
481
|
+
"D": "ASP",
|
|
482
|
+
"E": "GLU",
|
|
483
|
+
"F": "PHE",
|
|
484
|
+
"G": "GLY",
|
|
485
|
+
"H": "HIS",
|
|
486
|
+
"I": "ILE",
|
|
487
|
+
"K": "LYS",
|
|
488
|
+
"L": "LEU",
|
|
489
|
+
"M": "MET",
|
|
490
|
+
"N": "ASN",
|
|
491
|
+
"P": "PRO",
|
|
492
|
+
"Q": "GLN",
|
|
493
|
+
"R": "ARG",
|
|
494
|
+
"S": "SER",
|
|
495
|
+
"T": "THR",
|
|
496
|
+
"V": "VAL",
|
|
497
|
+
"W": "TRP",
|
|
498
|
+
"Y": "TYR",
|
|
499
|
+
"B": "ASX",
|
|
500
|
+
"Z": "GLX",
|
|
501
|
+
"X": "UNK",
|
|
502
|
+
"*": " * ",
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
_dict_3to1 = {}
|
|
506
|
+
for _key, _value in _dict_1to3.items():
|
|
507
|
+
_dict_3to1[_value] = _key
|
|
508
|
+
_dict_3to1["SEC"] = "C"
|
|
509
|
+
_dict_3to1["MSE"] = "M"
|
|
510
|
+
|
|
511
|
+
def __init__(self, sequence=()):
|
|
512
|
+
dict_3to1 = ProteinSequence._dict_3to1
|
|
513
|
+
# Convert 3-letter codes to single letter codes,
|
|
514
|
+
# if list contains 3-letter codes
|
|
515
|
+
sequence = [
|
|
516
|
+
dict_3to1[symbol.upper()] if len(symbol) == 3 else symbol.upper()
|
|
517
|
+
for symbol in sequence
|
|
518
|
+
]
|
|
519
|
+
super().__init__(sequence)
|
|
520
|
+
|
|
521
|
+
def __repr__(self):
|
|
522
|
+
"""Represent ProteinSequence as a string for debugging."""
|
|
523
|
+
return f'ProteinSequence("{"".join(self.symbols)}")'
|
|
524
|
+
|
|
525
|
+
def get_alphabet(self):
|
|
526
|
+
return ProteinSequence.alphabet
|
|
527
|
+
|
|
528
|
+
def remove_stops(self):
|
|
529
|
+
"""
|
|
530
|
+
Remove *stop signals* from the sequence.
|
|
531
|
+
|
|
532
|
+
Returns
|
|
533
|
+
-------
|
|
534
|
+
no_stop : ProteinSequence
|
|
535
|
+
A copy of this sequence without stop signals.
|
|
536
|
+
"""
|
|
537
|
+
stop_code = ProteinSequence.alphabet.encode("*")
|
|
538
|
+
no_stop = self.copy()
|
|
539
|
+
seq_code = no_stop.code
|
|
540
|
+
no_stop.code = seq_code[seq_code != stop_code]
|
|
541
|
+
return no_stop
|
|
542
|
+
|
|
543
|
+
@staticmethod
|
|
544
|
+
def convert_letter_3to1(symbol):
|
|
545
|
+
"""
|
|
546
|
+
Convert a 3-letter to a 1-letter amino acid representation.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
symbol : string
|
|
551
|
+
3-letter amino acid representation.
|
|
552
|
+
|
|
553
|
+
Returns
|
|
554
|
+
-------
|
|
555
|
+
convert : string
|
|
556
|
+
1-letter amino acid representation.
|
|
557
|
+
"""
|
|
558
|
+
return ProteinSequence._dict_3to1[symbol.upper()]
|
|
559
|
+
|
|
560
|
+
@staticmethod
|
|
561
|
+
def convert_letter_1to3(symbol):
|
|
562
|
+
"""
|
|
563
|
+
Convert a 1-letter to a 3-letter amino acid representation.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
symbol : string
|
|
568
|
+
1-letter amino acid representation.
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
convert : string
|
|
573
|
+
3-letter amino acid representation.
|
|
574
|
+
"""
|
|
575
|
+
return ProteinSequence._dict_1to3[symbol.upper()]
|
|
576
|
+
|
|
577
|
+
def get_molecular_weight(self, monoisotopic=False):
|
|
578
|
+
"""
|
|
579
|
+
Calculate the molecular weight of this protein.
|
|
580
|
+
|
|
581
|
+
Average protein molecular weight is calculated by the addition
|
|
582
|
+
of average isotopic masses of the amino acids
|
|
583
|
+
in the protein and the average isotopic mass of one water
|
|
584
|
+
molecule.
|
|
585
|
+
|
|
586
|
+
Parameters
|
|
587
|
+
----------
|
|
588
|
+
monoisotopic : bool
|
|
589
|
+
Use the mass of the most common isotope.
|
|
590
|
+
|
|
591
|
+
Returns
|
|
592
|
+
-------
|
|
593
|
+
weight : float
|
|
594
|
+
Molecular weight of the protein represented by the sequence.
|
|
595
|
+
Molecular weight values are given in Dalton (Da).
|
|
596
|
+
"""
|
|
597
|
+
if monoisotopic:
|
|
598
|
+
weight = np.sum(self._mol_weight_monoisotopic[self.code]) + 18.015
|
|
599
|
+
else:
|
|
600
|
+
weight = np.sum(self._mol_weight_average[self.code]) + 18.015
|
|
601
|
+
|
|
602
|
+
if np.isnan(weight):
|
|
603
|
+
raise ValueError(
|
|
604
|
+
"Sequence contains ambiguous amino acids, cannot calculate weight"
|
|
605
|
+
)
|
|
606
|
+
return weight
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
class PositionalSequence(Sequence):
|
|
610
|
+
"""
|
|
611
|
+
A sequence where each symbol is associated with a position.
|
|
612
|
+
|
|
613
|
+
For each individual position the sequence contains a separate
|
|
614
|
+
:class:`PositionalSequence.Symbol`, encoded by a custom alphabet for this sequence.
|
|
615
|
+
In consequence the symbol code is the position in the sequence itself.
|
|
616
|
+
This is useful for aligning sequences based on a position-specific
|
|
617
|
+
substitution matrix.
|
|
618
|
+
|
|
619
|
+
Parameters
|
|
620
|
+
----------
|
|
621
|
+
original_sequence : seq.Sequence
|
|
622
|
+
The original sequence to create the positional sequence from.
|
|
623
|
+
"""
|
|
624
|
+
|
|
625
|
+
@dataclass(frozen=True)
|
|
626
|
+
class Symbol:
|
|
627
|
+
"""
|
|
628
|
+
Combination of a symbol and its position in a sequence.
|
|
629
|
+
|
|
630
|
+
Attributes
|
|
631
|
+
----------
|
|
632
|
+
original_alphabet : Alphabet
|
|
633
|
+
The original alphabet, where the symbol stems from.
|
|
634
|
+
original_code : int
|
|
635
|
+
The code of the original symbol in the original alphabet.
|
|
636
|
+
position : int
|
|
637
|
+
The 0-based position of the symbol in the sequence.
|
|
638
|
+
symbol : object
|
|
639
|
+
The symbol from the original alphabet.
|
|
640
|
+
|
|
641
|
+
See Also
|
|
642
|
+
--------
|
|
643
|
+
PositionalSequence
|
|
644
|
+
The sequence type containing :class:`PositionalSymbol` objects.
|
|
645
|
+
"""
|
|
646
|
+
|
|
647
|
+
original_alphabet: ...
|
|
648
|
+
original_code: ...
|
|
649
|
+
position: ...
|
|
650
|
+
symbol: ... = field(init=False)
|
|
651
|
+
|
|
652
|
+
def __post_init__(self):
|
|
653
|
+
sym = self.original_alphabet.decode(self.original_code)
|
|
654
|
+
super().__setattr__("symbol", sym)
|
|
655
|
+
|
|
656
|
+
def __str__(self):
|
|
657
|
+
return str(self.symbol)
|
|
658
|
+
|
|
659
|
+
def __init__(self, original_sequence):
|
|
660
|
+
self._orig_alphabet = original_sequence.get_alphabet()
|
|
661
|
+
self._alphabet = Alphabet(
|
|
662
|
+
[
|
|
663
|
+
PositionalSequence.Symbol(self._orig_alphabet, code, pos)
|
|
664
|
+
for pos, code in enumerate(original_sequence.code)
|
|
665
|
+
]
|
|
666
|
+
)
|
|
667
|
+
self.code = np.arange(
|
|
668
|
+
len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
def reconstruct(self):
|
|
672
|
+
"""
|
|
673
|
+
Reconstruct the original sequence from the positional sequence.
|
|
674
|
+
|
|
675
|
+
Returns
|
|
676
|
+
-------
|
|
677
|
+
original_sequence : GeneralSequence
|
|
678
|
+
The original sequence.
|
|
679
|
+
Although the actual type of the returned sequence is always a
|
|
680
|
+
:class:`GeneralSequence`, the alphabet and the symbols of the returned
|
|
681
|
+
sequence are equal to the original sequence.
|
|
682
|
+
"""
|
|
683
|
+
original_sequence = GeneralSequence(self._orig_alphabet)
|
|
684
|
+
original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
|
|
685
|
+
return original_sequence
|
|
686
|
+
|
|
687
|
+
def get_alphabet(self):
|
|
688
|
+
return self._alphabet
|
|
689
|
+
|
|
690
|
+
def __str__(self) -> str:
|
|
691
|
+
return "".join([str(sym) for sym in self.symbols])
|
|
692
|
+
|
|
693
|
+
def __repr__(self):
|
|
694
|
+
return f"PositionalSequence({self.reconstruct()!r})"
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
class PurePositionalSequence(Sequence):
|
|
698
|
+
"""
|
|
699
|
+
An object of this class is a 'placeholder' sequence, where each symbol is the
|
|
700
|
+
position in the sequence itself.
|
|
701
|
+
|
|
702
|
+
This class is similar to :class:`PositionalSequence`, but the symbols are not
|
|
703
|
+
derived from an original sequence, but are the pure position.
|
|
704
|
+
Hence, there is no meaningful string representation of the sequence and its symbols.
|
|
705
|
+
|
|
706
|
+
Parameters
|
|
707
|
+
----------
|
|
708
|
+
length : int
|
|
709
|
+
The length of the sequence.
|
|
710
|
+
"""
|
|
711
|
+
|
|
712
|
+
def __init__(self, length):
|
|
713
|
+
self._alphabet = Alphabet(range(length))
|
|
714
|
+
self.code = np.arange(length, dtype=Sequence.dtype(length))
|
|
715
|
+
|
|
716
|
+
def get_alphabet(self):
|
|
717
|
+
return self._alphabet
|
|
718
|
+
|
|
719
|
+
def __repr__(self):
|
|
720
|
+
return f"PurePositionalSequence({len(self)})"
|