biotite 0.41.1__cp312-cp312-macosx_10_16_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +19 -0
- biotite/application/__init__.py +43 -0
- biotite/application/application.py +265 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +505 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +83 -0
- biotite/application/blast/webapp.py +421 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +238 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +152 -0
- biotite/application/localapp.py +306 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +122 -0
- biotite/application/msaapp.py +374 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +254 -0
- biotite/application/muscle/app5.py +171 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +456 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +222 -0
- biotite/application/util.py +59 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +304 -0
- biotite/application/viennarna/rnafold.py +269 -0
- biotite/application/viennarna/rnaplot.py +187 -0
- biotite/application/viennarna/util.py +72 -0
- biotite/application/webapp.py +77 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +61 -0
- biotite/database/entrez/dbnames.py +89 -0
- biotite/database/entrez/download.py +223 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +223 -0
- biotite/database/error.py +15 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +260 -0
- biotite/database/pubchem/error.py +20 -0
- biotite/database/pubchem/query.py +827 -0
- biotite/database/pubchem/throttle.py +99 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +167 -0
- biotite/database/rcsb/query.py +959 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +32 -0
- biotite/database/uniprot/download.py +134 -0
- biotite/database/uniprot/query.py +209 -0
- biotite/file.py +251 -0
- biotite/sequence/__init__.py +73 -0
- biotite/sequence/align/__init__.py +49 -0
- biotite/sequence/align/alignment.py +658 -0
- biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +69 -0
- biotite/sequence/align/cigar.py +434 -0
- biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +574 -0
- biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3400 -0
- biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +405 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +620 -0
- biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
- biotite/sequence/align/pairwise.pyx +587 -0
- biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +305 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +956 -0
- biotite/sequence/align/statistics.py +265 -0
- biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +566 -0
- biotite/sequence/annotation.py +829 -0
- biotite/sequence/codec.cpython-312-darwin.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +466 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1034 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +139 -0
- biotite/sequence/graphics/dendrogram.py +184 -0
- biotite/sequence/graphics/features.py +510 -0
- biotite/sequence/graphics/logo.py +110 -0
- biotite/sequence/graphics/plasmid.py +661 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +273 -0
- biotite/sequence/io/fasta/file.py +278 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +120 -0
- biotite/sequence/io/fastq/file.py +551 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +277 -0
- biotite/sequence/io/genbank/file.py +575 -0
- biotite/sequence/io/genbank/metadata.py +324 -0
- biotite/sequence/io/genbank/sequence.py +172 -0
- biotite/sequence/io/general.py +192 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +133 -0
- biotite/sequence/io/gff/file.py +434 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +456 -0
- biotite/sequence/search.py +116 -0
- biotite/sequence/seqtypes.py +556 -0
- biotite/sequence/sequence.py +374 -0
- biotite/structure/__init__.py +132 -0
- biotite/structure/atoms.py +1455 -0
- biotite/structure/basepairs.py +1415 -0
- biotite/structure/bonds.cpython-312-darwin.so +0 -0
- biotite/structure/bonds.pyx +1933 -0
- biotite/structure/box.py +592 -0
- biotite/structure/celllist.cpython-312-darwin.so +0 -0
- biotite/structure/celllist.pyx +849 -0
- biotite/structure/chains.py +298 -0
- biotite/structure/charges.cpython-312-darwin.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +274 -0
- biotite/structure/density.py +114 -0
- biotite/structure/dotbracket.py +216 -0
- biotite/structure/error.py +31 -0
- biotite/structure/filter.py +585 -0
- biotite/structure/geometry.py +697 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +226 -0
- biotite/structure/graphics/rna.py +282 -0
- biotite/structure/hbond.py +409 -0
- biotite/structure/info/__init__.py +25 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +82 -0
- biotite/structure/info/bonds.py +145 -0
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1663 -0
- biotite/structure/info/ccd/carbohydrates.txt +1135 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +798 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +123 -0
- biotite/structure/info/misc.py +144 -0
- biotite/structure/info/radii.py +197 -0
- biotite/structure/info/standardize.py +196 -0
- biotite/structure/integrity.py +268 -0
- biotite/structure/io/__init__.py +30 -0
- biotite/structure/io/ctab.py +72 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +65 -0
- biotite/structure/io/general.py +257 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mmtf/__init__.py +21 -0
- biotite/structure/io/mmtf/assembly.py +214 -0
- biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +341 -0
- biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +501 -0
- biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +152 -0
- biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +183 -0
- biotite/structure/io/mmtf/file.py +233 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +115 -0
- biotite/structure/io/mol/ctab.py +414 -0
- biotite/structure/io/mol/header.py +116 -0
- biotite/structure/io/mol/mol.py +193 -0
- biotite/structure/io/mol/sdf.py +916 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +63 -0
- biotite/structure/io/npz/__init__.py +20 -0
- biotite/structure/io/npz/file.py +152 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +293 -0
- biotite/structure/io/pdb/file.py +1240 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +107 -0
- biotite/structure/io/pdbqt/file.py +640 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +648 -0
- biotite/structure/io/pdbx/cif.py +1032 -0
- biotite/structure/io/pdbx/component.py +246 -0
- biotite/structure/io/pdbx/convert.py +1597 -0
- biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +950 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/io/tng/__init__.py +13 -0
- biotite/structure/io/tng/file.py +46 -0
- biotite/structure/io/trajfile.py +710 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +46 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +46 -0
- biotite/structure/mechanics.py +75 -0
- biotite/structure/molecules.py +353 -0
- biotite/structure/pseudoknots.py +642 -0
- biotite/structure/rdf.py +243 -0
- biotite/structure/repair.py +253 -0
- biotite/structure/residues.py +562 -0
- biotite/structure/resutil.py +178 -0
- biotite/structure/sasa.cpython-312-darwin.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/sequence.py +112 -0
- biotite/structure/sse.py +327 -0
- biotite/structure/superimpose.py +727 -0
- biotite/structure/transform.py +504 -0
- biotite/structure/util.py +98 -0
- biotite/temp.py +86 -0
- biotite/version.py +16 -0
- biotite/visualize.py +251 -0
- biotite-0.41.1.dist-info/METADATA +187 -0
- biotite-0.41.1.dist-info/RECORD +340 -0
- biotite-0.41.1.dist-info/WHEEL +4 -0
- biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["find_subsequence", "find_symbol", "find_symbol_first",
|
|
8
|
+
"find_symbol_last"]
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def find_subsequence(sequence, query):
|
|
14
|
+
"""
|
|
15
|
+
Find a subsequence in a sequence.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
sequence : Sequence
|
|
20
|
+
The sequence to find the subsequence in.
|
|
21
|
+
query : Sequence
|
|
22
|
+
The potential subsequence. Its alphabet must extend the
|
|
23
|
+
`sequence` alphabet.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
match_indices : ndarray
|
|
28
|
+
The starting indices in `sequence`, where `query` has been
|
|
29
|
+
found. The array is empty if no match has been found.
|
|
30
|
+
|
|
31
|
+
Raises
|
|
32
|
+
------
|
|
33
|
+
ValueError
|
|
34
|
+
If the `query` alphabet does not extend the `sequence` alphabet.
|
|
35
|
+
|
|
36
|
+
Examples
|
|
37
|
+
--------
|
|
38
|
+
|
|
39
|
+
>>> main_seq = NucleotideSequence("ACTGAATGA")
|
|
40
|
+
>>> sub_seq = NucleotideSequence("TGA")
|
|
41
|
+
>>> print(find_subsequence(main_seq, sub_seq))
|
|
42
|
+
[2 6]
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
if not sequence.get_alphabet().extends(query.get_alphabet()):
|
|
46
|
+
raise ValueError("The sequences alphabets are not equal")
|
|
47
|
+
match_indices = []
|
|
48
|
+
frame_size = len(query)
|
|
49
|
+
for i in range(len(sequence) - frame_size + 1):
|
|
50
|
+
sub_seq_code = sequence.code[i : i + frame_size]
|
|
51
|
+
if np.array_equal(query.code, sub_seq_code):
|
|
52
|
+
match_indices.append(i)
|
|
53
|
+
return np.array(match_indices)
|
|
54
|
+
|
|
55
|
+
def find_symbol(sequence, symbol):
|
|
56
|
+
"""
|
|
57
|
+
Find a symbol in a sequence.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
sequence : Sequence
|
|
62
|
+
The sequence to find the symbol in.
|
|
63
|
+
symbol : object
|
|
64
|
+
The symbol to be found in `sequence`.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
match_indices : ndarray
|
|
69
|
+
The indices in `sequence`, where `symbol` has been found.
|
|
70
|
+
"""
|
|
71
|
+
code = sequence.get_alphabet().encode(symbol)
|
|
72
|
+
return np.where(sequence.code == code)[0]
|
|
73
|
+
|
|
74
|
+
def find_symbol_first(sequence, symbol):
|
|
75
|
+
"""
|
|
76
|
+
Find first occurence of a symbol in a sequence.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
sequence : Sequence
|
|
81
|
+
The sequence to find the symbol in.
|
|
82
|
+
symbol : object
|
|
83
|
+
The symbol to be found in `sequence`.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
first_index : int
|
|
88
|
+
The first index of `symbol` in `sequence`. If `symbol` is not in
|
|
89
|
+
`sequence`, -1 is returned.
|
|
90
|
+
"""
|
|
91
|
+
match_i = find_symbol(sequence, symbol)
|
|
92
|
+
if len(match_i) == 0:
|
|
93
|
+
return -1
|
|
94
|
+
return np.min(match_i)
|
|
95
|
+
|
|
96
|
+
def find_symbol_last(sequence, symbol):
|
|
97
|
+
"""
|
|
98
|
+
Find last occurence of a symbol in a sequence.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
sequence : Sequence
|
|
103
|
+
The sequence to find the symbol in.
|
|
104
|
+
symbol : object
|
|
105
|
+
The symbol to be found in `sequence`.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
flast_index : int
|
|
110
|
+
The last index of `symbol` in `sequence`. If `symbol` is not in
|
|
111
|
+
`sequence`, -1 is returned.
|
|
112
|
+
"""
|
|
113
|
+
match_i = find_symbol(sequence, symbol)
|
|
114
|
+
if len(match_i) == 0:
|
|
115
|
+
return -1
|
|
116
|
+
return np.max(match_i)
|
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence"
|
|
6
|
+
__author__ = "Patrick Kunzmann", "Thomas Nevolianis"
|
|
7
|
+
__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
|
|
8
|
+
|
|
9
|
+
from .sequence import Sequence
|
|
10
|
+
from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper
|
|
11
|
+
import numpy as np
|
|
12
|
+
import copy
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GeneralSequence(Sequence):
|
|
16
|
+
"""
|
|
17
|
+
This class allows the creation of a sequence with custom
|
|
18
|
+
:class:`Alphabet` without the need to subclass :class:`Sequence`.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
alphabet : Alphabet
|
|
23
|
+
The alphabet of this sequence.
|
|
24
|
+
sequence : iterable object, optional
|
|
25
|
+
The symbol sequence, the :class:`Sequence` is initialized with.
|
|
26
|
+
For alphabets containing single letter strings, this parameter
|
|
27
|
+
may also be a :class:`str` object.
|
|
28
|
+
By default the sequence is empty.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, alphabet, sequence=()):
|
|
32
|
+
self._alphabet = alphabet
|
|
33
|
+
super().__init__(sequence)
|
|
34
|
+
|
|
35
|
+
def __repr__(self):
|
|
36
|
+
"""Represent GeneralSequence as a string for debugging."""
|
|
37
|
+
return f"GeneralSequence(Alphabet({self._alphabet}), " \
|
|
38
|
+
f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
|
|
39
|
+
|
|
40
|
+
def __copy_create__(self):
|
|
41
|
+
return GeneralSequence(self._alphabet)
|
|
42
|
+
|
|
43
|
+
def get_alphabet(self):
|
|
44
|
+
return self._alphabet
|
|
45
|
+
|
|
46
|
+
def as_type(self, sequence):
|
|
47
|
+
"""
|
|
48
|
+
Convert the :class:`GeneralSequence` into a sequence of another
|
|
49
|
+
:class:`Sequence` type.
|
|
50
|
+
|
|
51
|
+
This function simply replaces the sequence code of the given
|
|
52
|
+
sequence with the sequence code of this object.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
sequence : Sequence
|
|
57
|
+
The `Sequence` whose sequence code is replaced with the one
|
|
58
|
+
of this object.
|
|
59
|
+
The alphabet must equal or extend the alphabet of this
|
|
60
|
+
object.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
sequence : Sequence
|
|
65
|
+
The input `sequence` with replaced sequence code.
|
|
66
|
+
|
|
67
|
+
Raises
|
|
68
|
+
------
|
|
69
|
+
AlphabetError
|
|
70
|
+
If the the :class:`Alphabet` of the input `sequence` does
|
|
71
|
+
not extend the :class:`Alphabet` of this sequence.
|
|
72
|
+
"""
|
|
73
|
+
if not sequence.get_alphabet().extends(self._alphabet):
|
|
74
|
+
raise AlphabetError(
|
|
75
|
+
f"The alphabet of '{type(sequence).__name__}' "
|
|
76
|
+
f"is not compatible with the alphabet of this sequence"
|
|
77
|
+
)
|
|
78
|
+
sequence.code = self.code
|
|
79
|
+
return sequence
|
|
80
|
+
|
|
81
|
+
class NucleotideSequence(Sequence):
|
|
82
|
+
"""
|
|
83
|
+
Representation of a nucleotide sequence (DNA or RNA).
|
|
84
|
+
|
|
85
|
+
This class may have one of two different alphabets:
|
|
86
|
+
:attr:`unambiguous_alphabet()` contains only the unambiguous DNA
|
|
87
|
+
letters 'A', 'C', 'G' and 'T'.
|
|
88
|
+
:attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
|
|
89
|
+
letters.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
sequence : iterable object, optional
|
|
94
|
+
The initial DNA sequence. This may either be a list or a string.
|
|
95
|
+
May take upper or lower case letters.
|
|
96
|
+
By default the sequence is empty.
|
|
97
|
+
ambiguous : bool, optional
|
|
98
|
+
If true, the ambiguous alphabet is used. By default the
|
|
99
|
+
object tries to use the unambiguous alphabet. If this fails due
|
|
100
|
+
ambiguous letters in the sequence, the ambiguous alphabet
|
|
101
|
+
is used.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
alphabet_unamb = LetterAlphabet(["A","C","G","T"])
|
|
105
|
+
alphabet_amb = LetterAlphabet(
|
|
106
|
+
["A","C","G","T","R","Y","W","S",
|
|
107
|
+
"M","K","H","B","V","D","N"]
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
compl_symbol_dict = {"A" : "T",
|
|
111
|
+
"C" : "G",
|
|
112
|
+
"G" : "C",
|
|
113
|
+
"T" : "A",
|
|
114
|
+
"M" : "K",
|
|
115
|
+
"R" : "Y",
|
|
116
|
+
"W" : "W",
|
|
117
|
+
"S" : "S",
|
|
118
|
+
"Y" : "R",
|
|
119
|
+
"K" : "M",
|
|
120
|
+
"V" : "B",
|
|
121
|
+
"H" : "D",
|
|
122
|
+
"D" : "H",
|
|
123
|
+
"B" : "V",
|
|
124
|
+
"N" : "N"}
|
|
125
|
+
# List comprehension does not work in this scope
|
|
126
|
+
_compl_symbols = []
|
|
127
|
+
for _symbol in alphabet_amb.get_symbols():
|
|
128
|
+
_compl_symbols.append(compl_symbol_dict[_symbol])
|
|
129
|
+
_compl_alphabet_unamb = LetterAlphabet(_compl_symbols)
|
|
130
|
+
_compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb)
|
|
131
|
+
|
|
132
|
+
def __init__(self, sequence=[], ambiguous=None):
|
|
133
|
+
if isinstance(sequence, str):
|
|
134
|
+
sequence = sequence.upper()
|
|
135
|
+
else:
|
|
136
|
+
sequence = [symbol.upper() for symbol in sequence]
|
|
137
|
+
if ambiguous is None:
|
|
138
|
+
try:
|
|
139
|
+
self._alphabet = NucleotideSequence.alphabet_unamb
|
|
140
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
141
|
+
except AlphabetError:
|
|
142
|
+
self._alphabet = NucleotideSequence.alphabet_amb
|
|
143
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
144
|
+
elif not ambiguous:
|
|
145
|
+
self._alphabet = NucleotideSequence.alphabet_unamb
|
|
146
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
147
|
+
else:
|
|
148
|
+
self._alphabet = NucleotideSequence.alphabet_amb
|
|
149
|
+
seq_code = self._alphabet.encode_multiple(sequence)
|
|
150
|
+
super().__init__()
|
|
151
|
+
self.code = seq_code
|
|
152
|
+
|
|
153
|
+
def __repr__(self):
|
|
154
|
+
"""Represent NucleotideSequence as a string for debugging."""
|
|
155
|
+
if self._alphabet == NucleotideSequence.alphabet_amb:
|
|
156
|
+
ambiguous = True
|
|
157
|
+
else:
|
|
158
|
+
ambiguous = False
|
|
159
|
+
return f'NucleotideSequence("{"".join(self.symbols)}", ambiguous={ambiguous})'
|
|
160
|
+
|
|
161
|
+
def __copy_create__(self):
|
|
162
|
+
if self._alphabet == NucleotideSequence.alphabet_amb:
|
|
163
|
+
seq_copy = NucleotideSequence(ambiguous=True)
|
|
164
|
+
else:
|
|
165
|
+
seq_copy = NucleotideSequence(ambiguous=False)
|
|
166
|
+
return seq_copy
|
|
167
|
+
|
|
168
|
+
def get_alphabet(self):
|
|
169
|
+
return self._alphabet
|
|
170
|
+
|
|
171
|
+
def complement(self):
|
|
172
|
+
"""
|
|
173
|
+
Get the complement nucleotide sequence.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
complement : NucleotideSequence
|
|
178
|
+
The complement sequence.
|
|
179
|
+
|
|
180
|
+
Examples
|
|
181
|
+
--------
|
|
182
|
+
|
|
183
|
+
>>> dna_seq = NucleotideSequence("ACGCTT")
|
|
184
|
+
>>> print(dna_seq.complement())
|
|
185
|
+
TGCGAA
|
|
186
|
+
>>> print(dna_seq.reverse().complement())
|
|
187
|
+
AAGCGT
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
# Interpreting the sequence code of this object in the
|
|
191
|
+
# complementary alphabet gives the complementary symbols
|
|
192
|
+
# In order to get the complementary symbols in the original
|
|
193
|
+
# alphabet, the sequence code is mapped from the complementary
|
|
194
|
+
# alphabet into the original alphabet
|
|
195
|
+
compl_code = NucleotideSequence._compl_mapper[self.code]
|
|
196
|
+
return self.copy(compl_code)
|
|
197
|
+
|
|
198
|
+
def translate(self, complete=False, codon_table=None, met_start=False):
|
|
199
|
+
"""
|
|
200
|
+
Translate the nucleotide sequence into a protein sequence.
|
|
201
|
+
|
|
202
|
+
If `complete` is true, the entire sequence is translated,
|
|
203
|
+
beginning with the first codon and ending with the last codon,
|
|
204
|
+
even if stop codons occur during the translation.
|
|
205
|
+
|
|
206
|
+
Otherwise this method returns possible ORFs in the
|
|
207
|
+
sequence, even if not stop codon occurs in an ORF.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
complete : bool, optional
|
|
212
|
+
If true, the complete sequence is translated. In this case
|
|
213
|
+
the sequence length must be a multiple of 3.
|
|
214
|
+
Otherwise all ORFs are translated. (Default: False)
|
|
215
|
+
codon_table : CodonTable, optional
|
|
216
|
+
The codon table to be used. By default the default table
|
|
217
|
+
will be used
|
|
218
|
+
(NCBI "Standard" table with "ATG" as single start codon).
|
|
219
|
+
met_start : bool, optional
|
|
220
|
+
If true, the translation starts always with a 'methionine',
|
|
221
|
+
even if the start codon codes for another amino acid.
|
|
222
|
+
Otherwise the translation starts with the amino acid
|
|
223
|
+
the codon codes for. Only applies, if `complete` is false.
|
|
224
|
+
(Default: False)
|
|
225
|
+
|
|
226
|
+
Returns
|
|
227
|
+
-------
|
|
228
|
+
protein : ProteinSequence or list of ProteinSequence
|
|
229
|
+
The translated protein sequence. If `complete` is true,
|
|
230
|
+
only a single :class:`ProteinSequence` is returned. Otherwise
|
|
231
|
+
a list of :class:`ProteinSequence` is returned, which contains
|
|
232
|
+
every ORF.
|
|
233
|
+
pos : list of tuple (int, int)
|
|
234
|
+
Is only returned if `complete` is false. The list contains
|
|
235
|
+
a tuple for each ORF.
|
|
236
|
+
The first element of the tuple is the index of the
|
|
237
|
+
:class:`NucleotideSequence`, where the translation starts.
|
|
238
|
+
The second element is the exclusive stop index, it
|
|
239
|
+
represents the first nucleotide in the
|
|
240
|
+
:class:`NucleotideSequence` after a stop codon.
|
|
241
|
+
|
|
242
|
+
Examples
|
|
243
|
+
--------
|
|
244
|
+
|
|
245
|
+
>>> dna_seq = NucleotideSequence("AATGATGCTATAGAT")
|
|
246
|
+
>>> prot_seq = dna_seq.translate(complete=True)
|
|
247
|
+
>>> print(prot_seq)
|
|
248
|
+
NDAID
|
|
249
|
+
>>> prot_seqs, pos = dna_seq.translate(complete=False)
|
|
250
|
+
>>> for seq in prot_seqs:
|
|
251
|
+
... print(seq)
|
|
252
|
+
MML*
|
|
253
|
+
ML*
|
|
254
|
+
|
|
255
|
+
"""
|
|
256
|
+
if self._alphabet != NucleotideSequence.alphabet_unamb:
|
|
257
|
+
raise AlphabetError("Translation requires unambiguous alphabet")
|
|
258
|
+
# Determine codon_table
|
|
259
|
+
if codon_table is None:
|
|
260
|
+
# Import at this position to avoid circular import
|
|
261
|
+
from .codon import CodonTable
|
|
262
|
+
codon_table = CodonTable.default_table()
|
|
263
|
+
|
|
264
|
+
if complete:
|
|
265
|
+
if len(self) % 3 != 0:
|
|
266
|
+
raise ValueError("Sequence length needs to be a multiple of 3 "
|
|
267
|
+
"for complete translation")
|
|
268
|
+
# Reshape code into (n,3), with n being the amount of codons
|
|
269
|
+
codons = self.code.reshape(-1, 3)
|
|
270
|
+
protein_seq = ProteinSequence()
|
|
271
|
+
protein_seq.code = codon_table.map_codon_codes(codons)
|
|
272
|
+
return protein_seq
|
|
273
|
+
|
|
274
|
+
else:
|
|
275
|
+
stop_code = ProteinSequence.alphabet.encode("*")
|
|
276
|
+
met_code = ProteinSequence.alphabet.encode("M")
|
|
277
|
+
protein_seqs = []
|
|
278
|
+
pos = []
|
|
279
|
+
code = self.code
|
|
280
|
+
# Create all three frames
|
|
281
|
+
for shift in range(3):
|
|
282
|
+
# The frame length is always a multiple of 3
|
|
283
|
+
# If there is a trailing partial codon, remove it
|
|
284
|
+
frame_length = ((len(code) - shift) // 3) * 3
|
|
285
|
+
frame = code[shift : shift+frame_length]
|
|
286
|
+
# Reshape frame into (n,3), with n being the amount of codons
|
|
287
|
+
frame_codons = frame.reshape(-1, 3)
|
|
288
|
+
# At first, translate frame completely
|
|
289
|
+
protein_code = codon_table.map_codon_codes(frame_codons)
|
|
290
|
+
# Iterate over all start codons in this frame
|
|
291
|
+
starts = np.where(codon_table.is_start_codon(frame_codons))[0]
|
|
292
|
+
for start_i in starts:
|
|
293
|
+
# Protein sequence beginning from start codon
|
|
294
|
+
code_from_start = protein_code[start_i:]
|
|
295
|
+
# Get all stop codon positions
|
|
296
|
+
# relative to 'code_from_start'
|
|
297
|
+
stops = np.where(code_from_start == stop_code)[0]
|
|
298
|
+
# Find first stop codon after start codon
|
|
299
|
+
# Include stop -> stops[0] + 1
|
|
300
|
+
stop_i = stops[0] + 1 if len(stops) > 0 \
|
|
301
|
+
else len(code_from_start)
|
|
302
|
+
code_from_start_to_stop = code_from_start[:stop_i]
|
|
303
|
+
prot_seq = ProteinSequence()
|
|
304
|
+
if met_start:
|
|
305
|
+
# Copy as the slice is edited
|
|
306
|
+
prot_seq.code = code_from_start_to_stop.copy()
|
|
307
|
+
prot_seq.code[0] = met_code
|
|
308
|
+
else:
|
|
309
|
+
prot_seq.code = code_from_start_to_stop
|
|
310
|
+
protein_seqs.append(prot_seq)
|
|
311
|
+
# Codon indices are transformed
|
|
312
|
+
# to nucleotide sequence indices
|
|
313
|
+
pos.append((shift + start_i*3, shift + (start_i+stop_i)*3))
|
|
314
|
+
# Sort by start position
|
|
315
|
+
order = np.argsort([start for start, stop in pos])
|
|
316
|
+
pos = [pos[i] for i in order]
|
|
317
|
+
protein_seqs = [protein_seqs[i] for i in order]
|
|
318
|
+
return protein_seqs, pos
|
|
319
|
+
|
|
320
|
+
@staticmethod
|
|
321
|
+
def unambiguous_alphabet():
|
|
322
|
+
"""
|
|
323
|
+
Get the unambiguous nucleotide alphabet containing the symbols
|
|
324
|
+
``A``, ``C``, ``G`` and ``T``.
|
|
325
|
+
|
|
326
|
+
Returns
|
|
327
|
+
-------
|
|
328
|
+
alphabet : LetterAlphabet
|
|
329
|
+
The unambiguous nucleotide alphabet.
|
|
330
|
+
"""
|
|
331
|
+
return NucleotideSequence.alphabet_unamb
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def ambiguous_alphabet():
|
|
335
|
+
"""
|
|
336
|
+
Get the ambiguous nucleotide alphabet containing the symbols
|
|
337
|
+
``A``, ``C``, ``G`` and ``T`` and symbols describing
|
|
338
|
+
ambiguous combinations of these.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
alphabet : LetterAlphabet
|
|
343
|
+
The ambiguous nucleotide alphabet.
|
|
344
|
+
"""
|
|
345
|
+
return NucleotideSequence.alphabet_amb
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class ProteinSequence(Sequence):
|
|
349
|
+
"""
|
|
350
|
+
Representation of a protein sequence.
|
|
351
|
+
|
|
352
|
+
Furthermore this class offers a conversion of amino acids from
|
|
353
|
+
3-letter code into 1-letter code and vice versa.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
sequence : iterable object, optional
|
|
358
|
+
The initial protein sequence. This may either be a list or a
|
|
359
|
+
string. May take upper or lower case letters. If a list is
|
|
360
|
+
given, the list elements can be 1-letter or 3-letter amino acid
|
|
361
|
+
representations. By default the sequence is empty.
|
|
362
|
+
|
|
363
|
+
Notes
|
|
364
|
+
-----
|
|
365
|
+
The :class:`Alphabet` of this :class:`Sequence` class does not
|
|
366
|
+
support selenocysteine.
|
|
367
|
+
Please convert selenocysteine (``U``) into cysteine (``C``)
|
|
368
|
+
or use a custom :class:`Sequence` class, if the differentiation is
|
|
369
|
+
necessary.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
_codon_table = None
|
|
373
|
+
|
|
374
|
+
alphabet = LetterAlphabet(["A","C","D","E","F","G","H","I","K","L",
|
|
375
|
+
"M","N","P","Q","R","S","T","V","W","Y",
|
|
376
|
+
"B","Z","X","*"])
|
|
377
|
+
|
|
378
|
+
# Masses are taken from
|
|
379
|
+
# https://web.expasy.org/findmod/findmod_masses.html#AA
|
|
380
|
+
|
|
381
|
+
_mol_weight_average = np.array([
|
|
382
|
+
71.0788, # A
|
|
383
|
+
103.1388, # C
|
|
384
|
+
115.0886, # D
|
|
385
|
+
129.1155, # E
|
|
386
|
+
147.1766, # F
|
|
387
|
+
57.0519, # G
|
|
388
|
+
137.1411, # H
|
|
389
|
+
113.1594, # I
|
|
390
|
+
128.1741, # K
|
|
391
|
+
113.1594, # L
|
|
392
|
+
131.1926, # M
|
|
393
|
+
114.1038, # N
|
|
394
|
+
97.1167, # P
|
|
395
|
+
128.1307, # Q
|
|
396
|
+
156.1875, # R
|
|
397
|
+
87.0782, # S
|
|
398
|
+
101.1051, # T
|
|
399
|
+
99.1326, # V
|
|
400
|
+
186.2132, # W
|
|
401
|
+
163.1760, # Y
|
|
402
|
+
np.nan, # B
|
|
403
|
+
np.nan, # Z
|
|
404
|
+
np.nan, # X
|
|
405
|
+
np.nan, # *
|
|
406
|
+
])
|
|
407
|
+
|
|
408
|
+
_mol_weight_monoisotopic = np.array([
|
|
409
|
+
71.03711, # A
|
|
410
|
+
103.00919, # C
|
|
411
|
+
115.02694, # D
|
|
412
|
+
129.04259, # E
|
|
413
|
+
147.06841, # F
|
|
414
|
+
57.02146, # G
|
|
415
|
+
137.05891, # H
|
|
416
|
+
113.08406, # I
|
|
417
|
+
128.09496, # K
|
|
418
|
+
113.08406, # L
|
|
419
|
+
131.04049, # M
|
|
420
|
+
114.04293, # N
|
|
421
|
+
97.05276, # P
|
|
422
|
+
128.05858, # Q
|
|
423
|
+
156.10111, # R
|
|
424
|
+
87.03203, # S
|
|
425
|
+
101.04768, # T
|
|
426
|
+
99.06841, # V
|
|
427
|
+
186.07931, # W
|
|
428
|
+
163.06333, # Y
|
|
429
|
+
np.nan, # B
|
|
430
|
+
np.nan, # Z
|
|
431
|
+
np.nan, # X
|
|
432
|
+
np.nan, # *
|
|
433
|
+
])
|
|
434
|
+
|
|
435
|
+
_dict_1to3 = {"A" : "ALA",
|
|
436
|
+
"C" : "CYS",
|
|
437
|
+
"D" : "ASP",
|
|
438
|
+
"E" : "GLU",
|
|
439
|
+
"F" : "PHE",
|
|
440
|
+
"G" : "GLY",
|
|
441
|
+
"H" : "HIS",
|
|
442
|
+
"I" : "ILE",
|
|
443
|
+
"K" : "LYS",
|
|
444
|
+
"L" : "LEU",
|
|
445
|
+
"M" : "MET",
|
|
446
|
+
"N" : "ASN",
|
|
447
|
+
"P" : "PRO",
|
|
448
|
+
"Q" : "GLN",
|
|
449
|
+
"R" : "ARG",
|
|
450
|
+
"S" : "SER",
|
|
451
|
+
"T" : "THR",
|
|
452
|
+
"V" : "VAL",
|
|
453
|
+
"W" : "TRP",
|
|
454
|
+
"Y" : "TYR",
|
|
455
|
+
"B" : "ASX",
|
|
456
|
+
"Z" : "GLX",
|
|
457
|
+
"X" : "UNK",
|
|
458
|
+
"*" : " * "}
|
|
459
|
+
|
|
460
|
+
_dict_3to1 = {}
|
|
461
|
+
for _key, _value in _dict_1to3.items():
|
|
462
|
+
_dict_3to1[_value] = _key
|
|
463
|
+
_dict_3to1["SEC"] = "C"
|
|
464
|
+
_dict_3to1["MSE"] = "M"
|
|
465
|
+
|
|
466
|
+
def __init__(self, sequence=()):
|
|
467
|
+
dict_3to1 = ProteinSequence._dict_3to1
|
|
468
|
+
alph = ProteinSequence.alphabet
|
|
469
|
+
# Convert 3-letter codes to single letter codes,
|
|
470
|
+
# if list contains 3-letter codes
|
|
471
|
+
sequence = [dict_3to1[symbol.upper()] if len(symbol) == 3
|
|
472
|
+
else symbol.upper() for symbol in sequence]
|
|
473
|
+
super().__init__(sequence)
|
|
474
|
+
|
|
475
|
+
def __repr__(self):
|
|
476
|
+
"""Represent ProteinSequence as a string for debugging."""
|
|
477
|
+
return f'ProteinSequence("{"".join(self.symbols)}")'
|
|
478
|
+
|
|
479
|
+
def get_alphabet(self):
|
|
480
|
+
return ProteinSequence.alphabet
|
|
481
|
+
|
|
482
|
+
def remove_stops(self):
|
|
483
|
+
"""
|
|
484
|
+
Remove *stop signals* from the sequence.
|
|
485
|
+
|
|
486
|
+
Returns
|
|
487
|
+
-------
|
|
488
|
+
no_stop : ProteinSequence
|
|
489
|
+
A copy of this sequence without stop signals.
|
|
490
|
+
"""
|
|
491
|
+
stop_code = ProteinSequence.alphabet.encode("*")
|
|
492
|
+
no_stop = self.copy()
|
|
493
|
+
seq_code = no_stop.code
|
|
494
|
+
no_stop.code = seq_code[seq_code != stop_code]
|
|
495
|
+
return no_stop
|
|
496
|
+
|
|
497
|
+
@staticmethod
|
|
498
|
+
def convert_letter_3to1(symbol):
|
|
499
|
+
"""
|
|
500
|
+
Convert a 3-letter to a 1-letter amino acid representation.
|
|
501
|
+
|
|
502
|
+
Parameters
|
|
503
|
+
----------
|
|
504
|
+
symbol : string
|
|
505
|
+
3-letter amino acid representation.
|
|
506
|
+
|
|
507
|
+
Returns
|
|
508
|
+
-------
|
|
509
|
+
convert : string
|
|
510
|
+
1-letter amino acid representation.
|
|
511
|
+
"""
|
|
512
|
+
return ProteinSequence._dict_3to1[symbol.upper()]
|
|
513
|
+
|
|
514
|
+
@staticmethod
|
|
515
|
+
def convert_letter_1to3(symbol):
|
|
516
|
+
"""
|
|
517
|
+
Convert a 1-letter to a 3-letter amino acid representation.
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
symbol : string
|
|
522
|
+
1-letter amino acid representation.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
convert : string
|
|
527
|
+
3-letter amino acid representation.
|
|
528
|
+
"""
|
|
529
|
+
return ProteinSequence._dict_1to3[symbol.upper()]
|
|
530
|
+
|
|
531
|
+
def get_molecular_weight(self, monoisotopic=False):
|
|
532
|
+
"""
|
|
533
|
+
Calculate the molecular weight of this protein.
|
|
534
|
+
|
|
535
|
+
Average protein molecular weight is calculated by the addition
|
|
536
|
+
of average isotopic masses of the amino acids
|
|
537
|
+
in the protein and the average isotopic mass of one water
|
|
538
|
+
molecule.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
weight : float
|
|
543
|
+
Molecular weight of the protein represented by the sequence.
|
|
544
|
+
Molecular weight values are given in Dalton (Da).
|
|
545
|
+
"""
|
|
546
|
+
if monoisotopic:
|
|
547
|
+
weight = np.sum(self._mol_weight_monoisotopic[self.code]) + 18.015
|
|
548
|
+
else:
|
|
549
|
+
weight = np.sum(self._mol_weight_average[self.code]) + 18.015
|
|
550
|
+
|
|
551
|
+
if np.isnan(weight):
|
|
552
|
+
raise ValueError(
|
|
553
|
+
"Sequence contains ambiguous amino acids, "
|
|
554
|
+
"cannot calculate weight"
|
|
555
|
+
)
|
|
556
|
+
return weight
|