biotite 0.39.0__cp311-cp311-macosx_11_0_arm64.whl → 0.41.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +3 -3
- biotite/application/dssp/app.py +18 -18
- biotite/database/pubchem/download.py +23 -23
- biotite/database/pubchem/query.py +7 -7
- biotite/database/rcsb/download.py +19 -14
- biotite/file.py +17 -9
- biotite/sequence/align/banded.c +258 -237
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/cigar.py +60 -15
- biotite/sequence/align/kmeralphabet.c +243 -222
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.c +215 -196
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpp +233 -205
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localgapped.c +258 -237
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.c +235 -214
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/multiple.c +255 -234
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.c +274 -253
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.c +215 -196
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.c +217 -197
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/tracetable.c +215 -195
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/annotation.py +2 -2
- biotite/sequence/codec.c +235 -214
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/io/fasta/convert.py +27 -24
- biotite/sequence/phylo/nj.c +215 -196
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.c +227 -202
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.c +215 -196
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/structure/__init__.py +2 -0
- biotite/structure/basepairs.py +7 -12
- biotite/structure/bonds.c +1437 -1279
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/celllist.c +217 -197
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/charges.c +1052 -1101
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/dotbracket.py +2 -0
- biotite/structure/filter.py +30 -37
- biotite/structure/info/__init__.py +5 -8
- biotite/structure/info/atoms.py +31 -68
- biotite/structure/info/bonds.py +47 -101
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1663 -0
- biotite/structure/info/ccd/carbohydrates.txt +1135 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +798 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +21 -20
- biotite/structure/info/misc.py +78 -25
- biotite/structure/info/standardize.py +17 -12
- biotite/structure/integrity.py +19 -70
- biotite/structure/io/__init__.py +2 -4
- biotite/structure/io/ctab.py +12 -106
- biotite/structure/io/general.py +167 -181
- biotite/structure/io/gro/file.py +16 -16
- biotite/structure/io/mmtf/__init__.py +3 -0
- biotite/structure/io/mmtf/convertarray.c +219 -198
- biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.c +217 -197
- biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.c +225 -204
- biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.c +215 -196
- biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/file.py +34 -26
- biotite/structure/io/mol/__init__.py +4 -2
- biotite/structure/io/mol/convert.py +71 -7
- biotite/structure/io/mol/ctab.py +414 -0
- biotite/structure/io/mol/header.py +116 -0
- biotite/structure/io/mol/{file.py → mol.py} +69 -82
- biotite/structure/io/mol/sdf.py +909 -0
- biotite/structure/io/npz/__init__.py +3 -0
- biotite/structure/io/npz/file.py +21 -18
- biotite/structure/io/pdb/__init__.py +3 -3
- biotite/structure/io/pdb/file.py +89 -34
- biotite/structure/io/pdb/hybrid36.c +63 -43
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +12 -6
- biotite/structure/io/pdbx/bcif.py +648 -0
- biotite/structure/io/pdbx/cif.py +1032 -0
- biotite/structure/io/pdbx/component.py +246 -0
- biotite/structure/io/pdbx/convert.py +858 -386
- biotite/structure/io/pdbx/encoding.c +112813 -0
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/molecules.py +151 -151
- biotite/structure/repair.py +253 -0
- biotite/structure/sasa.c +215 -196
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/sequence.py +112 -0
- biotite/structure/superimpose.py +618 -116
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/METADATA +3 -3
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/RECORD +109 -103
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/WHEEL +1 -1
- biotite/structure/info/amino_acids.json +0 -1556
- biotite/structure/info/amino_acids.py +0 -42
- biotite/structure/info/carbohydrates.json +0 -1122
- biotite/structure/info/carbohydrates.py +0 -39
- biotite/structure/info/intra_bonds.msgpack +0 -0
- biotite/structure/info/link_types.msgpack +0 -1
- biotite/structure/info/nucleotides.json +0 -772
- biotite/structure/info/nucleotides.py +0 -39
- biotite/structure/info/residue_masses.msgpack +0 -0
- biotite/structure/info/residue_names.msgpack +0 -3
- biotite/structure/info/residues.msgpack +0 -0
- biotite/structure/io/pdbx/file.py +0 -652
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/LICENSE.rst +0 -0
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Function for converting a structure into a sequence.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__name__ = "biotite.structure"
|
|
10
|
+
__author__ = "Patrick Kunzmann"
|
|
11
|
+
__all__ = ["to_sequence"]
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from .info.misc import one_letter_code
|
|
15
|
+
from .info.groups import amino_acid_names, nucleotide_names
|
|
16
|
+
from .residues import get_residues
|
|
17
|
+
from .chains import get_chain_starts
|
|
18
|
+
from .error import BadStructureError
|
|
19
|
+
from ..sequence.seqtypes import ProteinSequence, NucleotideSequence
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
HETERO_PLACEHOLDER = "."
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def to_sequence(atoms, allow_hetero=False):
|
|
26
|
+
"""
|
|
27
|
+
Convert each chain in a structure into a sequence.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
atoms : AtomArray or AtomArrayStack
|
|
32
|
+
The structure.
|
|
33
|
+
May contain multiple chains.
|
|
34
|
+
Each chain must be either a peptide or a nucleic acid.
|
|
35
|
+
allow_hetero : bool, optional
|
|
36
|
+
If true, residues inside a amino acid or nucleotide chain,
|
|
37
|
+
that have no one-letter code, are replaced by the respective
|
|
38
|
+
'*any*' symbol (`"X"` or `"N"`, respectively).
|
|
39
|
+
The same is true for amino acids in nucleotide chains and vice
|
|
40
|
+
versa.
|
|
41
|
+
By default, an exception is raised.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
sequences : list of Sequence, length=n
|
|
46
|
+
The sequence for each chain in the structure.
|
|
47
|
+
chain_start_indices : ndarray, shape=(n,), dtype=int
|
|
48
|
+
The atom index where each chain starts.
|
|
49
|
+
|
|
50
|
+
Notes
|
|
51
|
+
-----
|
|
52
|
+
Residues are considered amino acids or nucleotides based on their
|
|
53
|
+
appearance :func:`info.amino_acid_names()` or
|
|
54
|
+
:func:`info.nucleotide_names()`, respectively.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
|
|
59
|
+
>>> sequences, chain_starts = to_sequence(atom_array)
|
|
60
|
+
>>> print(sequences)
|
|
61
|
+
[ProteinSequence("NLYIQWLKDGGPSSGRPPPS")]
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
sequences = []
|
|
65
|
+
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
|
|
66
|
+
for i in range(len(chain_start_indices)-1):
|
|
67
|
+
start = chain_start_indices[i]
|
|
68
|
+
stop = chain_start_indices[i+1]
|
|
69
|
+
chain = atoms[start:stop]
|
|
70
|
+
_, residues = get_residues(chain)
|
|
71
|
+
one_letter_symbols = np.array(
|
|
72
|
+
[one_letter_code(res) or HETERO_PLACEHOLDER for res in residues]
|
|
73
|
+
)
|
|
74
|
+
hetero_mask = one_letter_symbols == HETERO_PLACEHOLDER
|
|
75
|
+
|
|
76
|
+
aa_count = np.count_nonzero(np.isin(residues, amino_acid_names()))
|
|
77
|
+
nuc_count = np.count_nonzero(np.isin(residues, nucleotide_names()))
|
|
78
|
+
if aa_count == 0 and nuc_count == 0:
|
|
79
|
+
raise BadStructureError(
|
|
80
|
+
f"Chain {chain.chain_id[0]} contains neither amino acids "
|
|
81
|
+
"nor nucleotides"
|
|
82
|
+
)
|
|
83
|
+
elif aa_count > nuc_count:
|
|
84
|
+
# Chain is a peptide
|
|
85
|
+
hetero_mask |= ~np.isin(residues, amino_acid_names())
|
|
86
|
+
if not allow_hetero and np.any(hetero_mask):
|
|
87
|
+
hetero_indices = np.where(hetero_mask)[0]
|
|
88
|
+
raise BadStructureError(
|
|
89
|
+
f"Hetero residue(s) "
|
|
90
|
+
f"{', '.join(residues[hetero_indices])} in peptide"
|
|
91
|
+
)
|
|
92
|
+
one_letter_symbols[hetero_mask] = "X"
|
|
93
|
+
# Replace selenocysteine and pyrrolysine
|
|
94
|
+
one_letter_symbols[one_letter_symbols == "U"] = "C"
|
|
95
|
+
one_letter_symbols[one_letter_symbols == "O"] = "K"
|
|
96
|
+
sequences.append(ProteinSequence("".join(one_letter_symbols)))
|
|
97
|
+
else:
|
|
98
|
+
# Chain is a nucleic acid
|
|
99
|
+
hetero_mask |= ~np.isin(residues, nucleotide_names())
|
|
100
|
+
if not allow_hetero and np.any(hetero_mask):
|
|
101
|
+
hetero_indices = np.where(hetero_mask)[0]
|
|
102
|
+
raise BadStructureError(
|
|
103
|
+
f"Hetero residue(s) "
|
|
104
|
+
f"{', '.join(residues[hetero_indices])} in nucleic acid"
|
|
105
|
+
)
|
|
106
|
+
one_letter_symbols[hetero_mask] = "N"
|
|
107
|
+
# Replace uracil
|
|
108
|
+
one_letter_symbols[one_letter_symbols == "U"] = "T"
|
|
109
|
+
sequences.append(NucleotideSequence("".join(one_letter_symbols)))
|
|
110
|
+
|
|
111
|
+
# Remove exclusive stop
|
|
112
|
+
return sequences, chain_start_indices[:-1]
|