biotite 1.0.1__cp311-cp311-win_amd64.whl → 1.2.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/application.py +3 -3
- biotite/application/autodock/app.py +1 -1
- biotite/application/blast/webapp.py +1 -1
- biotite/application/clustalo/app.py +1 -1
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +36 -2
- biotite/application/msaapp.py +10 -10
- biotite/application/muscle/app3.py +5 -18
- biotite/application/muscle/app5.py +5 -5
- biotite/application/sra/app.py +0 -5
- biotite/application/util.py +22 -2
- biotite/application/viennarna/rnaalifold.py +8 -8
- biotite/application/viennarna/rnaplot.py +9 -3
- biotite/application/viennarna/util.py +1 -1
- biotite/application/webapp.py +1 -1
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +191 -0
- biotite/database/entrez/dbnames.py +10 -0
- biotite/database/entrez/download.py +9 -10
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +5 -4
- biotite/database/pubchem/download.py +6 -6
- biotite/database/pubchem/error.py +10 -0
- biotite/database/pubchem/query.py +12 -23
- biotite/database/rcsb/download.py +3 -2
- biotite/database/rcsb/query.py +8 -9
- biotite/database/uniprot/check.py +22 -17
- biotite/database/uniprot/download.py +3 -6
- biotite/database/uniprot/query.py +4 -5
- biotite/file.py +14 -2
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +16 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +198 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1226 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +15 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +71 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/align/__init__.py +0 -4
- biotite/sequence/align/alignment.py +49 -14
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +26 -26
- biotite/sequence/align/cigar.py +2 -2
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +19 -2
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +58 -48
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +47 -47
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +10 -10
- biotite/sequence/align/matrix.py +284 -57
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +35 -35
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +2 -2
- biotite/sequence/align/statistics.py +1 -1
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +5 -2
- biotite/sequence/annotation.py +19 -13
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +1 -2
- biotite/sequence/graphics/alignment.py +25 -39
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/graphics/dendrogram.py +4 -2
- biotite/sequence/graphics/features.py +2 -2
- biotite/sequence/graphics/logo.py +10 -12
- biotite/sequence/io/fasta/convert.py +1 -2
- biotite/sequence/io/fasta/file.py +1 -1
- biotite/sequence/io/fastq/file.py +3 -3
- biotite/sequence/io/genbank/file.py +3 -3
- biotite/sequence/io/genbank/sequence.py +2 -0
- biotite/sequence/io/gff/convert.py +1 -1
- biotite/sequence/io/gff/file.py +1 -2
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +105 -29
- biotite/sequence/search.py +0 -1
- biotite/sequence/seqtypes.py +136 -8
- biotite/sequence/sequence.py +1 -2
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +6 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +163 -66
- biotite/structure/basepairs.py +26 -26
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +79 -25
- biotite/structure/box.py +19 -21
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +83 -67
- biotite/structure/chains.py +5 -37
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/compare.py +420 -13
- biotite/structure/density.py +1 -1
- biotite/structure/dotbracket.py +27 -28
- biotite/structure/filter.py +8 -8
- biotite/structure/geometry.py +74 -127
- biotite/structure/hbond.py +17 -19
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +24 -15
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -34
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +62 -19
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -22
- biotite/structure/info/radii.py +92 -22
- biotite/structure/info/standardize.py +4 -4
- biotite/structure/integrity.py +4 -6
- biotite/structure/io/general.py +2 -2
- biotite/structure/io/gro/file.py +8 -9
- biotite/structure/io/mol/convert.py +1 -1
- biotite/structure/io/mol/ctab.py +33 -28
- biotite/structure/io/mol/mol.py +1 -1
- biotite/structure/io/mol/sdf.py +80 -53
- biotite/structure/io/pdb/convert.py +4 -3
- biotite/structure/io/pdb/file.py +85 -25
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +36 -36
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +54 -15
- biotite/structure/io/pdbx/cif.py +92 -66
- biotite/structure/io/pdbx/component.py +15 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +410 -75
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +9 -6
- biotite/structure/io/util.py +38 -0
- biotite/structure/mechanics.py +0 -1
- biotite/structure/molecules.py +141 -156
- biotite/structure/pseudoknots.py +7 -13
- biotite/structure/repair.py +2 -4
- biotite/structure/residues.py +13 -24
- biotite/structure/rings.py +335 -0
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +2 -1
- biotite/structure/segments.py +69 -11
- biotite/structure/sequence.py +0 -1
- biotite/structure/sse.py +0 -2
- biotite/structure/superimpose.py +74 -62
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +12 -25
- biotite/structure/util.py +76 -4
- biotite/version.py +9 -4
- biotite/visualize.py +111 -1
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -94,27 +94,32 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
|
|
|
94
94
|
>>> print(color_scheme)
|
|
95
95
|
['#3737f5', '#37f537', '#f5f537', '#f53737']
|
|
96
96
|
"""
|
|
97
|
+
# Try exact alphabet match first
|
|
98
|
+
for scheme in _color_schemes:
|
|
99
|
+
if scheme["name"] == name and scheme["alphabet"] == alphabet:
|
|
100
|
+
return _fit_color_scheme(alphabet, scheme, default)
|
|
101
|
+
# If no exact match was found, try to find a scheme for an alphabet
|
|
102
|
+
# that extends the given alphabet
|
|
97
103
|
for scheme in _color_schemes:
|
|
98
104
|
if scheme["name"] == name and scheme["alphabet"].extends(alphabet):
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
colors = [color if color is not None else default for color in colors]
|
|
102
|
-
# Only return colors that are in scope of this alphabet
|
|
103
|
-
# and not the extended alphabet
|
|
104
|
-
return colors[: len(alphabet)]
|
|
105
|
+
return _fit_color_scheme(alphabet, scheme, default)
|
|
106
|
+
|
|
105
107
|
raise ValueError(f"Unkown scheme '{name}' for given alphabet")
|
|
106
108
|
|
|
107
109
|
|
|
108
|
-
def list_color_scheme_names(alphabet):
|
|
110
|
+
def list_color_scheme_names(alphabet, strict=False):
|
|
109
111
|
"""
|
|
110
112
|
Get a list of available color scheme names for a given alphabet.
|
|
111
113
|
|
|
112
114
|
Parameters
|
|
113
115
|
----------
|
|
114
116
|
alphabet : Alphabet
|
|
115
|
-
The
|
|
116
|
-
|
|
117
|
-
to
|
|
117
|
+
The alphabet to get the color scheme names for.
|
|
118
|
+
strict : bool, optional
|
|
119
|
+
If set to true, only schemes with an exact match to the given
|
|
120
|
+
alphabet are included in the list.
|
|
121
|
+
If set to false, schemes with an alphabet that extends the given
|
|
122
|
+
alphabet are also included.
|
|
118
123
|
|
|
119
124
|
Returns
|
|
120
125
|
-------
|
|
@@ -123,7 +128,9 @@ def list_color_scheme_names(alphabet):
|
|
|
123
128
|
"""
|
|
124
129
|
scheme_list = []
|
|
125
130
|
for scheme in _color_schemes:
|
|
126
|
-
if scheme["alphabet"]
|
|
131
|
+
if strict and scheme["alphabet"] == alphabet:
|
|
132
|
+
scheme_list.append(scheme["name"])
|
|
133
|
+
if not strict and scheme["alphabet"].extends(alphabet):
|
|
127
134
|
scheme_list.append(scheme["name"])
|
|
128
135
|
return scheme_list
|
|
129
136
|
|
|
@@ -135,3 +142,29 @@ _color_schemes = []
|
|
|
135
142
|
for file_name in glob.glob(_scheme_dir + os.sep + "*.json"):
|
|
136
143
|
scheme = load_color_scheme(file_name)
|
|
137
144
|
_color_schemes.append(scheme)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _fit_color_scheme(alphabet, color_scheme, default_color):
|
|
148
|
+
"""
|
|
149
|
+
Fit a color scheme to the given alphabet.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
alphabet : Alphabet
|
|
154
|
+
The alphabet to get the color scheme for.
|
|
155
|
+
color_scheme : dict
|
|
156
|
+
The color scheme.
|
|
157
|
+
default_color : str or tuple
|
|
158
|
+
The default color.
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
scheme : list of str
|
|
163
|
+
The colors from the scheme.
|
|
164
|
+
"""
|
|
165
|
+
colors = color_scheme["colors"]
|
|
166
|
+
# Replace None values with default color
|
|
167
|
+
colors = [color if color is not None else default_color for color in colors]
|
|
168
|
+
# Only return colors that are in scope of this alphabet
|
|
169
|
+
# and not the extended alphabet
|
|
170
|
+
return colors[: len(alphabet)]
|
|
@@ -25,8 +25,10 @@ def plot_dendrogram(
|
|
|
25
25
|
|
|
26
26
|
Parameters
|
|
27
27
|
----------
|
|
28
|
+
axes : Axes
|
|
29
|
+
A *Matplotlib* axes, that is used as plotting area.
|
|
28
30
|
tree : Tree
|
|
29
|
-
The tree to be visualized
|
|
31
|
+
The tree to be visualized.
|
|
30
32
|
orientation : {'left', 'right', 'bottom', 'top'}, optional
|
|
31
33
|
The position of the root node in the plot
|
|
32
34
|
use_distances : bool, optional
|
|
@@ -38,7 +40,7 @@ def plot_dendrogram(
|
|
|
38
40
|
The label of a leaf node is the entry at the position of its
|
|
39
41
|
`index` attribute.
|
|
40
42
|
label_size : float, optional
|
|
41
|
-
The font size of the labels
|
|
43
|
+
The font size of the labels.
|
|
42
44
|
color : tuple or str, optional
|
|
43
45
|
A *Matplotlib* compatible color, that is used to draw the lines
|
|
44
46
|
of the dendrogram.
|
|
@@ -71,7 +71,7 @@ def plot_feature_map(
|
|
|
71
71
|
If true, the sequence position the base/residue of a line is
|
|
72
72
|
shown on the right side of the plot.
|
|
73
73
|
number_size : float, optional
|
|
74
|
-
The font size of the position numbers
|
|
74
|
+
The font size of the position numbers.
|
|
75
75
|
line_width : float, optional
|
|
76
76
|
The size of the continuous line as fraction of the height of
|
|
77
77
|
the drawn features.
|
|
@@ -416,7 +416,7 @@ class PromoterPlotter(FeaturePlotter):
|
|
|
416
416
|
line_width : float, optional
|
|
417
417
|
The width of the curved arrow tail.
|
|
418
418
|
head_width : float, optional
|
|
419
|
-
The width of the arrow head
|
|
419
|
+
The width of the arrow head.
|
|
420
420
|
head_length : float, optional
|
|
421
421
|
The length of the arrow.
|
|
422
422
|
head_height : float, optional
|
|
@@ -9,7 +9,7 @@ __all__ = ["plot_sequence_logo"]
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from biotite.sequence.alphabet import LetterAlphabet
|
|
11
11
|
from biotite.sequence.graphics.colorschemes import get_color_scheme
|
|
12
|
-
from biotite.visualize import
|
|
12
|
+
from biotite.visualize import plot_scaled_text
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
|
|
@@ -29,7 +29,7 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
|
|
|
29
29
|
----------
|
|
30
30
|
axes : Axes
|
|
31
31
|
The axes to draw the logo one.
|
|
32
|
-
profile: SequenceProfile
|
|
32
|
+
profile : SequenceProfile
|
|
33
33
|
The logo is created based on this profile.
|
|
34
34
|
scheme : str or list of (tuple or str)
|
|
35
35
|
Either a valid color scheme name
|
|
@@ -38,7 +38,8 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
|
|
|
38
38
|
The list length must be at least as long as the
|
|
39
39
|
length of the alphabet used by the `profile`.
|
|
40
40
|
**kwargs
|
|
41
|
-
Additional
|
|
41
|
+
Additional parameters for the :class:`matplotlib.font_manager.FontProperties`
|
|
42
|
+
of the text or the created :class:`matplotlib.patches.PathPatch`.
|
|
42
43
|
|
|
43
44
|
References
|
|
44
45
|
----------
|
|
@@ -69,23 +70,20 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
|
|
|
69
70
|
index_order = np.argsort(symbols_heights)
|
|
70
71
|
start_height = 0
|
|
71
72
|
for j in index_order[i]:
|
|
72
|
-
# Stack the symbols at position on top of the
|
|
73
|
+
# Stack the symbols at position on top of the preceding one
|
|
73
74
|
height = symbols_heights[i, j]
|
|
74
75
|
if height > 0:
|
|
75
76
|
symbol = alphabet.decode(j)
|
|
76
|
-
|
|
77
|
+
plot_scaled_text(
|
|
78
|
+
axes,
|
|
79
|
+
symbol,
|
|
77
80
|
i + 0.5,
|
|
78
81
|
start_height,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
va="bottom",
|
|
82
|
+
width=1,
|
|
83
|
+
height=height,
|
|
82
84
|
color=colors[j],
|
|
83
|
-
# Best results are obtained with this font size
|
|
84
|
-
size=1,
|
|
85
85
|
**kwargs,
|
|
86
86
|
)
|
|
87
|
-
text.set_clip_on(True)
|
|
88
|
-
set_font_size_in_coord(text, width=1, height=height)
|
|
89
87
|
start_height += height
|
|
90
88
|
|
|
91
89
|
axes.set_xlim(0.5, len(profile.symbols) + 0.5)
|
|
@@ -275,8 +275,7 @@ def _process_nucleotide_sequence(x):
|
|
|
275
275
|
def _convert_to_string(sequence, as_rna):
|
|
276
276
|
if not isinstance(sequence.get_alphabet(), LetterAlphabet):
|
|
277
277
|
raise ValueError(
|
|
278
|
-
"Only sequences using single letter alphabets "
|
|
279
|
-
"can be stored in a FASTA file"
|
|
278
|
+
"Only sequences using single letter alphabets can be stored in a FASTA file"
|
|
280
279
|
)
|
|
281
280
|
if isinstance(sequence, NucleotideSequence) and as_rna:
|
|
282
281
|
return str(sequence).replace("T", "U")
|
|
@@ -102,7 +102,7 @@ class FastaFile(TextFile, MutableMapping):
|
|
|
102
102
|
if not isinstance(header, str):
|
|
103
103
|
raise IndexError("'FastaFile' only supports header strings as keys")
|
|
104
104
|
if not isinstance(seq_str, str):
|
|
105
|
-
raise TypeError("'FastaFile' only supports sequence strings
|
|
105
|
+
raise TypeError("'FastaFile' only supports sequence strings as values")
|
|
106
106
|
# Create lines for new header and sequence (with line breaks)
|
|
107
107
|
new_lines = [">" + header.replace("\n", "").strip()] + wrap_string(
|
|
108
108
|
seq_str, width=self._chars_per_line
|
|
@@ -302,10 +302,10 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
302
302
|
else: # score_len > seq_len
|
|
303
303
|
raise InvalidFileError(
|
|
304
304
|
f"The amount of scores is not equal to the sequence "
|
|
305
|
-
f"length for the sequence in line {seq_start_i+1} "
|
|
305
|
+
f"length for the sequence in line {seq_start_i + 1} "
|
|
306
306
|
)
|
|
307
307
|
else:
|
|
308
|
-
raise InvalidFileError(f"Line {i+1} in FASTQ file is invalid")
|
|
308
|
+
raise InvalidFileError(f"Line {i + 1} in FASTQ file is invalid")
|
|
309
309
|
# At the end of the file, the last sequence or score block
|
|
310
310
|
# must have properly ended
|
|
311
311
|
if in_sequence or in_scores:
|
|
@@ -392,7 +392,7 @@ class FastqFile(TextFile, MutableMapping):
|
|
|
392
392
|
yield identifier, ("".join(seq_str_list), scores)
|
|
393
393
|
else: # score_len > seq_len
|
|
394
394
|
raise InvalidFileError(
|
|
395
|
-
"The amount of scores is not equal to the sequence
|
|
395
|
+
"The amount of scores is not equal to the sequence length"
|
|
396
396
|
)
|
|
397
397
|
|
|
398
398
|
else:
|
|
@@ -80,7 +80,7 @@ class GenBankFile(TextFile):
|
|
|
80
80
|
>>> print(content)
|
|
81
81
|
['One line', 'A second line']
|
|
82
82
|
>>> print(subfields)
|
|
83
|
-
OrderedDict(
|
|
83
|
+
OrderedDict({'SUBFIELD1': ['Single Line'], 'SUBFIELD2': ['Two', 'lines']})
|
|
84
84
|
|
|
85
85
|
Adding an additional field:
|
|
86
86
|
|
|
@@ -391,7 +391,7 @@ class GenBankFile(TextFile):
|
|
|
391
391
|
The field name.
|
|
392
392
|
content : list of str
|
|
393
393
|
The content lines.
|
|
394
|
-
|
|
394
|
+
subfields : dict of str -> str, optional
|
|
395
395
|
The subfields of the field.
|
|
396
396
|
The dictionary maps subfield names to the content lines of
|
|
397
397
|
the respective subfield.
|
|
@@ -432,7 +432,7 @@ class GenBankFile(TextFile):
|
|
|
432
432
|
The field name.
|
|
433
433
|
content : list of str
|
|
434
434
|
The content lines.
|
|
435
|
-
|
|
435
|
+
subfields : dict of str -> str, optional
|
|
436
436
|
The subfields of the field.
|
|
437
437
|
The dictionary maps subfield names to the content lines of
|
|
438
438
|
the respective subfield.
|
|
@@ -82,6 +82,8 @@ def get_annotated_sequence(gb_file, format="gb", include_only=None):
|
|
|
82
82
|
----------
|
|
83
83
|
gb_file : GenBankFile
|
|
84
84
|
The GenBank file to read the fields from.
|
|
85
|
+
format : {'gb', 'gp'}
|
|
86
|
+
Whether the file is a *GenBank* or *GenPept* file.
|
|
85
87
|
include_only : iterable object of str, optional
|
|
86
88
|
List of names of feature keys, which should included
|
|
87
89
|
in the annotation. By default all features are included.
|
|
@@ -84,7 +84,7 @@ def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=Tr
|
|
|
84
84
|
for feature in sorted(annotation):
|
|
85
85
|
if len(feature.locs) > 1 and "ID" not in feature.qual:
|
|
86
86
|
raise ValueError(
|
|
87
|
-
"The 'Id' qualifier is required
|
|
87
|
+
"The 'Id' qualifier is required for features with multiple locations"
|
|
88
88
|
)
|
|
89
89
|
## seqid ##
|
|
90
90
|
if seqid is not None and " " in seqid:
|
biotite/sequence/io/gff/file.py
CHANGED
|
@@ -303,8 +303,7 @@ class GFFFile(TextFile):
|
|
|
303
303
|
def __getitem__(self, index):
|
|
304
304
|
if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)):
|
|
305
305
|
raise IndexError(
|
|
306
|
-
f"Index {index} is out of range for GFFFile with "
|
|
307
|
-
f"{len(self)} entries"
|
|
306
|
+
f"Index {index} is out of range for GFFFile with {len(self)} entries"
|
|
308
307
|
)
|
|
309
308
|
|
|
310
309
|
line_index = self._entries[index]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
biotite/sequence/profile.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
5
|
import warnings
|
|
6
|
+
from numbers import Integral
|
|
6
7
|
import numpy as np
|
|
7
8
|
from biotite.sequence.align.alignment import get_codes
|
|
8
9
|
from biotite.sequence.alphabet import LetterAlphabet
|
|
@@ -66,6 +67,9 @@ class SequenceProfile(object):
|
|
|
66
67
|
It also saves the number of gaps at each position in the array
|
|
67
68
|
'gaps'.
|
|
68
69
|
|
|
70
|
+
With :meth:`from_alignment()` a :class:`SequenceProfile` object can
|
|
71
|
+
be created from an indefinite number of aligned sequences.
|
|
72
|
+
|
|
69
73
|
With :meth:`probability_matrix()` the position probability matrix
|
|
70
74
|
can be created based on 'symbols' and a pseudocount.
|
|
71
75
|
|
|
@@ -73,9 +77,6 @@ class SequenceProfile(object):
|
|
|
73
77
|
be created based on the before calculated position probability
|
|
74
78
|
matrix and the background frequencies.
|
|
75
79
|
|
|
76
|
-
With :meth:`from_alignment()` a :class:`SequenceProfile` object can
|
|
77
|
-
be created from an indefinite number of aligned sequences.
|
|
78
|
-
|
|
79
80
|
With :meth:`sequence_probability_from_matrix()` the probability of a
|
|
80
81
|
sequence can be calculated based on the before calculated position
|
|
81
82
|
probability matrix of this instance of object SequenceProfile.
|
|
@@ -94,7 +95,7 @@ class SequenceProfile(object):
|
|
|
94
95
|
gaps : ndarray, dtype=int, shape=n
|
|
95
96
|
Array which indicates the number of gaps at each position.
|
|
96
97
|
alphabet : Alphabet, length=k
|
|
97
|
-
Alphabet of sequences of sequence profile
|
|
98
|
+
Alphabet of sequences of sequence profile.
|
|
98
99
|
|
|
99
100
|
Attributes
|
|
100
101
|
----------
|
|
@@ -105,6 +106,63 @@ class SequenceProfile(object):
|
|
|
105
106
|
Array which indicates the number of gaps at each position.
|
|
106
107
|
alphabet : Alphabet, length=k
|
|
107
108
|
Alphabet of sequences of sequence profile
|
|
109
|
+
|
|
110
|
+
Examples
|
|
111
|
+
--------
|
|
112
|
+
|
|
113
|
+
Create a profile from a multiple sequence alignment:
|
|
114
|
+
|
|
115
|
+
>>> sequences = [
|
|
116
|
+
... NucleotideSequence("CGCTCATTC"),
|
|
117
|
+
... NucleotideSequence("CGCTATTC"),
|
|
118
|
+
... NucleotideSequence("CCCTCAATC"),
|
|
119
|
+
... ]
|
|
120
|
+
>>> msa, _, _, _ = align_multiple(
|
|
121
|
+
... sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
|
|
122
|
+
... )
|
|
123
|
+
>>> print(msa)
|
|
124
|
+
CGCTCATTC
|
|
125
|
+
CGCT-ATTC
|
|
126
|
+
CCCTCAATC
|
|
127
|
+
>>> profile = SequenceProfile.from_alignment(msa)
|
|
128
|
+
>>> print(profile)
|
|
129
|
+
A C G T
|
|
130
|
+
0 0 3 0 0
|
|
131
|
+
1 0 1 2 0
|
|
132
|
+
2 0 3 0 0
|
|
133
|
+
3 0 0 0 3
|
|
134
|
+
4 0 2 0 0
|
|
135
|
+
5 3 0 0 0
|
|
136
|
+
6 1 0 0 2
|
|
137
|
+
7 0 0 0 3
|
|
138
|
+
8 0 3 0 0
|
|
139
|
+
>>> print(profile.gaps)
|
|
140
|
+
[0 0 0 0 1 0 0 0 0]
|
|
141
|
+
|
|
142
|
+
Slice the profile (masks and index arrays are also supported):
|
|
143
|
+
|
|
144
|
+
>>> print(profile[2:])
|
|
145
|
+
A C G T
|
|
146
|
+
0 0 3 0 0
|
|
147
|
+
1 0 0 0 3
|
|
148
|
+
2 0 2 0 0
|
|
149
|
+
3 3 0 0 0
|
|
150
|
+
4 1 0 0 2
|
|
151
|
+
5 0 0 0 3
|
|
152
|
+
6 0 3 0 0
|
|
153
|
+
|
|
154
|
+
Use the profile to compute the position probability matrix:
|
|
155
|
+
|
|
156
|
+
>>> print(profile.probability_matrix())
|
|
157
|
+
[[0.000 1.000 0.000 0.000]
|
|
158
|
+
[0.000 0.333 0.667 0.000]
|
|
159
|
+
[0.000 1.000 0.000 0.000]
|
|
160
|
+
[0.000 0.000 0.000 1.000]
|
|
161
|
+
[0.000 1.000 0.000 0.000]
|
|
162
|
+
[1.000 0.000 0.000 0.000]
|
|
163
|
+
[0.333 0.000 0.000 0.667]
|
|
164
|
+
[0.000 0.000 0.000 1.000]
|
|
165
|
+
[0.000 1.000 0.000 0.000]]
|
|
108
166
|
"""
|
|
109
167
|
|
|
110
168
|
def __init__(self, symbols, gaps, alphabet):
|
|
@@ -156,8 +214,23 @@ class SequenceProfile(object):
|
|
|
156
214
|
)
|
|
157
215
|
self._gaps = new_gaps
|
|
158
216
|
|
|
217
|
+
def __str__(self):
|
|
218
|
+
# Add an additional row and column for the position and symbol indicators
|
|
219
|
+
print_matrix = np.full(
|
|
220
|
+
(self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
|
|
221
|
+
)
|
|
222
|
+
print_matrix[1:, 1:] = self.symbols.astype(str)
|
|
223
|
+
print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
|
|
224
|
+
print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
|
|
225
|
+
max_len = len(max(print_matrix.flatten(), key=len))
|
|
226
|
+
return "\n".join(
|
|
227
|
+
[
|
|
228
|
+
" ".join([str(cell).rjust(max_len) for cell in row])
|
|
229
|
+
for row in print_matrix
|
|
230
|
+
]
|
|
231
|
+
)
|
|
232
|
+
|
|
159
233
|
def __repr__(self):
|
|
160
|
-
"""Represent SequenceProfile as a string for debugging."""
|
|
161
234
|
return (
|
|
162
235
|
f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
|
|
163
236
|
f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
|
|
@@ -191,15 +264,14 @@ class SequenceProfile(object):
|
|
|
191
264
|
alphabet : bool
|
|
192
265
|
This alphabet will be used when creating the SequenceProfile
|
|
193
266
|
object. If no alphabet is selected, the alphabet for this
|
|
194
|
-
SequenceProfile
|
|
267
|
+
:class:`SequenceProfile`.
|
|
195
268
|
object will be calculated from the sequences of object
|
|
196
269
|
Alignment.
|
|
197
|
-
(Default: None).
|
|
198
270
|
|
|
199
271
|
Returns
|
|
200
272
|
-------
|
|
201
273
|
profile: SequenceProfile
|
|
202
|
-
The created SequenceProfile object
|
|
274
|
+
The created :class:`SequenceProfile` object.
|
|
203
275
|
"""
|
|
204
276
|
sequences = get_codes(alignment)
|
|
205
277
|
if alphabet is None:
|
|
@@ -233,13 +305,12 @@ class SequenceProfile(object):
|
|
|
233
305
|
If true, returns consensus sequence as GeneralSequence
|
|
234
306
|
object.
|
|
235
307
|
Otherwise, the consensus sequence object type is chosen
|
|
236
|
-
based on the alphabet of this SequenceProfile object
|
|
237
|
-
(Default: False).
|
|
308
|
+
based on the alphabet of this SequenceProfile object.
|
|
238
309
|
|
|
239
310
|
Returns
|
|
240
311
|
-------
|
|
241
312
|
consensus: Sequence
|
|
242
|
-
The calculated consensus sequence
|
|
313
|
+
The calculated consensus sequence.
|
|
243
314
|
"""
|
|
244
315
|
# https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
|
|
245
316
|
if as_general:
|
|
@@ -347,14 +418,13 @@ class SequenceProfile(object):
|
|
|
347
418
|
|
|
348
419
|
Parameters
|
|
349
420
|
----------
|
|
350
|
-
pseudocount: int, optional
|
|
421
|
+
pseudocount : int, optional
|
|
351
422
|
Amount added to the number of observed cases in order to
|
|
352
423
|
change the expected probability of the PPM.
|
|
353
|
-
(Default: 0)
|
|
354
424
|
|
|
355
425
|
Returns
|
|
356
426
|
-------
|
|
357
|
-
ppm: ndarray, dtype=float, shape=(n,k)
|
|
427
|
+
ppm : ndarray, dtype=float, shape=(n,k)
|
|
358
428
|
The calculated the position probability matrix.
|
|
359
429
|
"""
|
|
360
430
|
if pseudocount < 0:
|
|
@@ -383,17 +453,16 @@ class SequenceProfile(object):
|
|
|
383
453
|
|
|
384
454
|
Parameters
|
|
385
455
|
----------
|
|
386
|
-
|
|
387
|
-
Amount added to the number of observed cases in order to change
|
|
388
|
-
the expected probability of the PPM.
|
|
389
|
-
(Default: 0)
|
|
390
|
-
background_frequencies: ndarray, shape=(k,), dtype=float, optional
|
|
456
|
+
background_frequencies : ndarray, shape=(k,), dtype=float, optional
|
|
391
457
|
The background frequencies for each symbol in the alphabet.
|
|
392
458
|
By default, a uniform distribution is assumed.
|
|
459
|
+
pseudocount : int, optional
|
|
460
|
+
Amount added to the number of observed cases in order to change
|
|
461
|
+
the expected probability of the PPM.
|
|
393
462
|
|
|
394
463
|
Returns
|
|
395
464
|
-------
|
|
396
|
-
pwm: ndarray, dtype=float, shape=(n,k)
|
|
465
|
+
pwm : ndarray, dtype=float, shape=(n,k)
|
|
397
466
|
The calculated the position weight matrix.
|
|
398
467
|
"""
|
|
399
468
|
if background_frequencies is None:
|
|
@@ -417,14 +486,13 @@ class SequenceProfile(object):
|
|
|
417
486
|
----------
|
|
418
487
|
sequence : Sequence
|
|
419
488
|
The input sequence.
|
|
420
|
-
pseudocount: int, optional
|
|
489
|
+
pseudocount : int, optional
|
|
421
490
|
Amount added to the number of observed cases in order to change
|
|
422
491
|
the expected probability of the PPM.
|
|
423
|
-
(Default: 0)
|
|
424
492
|
|
|
425
493
|
Returns
|
|
426
494
|
-------
|
|
427
|
-
probability: float
|
|
495
|
+
probability : float
|
|
428
496
|
The calculated probability for the input sequence based on
|
|
429
497
|
the PPM.
|
|
430
498
|
"""
|
|
@@ -453,17 +521,16 @@ class SequenceProfile(object):
|
|
|
453
521
|
----------
|
|
454
522
|
sequence : Sequence
|
|
455
523
|
The input sequence.
|
|
456
|
-
|
|
457
|
-
Amount added to the number of observed cases in order to change
|
|
458
|
-
the expected probability of the PPM.
|
|
459
|
-
(Default: 0)
|
|
460
|
-
background_frequencies: ndarray, shape=(k,), dtype=float, optional
|
|
524
|
+
background_frequencies : ndarray, shape=(k,), dtype=float, optional
|
|
461
525
|
The background frequencies for each symbol in the alphabet.
|
|
462
526
|
By default a uniform distribution is assumed.
|
|
527
|
+
pseudocount : int, optional
|
|
528
|
+
Amount added to the number of observed cases in order to change
|
|
529
|
+
the expected probability of the PPM.
|
|
463
530
|
|
|
464
531
|
Returns
|
|
465
532
|
-------
|
|
466
|
-
score: float
|
|
533
|
+
score : float
|
|
467
534
|
The calculated score for the input sequence based on
|
|
468
535
|
the PWM.
|
|
469
536
|
"""
|
|
@@ -483,3 +550,12 @@ class SequenceProfile(object):
|
|
|
483
550
|
f"as 'symbols' {self.symbols.shape}"
|
|
484
551
|
)
|
|
485
552
|
return np.sum(pwm[np.arange(len(sequence)), sequence.code])
|
|
553
|
+
|
|
554
|
+
def __getitem__(self, index):
|
|
555
|
+
if isinstance(index, Integral):
|
|
556
|
+
# Do not allow to collapse dimensions
|
|
557
|
+
index = slice(index, index + 1)
|
|
558
|
+
return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
|
|
559
|
+
|
|
560
|
+
def __len__(self):
|
|
561
|
+
return len(self.symbols)
|
biotite/sequence/search.py
CHANGED
|
@@ -39,7 +39,6 @@ def find_subsequence(sequence, query):
|
|
|
39
39
|
>>> sub_seq = NucleotideSequence("TGA")
|
|
40
40
|
>>> print(find_subsequence(main_seq, sub_seq))
|
|
41
41
|
[2 6]
|
|
42
|
-
|
|
43
42
|
"""
|
|
44
43
|
if not sequence.get_alphabet().extends(query.get_alphabet()):
|
|
45
44
|
raise ValueError("The sequences alphabets are not equal")
|