biotite 0.41.1__cp311-cp311-win_amd64.whl → 0.41.2__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/__init__.py +35 -9
- biotite/application/application.py +2 -1
- biotite/sequence/__init__.py +13 -2
- biotite/sequence/align/__init__.py +158 -4
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +34 -34
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +63 -63
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/sequence/sequence.py +52 -50
- biotite/structure/atoms.py +8 -8
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +59 -68
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/info/ccd.py +17 -2
- biotite/structure/info/groups.py +9 -12
- biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/bcif.py +0 -8
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/version.py +2 -2
- {biotite-0.41.1.dist-info → biotite-0.41.2.dist-info}/METADATA +2 -2
- {biotite-0.41.1.dist-info → biotite-0.41.2.dist-info}/RECORD +42 -42
- {biotite-0.41.1.dist-info → biotite-0.41.2.dist-info}/WHEEL +1 -1
- {biotite-0.41.1.dist-info → biotite-0.41.2.dist-info}/licenses/LICENSE.rst +0 -0
biotite/application/__init__.py
CHANGED
|
@@ -19,17 +19,43 @@ These programs are not shipped with the *Biotite* package.
|
|
|
19
19
|
|
|
20
20
|
Each application is represented by its respective :class:`Application`
|
|
21
21
|
class.
|
|
22
|
-
:class:`Application`
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
:class:`AppState
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
Each :class:`Application` instance has a life cycle, starting with its
|
|
23
|
+
creation and ending with the result extraction.
|
|
24
|
+
Each state in this life cycle is described by the value of the
|
|
25
|
+
*enum* :class:`AppState`, that each :class:`Application` contains:
|
|
26
|
+
Directly after its instantiation the app is in the ``CREATED`` state.
|
|
27
|
+
In this state further parameters can be set for the application run.
|
|
28
|
+
After the user calls the :func:`Application.start()` method, the app
|
|
29
|
+
state is set to ``RUNNING`` and the app performs the calculations.
|
|
30
|
+
When the application finishes the AppState
|
|
31
|
+
changes to ``FINISHED``.
|
|
32
|
+
The user can now call the :func:`Application.join()` method, concluding
|
|
33
|
+
the application in the ``JOINED`` state and making the results of the
|
|
34
|
+
application accessible.
|
|
35
|
+
Furthermore, this may trigger cleanup actions in some applications.
|
|
36
|
+
:func:`Application.join()` can even be called in the ``RUNNING`` state:
|
|
37
|
+
This will constantly check if the application has finished and will
|
|
38
|
+
directly go into the ``JOINED`` state as soon as the application reaches
|
|
39
|
+
the ``FINISHED`` state.
|
|
40
|
+
Calling the :func:`Application.cancel()` method while the application is
|
|
41
|
+
``RUNNING`` or ``FINISHED`` leaves the application in the ``CANCELLED``
|
|
42
|
+
state.
|
|
43
|
+
This triggers cleanup, too, but there are no accessible results.
|
|
44
|
+
If a method is called in an unsuitable app state, an
|
|
45
|
+
:class:`AppStateError` is called.
|
|
46
|
+
At each state in the life cycle, :class:`Application` type specific
|
|
47
|
+
methods are called, as shown in the following diagram.
|
|
48
|
+
|
|
49
|
+
.. figure:: /static/assets/figures/app_lifecycle.png
|
|
50
|
+
:alt: Application life cycle
|
|
51
|
+
:scale: 50%
|
|
52
|
+
|
|
53
|
+
Taken from
|
|
54
|
+
`Kunzmann & Hamacher 2018 <https://doi.org/10.1186/s12859-018-2367-z>`_
|
|
55
|
+
licensed under `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_.
|
|
30
56
|
|
|
31
57
|
The execution of an :class:`Application` can run in parallel:
|
|
32
|
-
|
|
58
|
+
The time between starting the run and collecting the results can be
|
|
33
59
|
used to run other code, similar to the *Python* :class:`Thread` or
|
|
34
60
|
:class:`Process` classes.
|
|
35
61
|
"""
|
|
@@ -74,7 +74,8 @@ class Application(metaclass=abc.ABCMeta):
|
|
|
74
74
|
|
|
75
75
|
Every :class:`Application` runs through a different app states
|
|
76
76
|
(instances of enum :class:`AppState`) from its creation until its
|
|
77
|
-
termination
|
|
77
|
+
termination.
|
|
78
|
+
|
|
78
79
|
Directly after its instantiation the app is in the *CREATED* state.
|
|
79
80
|
In this state further parameters can be set for the application run.
|
|
80
81
|
After the user calls the :func:`start()` method, the app state is
|
biotite/sequence/__init__.py
CHANGED
|
@@ -24,7 +24,15 @@ For example, ``'A'``, ``'C'``, ``'G'`` and ``'T'`` would be encoded into
|
|
|
24
24
|
These integer values are called *symbol code*, the encoding of an entire
|
|
25
25
|
sequence of symbols is called *sequence code*.
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
.. figure:: /static/assets/figures/symbol_encoding.png
|
|
28
|
+
:alt: Symbol encoding in Biotite
|
|
29
|
+
:scale: 50%
|
|
30
|
+
|
|
31
|
+
Taken from
|
|
32
|
+
`Kunzmann & Hamacher 2018 <https://doi.org/10.1186/s12859-018-2367-z>`_
|
|
33
|
+
licensed under `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_.
|
|
34
|
+
|
|
35
|
+
The size of the symbol code type in the array is determined by the
|
|
28
36
|
size of the :class:`Alphabet`:
|
|
29
37
|
If the :class:`Alphabet` contains 256 symbols or less, one byte is used
|
|
30
38
|
per array element, between 257 and 65536 symbols, two bytes are used,
|
|
@@ -41,6 +49,7 @@ This approach has multiple advantages:
|
|
|
41
49
|
indifferent to the actual type of sequence.
|
|
42
50
|
- Symbol codes are directly indices for substitution matrices in
|
|
43
51
|
alignments
|
|
52
|
+
- *k-mers* can be computed fast
|
|
44
53
|
|
|
45
54
|
The abstract :class:`Sequence` superclass cannot be instantiated
|
|
46
55
|
directly, as it does not define an :class:`Alphabet` by itself.
|
|
@@ -55,10 +64,12 @@ The class :class:`GeneralSequence` allows the usage of a custom
|
|
|
55
64
|
Additionally, this subpackage provides support for sequence features,
|
|
56
65
|
as used in e.g. GenBank or GFF files.
|
|
57
66
|
A :class:`Feature` stores its key name, its qualifiers and locations.
|
|
58
|
-
An :class:`Annotation` is a group of multiple :class:`
|
|
67
|
+
An :class:`Annotation` is a group of multiple :class:`Feature` objects
|
|
59
68
|
and offers convenient location based indexing.
|
|
60
69
|
An :class:`AnnotatedSequence` combines an :class:`Annotation` and a
|
|
61
70
|
:class:`Sequence`.
|
|
71
|
+
|
|
72
|
+
Sequence profiles can be created with the :class:`SequenceProfile` class.
|
|
62
73
|
"""
|
|
63
74
|
|
|
64
75
|
__name__ = "biotite.sequence"
|
|
@@ -22,11 +22,165 @@ These objects contain the original sequences and a trace, that describe
|
|
|
22
22
|
which positions (indices) in the sequences are aligned.
|
|
23
23
|
Optionally they also contain the similarity score.
|
|
24
24
|
|
|
25
|
-
The aligning functions
|
|
26
|
-
|
|
25
|
+
The aligning functions :func:`align_optimal()` and
|
|
26
|
+
:func:`align_multiple()` cover most use cases for pairwise and multiple
|
|
27
|
+
sequence alignments respectively.
|
|
28
|
+
|
|
29
|
+
However, *Biotite* provides also a modular system to build performant
|
|
30
|
+
heuristic alignment search methods, e.g. for finding homologies in a sequence
|
|
31
|
+
database or map reads to a genome.
|
|
32
|
+
The table below summarizes those provided functionalities.
|
|
33
|
+
The typical stages in alignment search, where those functionalities are used,
|
|
34
|
+
are arranged from top to bottom.
|
|
35
|
+
|
|
36
|
+
.. grid::
|
|
37
|
+
:gutter: 0
|
|
38
|
+
:class-container: sd-text-center
|
|
39
|
+
|
|
40
|
+
.. grid-item::
|
|
41
|
+
:padding: 2
|
|
42
|
+
:outline:
|
|
43
|
+
:columns: 3
|
|
44
|
+
|
|
45
|
+
**Entire k-mer set**
|
|
46
|
+
|
|
47
|
+
.. grid-item::
|
|
48
|
+
:padding: 2
|
|
49
|
+
:outline:
|
|
50
|
+
:columns: 9
|
|
51
|
+
|
|
52
|
+
.. grid::
|
|
53
|
+
:margin: 0
|
|
54
|
+
|
|
55
|
+
.. grid-item::
|
|
56
|
+
:padding: 2
|
|
57
|
+
:columns: 12
|
|
58
|
+
|
|
59
|
+
**k-mer subset selection**
|
|
60
|
+
|
|
61
|
+
.. grid-item::
|
|
62
|
+
:padding: 2
|
|
63
|
+
:columns: 4
|
|
64
|
+
|
|
65
|
+
Minimizers
|
|
66
|
+
|
|
67
|
+
:class:`MinimizerSelector`
|
|
68
|
+
|
|
69
|
+
.. grid-item::
|
|
70
|
+
:padding: 2
|
|
71
|
+
:columns: 4
|
|
72
|
+
|
|
73
|
+
Syncmers
|
|
74
|
+
|
|
75
|
+
:class:`SyncmerSelector`
|
|
76
|
+
|
|
77
|
+
:class:`CachedSyncmerSelector`
|
|
78
|
+
|
|
79
|
+
.. grid-item::
|
|
80
|
+
:padding: 2
|
|
81
|
+
:columns: 4
|
|
82
|
+
|
|
83
|
+
Mincode
|
|
84
|
+
|
|
85
|
+
:class:`MincodeSelector`
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
.. grid-item::
|
|
89
|
+
:padding: 2
|
|
90
|
+
:outline:
|
|
91
|
+
:columns: 12
|
|
92
|
+
|
|
93
|
+
.. grid::
|
|
94
|
+
:margin: 0
|
|
95
|
+
|
|
96
|
+
.. grid-item::
|
|
97
|
+
:padding: 2
|
|
98
|
+
:columns: 12
|
|
99
|
+
|
|
100
|
+
**k-mer indexing and matching**
|
|
101
|
+
|
|
102
|
+
.. grid-item::
|
|
103
|
+
:padding: 2
|
|
104
|
+
:columns: 6
|
|
105
|
+
|
|
106
|
+
Perfect hashing
|
|
107
|
+
|
|
108
|
+
:class:`KmerTable`
|
|
109
|
+
|
|
110
|
+
.. grid-item::
|
|
111
|
+
:padding: 2
|
|
112
|
+
:columns: 6
|
|
113
|
+
|
|
114
|
+
Space-efficient hashing
|
|
115
|
+
|
|
116
|
+
:class:`BucketKmerTable`
|
|
117
|
+
|
|
118
|
+
:func:`bucket_number()`
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
.. grid-item::
|
|
122
|
+
:padding: 2
|
|
123
|
+
:outline:
|
|
124
|
+
:columns: 12
|
|
125
|
+
|
|
126
|
+
.. grid::
|
|
127
|
+
:margin: 0
|
|
128
|
+
|
|
129
|
+
.. grid-item::
|
|
130
|
+
:padding: 2
|
|
131
|
+
:columns: 12
|
|
132
|
+
|
|
133
|
+
**Ungapped seed extension**
|
|
134
|
+
|
|
135
|
+
:class:`align_local_ungapped()`
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
.. grid-item::
|
|
139
|
+
:padding: 2
|
|
140
|
+
:outline:
|
|
141
|
+
:columns: 12
|
|
142
|
+
|
|
143
|
+
.. grid::
|
|
144
|
+
:margin: 0
|
|
145
|
+
|
|
146
|
+
.. grid-item::
|
|
147
|
+
:padding: 2
|
|
148
|
+
:columns: 12
|
|
149
|
+
|
|
150
|
+
**Gapped alignment**
|
|
151
|
+
|
|
152
|
+
.. grid-item::
|
|
153
|
+
:padding: 2
|
|
154
|
+
:columns: 6
|
|
155
|
+
|
|
156
|
+
Banded local/semiglobal alignment
|
|
157
|
+
|
|
158
|
+
:class:`align_banded()`
|
|
159
|
+
|
|
160
|
+
.. grid-item::
|
|
161
|
+
:padding: 2
|
|
162
|
+
:columns: 6
|
|
163
|
+
|
|
164
|
+
Local alignment (*X-drop*)
|
|
165
|
+
|
|
166
|
+
:class:`align_local_gapped()`
|
|
167
|
+
|
|
168
|
+
.. grid-item::
|
|
169
|
+
:padding: 2
|
|
170
|
+
:outline:
|
|
171
|
+
:columns: 12
|
|
172
|
+
|
|
173
|
+
.. grid::
|
|
174
|
+
:margin: 0
|
|
175
|
+
|
|
176
|
+
.. grid-item::
|
|
177
|
+
:padding: 2
|
|
178
|
+
:columns: 12
|
|
179
|
+
|
|
180
|
+
**Significance evaluation**
|
|
181
|
+
|
|
182
|
+
:class:`EValueEstimator`
|
|
27
183
|
|
|
28
|
-
This subpackage also contains functionality for finding *k-mer* matches
|
|
29
|
-
between two sequences, allowing fast heuristic pairwise alignments.
|
|
30
184
|
"""
|
|
31
185
|
|
|
32
186
|
__name__ = "biotite.sequence.align"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -39,9 +39,9 @@ cdef float32 MAX_FLOAT = np.finfo(np.float32).max
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class GapSymbol:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
_instance = None
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
def __init__(self):
|
|
46
46
|
if GapSymbol._instance is not None:
|
|
47
47
|
raise ValueError(
|
|
@@ -49,16 +49,16 @@ class GapSymbol:
|
|
|
49
49
|
)
|
|
50
50
|
else:
|
|
51
51
|
GapSymbol._instance = self
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
@staticmethod
|
|
54
54
|
def instance():
|
|
55
55
|
if GapSymbol._instance is None:
|
|
56
56
|
GapSymbol._instance = GapSymbol()
|
|
57
57
|
return GapSymbol._instance
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
def __str__(self):
|
|
60
60
|
return "-"
|
|
61
|
-
|
|
61
|
+
|
|
62
62
|
def __hash__(self):
|
|
63
63
|
return 0
|
|
64
64
|
|
|
@@ -69,13 +69,13 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
69
69
|
align_multiple(sequences, matrix, gap_penalty=-10,
|
|
70
70
|
terminal_penalty=True, distances=None,
|
|
71
71
|
guide_tree=None)
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
Perform a multiple sequence alignment using a progressive
|
|
74
74
|
alignment algorithm. :footcite:`Feng1987`
|
|
75
75
|
|
|
76
76
|
Based on pairwise sequence distances a guide tree is constructed.
|
|
77
77
|
The sequences are progessively aligned according to the tree,
|
|
78
|
-
following the rule 'Once a gap, always a gap'.
|
|
78
|
+
following the rule 'Once a gap, always a gap'.
|
|
79
79
|
|
|
80
80
|
Parameters
|
|
81
81
|
----------
|
|
@@ -124,7 +124,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
124
124
|
distance_matrix : ndarray, shape=(n,n), dtype=float32
|
|
125
125
|
The pairwise distance matrix used to construct the guide tree.
|
|
126
126
|
Equal to `distances` if provided.
|
|
127
|
-
|
|
127
|
+
|
|
128
128
|
Notes
|
|
129
129
|
-----
|
|
130
130
|
The similarity to distance conversion is performed according to the
|
|
@@ -137,14 +137,14 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
137
137
|
\right)
|
|
138
138
|
|
|
139
139
|
.. math:: S_{a,b}^{max} = \frac{ S_{a,a} + S_{b,b} }{ 2 }
|
|
140
|
-
|
|
140
|
+
|
|
141
141
|
.. math:: S_{a,b}^{rand} = \frac{1}{L_{a,b}}
|
|
142
142
|
\left(
|
|
143
143
|
\sum_{x \in \Omega} \sum_{y \in \Omega}
|
|
144
144
|
s_{x,y} \cdot N_a(x) \cdot N_b(y)
|
|
145
145
|
\right)
|
|
146
146
|
+ N_{a,b}^{open} \cdot p^{open} + N_{a,b}^{ext} \cdot p^{ext}
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
:math:`D_{a,b}` - The distance between the sequences *a* and *b*.
|
|
149
149
|
|
|
150
150
|
:math:`S_{a,b}` - The similarity score between the sequences *a* and *b*.
|
|
@@ -164,17 +164,17 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
164
164
|
|
|
165
165
|
In rare cases of extremely unrelated sequences, :math:`S_{a,b}`
|
|
166
166
|
can be lower than :math:`S_{a,b}^{rand}`.
|
|
167
|
-
In this case the
|
|
167
|
+
In this case the logarithm cannot be calculated and a
|
|
168
168
|
:class:`ValueError` is raised.
|
|
169
169
|
|
|
170
170
|
References
|
|
171
171
|
----------
|
|
172
|
-
|
|
172
|
+
|
|
173
173
|
.. footbibliography::
|
|
174
174
|
|
|
175
175
|
Examples
|
|
176
176
|
--------
|
|
177
|
-
|
|
177
|
+
|
|
178
178
|
>>> seq1 = ProteinSequence("BIQTITE")
|
|
179
179
|
>>> seq2 = ProteinSequence("TITANITE")
|
|
180
180
|
>>> seq3 = ProteinSequence("BISMITE")
|
|
@@ -232,7 +232,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
232
232
|
else:
|
|
233
233
|
# Assure that every node in the guide tree is binary
|
|
234
234
|
guide_tree = as_binary(guide_tree)
|
|
235
|
-
|
|
235
|
+
|
|
236
236
|
# Create new matrix with neutral gap symbol
|
|
237
237
|
gap_symbol = GapSymbol.instance()
|
|
238
238
|
new_alphabet = Alphabet(
|
|
@@ -275,7 +275,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
|
275
275
|
]
|
|
276
276
|
for i in range(len(aligned_seqs)):
|
|
277
277
|
aligned_seqs[i].code = aligned_seq_codes[i]
|
|
278
|
-
|
|
278
|
+
|
|
279
279
|
# Reorder alignmets into original alignemnt
|
|
280
280
|
new_order = np.argsort(order)
|
|
281
281
|
aligned_seqs = [aligned_seqs[pos] for pos in new_order]
|
|
@@ -290,7 +290,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
290
290
|
Create all pairwise alignments for the given sequences and use the
|
|
291
291
|
method proposed by Feng & Doolittle to calculate the pairwise
|
|
292
292
|
distance matrix
|
|
293
|
-
|
|
293
|
+
|
|
294
294
|
Parameters
|
|
295
295
|
----------
|
|
296
296
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -306,7 +306,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
306
306
|
terminal_penalty : bool
|
|
307
307
|
Whether to or not count terminal gap penalties for the
|
|
308
308
|
alignments.
|
|
309
|
-
|
|
309
|
+
|
|
310
310
|
Returns
|
|
311
311
|
-------
|
|
312
312
|
distances : ndarray, shape=(n,n), dtype=float32
|
|
@@ -332,7 +332,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
332
332
|
)[0]
|
|
333
333
|
scores[i,j] = alignment.score
|
|
334
334
|
alignments[i,j] = alignment
|
|
335
|
-
|
|
335
|
+
|
|
336
336
|
### Distance calculation from similarity scores ###
|
|
337
337
|
# Calculate the occurences of each symbol code in each sequence
|
|
338
338
|
# This is used later for the random score
|
|
@@ -364,7 +364,7 @@ def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
|
364
364
|
cdef CodeType[:] seq_code1, seq_code2
|
|
365
365
|
cdef CodeType code1, code2
|
|
366
366
|
cdef float32 score_rand, score_max
|
|
367
|
-
|
|
367
|
+
|
|
368
368
|
# Calculate distance
|
|
369
369
|
# i and j are indicating the alignment between the sequences i and j
|
|
370
370
|
for i in range(scores_v.shape[0]):
|
|
@@ -405,14 +405,14 @@ def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
|
|
|
405
405
|
"""
|
|
406
406
|
Count the number of gap openings and gap extensions in an alignment
|
|
407
407
|
trace.
|
|
408
|
-
|
|
408
|
+
|
|
409
409
|
Parameters
|
|
410
410
|
----------
|
|
411
411
|
trace_v : ndarary, shape=(n,2), dtype=int
|
|
412
412
|
The alignemnt trace.
|
|
413
413
|
terminal_penalty : bool
|
|
414
414
|
Whether to or not count terminal gap penalties.
|
|
415
|
-
|
|
415
|
+
|
|
416
416
|
Returns
|
|
417
417
|
-------
|
|
418
418
|
gap_open_count, gap_ext_count: int
|
|
@@ -440,7 +440,7 @@ def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
|
|
|
440
440
|
if start_index == -1 or stop_index == -1:
|
|
441
441
|
return 0, 0
|
|
442
442
|
trace_v = trace_v[start_index : stop_index]
|
|
443
|
-
|
|
443
|
+
|
|
444
444
|
if trace_v[0,0] == -1:
|
|
445
445
|
gap_open_count += 1
|
|
446
446
|
if trace_v[0,1] == -1:
|
|
@@ -471,7 +471,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
471
471
|
The gaps inserted in this pairwise alignment are also inserted
|
|
472
472
|
into all other sequences in the respective sub-MSA at the same
|
|
473
473
|
position.
|
|
474
|
-
|
|
474
|
+
|
|
475
475
|
Parameters
|
|
476
476
|
----------
|
|
477
477
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -490,13 +490,13 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
490
490
|
matrix : SubstitutionMatrix
|
|
491
491
|
The substitution matrix used for the alignments.
|
|
492
492
|
gap_symbol_code : int
|
|
493
|
-
The symbol code for the gap symbol.
|
|
493
|
+
The symbol code for the gap symbol.
|
|
494
494
|
gap_penalty : int or tuple(int, int)
|
|
495
495
|
A linear or affine gap penalty for the alignments.
|
|
496
496
|
terminal_penalty : bool
|
|
497
497
|
Whether to or not count terminal gap penalties for the
|
|
498
498
|
alignments.
|
|
499
|
-
|
|
499
|
+
|
|
500
500
|
Returns
|
|
501
501
|
-------
|
|
502
502
|
order : ndarray, shape=(m,), dtype=int
|
|
@@ -515,7 +515,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
515
515
|
cdef int32[:] indices1_v, indices2_v
|
|
516
516
|
cdef np.ndarray incides1, incides2
|
|
517
517
|
cdef list aligned_seqs1, aligned_seqs2
|
|
518
|
-
|
|
518
|
+
|
|
519
519
|
if tree_node.is_leaf():
|
|
520
520
|
# Child node -> Cannot do an alignment
|
|
521
521
|
# -> Just return the sequence corresponding to the leaf node
|
|
@@ -523,7 +523,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
523
523
|
# when neutral gap character is inserted
|
|
524
524
|
return np.array([tree_node.index], dtype=np.int32), \
|
|
525
525
|
[sequences[tree_node.index].copy()]
|
|
526
|
-
|
|
526
|
+
|
|
527
527
|
else:
|
|
528
528
|
# Multiple alignment of sequences corresponding to both child nodes
|
|
529
529
|
child1, child2 = tree_node.children
|
|
@@ -537,7 +537,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
537
537
|
gap_symbol_code, gap_penalty, terminal_penalty
|
|
538
538
|
)
|
|
539
539
|
indices2_v = incides2
|
|
540
|
-
|
|
540
|
+
|
|
541
541
|
# Find sequence pair with lowest distance
|
|
542
542
|
dist_min = MAX_FLOAT
|
|
543
543
|
for i in range(indices1_v.shape[0]):
|
|
@@ -554,7 +554,7 @@ def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
|
554
554
|
gap_penalty, terminal_penalty, max_number=1
|
|
555
555
|
)[0]
|
|
556
556
|
# Place neutral gap symbol for position of new gaps
|
|
557
|
-
# in both sequence groups
|
|
557
|
+
# in both sequence groups
|
|
558
558
|
for i in range(len(aligned_seqs1)):
|
|
559
559
|
seq = aligned_seqs1[i]
|
|
560
560
|
seq.code = _replace_gaps(
|
|
@@ -580,7 +580,7 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
580
580
|
|
|
581
581
|
The replacement is required by the progressive alignment algorithm
|
|
582
582
|
to be able to align gapped sequences with each other.
|
|
583
|
-
|
|
583
|
+
|
|
584
584
|
Parameters
|
|
585
585
|
----------
|
|
586
586
|
_T : ndarray, dtype=VARAIBLE
|
|
@@ -592,8 +592,8 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
592
592
|
seq_code : ndarary, shape=(n,)
|
|
593
593
|
The sequence code representing the given sequence.
|
|
594
594
|
gap_symbol_code : int
|
|
595
|
-
The symbol code for the gap symbol.
|
|
596
|
-
|
|
595
|
+
The symbol code for the gap symbol.
|
|
596
|
+
|
|
597
597
|
Returns
|
|
598
598
|
-------
|
|
599
599
|
new_seq_code : ndarary, shape=(m,)
|
|
@@ -609,12 +609,12 @@ def _replace_gaps(CodeType[:] _T,
|
|
|
609
609
|
partial_trace_v.shape[0], dtype=seq_code.dtype
|
|
610
610
|
)
|
|
611
611
|
cdef CodeType[:] new_seq_code_v = new_seq_code
|
|
612
|
-
|
|
612
|
+
|
|
613
613
|
for i in range(partial_trace_v.shape[0]):
|
|
614
614
|
index = partial_trace_v[i]
|
|
615
615
|
if index == -1:
|
|
616
616
|
new_seq_code_v[i] = gap_symbol_code
|
|
617
617
|
else:
|
|
618
618
|
new_seq_code_v[i] = seq_code[index]
|
|
619
|
-
|
|
619
|
+
|
|
620
620
|
return new_seq_code
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|