pyseqalignment 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyseqalignment-0.1.3/src/pyseqalignment.egg-info → pyseqalignment-0.1.4}/PKG-INFO +31 -2
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/README.md +30 -1
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/pyproject.toml +6 -1
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/__init__.py +9 -2
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/__init__.py +8 -1
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/alignment.py +13 -0
- pyseqalignment-0.1.4/src/pyseqalign/core/nw_affine.py +202 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/__init__.py +17 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/probability.py +192 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/profile.py +249 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/render.py +254 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/__init__.py +17 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/consensus.py +72 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/distance_matrix.py +118 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/guide_tree.py +191 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/progressive.py +221 -0
- pyseqalignment-0.1.4/src/pyseqalign/scoring/protocols.py +28 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4/src/pyseqalignment.egg-info}/PKG-INFO +31 -2
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/SOURCES.txt +12 -0
- pyseqalignment-0.1.4/tests/test_msa_logo.py +62 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/LICENSE +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/setup.cfg +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/setup.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/accel.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/smith_waterman.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/build_cpp_aligner.sh +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/cpp_aligner.cpp +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/aleph_swi_ak.pl +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/base.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/popper.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/task_builder.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/engine.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/amino_acids.pl +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/blosum50.pl +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/defaults.pl +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/distances.pl +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/distance.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrices.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM100 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM50 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM60 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM62 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM70 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM80 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM90 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM150 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM200 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM250 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM50 +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/utils/__init__.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/utils/helpers.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/dependency_links.txt +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/requires.txt +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/top_level.txt +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_learning.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_scoring.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_smith_waterman.py +0 -0
- {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyseqalignment
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning
|
|
5
5
|
Author-email: Andreas Karwath <a.karwath@bham.ac.uk>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -56,7 +56,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
56
56
|
## Features
|
|
57
57
|
|
|
58
58
|
- **Smith-Waterman** local alignment with k-best non-overlapping results
|
|
59
|
-
- **Needleman-Wunsch** global alignment
|
|
59
|
+
- **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
|
|
60
|
+
- **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
|
|
61
|
+
- **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
|
|
60
62
|
- **Prolog-based distance functions** via SWI-Prolog integration (optional)
|
|
61
63
|
- **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
|
|
62
64
|
- **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
|
|
@@ -309,6 +311,33 @@ For reference, other notable systems in the field include:
|
|
|
309
311
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
310
312
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
311
313
|
|
|
314
|
+
## Multiple alignment & relational logos
|
|
315
|
+
|
|
316
|
+
pySeqAlign can align *and* summarise sequences of structured logical atoms,
|
|
317
|
+
reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
|
|
318
|
+
(ILP 2006) — with no learning involved:
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
from pyseqalign.msa import progressive_msa
|
|
322
|
+
from pyseqalign.logo import relational_logo
|
|
323
|
+
from pyseqalign.scoring.distance import AtomDistance
|
|
324
|
+
|
|
325
|
+
# atoms as structured tuples: id -> (predicate, *args); 0 = gap
|
|
326
|
+
atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
|
|
327
|
+
seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
|
|
328
|
+
|
|
329
|
+
scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
|
|
330
|
+
msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
|
|
331
|
+
rows = list(msa.aligned_sequences.values())
|
|
332
|
+
|
|
333
|
+
relational_logo(rows, atom_store, 'logo.png', title='example fold')
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
`progressive_msa` accepts **any** scoring function, so a reward matrix learned
|
|
337
|
+
by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
|
|
338
|
+
in place of the fixed distance. Runnable reproductions of the paper's SCOP and
|
|
339
|
+
balloon logos are in [`examples/`](examples/).
|
|
340
|
+
|
|
312
341
|
## Fast C++ aligner (optional)
|
|
313
342
|
|
|
314
343
|
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
@@ -18,7 +18,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
18
18
|
## Features
|
|
19
19
|
|
|
20
20
|
- **Smith-Waterman** local alignment with k-best non-overlapping results
|
|
21
|
-
- **Needleman-Wunsch** global alignment
|
|
21
|
+
- **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
|
|
22
|
+
- **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
|
|
23
|
+
- **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
|
|
22
24
|
- **Prolog-based distance functions** via SWI-Prolog integration (optional)
|
|
23
25
|
- **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
|
|
24
26
|
- **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
|
|
@@ -271,6 +273,33 @@ For reference, other notable systems in the field include:
|
|
|
271
273
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
272
274
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
273
275
|
|
|
276
|
+
## Multiple alignment & relational logos
|
|
277
|
+
|
|
278
|
+
pySeqAlign can align *and* summarise sequences of structured logical atoms,
|
|
279
|
+
reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
|
|
280
|
+
(ILP 2006) — with no learning involved:
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
from pyseqalign.msa import progressive_msa
|
|
284
|
+
from pyseqalign.logo import relational_logo
|
|
285
|
+
from pyseqalign.scoring.distance import AtomDistance
|
|
286
|
+
|
|
287
|
+
# atoms as structured tuples: id -> (predicate, *args); 0 = gap
|
|
288
|
+
atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
|
|
289
|
+
seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
|
|
290
|
+
|
|
291
|
+
scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
|
|
292
|
+
msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
|
|
293
|
+
rows = list(msa.aligned_sequences.values())
|
|
294
|
+
|
|
295
|
+
relational_logo(rows, atom_store, 'logo.png', title='example fold')
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
`progressive_msa` accepts **any** scoring function, so a reward matrix learned
|
|
299
|
+
by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
|
|
300
|
+
in place of the fixed distance. Runnable reproductions of the paper's SCOP and
|
|
301
|
+
balloon logos are in [`examples/`](examples/).
|
|
302
|
+
|
|
274
303
|
## Fast C++ aligner (optional)
|
|
275
304
|
|
|
276
305
|
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
|
|
|
8
8
|
# PyPI distribution name (the import package is `pyseqalign`; the name
|
|
9
9
|
# `pyseqalign` was blocked by PyPI's similarity guard vs. an existing project).
|
|
10
10
|
name = "pyseqalignment"
|
|
11
|
-
version = "0.1.
|
|
11
|
+
version = "0.1.4"
|
|
12
12
|
description = "pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning"
|
|
13
13
|
readme = "README.md"
|
|
14
14
|
license = "MIT"
|
|
@@ -80,6 +80,11 @@ line-length = 100
|
|
|
80
80
|
|
|
81
81
|
[tool.ruff.lint]
|
|
82
82
|
select = ["E", "F", "W", "I", "N", "UP"]
|
|
83
|
+
ignore = [
|
|
84
|
+
"N803", # uppercase argument names (matrix math convention: M, Ix, Iy)
|
|
85
|
+
"N806", # uppercase local variables in functions (same reason)
|
|
86
|
+
"E741", # ambiguous variable name 'l' (used in tree (m, l, r) unpacking)
|
|
87
|
+
]
|
|
83
88
|
|
|
84
89
|
[tool.mypy]
|
|
85
90
|
python_version = "3.10"
|
|
@@ -1,14 +1,21 @@
|
|
|
1
1
|
"""pySeqAlign -- Sequence alignment with Prolog-style distance functions and ILP learning."""
|
|
2
2
|
|
|
3
|
-
from pyseqalign.core.alignment import
|
|
3
|
+
from pyseqalign.core.alignment import (
|
|
4
|
+
AffineAlignmentResult,
|
|
5
|
+
AlignmentResult,
|
|
6
|
+
LocalAlignmentResult,
|
|
7
|
+
)
|
|
4
8
|
from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
|
|
9
|
+
from pyseqalign.core.nw_affine import NeedlemanWunschAffine
|
|
5
10
|
from pyseqalign.core.smith_waterman import SmithWaterman
|
|
6
11
|
|
|
7
|
-
__version__ = "0.1.
|
|
12
|
+
__version__ = "0.1.4"
|
|
8
13
|
|
|
9
14
|
__all__ = [
|
|
10
15
|
"SmithWaterman",
|
|
11
16
|
"NeedlemanWunsch",
|
|
17
|
+
"NeedlemanWunschAffine",
|
|
12
18
|
"AlignmentResult",
|
|
19
|
+
"AffineAlignmentResult",
|
|
13
20
|
"LocalAlignmentResult",
|
|
14
21
|
]
|
|
@@ -1,12 +1,19 @@
|
|
|
1
1
|
"""Core alignment algorithms."""
|
|
2
2
|
|
|
3
|
-
from pyseqalign.core.alignment import
|
|
3
|
+
from pyseqalign.core.alignment import (
|
|
4
|
+
AffineAlignmentResult,
|
|
5
|
+
AlignmentResult,
|
|
6
|
+
LocalAlignmentResult,
|
|
7
|
+
)
|
|
4
8
|
from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
|
|
9
|
+
from pyseqalign.core.nw_affine import NeedlemanWunschAffine
|
|
5
10
|
from pyseqalign.core.smith_waterman import SmithWaterman
|
|
6
11
|
|
|
7
12
|
__all__ = [
|
|
8
13
|
"SmithWaterman",
|
|
9
14
|
"NeedlemanWunsch",
|
|
15
|
+
"NeedlemanWunschAffine",
|
|
10
16
|
"AlignmentResult",
|
|
17
|
+
"AffineAlignmentResult",
|
|
11
18
|
"LocalAlignmentResult",
|
|
12
19
|
]
|
|
@@ -22,6 +22,19 @@ class AlignmentResult:
|
|
|
22
22
|
length: int
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclass
|
|
26
|
+
class AffineAlignmentResult(AlignmentResult):
|
|
27
|
+
"""Extended result from affine-gap alignment.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
gap_opens: Number of gap-open events in both sequences combined.
|
|
31
|
+
gap_extensions: Number of gap-extension events.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
gap_opens: int = 0
|
|
35
|
+
gap_extensions: int = 0
|
|
36
|
+
|
|
37
|
+
|
|
25
38
|
@dataclass
|
|
26
39
|
class LocalAlignmentResult:
|
|
27
40
|
"""Result of a single local (Smith-Waterman) alignment.
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Needleman-Wunsch global alignment with affine gap penalties.
|
|
2
|
+
|
|
3
|
+
Translated from the legacy C++ AlignerAffine::_align() implementation.
|
|
4
|
+
Uses three DP matrices (M, Ix, Iy) to distinguish gap-open from gap-extend.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from pyseqalign.core.alignment import AffineAlignmentResult
|
|
12
|
+
from pyseqalign.scoring.protocols import ScoringFunction
|
|
13
|
+
|
|
14
|
+
# Matrix indices.
|
|
15
|
+
_M = 0 # match/mismatch
|
|
16
|
+
_IX = 1 # gap in target (consuming query element)
|
|
17
|
+
_IY = 2 # gap in query (consuming target element)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NeedlemanWunschAffine:
|
|
21
|
+
"""Needleman-Wunsch with affine gap penalties.
|
|
22
|
+
|
|
23
|
+
Recurrences (similarity mode):
|
|
24
|
+
M[i][j] = score(q[i], t[j]) + max(M[i-1][j-1], Ix[i-1][j-1], Iy[i-1][j-1])
|
|
25
|
+
Ix[i][j] = max(M[i-1][j] + gap_open, Ix[i-1][j] + gap_extend, Iy[i-1][j] + gap_open)
|
|
26
|
+
Iy[i][j] = max(M[i][j-1] + gap_open, Iy[i][j-1] + gap_extend, Ix[i][j-1] + gap_open)
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
scoring: Scoring function (element ID 0 = gap).
|
|
30
|
+
gap_open: Cost for opening a new gap (should be negative for penalties).
|
|
31
|
+
gap_extend: Cost for extending an existing gap (should be negative,
|
|
32
|
+
typically less severe than gap_open).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
scoring: ScoringFunction,
|
|
38
|
+
gap_open: float = -2.5,
|
|
39
|
+
gap_extend: float = -0.25,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.scoring = scoring
|
|
42
|
+
self.gap_open = gap_open
|
|
43
|
+
self.gap_extend = gap_extend
|
|
44
|
+
|
|
45
|
+
def align(self, seq1: list[int], seq2: list[int]) -> AffineAlignmentResult:
|
|
46
|
+
"""Compute the optimal global alignment with affine gap penalties.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
seq1: Query sequence (list of integer element IDs).
|
|
50
|
+
seq2: Target sequence.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
An ``AffineAlignmentResult`` with aligned sequences and gap statistics.
|
|
54
|
+
"""
|
|
55
|
+
n = len(seq1)
|
|
56
|
+
m = len(seq2)
|
|
57
|
+
|
|
58
|
+
NEG_INF = -np.inf
|
|
59
|
+
|
|
60
|
+
# F[k, i, j] for k in {M=0, Ix=1, Iy=2}
|
|
61
|
+
F = np.full((3, n + 1, m + 1), NEG_INF, dtype=np.float64)
|
|
62
|
+
# Traceback: B[k, i, j, :] = (from_k, from_i, from_j)
|
|
63
|
+
B = np.full((3, n + 1, m + 1, 3), -1, dtype=np.int32)
|
|
64
|
+
|
|
65
|
+
F[_M, 0, 0] = 0.0
|
|
66
|
+
|
|
67
|
+
d = self.gap_open
|
|
68
|
+
e = self.gap_extend
|
|
69
|
+
|
|
70
|
+
# --- Border initialization: gaps along query (Ix column) ---
|
|
71
|
+
for i0 in range(n):
|
|
72
|
+
i = i0 + 1
|
|
73
|
+
if i > 1:
|
|
74
|
+
F[_IX, i, 0] = F[_IX, i - 1, 0] + e
|
|
75
|
+
else:
|
|
76
|
+
F[_IX, i, 0] = d
|
|
77
|
+
B[_IX, i, 0] = [_IX, i - 1, 0]
|
|
78
|
+
# M and Iy are -inf along this border (already set).
|
|
79
|
+
|
|
80
|
+
# --- Border initialization: gaps along target (Iy row) ---
|
|
81
|
+
for j0 in range(m):
|
|
82
|
+
j = j0 + 1
|
|
83
|
+
if j > 1:
|
|
84
|
+
F[_IY, 0, j] = F[_IY, 0, j - 1] + e
|
|
85
|
+
else:
|
|
86
|
+
F[_IY, 0, j] = d
|
|
87
|
+
B[_IY, 0, j] = [_IY, 0, j - 1]
|
|
88
|
+
# M and Ix are -inf along this border (already set).
|
|
89
|
+
|
|
90
|
+
# --- Main DP fill ---
|
|
91
|
+
for i0 in range(n):
|
|
92
|
+
i = i0 + 1
|
|
93
|
+
for j0 in range(m):
|
|
94
|
+
j = j0 + 1
|
|
95
|
+
|
|
96
|
+
# Match/mismatch: diagonal transition.
|
|
97
|
+
s = self.scoring.score(seq1[i - 1], seq2[j - 1])
|
|
98
|
+
candidates_m = (
|
|
99
|
+
F[_M, i - 1, j - 1] + s,
|
|
100
|
+
F[_IX, i - 1, j - 1] + s,
|
|
101
|
+
F[_IY, i - 1, j - 1] + s,
|
|
102
|
+
)
|
|
103
|
+
best_k = _argmax3(candidates_m)
|
|
104
|
+
F[_M, i, j] = candidates_m[best_k]
|
|
105
|
+
B[_M, i, j] = [best_k, i - 1, j - 1]
|
|
106
|
+
|
|
107
|
+
# Ix: gap in target (consume query[i], skip target).
|
|
108
|
+
candidates_ix = (
|
|
109
|
+
F[_M, i - 1, j] + d, # new gap
|
|
110
|
+
F[_IX, i - 1, j] + e, # extend gap
|
|
111
|
+
F[_IY, i - 1, j] + d, # new gap
|
|
112
|
+
)
|
|
113
|
+
best_k = _argmax3(candidates_ix)
|
|
114
|
+
F[_IX, i, j] = candidates_ix[best_k]
|
|
115
|
+
B[_IX, i, j] = [best_k, i - 1, j]
|
|
116
|
+
|
|
117
|
+
# Iy: gap in query (skip query, consume target[j]).
|
|
118
|
+
candidates_iy = (
|
|
119
|
+
F[_M, i, j - 1] + d, # new gap
|
|
120
|
+
F[_IY, i, j - 1] + e, # extend gap
|
|
121
|
+
F[_IX, i, j - 1] + d, # new gap
|
|
122
|
+
)
|
|
123
|
+
best_k = _argmax3(candidates_iy)
|
|
124
|
+
F[_IY, i, j] = candidates_iy[best_k]
|
|
125
|
+
B[_IY, i, j] = [best_k, i, j - 1]
|
|
126
|
+
|
|
127
|
+
# --- Find best endpoint ---
|
|
128
|
+
end_scores = (F[_M, n, m], F[_IX, n, m], F[_IY, n, m])
|
|
129
|
+
best_end = _argmax3(end_scores)
|
|
130
|
+
score = end_scores[best_end]
|
|
131
|
+
|
|
132
|
+
# --- Traceback ---
|
|
133
|
+
align1, align2, gap_opens, gap_extensions = self._traceback(B, seq1, seq2, best_end, n, m)
|
|
134
|
+
|
|
135
|
+
return AffineAlignmentResult(
|
|
136
|
+
query=align1,
|
|
137
|
+
target=align2,
|
|
138
|
+
score=float(score),
|
|
139
|
+
length=len(align1),
|
|
140
|
+
gap_opens=gap_opens,
|
|
141
|
+
gap_extensions=gap_extensions,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _traceback(
|
|
146
|
+
B: np.ndarray,
|
|
147
|
+
seq1: list[int],
|
|
148
|
+
seq2: list[int],
|
|
149
|
+
start_k: int,
|
|
150
|
+
start_i: int,
|
|
151
|
+
start_j: int,
|
|
152
|
+
) -> tuple[list[int], list[int], int, int]:
|
|
153
|
+
"""Walk the traceback matrix to produce aligned sequences."""
|
|
154
|
+
align1: list[int] = []
|
|
155
|
+
align2: list[int] = []
|
|
156
|
+
gap_opens = 0
|
|
157
|
+
gap_extensions = 0
|
|
158
|
+
|
|
159
|
+
k, i, j = start_k, start_i, start_j
|
|
160
|
+
prev_k = -1
|
|
161
|
+
|
|
162
|
+
while i > 0 or j > 0:
|
|
163
|
+
from_k, from_i, from_j = int(B[k, i, j, 0]), int(B[k, i, j, 1]), int(B[k, i, j, 2])
|
|
164
|
+
|
|
165
|
+
if from_i < 0 or from_j < 0:
|
|
166
|
+
# Reached uninitialised border — shouldn't happen.
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if k == _M:
|
|
170
|
+
# Diagonal: match/mismatch.
|
|
171
|
+
align1.append(seq1[i - 1])
|
|
172
|
+
align2.append(seq2[j - 1])
|
|
173
|
+
elif k == _IX:
|
|
174
|
+
# Gap in target.
|
|
175
|
+
align1.append(seq1[i - 1])
|
|
176
|
+
align2.append(0)
|
|
177
|
+
if prev_k != _IX:
|
|
178
|
+
gap_opens += 1
|
|
179
|
+
else:
|
|
180
|
+
gap_extensions += 1
|
|
181
|
+
else: # _IY
|
|
182
|
+
# Gap in query.
|
|
183
|
+
align1.append(0)
|
|
184
|
+
align2.append(seq2[j - 1])
|
|
185
|
+
if prev_k != _IY:
|
|
186
|
+
gap_opens += 1
|
|
187
|
+
else:
|
|
188
|
+
gap_extensions += 1
|
|
189
|
+
|
|
190
|
+
prev_k = k
|
|
191
|
+
k, i, j = from_k, from_i, from_j
|
|
192
|
+
|
|
193
|
+
align1.reverse()
|
|
194
|
+
align2.reverse()
|
|
195
|
+
return align1, align2, gap_opens, gap_extensions
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _argmax3(vals: tuple[float, float, float]) -> int:
|
|
199
|
+
"""Return index of maximum among exactly three values."""
|
|
200
|
+
if vals[0] >= vals[1]:
|
|
201
|
+
return 0 if vals[0] >= vals[2] else 2
|
|
202
|
+
return 1 if vals[1] >= vals[2] else 2
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Relational sequence logos — position-specific profiles of logical atoms."""
|
|
2
|
+
|
|
3
|
+
from pyseqalign.logo.probability import FreqDist, LidstoneProbDist, MLEProbDist
|
|
4
|
+
from pyseqalign.logo.profile import PositionProfile, RelationalProfile
|
|
5
|
+
from pyseqalign.logo.render import column_ic, lgg_atoms, relational_logo, term_str
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'FreqDist',
|
|
9
|
+
'MLEProbDist',
|
|
10
|
+
'LidstoneProbDist',
|
|
11
|
+
'PositionProfile',
|
|
12
|
+
'RelationalProfile',
|
|
13
|
+
'relational_logo',
|
|
14
|
+
'column_ic',
|
|
15
|
+
'lgg_atoms',
|
|
16
|
+
'term_str',
|
|
17
|
+
]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Frequency and probability distributions for relational sequence logos.
|
|
2
|
+
|
|
3
|
+
Simplified, modern-Python reimplementation of the NLTK-derived legacy
|
|
4
|
+
``Probability.py``. Only the distributions needed for logo construction
|
|
5
|
+
are included: :class:`FreqDist`, :class:`MLEProbDist` (maximum-likelihood),
|
|
6
|
+
and :class:`LidstoneProbDist` (smoothed).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import builtins as _builtins
|
|
12
|
+
import math
|
|
13
|
+
from collections.abc import Hashable, Iterator
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
_builtin_max = _builtins.max
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FreqDist:
|
|
20
|
+
"""A frequency distribution over hashable samples.
|
|
21
|
+
|
|
22
|
+
Counts how many times each outcome has been observed.
|
|
23
|
+
|
|
24
|
+
>>> fd = FreqDist()
|
|
25
|
+
>>> fd.inc("a"); fd.inc("a"); fd.inc("b")
|
|
26
|
+
>>> fd.count("a")
|
|
27
|
+
2
|
|
28
|
+
>>> fd.freq("a") # doctest: +ELLIPSIS
|
|
29
|
+
0.666...
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
__slots__ = ('_counts', '_total', '_max_cache')
|
|
33
|
+
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self._counts: dict[Any, int] = {}
|
|
36
|
+
self._total: int = 0
|
|
37
|
+
self._max_cache: Any | None = None
|
|
38
|
+
|
|
39
|
+
# -- Mutation -----------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
def inc(self, sample: Hashable, count: int = 1) -> None:
|
|
42
|
+
"""Increment the count for *sample* by *count*."""
|
|
43
|
+
if count == 0:
|
|
44
|
+
return
|
|
45
|
+
self._counts[sample] = self._counts.get(sample, 0) + count
|
|
46
|
+
self._total += count
|
|
47
|
+
self._max_cache = None
|
|
48
|
+
|
|
49
|
+
# -- Queries ------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def total(self) -> int:
|
|
53
|
+
"""Total number of recorded outcomes (``N``)."""
|
|
54
|
+
return self._total
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def num_bins(self) -> int:
|
|
58
|
+
"""Number of distinct samples with count > 0 (``B``)."""
|
|
59
|
+
return len(self._counts)
|
|
60
|
+
|
|
61
|
+
def count(self, sample: Hashable) -> int:
|
|
62
|
+
"""Return the count for *sample* (0 if unseen)."""
|
|
63
|
+
return self._counts.get(sample, 0)
|
|
64
|
+
|
|
65
|
+
def freq(self, sample: Hashable) -> float:
|
|
66
|
+
"""Return the relative frequency ``count(sample) / N``."""
|
|
67
|
+
if self._total == 0:
|
|
68
|
+
return 0.0
|
|
69
|
+
return self._counts.get(sample, 0) / self._total
|
|
70
|
+
|
|
71
|
+
def samples(self) -> list[Any]:
|
|
72
|
+
"""Return all samples with count > 0."""
|
|
73
|
+
return list(self._counts.keys())
|
|
74
|
+
|
|
75
|
+
def max(self) -> Any | None:
|
|
76
|
+
"""Return the sample with the highest count (arbitrary tie-break)."""
|
|
77
|
+
if self._max_cache is None:
|
|
78
|
+
if not self._counts:
|
|
79
|
+
return None
|
|
80
|
+
self._max_cache = _builtin_max(self._counts, key=self._counts.__getitem__)
|
|
81
|
+
return self._max_cache
|
|
82
|
+
|
|
83
|
+
def sorted_samples(self) -> list[Any]:
|
|
84
|
+
"""Return samples sorted by descending count."""
|
|
85
|
+
return sorted(self._counts, key=self._counts.__getitem__, reverse=True)
|
|
86
|
+
|
|
87
|
+
# -- Container protocol -------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def __contains__(self, sample: object) -> bool:
|
|
90
|
+
return sample in self._counts
|
|
91
|
+
|
|
92
|
+
def __len__(self) -> int:
|
|
93
|
+
return self.num_bins
|
|
94
|
+
|
|
95
|
+
def __iter__(self) -> Iterator[Any]:
|
|
96
|
+
return iter(self._counts)
|
|
97
|
+
|
|
98
|
+
# -- Representation -----------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def __repr__(self) -> str:
|
|
101
|
+
return f'<FreqDist with {self._total} outcomes, {self.num_bins} bins>'
|
|
102
|
+
|
|
103
|
+
def __str__(self) -> str:
|
|
104
|
+
items = ', '.join(
|
|
105
|
+
f'{s!r}: {c}' for s, c in sorted(self._counts.items(), key=lambda kv: -kv[1])
|
|
106
|
+
)
|
|
107
|
+
return f'<FreqDist: {items}>'
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class MLEProbDist:
|
|
111
|
+
"""Maximum-likelihood probability distribution from a :class:`FreqDist`.
|
|
112
|
+
|
|
113
|
+
``P(sample) = count(sample) / N``
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
__slots__ = ('_freqdist',)
|
|
117
|
+
|
|
118
|
+
def __init__(self, freqdist: FreqDist) -> None:
|
|
119
|
+
if freqdist.total == 0:
|
|
120
|
+
raise ValueError('Cannot build MLE distribution from empty FreqDist.')
|
|
121
|
+
self._freqdist = freqdist
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def freqdist(self) -> FreqDist:
|
|
125
|
+
return self._freqdist
|
|
126
|
+
|
|
127
|
+
def prob(self, sample: Hashable) -> float:
|
|
128
|
+
return self._freqdist.freq(sample)
|
|
129
|
+
|
|
130
|
+
def logprob(self, sample: Hashable) -> float:
|
|
131
|
+
p = self.prob(sample)
|
|
132
|
+
return math.log(p) if p > 0 else float('-inf')
|
|
133
|
+
|
|
134
|
+
def max(self) -> Any | None:
|
|
135
|
+
return self._freqdist.max()
|
|
136
|
+
|
|
137
|
+
def samples(self) -> list[Any]:
|
|
138
|
+
return self._freqdist.samples()
|
|
139
|
+
|
|
140
|
+
def __repr__(self) -> str:
|
|
141
|
+
return f'<MLEProbDist based on {self._freqdist.total} outcomes>'
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class LidstoneProbDist:
|
|
145
|
+
"""Lidstone-smoothed probability distribution.
|
|
146
|
+
|
|
147
|
+
``P(sample) = (count(sample) + gamma) / (N + B * gamma)``
|
|
148
|
+
|
|
149
|
+
With ``gamma = 1`` this is Laplace smoothing; ``gamma = 0.5`` gives the
|
|
150
|
+
Expected Likelihood Estimate (ELE).
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
__slots__ = ('_freqdist', '_gamma', '_bins', '_N')
|
|
154
|
+
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
freqdist: FreqDist,
|
|
158
|
+
gamma: float = 1.0,
|
|
159
|
+
bins: int | None = None,
|
|
160
|
+
) -> None:
|
|
161
|
+
if bins is not None and bins < freqdist.num_bins:
|
|
162
|
+
raise ValueError(f'bins ({bins}) must be >= FreqDist.num_bins ({freqdist.num_bins})')
|
|
163
|
+
if bins is None:
|
|
164
|
+
bins = freqdist.num_bins
|
|
165
|
+
if bins == 0:
|
|
166
|
+
raise ValueError('Lidstone distribution must have at least one bin.')
|
|
167
|
+
|
|
168
|
+
self._freqdist = freqdist
|
|
169
|
+
self._gamma = float(gamma)
|
|
170
|
+
self._bins = bins
|
|
171
|
+
self._N = freqdist.total
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def freqdist(self) -> FreqDist:
|
|
175
|
+
return self._freqdist
|
|
176
|
+
|
|
177
|
+
def prob(self, sample: Hashable) -> float:
|
|
178
|
+
c = self._freqdist.count(sample)
|
|
179
|
+
return (c + self._gamma) / (self._N + self._bins * self._gamma)
|
|
180
|
+
|
|
181
|
+
def logprob(self, sample: Hashable) -> float:
|
|
182
|
+
p = self.prob(sample)
|
|
183
|
+
return math.log(p) if p > 0 else float('-inf')
|
|
184
|
+
|
|
185
|
+
def max(self) -> Any | None:
|
|
186
|
+
return self._freqdist.max()
|
|
187
|
+
|
|
188
|
+
def samples(self) -> list[Any]:
|
|
189
|
+
return self._freqdist.samples()
|
|
190
|
+
|
|
191
|
+
def __repr__(self) -> str:
|
|
192
|
+
return f'<LidstoneProbDist gamma={self._gamma} based on {self._N} outcomes>'
|