pyseqalignment 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyseqalignment-0.1.2/src/pyseqalignment.egg-info → pyseqalignment-0.1.4}/PKG-INFO +50 -5
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/README.md +49 -4
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/pyproject.toml +9 -2
- pyseqalignment-0.1.4/setup.py +58 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/__init__.py +9 -2
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/core/__init__.py +8 -1
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/core/alignment.py +13 -0
- pyseqalignment-0.1.4/src/pyseqalign/core/nw_affine.py +202 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/__init__.py +17 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/probability.py +192 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/profile.py +249 -0
- pyseqalignment-0.1.4/src/pyseqalign/logo/render.py +254 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/__init__.py +17 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/consensus.py +72 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/distance_matrix.py +118 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/guide_tree.py +191 -0
- pyseqalignment-0.1.4/src/pyseqalign/msa/progressive.py +221 -0
- pyseqalignment-0.1.4/src/pyseqalign/scoring/protocols.py +28 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4/src/pyseqalignment.egg-info}/PKG-INFO +50 -5
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/SOURCES.txt +13 -0
- pyseqalignment-0.1.4/tests/test_msa_logo.py +62 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/LICENSE +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/setup.cfg +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/accel.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/core/needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/core/smith_waterman.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/build_cpp_aligner.sh +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/cpp_aligner.cpp +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/aleph_swi_ak.pl +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/base.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/popper.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/learning/task_builder.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/engine.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/amino_acids.pl +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/blosum50.pl +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/defaults.pl +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/distances.pl +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/distance.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrices.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM100 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM50 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM60 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM62 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM70 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM80 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM90 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM150 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM200 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM250 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM50 +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/utils/__init__.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalign/utils/helpers.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/dependency_links.txt +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/requires.txt +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/top_level.txt +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/tests/test_learning.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/tests/test_needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/tests/test_scoring.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/tests/test_smith_waterman.py +0 -0
- {pyseqalignment-0.1.2 → pyseqalignment-0.1.4}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyseqalignment
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning
|
|
5
5
|
Author-email: Andreas Karwath <a.karwath@bham.ac.uk>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -56,7 +56,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
56
56
|
## Features
|
|
57
57
|
|
|
58
58
|
- **Smith-Waterman** local alignment with k-best non-overlapping results
|
|
59
|
-
- **Needleman-Wunsch** global alignment
|
|
59
|
+
- **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
|
|
60
|
+
- **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
|
|
61
|
+
- **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
|
|
60
62
|
- **Prolog-based distance functions** via SWI-Prolog integration (optional)
|
|
61
63
|
- **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
|
|
62
64
|
- **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
|
|
@@ -309,19 +311,62 @@ For reference, other notable systems in the field include:
|
|
|
309
311
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
310
312
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
311
313
|
|
|
314
|
+
## Multiple alignment & relational logos
|
|
315
|
+
|
|
316
|
+
pySeqAlign can align *and* summarise sequences of structured logical atoms,
|
|
317
|
+
reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
|
|
318
|
+
(ILP 2006) — with no learning involved:
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
from pyseqalign.msa import progressive_msa
|
|
322
|
+
from pyseqalign.logo import relational_logo
|
|
323
|
+
from pyseqalign.scoring.distance import AtomDistance
|
|
324
|
+
|
|
325
|
+
# atoms as structured tuples: id -> (predicate, *args); 0 = gap
|
|
326
|
+
atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
|
|
327
|
+
seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
|
|
328
|
+
|
|
329
|
+
scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
|
|
330
|
+
msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
|
|
331
|
+
rows = list(msa.aligned_sequences.values())
|
|
332
|
+
|
|
333
|
+
relational_logo(rows, atom_store, 'logo.png', title='example fold')
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
`progressive_msa` accepts **any** scoring function, so a reward matrix learned
|
|
337
|
+
by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
|
|
338
|
+
in place of the fixed distance. Runnable reproductions of the paper's SCOP and
|
|
339
|
+
balloon logos are in [`examples/`](examples/).
|
|
340
|
+
|
|
312
341
|
## Fast C++ aligner (optional)
|
|
313
342
|
|
|
314
343
|
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
315
344
|
boosting that re-aligns thousands of sequence pairs per iteration), an optional
|
|
316
|
-
**C++ affine-gap Needleman-Wunsch** kernel is provided.
|
|
317
|
-
|
|
345
|
+
**C++ affine-gap Needleman-Wunsch** kernel is provided.
|
|
346
|
+
|
|
347
|
+
**It is compiled automatically at install time** (best effort). The package is
|
|
348
|
+
distributed as an sdist, so `pip install pyseqalignment` builds from source and
|
|
349
|
+
tries to compile the accelerator for your Python/ABI/platform using a C++
|
|
350
|
+
compiler + pybind11 (pulled in as a build dependency). On macOS/Linux with a
|
|
351
|
+
compiler present this "just works"; if no compiler is available the install
|
|
352
|
+
still succeeds and the library falls back to the pure-Python aligner. Check with:
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
from pyseqalign.accel import cpp_available
|
|
356
|
+
print(cpp_available()) # True if the accelerator compiled at install
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
If you installed without a compiler and later want the accelerator, install one
|
|
360
|
+
(Xcode Command Line Tools on macOS, `build-essential` on Debian/Ubuntu) and
|
|
361
|
+
either reinstall (`pip install --force-reinstall --no-binary :all: pyseqalignment`)
|
|
362
|
+
or build it in place once:
|
|
318
363
|
|
|
319
364
|
```bash
|
|
320
365
|
pip install pybind11
|
|
321
366
|
src/pyseqalign/cpp/build_cpp_aligner.sh # or: PY=$(which python) src/.../build_cpp_aligner.sh
|
|
322
367
|
```
|
|
323
368
|
|
|
324
|
-
|
|
369
|
+
Either way it compiles the extension into the `pyseqalign` package. Then:
|
|
325
370
|
|
|
326
371
|
```python
|
|
327
372
|
from pyseqalign.accel import cpp_available, load
|
|
@@ -18,7 +18,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
18
18
|
## Features
|
|
19
19
|
|
|
20
20
|
- **Smith-Waterman** local alignment with k-best non-overlapping results
|
|
21
|
-
- **Needleman-Wunsch** global alignment
|
|
21
|
+
- **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
|
|
22
|
+
- **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
|
|
23
|
+
- **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
|
|
22
24
|
- **Prolog-based distance functions** via SWI-Prolog integration (optional)
|
|
23
25
|
- **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
|
|
24
26
|
- **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
|
|
@@ -271,19 +273,62 @@ For reference, other notable systems in the field include:
|
|
|
271
273
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
272
274
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
273
275
|
|
|
276
|
+
## Multiple alignment & relational logos
|
|
277
|
+
|
|
278
|
+
pySeqAlign can align *and* summarise sequences of structured logical atoms,
|
|
279
|
+
reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
|
|
280
|
+
(ILP 2006) — with no learning involved:
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
from pyseqalign.msa import progressive_msa
|
|
284
|
+
from pyseqalign.logo import relational_logo
|
|
285
|
+
from pyseqalign.scoring.distance import AtomDistance
|
|
286
|
+
|
|
287
|
+
# atoms as structured tuples: id -> (predicate, *args); 0 = gap
|
|
288
|
+
atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
|
|
289
|
+
seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
|
|
290
|
+
|
|
291
|
+
scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
|
|
292
|
+
msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
|
|
293
|
+
rows = list(msa.aligned_sequences.values())
|
|
294
|
+
|
|
295
|
+
relational_logo(rows, atom_store, 'logo.png', title='example fold')
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
`progressive_msa` accepts **any** scoring function, so a reward matrix learned
|
|
299
|
+
by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
|
|
300
|
+
in place of the fixed distance. Runnable reproductions of the paper's SCOP and
|
|
301
|
+
balloon logos are in [`examples/`](examples/).
|
|
302
|
+
|
|
274
303
|
## Fast C++ aligner (optional)
|
|
275
304
|
|
|
276
305
|
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
277
306
|
boosting that re-aligns thousands of sequence pairs per iteration), an optional
|
|
278
|
-
**C++ affine-gap Needleman-Wunsch** kernel is provided.
|
|
279
|
-
|
|
307
|
+
**C++ affine-gap Needleman-Wunsch** kernel is provided.
|
|
308
|
+
|
|
309
|
+
**It is compiled automatically at install time** (best effort). The package is
|
|
310
|
+
distributed as an sdist, so `pip install pyseqalignment` builds from source and
|
|
311
|
+
tries to compile the accelerator for your Python/ABI/platform using a C++
|
|
312
|
+
compiler + pybind11 (pulled in as a build dependency). On macOS/Linux with a
|
|
313
|
+
compiler present this "just works"; if no compiler is available the install
|
|
314
|
+
still succeeds and the library falls back to the pure-Python aligner. Check with:
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
from pyseqalign.accel import cpp_available
|
|
318
|
+
print(cpp_available()) # True if the accelerator compiled at install
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
If you installed without a compiler and later want the accelerator, install one
|
|
322
|
+
(Xcode Command Line Tools on macOS, `build-essential` on Debian/Ubuntu) and
|
|
323
|
+
either reinstall (`pip install --force-reinstall --no-binary :all: pyseqalignment`)
|
|
324
|
+
or build it in place once:
|
|
280
325
|
|
|
281
326
|
```bash
|
|
282
327
|
pip install pybind11
|
|
283
328
|
src/pyseqalign/cpp/build_cpp_aligner.sh # or: PY=$(which python) src/.../build_cpp_aligner.sh
|
|
284
329
|
```
|
|
285
330
|
|
|
286
|
-
|
|
331
|
+
Either way it compiles the extension into the `pyseqalign` package. Then:
|
|
287
332
|
|
|
288
333
|
```python
|
|
289
334
|
from pyseqalign.accel import cpp_available, load
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
|
|
2
|
+
# pybind11 is a build-time dep so the OPTIONAL C++ aligner can be compiled at
|
|
3
|
+
# install time (best-effort; see setup.py). It is not a runtime dependency.
|
|
4
|
+
requires = ["setuptools>=68.0", "pybind11>=2.10"]
|
|
3
5
|
build-backend = "setuptools.build_meta"
|
|
4
6
|
|
|
5
7
|
[project]
|
|
6
8
|
# PyPI distribution name (the import package is `pyseqalign`; the name
|
|
7
9
|
# `pyseqalign` was blocked by PyPI's similarity guard vs. an existing project).
|
|
8
10
|
name = "pyseqalignment"
|
|
9
|
-
version = "0.1.
|
|
11
|
+
version = "0.1.4"
|
|
10
12
|
description = "pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning"
|
|
11
13
|
readme = "README.md"
|
|
12
14
|
license = "MIT"
|
|
@@ -78,6 +80,11 @@ line-length = 100
|
|
|
78
80
|
|
|
79
81
|
[tool.ruff.lint]
|
|
80
82
|
select = ["E", "F", "W", "I", "N", "UP"]
|
|
83
|
+
ignore = [
|
|
84
|
+
"N803", # uppercase argument names (matrix math convention: M, Ix, Iy)
|
|
85
|
+
"N806", # uppercase local variables in functions (same reason)
|
|
86
|
+
"E741", # ambiguous variable name 'l' (used in tree (m, l, r) unpacking)
|
|
87
|
+
]
|
|
81
88
|
|
|
82
89
|
[tool.mypy]
|
|
83
90
|
python_version = "3.10"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Best-effort build of the optional C++ affine-gap aligner.
|
|
2
|
+
|
|
3
|
+
pySeqAlign's core is pure Python. The C++ aligner (``pyseqalign.cpp_aligner``)
|
|
4
|
+
is an OPTIONAL accelerator (~100-270x faster, identical results). We try to
|
|
5
|
+
compile it at install time on any platform that has a C++ compiler + pybind11;
|
|
6
|
+
if that fails (no compiler, no pybind11, unsupported platform, ...) the install
|
|
7
|
+
STILL SUCCEEDS and the library transparently falls back to the pure-Python
|
|
8
|
+
aligner -- see ``pyseqalign.accel``.
|
|
9
|
+
|
|
10
|
+
This is why the project publishes an sdist (not a pure-Python wheel): pip builds
|
|
11
|
+
from source on the target machine, giving every install a chance to compile the
|
|
12
|
+
accelerator locally for its own Python/ABI/platform.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
from setuptools import Extension, setup
|
|
20
|
+
from setuptools.command.build_ext import build_ext
|
|
21
|
+
|
|
22
|
+
# Only build on POSIX (macOS/Linux); the hand-tuned flags below are GCC/Clang.
|
|
23
|
+
_CPP = 'src/pyseqalign/cpp/cpp_aligner.cpp'
|
|
24
|
+
ext_modules: list[Extension] = []
|
|
25
|
+
if sys.platform != 'win32':
|
|
26
|
+
try:
|
|
27
|
+
import pybind11
|
|
28
|
+
ext_modules = [
|
|
29
|
+
Extension(
|
|
30
|
+
'pyseqalign.cpp_aligner',
|
|
31
|
+
[_CPP],
|
|
32
|
+
include_dirs=[pybind11.get_include()],
|
|
33
|
+
language='c++',
|
|
34
|
+
optional=True, # setuptools won't fail the build if this ext won't compile
|
|
35
|
+
extra_compile_args=['-O3', '-std=c++14'],
|
|
36
|
+
)
|
|
37
|
+
]
|
|
38
|
+
except Exception as exc: # pybind11 missing -> skip the accelerator
|
|
39
|
+
print(f'pyseqalign: skipping optional C++ aligner ({exc}); pure-Python fallback.')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BestEffortBuildExt(build_ext):
|
|
43
|
+
"""Compile the accelerator if possible; never break the install if not."""
|
|
44
|
+
|
|
45
|
+
def run(self) -> None:
|
|
46
|
+
try:
|
|
47
|
+
super().run()
|
|
48
|
+
except Exception as exc: # pragma: no cover - depends on build env
|
|
49
|
+
print(f'pyseqalign: optional C++ aligner not built ({exc}); pure-Python fallback.')
|
|
50
|
+
|
|
51
|
+
def build_extension(self, ext) -> None:
|
|
52
|
+
try:
|
|
53
|
+
super().build_extension(ext)
|
|
54
|
+
except Exception as exc: # pragma: no cover - depends on build env
|
|
55
|
+
print(f'pyseqalign: optional C++ aligner not built ({exc}); pure-Python fallback.')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
setup(ext_modules=ext_modules, cmdclass={'build_ext': BestEffortBuildExt})
|
|
@@ -1,14 +1,21 @@
|
|
|
1
1
|
"""pySeqAlign -- Sequence alignment with Prolog-style distance functions and ILP learning."""
|
|
2
2
|
|
|
3
|
-
from pyseqalign.core.alignment import
|
|
3
|
+
from pyseqalign.core.alignment import (
|
|
4
|
+
AffineAlignmentResult,
|
|
5
|
+
AlignmentResult,
|
|
6
|
+
LocalAlignmentResult,
|
|
7
|
+
)
|
|
4
8
|
from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
|
|
9
|
+
from pyseqalign.core.nw_affine import NeedlemanWunschAffine
|
|
5
10
|
from pyseqalign.core.smith_waterman import SmithWaterman
|
|
6
11
|
|
|
7
|
-
__version__ = "0.1.
|
|
12
|
+
__version__ = "0.1.4"
|
|
8
13
|
|
|
9
14
|
__all__ = [
|
|
10
15
|
"SmithWaterman",
|
|
11
16
|
"NeedlemanWunsch",
|
|
17
|
+
"NeedlemanWunschAffine",
|
|
12
18
|
"AlignmentResult",
|
|
19
|
+
"AffineAlignmentResult",
|
|
13
20
|
"LocalAlignmentResult",
|
|
14
21
|
]
|
|
@@ -1,12 +1,19 @@
|
|
|
1
1
|
"""Core alignment algorithms."""
|
|
2
2
|
|
|
3
|
-
from pyseqalign.core.alignment import
|
|
3
|
+
from pyseqalign.core.alignment import (
|
|
4
|
+
AffineAlignmentResult,
|
|
5
|
+
AlignmentResult,
|
|
6
|
+
LocalAlignmentResult,
|
|
7
|
+
)
|
|
4
8
|
from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
|
|
9
|
+
from pyseqalign.core.nw_affine import NeedlemanWunschAffine
|
|
5
10
|
from pyseqalign.core.smith_waterman import SmithWaterman
|
|
6
11
|
|
|
7
12
|
__all__ = [
|
|
8
13
|
"SmithWaterman",
|
|
9
14
|
"NeedlemanWunsch",
|
|
15
|
+
"NeedlemanWunschAffine",
|
|
10
16
|
"AlignmentResult",
|
|
17
|
+
"AffineAlignmentResult",
|
|
11
18
|
"LocalAlignmentResult",
|
|
12
19
|
]
|
|
@@ -22,6 +22,19 @@ class AlignmentResult:
|
|
|
22
22
|
length: int
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclass
|
|
26
|
+
class AffineAlignmentResult(AlignmentResult):
|
|
27
|
+
"""Extended result from affine-gap alignment.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
gap_opens: Number of gap-open events in both sequences combined.
|
|
31
|
+
gap_extensions: Number of gap-extension events.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
gap_opens: int = 0
|
|
35
|
+
gap_extensions: int = 0
|
|
36
|
+
|
|
37
|
+
|
|
25
38
|
@dataclass
|
|
26
39
|
class LocalAlignmentResult:
|
|
27
40
|
"""Result of a single local (Smith-Waterman) alignment.
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Needleman-Wunsch global alignment with affine gap penalties.
|
|
2
|
+
|
|
3
|
+
Translated from the legacy C++ AlignerAffine::_align() implementation.
|
|
4
|
+
Uses three DP matrices (M, Ix, Iy) to distinguish gap-open from gap-extend.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from pyseqalign.core.alignment import AffineAlignmentResult
|
|
12
|
+
from pyseqalign.scoring.protocols import ScoringFunction
|
|
13
|
+
|
|
14
|
+
# Matrix indices.
|
|
15
|
+
_M = 0 # match/mismatch
|
|
16
|
+
_IX = 1 # gap in target (consuming query element)
|
|
17
|
+
_IY = 2 # gap in query (consuming target element)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NeedlemanWunschAffine:
|
|
21
|
+
"""Needleman-Wunsch with affine gap penalties.
|
|
22
|
+
|
|
23
|
+
Recurrences (similarity mode):
|
|
24
|
+
M[i][j] = score(q[i], t[j]) + max(M[i-1][j-1], Ix[i-1][j-1], Iy[i-1][j-1])
|
|
25
|
+
Ix[i][j] = max(M[i-1][j] + gap_open, Ix[i-1][j] + gap_extend, Iy[i-1][j] + gap_open)
|
|
26
|
+
Iy[i][j] = max(M[i][j-1] + gap_open, Iy[i][j-1] + gap_extend, Ix[i][j-1] + gap_open)
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
scoring: Scoring function (element ID 0 = gap).
|
|
30
|
+
gap_open: Cost for opening a new gap (should be negative for penalties).
|
|
31
|
+
gap_extend: Cost for extending an existing gap (should be negative,
|
|
32
|
+
typically less severe than gap_open).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
scoring: ScoringFunction,
|
|
38
|
+
gap_open: float = -2.5,
|
|
39
|
+
gap_extend: float = -0.25,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.scoring = scoring
|
|
42
|
+
self.gap_open = gap_open
|
|
43
|
+
self.gap_extend = gap_extend
|
|
44
|
+
|
|
45
|
+
def align(self, seq1: list[int], seq2: list[int]) -> AffineAlignmentResult:
|
|
46
|
+
"""Compute the optimal global alignment with affine gap penalties.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
seq1: Query sequence (list of integer element IDs).
|
|
50
|
+
seq2: Target sequence.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
An ``AffineAlignmentResult`` with aligned sequences and gap statistics.
|
|
54
|
+
"""
|
|
55
|
+
n = len(seq1)
|
|
56
|
+
m = len(seq2)
|
|
57
|
+
|
|
58
|
+
NEG_INF = -np.inf
|
|
59
|
+
|
|
60
|
+
# F[k, i, j] for k in {M=0, Ix=1, Iy=2}
|
|
61
|
+
F = np.full((3, n + 1, m + 1), NEG_INF, dtype=np.float64)
|
|
62
|
+
# Traceback: B[k, i, j, :] = (from_k, from_i, from_j)
|
|
63
|
+
B = np.full((3, n + 1, m + 1, 3), -1, dtype=np.int32)
|
|
64
|
+
|
|
65
|
+
F[_M, 0, 0] = 0.0
|
|
66
|
+
|
|
67
|
+
d = self.gap_open
|
|
68
|
+
e = self.gap_extend
|
|
69
|
+
|
|
70
|
+
# --- Border initialization: gaps along query (Ix column) ---
|
|
71
|
+
for i0 in range(n):
|
|
72
|
+
i = i0 + 1
|
|
73
|
+
if i > 1:
|
|
74
|
+
F[_IX, i, 0] = F[_IX, i - 1, 0] + e
|
|
75
|
+
else:
|
|
76
|
+
F[_IX, i, 0] = d
|
|
77
|
+
B[_IX, i, 0] = [_IX, i - 1, 0]
|
|
78
|
+
# M and Iy are -inf along this border (already set).
|
|
79
|
+
|
|
80
|
+
# --- Border initialization: gaps along target (Iy row) ---
|
|
81
|
+
for j0 in range(m):
|
|
82
|
+
j = j0 + 1
|
|
83
|
+
if j > 1:
|
|
84
|
+
F[_IY, 0, j] = F[_IY, 0, j - 1] + e
|
|
85
|
+
else:
|
|
86
|
+
F[_IY, 0, j] = d
|
|
87
|
+
B[_IY, 0, j] = [_IY, 0, j - 1]
|
|
88
|
+
# M and Ix are -inf along this border (already set).
|
|
89
|
+
|
|
90
|
+
# --- Main DP fill ---
|
|
91
|
+
for i0 in range(n):
|
|
92
|
+
i = i0 + 1
|
|
93
|
+
for j0 in range(m):
|
|
94
|
+
j = j0 + 1
|
|
95
|
+
|
|
96
|
+
# Match/mismatch: diagonal transition.
|
|
97
|
+
s = self.scoring.score(seq1[i - 1], seq2[j - 1])
|
|
98
|
+
candidates_m = (
|
|
99
|
+
F[_M, i - 1, j - 1] + s,
|
|
100
|
+
F[_IX, i - 1, j - 1] + s,
|
|
101
|
+
F[_IY, i - 1, j - 1] + s,
|
|
102
|
+
)
|
|
103
|
+
best_k = _argmax3(candidates_m)
|
|
104
|
+
F[_M, i, j] = candidates_m[best_k]
|
|
105
|
+
B[_M, i, j] = [best_k, i - 1, j - 1]
|
|
106
|
+
|
|
107
|
+
# Ix: gap in target (consume query[i], skip target).
|
|
108
|
+
candidates_ix = (
|
|
109
|
+
F[_M, i - 1, j] + d, # new gap
|
|
110
|
+
F[_IX, i - 1, j] + e, # extend gap
|
|
111
|
+
F[_IY, i - 1, j] + d, # new gap
|
|
112
|
+
)
|
|
113
|
+
best_k = _argmax3(candidates_ix)
|
|
114
|
+
F[_IX, i, j] = candidates_ix[best_k]
|
|
115
|
+
B[_IX, i, j] = [best_k, i - 1, j]
|
|
116
|
+
|
|
117
|
+
# Iy: gap in query (skip query, consume target[j]).
|
|
118
|
+
candidates_iy = (
|
|
119
|
+
F[_M, i, j - 1] + d, # new gap
|
|
120
|
+
F[_IY, i, j - 1] + e, # extend gap
|
|
121
|
+
F[_IX, i, j - 1] + d, # new gap
|
|
122
|
+
)
|
|
123
|
+
best_k = _argmax3(candidates_iy)
|
|
124
|
+
F[_IY, i, j] = candidates_iy[best_k]
|
|
125
|
+
B[_IY, i, j] = [best_k, i, j - 1]
|
|
126
|
+
|
|
127
|
+
# --- Find best endpoint ---
|
|
128
|
+
end_scores = (F[_M, n, m], F[_IX, n, m], F[_IY, n, m])
|
|
129
|
+
best_end = _argmax3(end_scores)
|
|
130
|
+
score = end_scores[best_end]
|
|
131
|
+
|
|
132
|
+
# --- Traceback ---
|
|
133
|
+
align1, align2, gap_opens, gap_extensions = self._traceback(B, seq1, seq2, best_end, n, m)
|
|
134
|
+
|
|
135
|
+
return AffineAlignmentResult(
|
|
136
|
+
query=align1,
|
|
137
|
+
target=align2,
|
|
138
|
+
score=float(score),
|
|
139
|
+
length=len(align1),
|
|
140
|
+
gap_opens=gap_opens,
|
|
141
|
+
gap_extensions=gap_extensions,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _traceback(
|
|
146
|
+
B: np.ndarray,
|
|
147
|
+
seq1: list[int],
|
|
148
|
+
seq2: list[int],
|
|
149
|
+
start_k: int,
|
|
150
|
+
start_i: int,
|
|
151
|
+
start_j: int,
|
|
152
|
+
) -> tuple[list[int], list[int], int, int]:
|
|
153
|
+
"""Walk the traceback matrix to produce aligned sequences."""
|
|
154
|
+
align1: list[int] = []
|
|
155
|
+
align2: list[int] = []
|
|
156
|
+
gap_opens = 0
|
|
157
|
+
gap_extensions = 0
|
|
158
|
+
|
|
159
|
+
k, i, j = start_k, start_i, start_j
|
|
160
|
+
prev_k = -1
|
|
161
|
+
|
|
162
|
+
while i > 0 or j > 0:
|
|
163
|
+
from_k, from_i, from_j = int(B[k, i, j, 0]), int(B[k, i, j, 1]), int(B[k, i, j, 2])
|
|
164
|
+
|
|
165
|
+
if from_i < 0 or from_j < 0:
|
|
166
|
+
# Reached uninitialised border — shouldn't happen.
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if k == _M:
|
|
170
|
+
# Diagonal: match/mismatch.
|
|
171
|
+
align1.append(seq1[i - 1])
|
|
172
|
+
align2.append(seq2[j - 1])
|
|
173
|
+
elif k == _IX:
|
|
174
|
+
# Gap in target.
|
|
175
|
+
align1.append(seq1[i - 1])
|
|
176
|
+
align2.append(0)
|
|
177
|
+
if prev_k != _IX:
|
|
178
|
+
gap_opens += 1
|
|
179
|
+
else:
|
|
180
|
+
gap_extensions += 1
|
|
181
|
+
else: # _IY
|
|
182
|
+
# Gap in query.
|
|
183
|
+
align1.append(0)
|
|
184
|
+
align2.append(seq2[j - 1])
|
|
185
|
+
if prev_k != _IY:
|
|
186
|
+
gap_opens += 1
|
|
187
|
+
else:
|
|
188
|
+
gap_extensions += 1
|
|
189
|
+
|
|
190
|
+
prev_k = k
|
|
191
|
+
k, i, j = from_k, from_i, from_j
|
|
192
|
+
|
|
193
|
+
align1.reverse()
|
|
194
|
+
align2.reverse()
|
|
195
|
+
return align1, align2, gap_opens, gap_extensions
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _argmax3(vals: tuple[float, float, float]) -> int:
|
|
199
|
+
"""Return index of maximum among exactly three values."""
|
|
200
|
+
if vals[0] >= vals[1]:
|
|
201
|
+
return 0 if vals[0] >= vals[2] else 2
|
|
202
|
+
return 1 if vals[1] >= vals[2] else 2
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Relational sequence logos — position-specific profiles of logical atoms."""
|
|
2
|
+
|
|
3
|
+
from pyseqalign.logo.probability import FreqDist, LidstoneProbDist, MLEProbDist
|
|
4
|
+
from pyseqalign.logo.profile import PositionProfile, RelationalProfile
|
|
5
|
+
from pyseqalign.logo.render import column_ic, lgg_atoms, relational_logo, term_str
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'FreqDist',
|
|
9
|
+
'MLEProbDist',
|
|
10
|
+
'LidstoneProbDist',
|
|
11
|
+
'PositionProfile',
|
|
12
|
+
'RelationalProfile',
|
|
13
|
+
'relational_logo',
|
|
14
|
+
'column_ic',
|
|
15
|
+
'lgg_atoms',
|
|
16
|
+
'term_str',
|
|
17
|
+
]
|