pyseqalignment 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyseqalignment-0.1.1/src/pyseqalignment.egg-info → pyseqalignment-0.1.2}/PKG-INFO +31 -1
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/README.md +30 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/pyproject.toml +2 -1
- pyseqalignment-0.1.2/src/pyseqalign/accel.py +33 -0
- pyseqalignment-0.1.2/src/pyseqalign/cpp/build_cpp_aligner.sh +12 -0
- pyseqalignment-0.1.2/src/pyseqalign/cpp/cpp_aligner.cpp +140 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2/src/pyseqalignment.egg-info}/PKG-INFO +31 -1
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalignment.egg-info/SOURCES.txt +3 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/LICENSE +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/setup.cfg +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/core/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/core/alignment.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/core/needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/core/smith_waterman.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/aleph.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/aleph_files/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/aleph_files/aleph_swi_ak.pl +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/base.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/popper.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/task_builder.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/engine.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/amino_acids.pl +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/blosum50.pl +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/defaults.pl +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/distances.pl +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/distance.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrices.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM100 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM50 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM60 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM62 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM70 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM80 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/BLOSUM90 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/PAM150 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/PAM200 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/PAM250 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/PAM50 +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/utils/__init__.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/utils/helpers.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalignment.egg-info/dependency_links.txt +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalignment.egg-info/requires.txt +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalignment.egg-info/top_level.txt +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/tests/test_learning.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/tests/test_needleman_wunsch.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/tests/test_scoring.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/tests/test_smith_waterman.py +0 -0
- {pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyseqalignment
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning
|
|
5
5
|
Author-email: Andreas Karwath <a.karwath@bham.ac.uk>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -64,6 +64,7 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
64
64
|
- **Aleph** backend -- classic ILP system (Srinivasan, 2001)
|
|
65
65
|
- **Popper** backend -- modern ILP via learning from failures (Cropper & Morel, 2021)
|
|
66
66
|
- **Pure Python** core -- no C extension required (unlike the legacy version)
|
|
67
|
+
- **Optional fast C++ aligner** -- a pybind11 affine-gap Needleman-Wunsch kernel (`pyseqalign.accel`) for large all-pairs / iterative workloads; identical results to the Python aligner, ~100-270x faster
|
|
67
68
|
|
|
68
69
|
## Installation
|
|
69
70
|
|
|
@@ -308,6 +309,35 @@ For reference, other notable systems in the field include:
|
|
|
308
309
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
309
310
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
310
311
|
|
|
312
|
+
## Fast C++ aligner (optional)
|
|
313
|
+
|
|
314
|
+
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
315
|
+
boosting that re-aligns thousands of sequence pairs per iteration), an optional
|
|
316
|
+
**C++ affine-gap Needleman-Wunsch** kernel is provided. It is NOT built by default
|
|
317
|
+
(the core stays pure Python); build it once with a C++ compiler + pybind11:
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
pip install pybind11
|
|
321
|
+
src/pyseqalign/cpp/build_cpp_aligner.sh # or: PY=$(which python) src/.../build_cpp_aligner.sh
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
This compiles the extension into the `pyseqalign` package. Then:
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
from pyseqalign.accel import cpp_available, load
|
|
328
|
+
assert cpp_available()
|
|
329
|
+
cpp = load() # the compiled module
|
|
330
|
+
al = cpp.CppAligner(num_ids, gap_open, gap_extend)
|
|
331
|
+
al.set_matrix(flat_score_matrix) # (num_ids+1)^2 row-major scores
|
|
332
|
+
r = al.align(query_ids, target_ids) # r.score, r.query, r.target, r.gap_opens, ...
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
The kernel implements the same 3-matrix (M/Ix/Iy) affine recurrence as a numeric
|
|
336
|
+
aligner over a dense score matrix, so its results are interchangeable with a
|
|
337
|
+
pure-Python affine Needleman-Wunsch (validated bit-for-bit). If the extension is
|
|
338
|
+
not built, `cpp_available()` is `False` and `load()` raises a helpful error --
|
|
339
|
+
callers should fall back to the Python aligner.
|
|
340
|
+
|
|
311
341
|
## Background
|
|
312
342
|
|
|
313
343
|
**pySeqAlign** is a modern, pure-Python reimplementation that revives the name of one of its own ancestors. It succeeds two legacy libraries behind the ILP 2006 / ICDM 2008 work: the original **pyAlign** (SWIG-wrapped C with YAP Prolog bindings for alignment) and the original **pySeqAlign** (which held the Aleph ILP framework for learning rules from alignment examples). This version is pure Python, with optional SWI-Prolog integration via [Janus](https://www.swi-prolog.org/packs/list?p=janus) (the modern Python-Prolog bridge, replacing the older pyswip).
|
|
@@ -26,6 +26,7 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
26
26
|
- **Aleph** backend -- classic ILP system (Srinivasan, 2001)
|
|
27
27
|
- **Popper** backend -- modern ILP via learning from failures (Cropper & Morel, 2021)
|
|
28
28
|
- **Pure Python** core -- no C extension required (unlike the legacy version)
|
|
29
|
+
- **Optional fast C++ aligner** -- a pybind11 affine-gap Needleman-Wunsch kernel (`pyseqalign.accel`) for large all-pairs / iterative workloads; identical results to the Python aligner, ~100-270x faster
|
|
29
30
|
|
|
30
31
|
## Installation
|
|
31
32
|
|
|
@@ -270,6 +271,35 @@ For reference, other notable systems in the field include:
|
|
|
270
271
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
271
272
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
272
273
|
|
|
274
|
+
## Fast C++ aligner (optional)
|
|
275
|
+
|
|
276
|
+
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
277
|
+
boosting that re-aligns thousands of sequence pairs per iteration), an optional
|
|
278
|
+
**C++ affine-gap Needleman-Wunsch** kernel is provided. It is NOT built by default
|
|
279
|
+
(the core stays pure Python); build it once with a C++ compiler + pybind11:
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
pip install pybind11
|
|
283
|
+
src/pyseqalign/cpp/build_cpp_aligner.sh # or: PY=$(which python) src/.../build_cpp_aligner.sh
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
This compiles the extension into the `pyseqalign` package. Then:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
from pyseqalign.accel import cpp_available, load
|
|
290
|
+
assert cpp_available()
|
|
291
|
+
cpp = load() # the compiled module
|
|
292
|
+
al = cpp.CppAligner(num_ids, gap_open, gap_extend)
|
|
293
|
+
al.set_matrix(flat_score_matrix) # (num_ids+1)^2 row-major scores
|
|
294
|
+
r = al.align(query_ids, target_ids) # r.score, r.query, r.target, r.gap_opens, ...
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
The kernel implements the same 3-matrix (M/Ix/Iy) affine recurrence as a numeric
|
|
298
|
+
aligner over a dense score matrix, so its results are interchangeable with a
|
|
299
|
+
pure-Python affine Needleman-Wunsch (validated bit-for-bit). If the extension is
|
|
300
|
+
not built, `cpp_available()` is `False` and `load()` raises a helpful error --
|
|
301
|
+
callers should fall back to the Python aligner.
|
|
302
|
+
|
|
273
303
|
## Background
|
|
274
304
|
|
|
275
305
|
**pySeqAlign** is a modern, pure-Python reimplementation that revives the name of one of its own ancestors. It succeeds two legacy libraries behind the ILP 2006 / ICDM 2008 work: the original **pyAlign** (SWIG-wrapped C with YAP Prolog bindings for alignment) and the original **pySeqAlign** (which held the Aleph ILP framework for learning rules from alignment examples). This version is pure Python, with optional SWI-Prolog integration via [Janus](https://www.swi-prolog.org/packs/list?p=janus) (the modern Python-Prolog bridge, replacing the older pyswip).
|
|
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
|
|
|
6
6
|
# PyPI distribution name (the import package is `pyseqalign`; the name
|
|
7
7
|
# `pyseqalign` was blocked by PyPI's similarity guard vs. an existing project).
|
|
8
8
|
name = "pyseqalignment"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.2"
|
|
10
10
|
description = "pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
license = "MIT"
|
|
@@ -66,6 +66,7 @@ where = ["src"]
|
|
|
66
66
|
"pyseqalign.prolog.knowledge" = ["*.pl"]
|
|
67
67
|
"pyseqalign.learning.aleph_files" = ["*.pl"]
|
|
68
68
|
"pyseqalign.scoring.matrix_data" = ["*"]
|
|
69
|
+
"pyseqalign" = ["cpp/*.cpp", "cpp/*.sh"] # optional C++ aligner source (build manually)
|
|
69
70
|
|
|
70
71
|
[tool.pytest.ini_options]
|
|
71
72
|
testpaths = ["tests"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Optional fast C++ affine-gap aligner (pybind11).
|
|
2
|
+
|
|
3
|
+
pySeqAlign's core is pure Python; this accelerated aligner is OPTIONAL. Build it
|
|
4
|
+
with ``src/pyseqalign/cpp/build_cpp_aligner.sh`` (needs a C++ compiler + pybind11).
|
|
5
|
+
|
|
6
|
+
It implements the same 3-matrix affine Needleman-Wunsch recurrence as a numeric
|
|
7
|
+
kernel over a dense score matrix, so results are interchangeable with a
|
|
8
|
+
pure-Python affine aligner. Exposes ``CppAligner(num_ids, gap_open, gap_extend)``
|
|
9
|
+
with ``set_matrix(flat_row_major)`` and ``align(q, t) -> AlignResult``
|
|
10
|
+
(``.score/.query/.target/.gap_opens/.gap_extensions/.length``).
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cpp_available() -> bool:
|
|
16
|
+
"""True if the compiled extension is built and importable."""
|
|
17
|
+
try:
|
|
18
|
+
from . import cpp_aligner # noqa: F401
|
|
19
|
+
return True
|
|
20
|
+
except Exception:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load():
|
|
25
|
+
"""Return the compiled module (raises a helpful error if not built)."""
|
|
26
|
+
try:
|
|
27
|
+
from . import cpp_aligner
|
|
28
|
+
return cpp_aligner
|
|
29
|
+
except ImportError as e: # pragma: no cover
|
|
30
|
+
raise ImportError(
|
|
31
|
+
'pySeqAlign C++ aligner not built. Run '
|
|
32
|
+
'src/pyseqalign/cpp/build_cpp_aligner.sh (needs a C++ compiler + pybind11).'
|
|
33
|
+
) from e
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Build the optional fast C++ affine-gap aligner (pybind11) for the active Python.
|
|
3
|
+
# Usage: PY=/path/to/python ./build_cpp_aligner.sh (default: python3)
|
|
4
|
+
set -e
|
|
5
|
+
PY="${PY:-python3}"
|
|
6
|
+
HERE="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
PYINC=$("$PY" -c "import sysconfig;print(sysconfig.get_path('include'))")
|
|
8
|
+
PBINC=$("$PY" -c "import pybind11;print(pybind11.get_include())")
|
|
9
|
+
SUF=$("$PY" -c "import sysconfig;print(sysconfig.get_config_var('EXT_SUFFIX'))")
|
|
10
|
+
clang++ -O3 -std=c++14 -shared -undefined dynamic_lookup -fPIC \
|
|
11
|
+
-I"$PYINC" -I"$PBINC" "$HERE/cpp_aligner.cpp" -o "$HERE/../cpp_aligner$SUF"
|
|
12
|
+
echo "built $HERE/../cpp_aligner$SUF"
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// Fast C++ Needleman-Wunsch affine-gap aligner for pyREAL boosting.
|
|
2
|
+
//
|
|
3
|
+
// Mirrors pyreal.core.nw_affine.NeedlemanWunschAffine EXACTLY (3-matrix M/Ix/Iy
|
|
4
|
+
// DP, same border init, same argmax tie-breaking, same traceback + gap counting)
|
|
5
|
+
// so the Python and C++ backends are interchangeable. Adapted from the pyAlign2
|
|
6
|
+
// AlignerAffine structure, but YAP-free: scores come from a flat distance matrix
|
|
7
|
+
// (the boosting reward matrix), set once per reward update; align() is then a
|
|
8
|
+
// pure numeric kernel callable thousands of times without recomputing scores.
|
|
9
|
+
//
|
|
10
|
+
// Build: see build_cpp_aligner.sh
|
|
11
|
+
#include <pybind11/pybind11.h>
|
|
12
|
+
#include <pybind11/stl.h>
|
|
13
|
+
#include <vector>
|
|
14
|
+
#include <limits>
|
|
15
|
+
#include <algorithm>
|
|
16
|
+
#include <stdexcept>
|
|
17
|
+
|
|
18
|
+
namespace py = pybind11;
|
|
19
|
+
|
|
20
|
+
static const int M = 0, IX = 1, IY = 2;
|
|
21
|
+
|
|
22
|
+
struct AlignResult {
|
|
23
|
+
double score = 0.0;
|
|
24
|
+
std::vector<int> query; // aligned query (0 = gap)
|
|
25
|
+
std::vector<int> target; // aligned target (0 = gap)
|
|
26
|
+
int gap_opens = 0;
|
|
27
|
+
int gap_extensions = 0;
|
|
28
|
+
int length = 0;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
class CppAligner {
|
|
32
|
+
public:
|
|
33
|
+
// num_ids = number of distinct atom ids (gap id 0 excluded). The score
|
|
34
|
+
// matrix is (num_ids+1) x (num_ids+1), row-major, indexed by atom id.
|
|
35
|
+
CppAligner(int num_ids, double gap_open, double gap_extend)
|
|
36
|
+
: n1_(num_ids + 1), gap_open_(gap_open), gap_extend_(gap_extend),
|
|
37
|
+
mat_((size_t)(num_ids + 1) * (num_ids + 1), 0.0) {}
|
|
38
|
+
|
|
39
|
+
void set_matrix(const std::vector<double>& flat) {
|
|
40
|
+
if (flat.size() != mat_.size())
|
|
41
|
+
throw std::runtime_error("score matrix size mismatch");
|
|
42
|
+
mat_ = flat;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
inline double score(int a, int b) const { return mat_[(size_t)a * n1_ + b]; }
|
|
46
|
+
|
|
47
|
+
AlignResult align(const std::vector<int>& q, const std::vector<int>& t) const {
|
|
48
|
+
const int n = (int)q.size(), m = (int)t.size();
|
|
49
|
+
const double NEG = -std::numeric_limits<double>::infinity();
|
|
50
|
+
const double d = gap_open_, e = gap_extend_;
|
|
51
|
+
|
|
52
|
+
// F[k][i][j], B stores (from_k, from_i, from_j)
|
|
53
|
+
std::vector<double> F((size_t)3 * (n + 1) * (m + 1), NEG);
|
|
54
|
+
std::vector<int> B((size_t)3 * (n + 1) * (m + 1) * 3, -1);
|
|
55
|
+
auto Fi = [&](int k, int i, int j) -> double& {
|
|
56
|
+
return F[((size_t)k * (n + 1) + i) * (m + 1) + j];
|
|
57
|
+
};
|
|
58
|
+
auto Bi = [&](int k, int i, int j, int c) -> int& {
|
|
59
|
+
return B[(((size_t)k * (n + 1) + i) * (m + 1) + j) * 3 + c];
|
|
60
|
+
};
|
|
61
|
+
auto setB = [&](int k, int i, int j, int fk, int fi, int fj) {
|
|
62
|
+
Bi(k, i, j, 0) = fk; Bi(k, i, j, 1) = fi; Bi(k, i, j, 2) = fj;
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
Fi(M, 0, 0) = 0.0;
|
|
66
|
+
for (int i = 1; i <= n; ++i) {
|
|
67
|
+
Fi(IX, i, 0) = (i > 1) ? Fi(IX, i - 1, 0) + e : d;
|
|
68
|
+
setB(IX, i, 0, IX, i - 1, 0);
|
|
69
|
+
}
|
|
70
|
+
for (int j = 1; j <= m; ++j) {
|
|
71
|
+
Fi(IY, 0, j) = (j > 1) ? Fi(IY, 0, j - 1) + e : d;
|
|
72
|
+
setB(IY, 0, j, IY, 0, j - 1);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (int i = 1; i <= n; ++i) {
|
|
76
|
+
for (int j = 1; j <= m; ++j) {
|
|
77
|
+
double s = score(q[i - 1], t[j - 1]);
|
|
78
|
+
double cm[3] = {Fi(M, i - 1, j - 1) + s, Fi(IX, i - 1, j - 1) + s, Fi(IY, i - 1, j - 1) + s};
|
|
79
|
+
int bk = argmax3(cm);
|
|
80
|
+
Fi(M, i, j) = cm[bk]; setB(M, i, j, bk, i - 1, j - 1);
|
|
81
|
+
|
|
82
|
+
double cx[3] = {Fi(M, i - 1, j) + d, Fi(IX, i - 1, j) + e, Fi(IY, i - 1, j) + d};
|
|
83
|
+
bk = argmax3(cx);
|
|
84
|
+
Fi(IX, i, j) = cx[bk]; setB(IX, i, j, bk, i - 1, j);
|
|
85
|
+
|
|
86
|
+
double cy[3] = {Fi(M, i, j - 1) + d, Fi(IY, i, j - 1) + e, Fi(IX, i, j - 1) + d};
|
|
87
|
+
bk = argmax3(cy);
|
|
88
|
+
Fi(IY, i, j) = cy[bk]; setB(IY, i, j, bk, i, j - 1);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
double ends[3] = {Fi(M, n, m), Fi(IX, n, m), Fi(IY, n, m)};
|
|
93
|
+
int best = argmax3(ends);
|
|
94
|
+
|
|
95
|
+
AlignResult r;
|
|
96
|
+
r.score = ends[best];
|
|
97
|
+
int k = best, i = n, j = m, prev_k = -1;
|
|
98
|
+
while (i > 0 || j > 0) {
|
|
99
|
+
int fk = Bi(k, i, j, 0), fi = Bi(k, i, j, 1), fj = Bi(k, i, j, 2);
|
|
100
|
+
if (fi < 0 || fj < 0) break;
|
|
101
|
+
if (k == M) { r.query.push_back(q[i - 1]); r.target.push_back(t[j - 1]); }
|
|
102
|
+
else if (k == IX) {
|
|
103
|
+
r.query.push_back(q[i - 1]); r.target.push_back(0);
|
|
104
|
+
if (prev_k != IX) r.gap_opens++; else r.gap_extensions++;
|
|
105
|
+
} else {
|
|
106
|
+
r.query.push_back(0); r.target.push_back(t[j - 1]);
|
|
107
|
+
if (prev_k != IY) r.gap_opens++; else r.gap_extensions++;
|
|
108
|
+
}
|
|
109
|
+
prev_k = k; k = fk; i = fi; j = fj;
|
|
110
|
+
}
|
|
111
|
+
std::reverse(r.query.begin(), r.query.end());
|
|
112
|
+
std::reverse(r.target.begin(), r.target.end());
|
|
113
|
+
r.length = (int)r.query.size();
|
|
114
|
+
return r;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
private:
|
|
118
|
+
int n1_;
|
|
119
|
+
double gap_open_, gap_extend_;
|
|
120
|
+
std::vector<double> mat_;
|
|
121
|
+
|
|
122
|
+
static inline int argmax3(const double v[3]) {
|
|
123
|
+
if (v[0] >= v[1]) return (v[0] >= v[2]) ? 0 : 2;
|
|
124
|
+
return (v[1] >= v[2]) ? 1 : 2;
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
PYBIND11_MODULE(cpp_aligner, mod) {
|
|
129
|
+
py::class_<AlignResult>(mod, "AlignResult")
|
|
130
|
+
.def_readonly("score", &AlignResult::score)
|
|
131
|
+
.def_readonly("query", &AlignResult::query)
|
|
132
|
+
.def_readonly("target", &AlignResult::target)
|
|
133
|
+
.def_readonly("gap_opens", &AlignResult::gap_opens)
|
|
134
|
+
.def_readonly("gap_extensions", &AlignResult::gap_extensions)
|
|
135
|
+
.def_readonly("length", &AlignResult::length);
|
|
136
|
+
py::class_<CppAligner>(mod, "CppAligner")
|
|
137
|
+
.def(py::init<int, double, double>())
|
|
138
|
+
.def("set_matrix", &CppAligner::set_matrix)
|
|
139
|
+
.def("align", &CppAligner::align);
|
|
140
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyseqalignment
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning
|
|
5
5
|
Author-email: Andreas Karwath <a.karwath@bham.ac.uk>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -64,6 +64,7 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
|
|
|
64
64
|
- **Aleph** backend -- classic ILP system (Srinivasan, 2001)
|
|
65
65
|
- **Popper** backend -- modern ILP via learning from failures (Cropper & Morel, 2021)
|
|
66
66
|
- **Pure Python** core -- no C extension required (unlike the legacy version)
|
|
67
|
+
- **Optional fast C++ aligner** -- a pybind11 affine-gap Needleman-Wunsch kernel (`pyseqalign.accel`) for large all-pairs / iterative workloads; identical results to the Python aligner, ~100-270x faster
|
|
67
68
|
|
|
68
69
|
## Installation
|
|
69
70
|
|
|
@@ -308,6 +309,35 @@ For reference, other notable systems in the field include:
|
|
|
308
309
|
- [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
|
|
309
310
|
- [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
|
|
310
311
|
|
|
312
|
+
## Fast C++ aligner (optional)
|
|
313
|
+
|
|
314
|
+
The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
|
|
315
|
+
boosting that re-aligns thousands of sequence pairs per iteration), an optional
|
|
316
|
+
**C++ affine-gap Needleman-Wunsch** kernel is provided. It is NOT built by default
|
|
317
|
+
(the core stays pure Python); build it once with a C++ compiler + pybind11:
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
pip install pybind11
|
|
321
|
+
src/pyseqalign/cpp/build_cpp_aligner.sh # or: PY=$(which python) src/.../build_cpp_aligner.sh
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
This compiles the extension into the `pyseqalign` package. Then:
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
from pyseqalign.accel import cpp_available, load
|
|
328
|
+
assert cpp_available()
|
|
329
|
+
cpp = load() # the compiled module
|
|
330
|
+
al = cpp.CppAligner(num_ids, gap_open, gap_extend)
|
|
331
|
+
al.set_matrix(flat_score_matrix) # (num_ids+1)^2 row-major scores
|
|
332
|
+
r = al.align(query_ids, target_ids) # r.score, r.query, r.target, r.gap_opens, ...
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
The kernel implements the same 3-matrix (M/Ix/Iy) affine recurrence as a numeric
|
|
336
|
+
aligner over a dense score matrix, so its results are interchangeable with a
|
|
337
|
+
pure-Python affine Needleman-Wunsch (validated bit-for-bit). If the extension is
|
|
338
|
+
not built, `cpp_available()` is `False` and `load()` raises a helpful error --
|
|
339
|
+
callers should fall back to the Python aligner.
|
|
340
|
+
|
|
311
341
|
## Background
|
|
312
342
|
|
|
313
343
|
**pySeqAlign** is a modern, pure-Python reimplementation that revives the name of one of its own ancestors. It succeeds two legacy libraries behind the ILP 2006 / ICDM 2008 work: the original **pyAlign** (SWIG-wrapped C with YAP Prolog bindings for alignment) and the original **pySeqAlign** (which held the Aleph ILP framework for learning rules from alignment examples). This version is pure Python, with optional SWI-Prolog integration via [Janus](https://www.swi-prolog.org/packs/list?p=janus) (the modern Python-Prolog bridge, replacing the older pyswip).
|
|
@@ -2,10 +2,13 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
src/pyseqalign/__init__.py
|
|
5
|
+
src/pyseqalign/accel.py
|
|
5
6
|
src/pyseqalign/core/__init__.py
|
|
6
7
|
src/pyseqalign/core/alignment.py
|
|
7
8
|
src/pyseqalign/core/needleman_wunsch.py
|
|
8
9
|
src/pyseqalign/core/smith_waterman.py
|
|
10
|
+
src/pyseqalign/cpp/build_cpp_aligner.sh
|
|
11
|
+
src/pyseqalign/cpp/cpp_aligner.cpp
|
|
9
12
|
src/pyseqalign/learning/__init__.py
|
|
10
13
|
src/pyseqalign/learning/aleph.py
|
|
11
14
|
src/pyseqalign/learning/base.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/aleph_files/__init__.py
RENAMED
|
File without changes
|
{pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/learning/aleph_files/aleph_swi_ak.pl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/prolog/knowledge/amino_acids.pl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalign/scoring/matrix_data/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pyseqalignment-0.1.1 → pyseqalignment-0.1.2}/src/pyseqalignment.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|