pyseqalignment 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {pyseqalignment-0.1.3/src/pyseqalignment.egg-info → pyseqalignment-0.1.4}/PKG-INFO +31 -2
  2. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/README.md +30 -1
  3. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/pyproject.toml +6 -1
  4. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/__init__.py +9 -2
  5. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/__init__.py +8 -1
  6. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/alignment.py +13 -0
  7. pyseqalignment-0.1.4/src/pyseqalign/core/nw_affine.py +202 -0
  8. pyseqalignment-0.1.4/src/pyseqalign/logo/__init__.py +17 -0
  9. pyseqalignment-0.1.4/src/pyseqalign/logo/probability.py +192 -0
  10. pyseqalignment-0.1.4/src/pyseqalign/logo/profile.py +249 -0
  11. pyseqalignment-0.1.4/src/pyseqalign/logo/render.py +254 -0
  12. pyseqalignment-0.1.4/src/pyseqalign/msa/__init__.py +17 -0
  13. pyseqalignment-0.1.4/src/pyseqalign/msa/consensus.py +72 -0
  14. pyseqalignment-0.1.4/src/pyseqalign/msa/distance_matrix.py +118 -0
  15. pyseqalignment-0.1.4/src/pyseqalign/msa/guide_tree.py +191 -0
  16. pyseqalignment-0.1.4/src/pyseqalign/msa/progressive.py +221 -0
  17. pyseqalignment-0.1.4/src/pyseqalign/scoring/protocols.py +28 -0
  18. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4/src/pyseqalignment.egg-info}/PKG-INFO +31 -2
  19. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/SOURCES.txt +12 -0
  20. pyseqalignment-0.1.4/tests/test_msa_logo.py +62 -0
  21. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/LICENSE +0 -0
  22. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/setup.cfg +0 -0
  23. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/setup.py +0 -0
  24. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/accel.py +0 -0
  25. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/needleman_wunsch.py +0 -0
  26. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/core/smith_waterman.py +0 -0
  27. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/build_cpp_aligner.sh +0 -0
  28. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/cpp/cpp_aligner.cpp +0 -0
  29. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/__init__.py +0 -0
  30. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph.py +0 -0
  31. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/__init__.py +0 -0
  32. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/aleph_files/aleph_swi_ak.pl +0 -0
  33. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/base.py +0 -0
  34. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/popper.py +0 -0
  35. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/learning/task_builder.py +0 -0
  36. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/__init__.py +0 -0
  37. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/engine.py +0 -0
  38. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/__init__.py +0 -0
  39. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/amino_acids.pl +0 -0
  40. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/blosum50.pl +0 -0
  41. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/defaults.pl +0 -0
  42. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/prolog/knowledge/distances.pl +0 -0
  43. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/__init__.py +0 -0
  44. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/distance.py +0 -0
  45. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrices.py +0 -0
  46. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM100 +0 -0
  47. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM50 +0 -0
  48. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM60 +0 -0
  49. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM62 +0 -0
  50. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM70 +0 -0
  51. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM80 +0 -0
  52. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/BLOSUM90 +0 -0
  53. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM150 +0 -0
  54. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM200 +0 -0
  55. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM250 +0 -0
  56. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/PAM50 +0 -0
  57. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/scoring/matrix_data/__init__.py +0 -0
  58. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/utils/__init__.py +0 -0
  59. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalign/utils/helpers.py +0 -0
  60. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/dependency_links.txt +0 -0
  61. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/requires.txt +0 -0
  62. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/src/pyseqalignment.egg-info/top_level.txt +0 -0
  63. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_learning.py +0 -0
  64. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_needleman_wunsch.py +0 -0
  65. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_scoring.py +0 -0
  66. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_smith_waterman.py +0 -0
  67. {pyseqalignment-0.1.3 → pyseqalignment-0.1.4}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyseqalignment
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning
5
5
  Author-email: Andreas Karwath <a.karwath@bham.ac.uk>
6
6
  License-Expression: MIT
@@ -56,7 +56,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
56
56
  ## Features
57
57
 
58
58
  - **Smith-Waterman** local alignment with k-best non-overlapping results
59
- - **Needleman-Wunsch** global alignment
59
+ - **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
60
+ - **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
61
+ - **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
60
62
  - **Prolog-based distance functions** via SWI-Prolog integration (optional)
61
63
  - **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
62
64
  - **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
@@ -309,6 +311,33 @@ For reference, other notable systems in the field include:
309
311
  - [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
310
312
  - [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
311
313
 
314
+ ## Multiple alignment & relational logos
315
+
316
+ pySeqAlign can align *and* summarise sequences of structured logical atoms,
317
+ reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
318
+ (ILP 2006) — with no learning involved:
319
+
320
+ ```python
321
+ from pyseqalign.msa import progressive_msa
322
+ from pyseqalign.logo import relational_logo
323
+ from pyseqalign.scoring.distance import AtomDistance
324
+
325
+ # atoms as structured tuples: id -> (predicate, *args); 0 = gap
326
+ atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
327
+ seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
328
+
329
+ scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
330
+ msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
331
+ rows = list(msa.aligned_sequences.values())
332
+
333
+ relational_logo(rows, atom_store, 'logo.png', title='example fold')
334
+ ```
335
+
336
+ `progressive_msa` accepts **any** scoring function, so a reward matrix learned
337
+ by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
338
+ in place of the fixed distance. Runnable reproductions of the paper's SCOP and
339
+ balloon logos are in [`examples/`](examples/).
340
+
312
341
  ## Fast C++ aligner (optional)
313
342
 
314
343
  The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
@@ -18,7 +18,9 @@ pySeqAlign provides Smith-Waterman (local) and Needleman-Wunsch (global) sequenc
18
18
  ## Features
19
19
 
20
20
  - **Smith-Waterman** local alignment with k-best non-overlapping results
21
- - **Needleman-Wunsch** global alignment
21
+ - **Needleman-Wunsch** global alignment (linear and **affine** gap costs)
22
+ - **Multiple sequence alignment** -- progressive MSA over a neighbor-joining guide tree, parameterised by any scoring function (`pyseqalign.msa`)
23
+ - **Relational sequence logos** -- information-content logos of aligned logical atoms, with per-column least-general generalisation, after Karwath & Kersting (ILP 2006) (`pyseqalign.logo`)
22
24
  - **Prolog-based distance functions** via SWI-Prolog integration (optional)
23
25
  - **Substitution matrices** -- BLOSUM (50, 60, 62, 70, 80, 90, 100) and PAM (50, 150, 200, 250) bundled; any NCBI-format matrix loadable from file or downloaded at runtime
24
26
  - **Nienhuys-Cheng distance** for recursive structural comparison of logical atoms
@@ -271,6 +273,33 @@ For reference, other notable systems in the field include:
271
273
  - [Metagol](https://github.com/metagol/metagol) -- Meta-Interpretive Learning
272
274
  - [DeepStochLog](https://github.com/ML-KULeuven/deepstochlog) -- Neural-symbolic ILP combining logic and neural networks
273
275
 
276
+ ## Multiple alignment & relational logos
277
+
278
+ pySeqAlign can align *and* summarise sequences of structured logical atoms,
279
+ reproducing Karwath & Kersting, *Relational Sequence Alignments and Logos*
280
+ (ILP 2006) — with no learning involved:
281
+
282
+ ```python
283
+ from pyseqalign.msa import progressive_msa
284
+ from pyseqalign.logo import relational_logo
285
+ from pyseqalign.scoring.distance import AtomDistance
286
+
287
+ # atoms as structured tuples: id -> (predicate, *args); 0 = gap
288
+ atom_store = {1: ('h', 'a', 'r', 'm'), 2: ('h', 'a', 'r', 'l'), 3: ('s', 'p', 'm')}
289
+ seqs = {'d1': [1, 2, 3], 'd2': [1, 3, 2], 'd3': [2, 3]}
290
+
291
+ scoring = AtomDistance(atom_store=atom_store, gap_score=-0.5) # Nienhuys-Cheng
292
+ msa = progressive_msa(seqs, scoring, gap_open=-1.0, gap_extend=-0.1)
293
+ rows = list(msa.aligned_sequences.values())
294
+
295
+ relational_logo(rows, atom_store, 'logo.png', title='example fold')
296
+ ```
297
+
298
+ `progressive_msa` accepts **any** scoring function, so a reward matrix learned
299
+ by [pyREAL](https://github.com/athro/pyREAL)'s boosting can drive the alignment
300
+ in place of the fixed distance. Runnable reproductions of the paper's SCOP and
301
+ balloon logos are in [`examples/`](examples/).
302
+
274
303
  ## Fast C++ aligner (optional)
275
304
 
276
305
  The pure-Python aligners are fine for typical use. For heavy workloads (e.g.
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
8
8
  # PyPI distribution name (the import package is `pyseqalign`; the name
9
9
  # `pyseqalign` was blocked by PyPI's similarity guard vs. an existing project).
10
10
  name = "pyseqalignment"
11
- version = "0.1.3"
11
+ version = "0.1.4"
12
12
  description = "pySeqAlign -- sequence alignment with Prolog-style distance functions and ILP learning"
13
13
  readme = "README.md"
14
14
  license = "MIT"
@@ -80,6 +80,11 @@ line-length = 100
80
80
 
81
81
  [tool.ruff.lint]
82
82
  select = ["E", "F", "W", "I", "N", "UP"]
83
+ ignore = [
84
+ "N803", # uppercase argument names (matrix math convention: M, Ix, Iy)
85
+ "N806", # uppercase local variables in functions (same reason)
86
+ "E741", # ambiguous variable name 'l' (used in tree (m, l, r) unpacking)
87
+ ]
83
88
 
84
89
  [tool.mypy]
85
90
  python_version = "3.10"
@@ -1,14 +1,21 @@
1
1
  """pySeqAlign -- Sequence alignment with Prolog-style distance functions and ILP learning."""
2
2
 
3
- from pyseqalign.core.alignment import AlignmentResult, LocalAlignmentResult
3
+ from pyseqalign.core.alignment import (
4
+ AffineAlignmentResult,
5
+ AlignmentResult,
6
+ LocalAlignmentResult,
7
+ )
4
8
  from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
9
+ from pyseqalign.core.nw_affine import NeedlemanWunschAffine
5
10
  from pyseqalign.core.smith_waterman import SmithWaterman
6
11
 
7
- __version__ = "0.1.3"
12
+ __version__ = "0.1.4"
8
13
 
9
14
  __all__ = [
10
15
  "SmithWaterman",
11
16
  "NeedlemanWunsch",
17
+ "NeedlemanWunschAffine",
12
18
  "AlignmentResult",
19
+ "AffineAlignmentResult",
13
20
  "LocalAlignmentResult",
14
21
  ]
@@ -1,12 +1,19 @@
1
1
  """Core alignment algorithms."""
2
2
 
3
- from pyseqalign.core.alignment import AlignmentResult, LocalAlignmentResult
3
+ from pyseqalign.core.alignment import (
4
+ AffineAlignmentResult,
5
+ AlignmentResult,
6
+ LocalAlignmentResult,
7
+ )
4
8
  from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
9
+ from pyseqalign.core.nw_affine import NeedlemanWunschAffine
5
10
  from pyseqalign.core.smith_waterman import SmithWaterman
6
11
 
7
12
  __all__ = [
8
13
  "SmithWaterman",
9
14
  "NeedlemanWunsch",
15
+ "NeedlemanWunschAffine",
10
16
  "AlignmentResult",
17
+ "AffineAlignmentResult",
11
18
  "LocalAlignmentResult",
12
19
  ]
@@ -22,6 +22,19 @@ class AlignmentResult:
22
22
  length: int
23
23
 
24
24
 
25
+ @dataclass
26
+ class AffineAlignmentResult(AlignmentResult):
27
+ """Extended result from affine-gap alignment.
28
+
29
+ Attributes:
30
+ gap_opens: Number of gap-open events in both sequences combined.
31
+ gap_extensions: Number of gap-extension events.
32
+ """
33
+
34
+ gap_opens: int = 0
35
+ gap_extensions: int = 0
36
+
37
+
25
38
  @dataclass
26
39
  class LocalAlignmentResult:
27
40
  """Result of a single local (Smith-Waterman) alignment.
@@ -0,0 +1,202 @@
1
+ """Needleman-Wunsch global alignment with affine gap penalties.
2
+
3
+ Translated from the legacy C++ AlignerAffine::_align() implementation.
4
+ Uses three DP matrices (M, Ix, Iy) to distinguish gap-open from gap-extend.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+
11
+ from pyseqalign.core.alignment import AffineAlignmentResult
12
+ from pyseqalign.scoring.protocols import ScoringFunction
13
+
14
+ # Matrix indices.
15
+ _M = 0 # match/mismatch
16
+ _IX = 1 # gap in target (consuming query element)
17
+ _IY = 2 # gap in query (consuming target element)
18
+
19
+
20
+ class NeedlemanWunschAffine:
21
+ """Needleman-Wunsch with affine gap penalties.
22
+
23
+ Recurrences (similarity mode):
24
+ M[i][j] = score(q[i], t[j]) + max(M[i-1][j-1], Ix[i-1][j-1], Iy[i-1][j-1])
25
+ Ix[i][j] = max(M[i-1][j] + gap_open, Ix[i-1][j] + gap_extend, Iy[i-1][j] + gap_open)
26
+ Iy[i][j] = max(M[i][j-1] + gap_open, Iy[i][j-1] + gap_extend, Ix[i][j-1] + gap_open)
27
+
28
+ Args:
29
+ scoring: Scoring function (element ID 0 = gap).
30
+ gap_open: Cost for opening a new gap (should be negative for penalties).
31
+ gap_extend: Cost for extending an existing gap (should be negative,
32
+ typically less severe than gap_open).
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ scoring: ScoringFunction,
38
+ gap_open: float = -2.5,
39
+ gap_extend: float = -0.25,
40
+ ) -> None:
41
+ self.scoring = scoring
42
+ self.gap_open = gap_open
43
+ self.gap_extend = gap_extend
44
+
45
+ def align(self, seq1: list[int], seq2: list[int]) -> AffineAlignmentResult:
46
+ """Compute the optimal global alignment with affine gap penalties.
47
+
48
+ Args:
49
+ seq1: Query sequence (list of integer element IDs).
50
+ seq2: Target sequence.
51
+
52
+ Returns:
53
+ An ``AffineAlignmentResult`` with aligned sequences and gap statistics.
54
+ """
55
+ n = len(seq1)
56
+ m = len(seq2)
57
+
58
+ NEG_INF = -np.inf
59
+
60
+ # F[k, i, j] for k in {M=0, Ix=1, Iy=2}
61
+ F = np.full((3, n + 1, m + 1), NEG_INF, dtype=np.float64)
62
+ # Traceback: B[k, i, j, :] = (from_k, from_i, from_j)
63
+ B = np.full((3, n + 1, m + 1, 3), -1, dtype=np.int32)
64
+
65
+ F[_M, 0, 0] = 0.0
66
+
67
+ d = self.gap_open
68
+ e = self.gap_extend
69
+
70
+ # --- Border initialization: gaps along query (Ix column) ---
71
+ for i0 in range(n):
72
+ i = i0 + 1
73
+ if i > 1:
74
+ F[_IX, i, 0] = F[_IX, i - 1, 0] + e
75
+ else:
76
+ F[_IX, i, 0] = d
77
+ B[_IX, i, 0] = [_IX, i - 1, 0]
78
+ # M and Iy are -inf along this border (already set).
79
+
80
+ # --- Border initialization: gaps along target (Iy row) ---
81
+ for j0 in range(m):
82
+ j = j0 + 1
83
+ if j > 1:
84
+ F[_IY, 0, j] = F[_IY, 0, j - 1] + e
85
+ else:
86
+ F[_IY, 0, j] = d
87
+ B[_IY, 0, j] = [_IY, 0, j - 1]
88
+ # M and Ix are -inf along this border (already set).
89
+
90
+ # --- Main DP fill ---
91
+ for i0 in range(n):
92
+ i = i0 + 1
93
+ for j0 in range(m):
94
+ j = j0 + 1
95
+
96
+ # Match/mismatch: diagonal transition.
97
+ s = self.scoring.score(seq1[i - 1], seq2[j - 1])
98
+ candidates_m = (
99
+ F[_M, i - 1, j - 1] + s,
100
+ F[_IX, i - 1, j - 1] + s,
101
+ F[_IY, i - 1, j - 1] + s,
102
+ )
103
+ best_k = _argmax3(candidates_m)
104
+ F[_M, i, j] = candidates_m[best_k]
105
+ B[_M, i, j] = [best_k, i - 1, j - 1]
106
+
107
+ # Ix: gap in target (consume query[i], skip target).
108
+ candidates_ix = (
109
+ F[_M, i - 1, j] + d, # new gap
110
+ F[_IX, i - 1, j] + e, # extend gap
111
+ F[_IY, i - 1, j] + d, # new gap
112
+ )
113
+ best_k = _argmax3(candidates_ix)
114
+ F[_IX, i, j] = candidates_ix[best_k]
115
+ B[_IX, i, j] = [best_k, i - 1, j]
116
+
117
+ # Iy: gap in query (skip query, consume target[j]).
118
+ candidates_iy = (
119
+ F[_M, i, j - 1] + d, # new gap
120
+ F[_IY, i, j - 1] + e, # extend gap
121
+ F[_IX, i, j - 1] + d, # new gap
122
+ )
123
+ best_k = _argmax3(candidates_iy)
124
+ F[_IY, i, j] = candidates_iy[best_k]
125
+ B[_IY, i, j] = [best_k, i, j - 1]
126
+
127
+ # --- Find best endpoint ---
128
+ end_scores = (F[_M, n, m], F[_IX, n, m], F[_IY, n, m])
129
+ best_end = _argmax3(end_scores)
130
+ score = end_scores[best_end]
131
+
132
+ # --- Traceback ---
133
+ align1, align2, gap_opens, gap_extensions = self._traceback(B, seq1, seq2, best_end, n, m)
134
+
135
+ return AffineAlignmentResult(
136
+ query=align1,
137
+ target=align2,
138
+ score=float(score),
139
+ length=len(align1),
140
+ gap_opens=gap_opens,
141
+ gap_extensions=gap_extensions,
142
+ )
143
+
144
+ @staticmethod
145
+ def _traceback(
146
+ B: np.ndarray,
147
+ seq1: list[int],
148
+ seq2: list[int],
149
+ start_k: int,
150
+ start_i: int,
151
+ start_j: int,
152
+ ) -> tuple[list[int], list[int], int, int]:
153
+ """Walk the traceback matrix to produce aligned sequences."""
154
+ align1: list[int] = []
155
+ align2: list[int] = []
156
+ gap_opens = 0
157
+ gap_extensions = 0
158
+
159
+ k, i, j = start_k, start_i, start_j
160
+ prev_k = -1
161
+
162
+ while i > 0 or j > 0:
163
+ from_k, from_i, from_j = int(B[k, i, j, 0]), int(B[k, i, j, 1]), int(B[k, i, j, 2])
164
+
165
+ if from_i < 0 or from_j < 0:
166
+ # Reached uninitialised border — shouldn't happen.
167
+ break
168
+
169
+ if k == _M:
170
+ # Diagonal: match/mismatch.
171
+ align1.append(seq1[i - 1])
172
+ align2.append(seq2[j - 1])
173
+ elif k == _IX:
174
+ # Gap in target.
175
+ align1.append(seq1[i - 1])
176
+ align2.append(0)
177
+ if prev_k != _IX:
178
+ gap_opens += 1
179
+ else:
180
+ gap_extensions += 1
181
+ else: # _IY
182
+ # Gap in query.
183
+ align1.append(0)
184
+ align2.append(seq2[j - 1])
185
+ if prev_k != _IY:
186
+ gap_opens += 1
187
+ else:
188
+ gap_extensions += 1
189
+
190
+ prev_k = k
191
+ k, i, j = from_k, from_i, from_j
192
+
193
+ align1.reverse()
194
+ align2.reverse()
195
+ return align1, align2, gap_opens, gap_extensions
196
+
197
+
198
+ def _argmax3(vals: tuple[float, float, float]) -> int:
199
+ """Return index of maximum among exactly three values."""
200
+ if vals[0] >= vals[1]:
201
+ return 0 if vals[0] >= vals[2] else 2
202
+ return 1 if vals[1] >= vals[2] else 2
@@ -0,0 +1,17 @@
1
+ """Relational sequence logos — position-specific profiles of logical atoms."""
2
+
3
+ from pyseqalign.logo.probability import FreqDist, LidstoneProbDist, MLEProbDist
4
+ from pyseqalign.logo.profile import PositionProfile, RelationalProfile
5
+ from pyseqalign.logo.render import column_ic, lgg_atoms, relational_logo, term_str
6
+
7
+ __all__ = [
8
+ 'FreqDist',
9
+ 'MLEProbDist',
10
+ 'LidstoneProbDist',
11
+ 'PositionProfile',
12
+ 'RelationalProfile',
13
+ 'relational_logo',
14
+ 'column_ic',
15
+ 'lgg_atoms',
16
+ 'term_str',
17
+ ]
@@ -0,0 +1,192 @@
1
+ """Frequency and probability distributions for relational sequence logos.
2
+
3
+ Simplified, modern-Python reimplementation of the NLTK-derived legacy
4
+ ``Probability.py``. Only the distributions needed for logo construction
5
+ are included: :class:`FreqDist`, :class:`MLEProbDist` (maximum-likelihood),
6
+ and :class:`LidstoneProbDist` (smoothed).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import builtins as _builtins
12
+ import math
13
+ from collections.abc import Hashable, Iterator
14
+ from typing import Any
15
+
16
+ _builtin_max = _builtins.max
17
+
18
+
19
+ class FreqDist:
20
+ """A frequency distribution over hashable samples.
21
+
22
+ Counts how many times each outcome has been observed.
23
+
24
+ >>> fd = FreqDist()
25
+ >>> fd.inc("a"); fd.inc("a"); fd.inc("b")
26
+ >>> fd.count("a")
27
+ 2
28
+ >>> fd.freq("a") # doctest: +ELLIPSIS
29
+ 0.666...
30
+ """
31
+
32
+ __slots__ = ('_counts', '_total', '_max_cache')
33
+
34
+ def __init__(self) -> None:
35
+ self._counts: dict[Any, int] = {}
36
+ self._total: int = 0
37
+ self._max_cache: Any | None = None
38
+
39
+ # -- Mutation -----------------------------------------------------------
40
+
41
+ def inc(self, sample: Hashable, count: int = 1) -> None:
42
+ """Increment the count for *sample* by *count*."""
43
+ if count == 0:
44
+ return
45
+ self._counts[sample] = self._counts.get(sample, 0) + count
46
+ self._total += count
47
+ self._max_cache = None
48
+
49
+ # -- Queries ------------------------------------------------------------
50
+
51
+ @property
52
+ def total(self) -> int:
53
+ """Total number of recorded outcomes (``N``)."""
54
+ return self._total
55
+
56
+ @property
57
+ def num_bins(self) -> int:
58
+ """Number of distinct samples with count > 0 (``B``)."""
59
+ return len(self._counts)
60
+
61
+ def count(self, sample: Hashable) -> int:
62
+ """Return the count for *sample* (0 if unseen)."""
63
+ return self._counts.get(sample, 0)
64
+
65
+ def freq(self, sample: Hashable) -> float:
66
+ """Return the relative frequency ``count(sample) / N``."""
67
+ if self._total == 0:
68
+ return 0.0
69
+ return self._counts.get(sample, 0) / self._total
70
+
71
+ def samples(self) -> list[Any]:
72
+ """Return all samples with count > 0."""
73
+ return list(self._counts.keys())
74
+
75
+ def max(self) -> Any | None:
76
+ """Return the sample with the highest count (arbitrary tie-break)."""
77
+ if self._max_cache is None:
78
+ if not self._counts:
79
+ return None
80
+ self._max_cache = _builtin_max(self._counts, key=self._counts.__getitem__)
81
+ return self._max_cache
82
+
83
+ def sorted_samples(self) -> list[Any]:
84
+ """Return samples sorted by descending count."""
85
+ return sorted(self._counts, key=self._counts.__getitem__, reverse=True)
86
+
87
+ # -- Container protocol -------------------------------------------------
88
+
89
+ def __contains__(self, sample: object) -> bool:
90
+ return sample in self._counts
91
+
92
+ def __len__(self) -> int:
93
+ return self.num_bins
94
+
95
+ def __iter__(self) -> Iterator[Any]:
96
+ return iter(self._counts)
97
+
98
+ # -- Representation -----------------------------------------------------
99
+
100
+ def __repr__(self) -> str:
101
+ return f'<FreqDist with {self._total} outcomes, {self.num_bins} bins>'
102
+
103
+ def __str__(self) -> str:
104
+ items = ', '.join(
105
+ f'{s!r}: {c}' for s, c in sorted(self._counts.items(), key=lambda kv: -kv[1])
106
+ )
107
+ return f'<FreqDist: {items}>'
108
+
109
+
110
+ class MLEProbDist:
111
+ """Maximum-likelihood probability distribution from a :class:`FreqDist`.
112
+
113
+ ``P(sample) = count(sample) / N``
114
+ """
115
+
116
+ __slots__ = ('_freqdist',)
117
+
118
+ def __init__(self, freqdist: FreqDist) -> None:
119
+ if freqdist.total == 0:
120
+ raise ValueError('Cannot build MLE distribution from empty FreqDist.')
121
+ self._freqdist = freqdist
122
+
123
+ @property
124
+ def freqdist(self) -> FreqDist:
125
+ return self._freqdist
126
+
127
+ def prob(self, sample: Hashable) -> float:
128
+ return self._freqdist.freq(sample)
129
+
130
+ def logprob(self, sample: Hashable) -> float:
131
+ p = self.prob(sample)
132
+ return math.log(p) if p > 0 else float('-inf')
133
+
134
+ def max(self) -> Any | None:
135
+ return self._freqdist.max()
136
+
137
+ def samples(self) -> list[Any]:
138
+ return self._freqdist.samples()
139
+
140
+ def __repr__(self) -> str:
141
+ return f'<MLEProbDist based on {self._freqdist.total} outcomes>'
142
+
143
+
144
+ class LidstoneProbDist:
145
+ """Lidstone-smoothed probability distribution.
146
+
147
+ ``P(sample) = (count(sample) + gamma) / (N + B * gamma)``
148
+
149
+ With ``gamma = 1`` this is Laplace smoothing; ``gamma = 0.5`` gives the
150
+ Expected Likelihood Estimate (ELE).
151
+ """
152
+
153
+ __slots__ = ('_freqdist', '_gamma', '_bins', '_N')
154
+
155
+ def __init__(
156
+ self,
157
+ freqdist: FreqDist,
158
+ gamma: float = 1.0,
159
+ bins: int | None = None,
160
+ ) -> None:
161
+ if bins is not None and bins < freqdist.num_bins:
162
+ raise ValueError(f'bins ({bins}) must be >= FreqDist.num_bins ({freqdist.num_bins})')
163
+ if bins is None:
164
+ bins = freqdist.num_bins
165
+ if bins == 0:
166
+ raise ValueError('Lidstone distribution must have at least one bin.')
167
+
168
+ self._freqdist = freqdist
169
+ self._gamma = float(gamma)
170
+ self._bins = bins
171
+ self._N = freqdist.total
172
+
173
+ @property
174
+ def freqdist(self) -> FreqDist:
175
+ return self._freqdist
176
+
177
+ def prob(self, sample: Hashable) -> float:
178
+ c = self._freqdist.count(sample)
179
+ return (c + self._gamma) / (self._N + self._bins * self._gamma)
180
+
181
+ def logprob(self, sample: Hashable) -> float:
182
+ p = self.prob(sample)
183
+ return math.log(p) if p > 0 else float('-inf')
184
+
185
+ def max(self) -> Any | None:
186
+ return self._freqdist.max()
187
+
188
+ def samples(self) -> list[Any]:
189
+ return self._freqdist.samples()
190
+
191
+ def __repr__(self) -> str:
192
+ return f'<LidstoneProbDist gamma={self._gamma} based on {self._N} outcomes>'