PyPI - pyseqalignment - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pyseqalignment 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

pyseqalign/__init__.py +14 -0
pyseqalign/core/__init__.py +12 -0
pyseqalign/core/alignment.py +67 -0
pyseqalign/core/needleman_wunsch.py +122 -0
pyseqalign/core/smith_waterman.py +173 -0
pyseqalign/learning/__init__.py +20 -0
pyseqalign/learning/aleph.py +212 -0
pyseqalign/learning/aleph_files/__init__.py +0 -0
pyseqalign/learning/aleph_files/aleph_swi_ak.pl +10420 -0
pyseqalign/learning/base.py +68 -0
pyseqalign/learning/popper.py +215 -0
pyseqalign/learning/task_builder.py +213 -0
pyseqalign/prolog/__init__.py +5 -0
pyseqalign/prolog/engine.py +102 -0
pyseqalign/prolog/knowledge/__init__.py +0 -0
pyseqalign/prolog/knowledge/amino_acids.pl +53 -0
pyseqalign/prolog/knowledge/blosum50.pl +800 -0
pyseqalign/prolog/knowledge/defaults.pl +15 -0
pyseqalign/prolog/knowledge/distances.pl +119 -0
pyseqalign/scoring/__init__.py +11 -0
pyseqalign/scoring/distance.py +100 -0
pyseqalign/scoring/matrices.py +362 -0
pyseqalign/scoring/matrix_data/BLOSUM100 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM50 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM60 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM62 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM70 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM80 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM90 +31 -0
pyseqalign/scoring/matrix_data/PAM150 +34 -0
pyseqalign/scoring/matrix_data/PAM200 +34 -0
pyseqalign/scoring/matrix_data/PAM250 +34 -0
pyseqalign/scoring/matrix_data/PAM50 +34 -0
pyseqalign/scoring/matrix_data/__init__.py +0 -0
pyseqalign/utils/__init__.py +9 -0
pyseqalign/utils/helpers.py +47 -0
pyseqalignment-0.1.0.dist-info/METADATA +317 -0
pyseqalignment-0.1.0.dist-info/RECORD +41 -0
pyseqalignment-0.1.0.dist-info/WHEEL +5 -0
pyseqalignment-0.1.0.dist-info/licenses/LICENSE +21 -0
pyseqalignment-0.1.0.dist-info/top_level.txt +1 -0

pyseqalign/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""pySeqAlign -- Sequence alignment with Prolog-style distance functions and ILP learning."""
+from pyseqalign.core.alignment import AlignmentResult, LocalAlignmentResult
+from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
+from pyseqalign.core.smith_waterman import SmithWaterman
+__version__ = "0.1.0"
+__all__ = [
+    "SmithWaterman",
+    "NeedlemanWunsch",
+    "AlignmentResult",
+    "LocalAlignmentResult",
+]

pyseqalign/core/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Core alignment algorithms."""
+from pyseqalign.core.alignment import AlignmentResult, LocalAlignmentResult
+from pyseqalign.core.needleman_wunsch import NeedlemanWunsch
+from pyseqalign.core.smith_waterman import SmithWaterman
+__all__ = [
+    "SmithWaterman",
+    "NeedlemanWunsch",
+    "AlignmentResult",
+    "LocalAlignmentResult",
+]

pyseqalign/core/alignment.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Data structures for alignment results."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class AlignmentResult:
+    """Result of a global (Needleman-Wunsch) alignment.
+    Attributes:
+        query: Aligned query sequence (with gaps represented as 0).
+        target: Aligned target sequence (with gaps represented as 0).
+        score: Alignment score.
+        length: Length of the alignment.
+    """
+    query: list[int]
+    target: list[int]
+    score: float
+    length: int
+@dataclass
+class LocalAlignmentResult:
+    """Result of a single local (Smith-Waterman) alignment.
+    Attributes:
+        query_path: Indices along the query sequence in the alignment.
+        target_path: Indices along the target sequence in the alignment.
+        start_query: Start position in the query.
+        start_target: Start position in the target.
+        end_query: End position in the query.
+        end_target: End position in the target.
+        length: Length of the alignment path.
+        score: Alignment score.
+    """
+    query_path: list[int]
+    target_path: list[int]
+    start_query: int
+    start_target: int
+    end_query: int
+    end_target: int
+    length: int
+    score: float
+@dataclass
+class KLocalAlignmentResults:
+    """Container for k non-overlapping local alignments.
+    Attributes:
+        alignments: List of local alignment results, sorted by score descending.
+    """
+    alignments: list[LocalAlignmentResult] = field(default_factory=list)
+    def __len__(self) -> int:
+        return len(self.alignments)
+    def __getitem__(self, index: int) -> LocalAlignmentResult:
+        return self.alignments[index]
+    def __iter__(self):
+        return iter(self.alignments)

pyseqalign/core/needleman_wunsch.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Needleman-Wunsch global sequence alignment.
+Translated from the legacy C implementation in pyAlign.c.
+"""
+from __future__ import annotations
+from pyseqalign.core.alignment import AlignmentResult
+from pyseqalign.core.smith_waterman import ScoringFunction
+class NeedlemanWunsch:
+    """Needleman-Wunsch global alignment.
+    Args:
+        scoring: A scoring function implementing the ``ScoringFunction`` protocol.
+        gap_penalty: Cost applied when introducing a gap.  The scoring function is
+            called with element ID ``0`` to represent a gap character.
+    """
+    def __init__(self, scoring: ScoringFunction, gap_penalty: float | None = None) -> None:
+        self.scoring = scoring
+        self._explicit_gap_penalty = gap_penalty
+    @property
+    def gap_penalty(self) -> float:
+        """Return gap cost -- derived from scoring(0,0) when not set explicitly."""
+        if self._explicit_gap_penalty is not None:
+            return self._explicit_gap_penalty
+        return self.scoring.score(0, 0)
+    def align(self, seq1: list[int], seq2: list[int]) -> AlignmentResult:
+        """Compute the optimal global alignment of *seq1* and *seq2*.
+        Args:
+            seq1: First input sequence (list of integer element IDs).
+            seq2: Second input sequence.
+        Returns:
+            An ``AlignmentResult`` with aligned sequences, score, and length.
+        """
+        rows = len(seq1) + 1
+        cols = len(seq2) + 1
+        gap = self.gap_penalty
+        # Initialise F-matrix.
+        f_matrix = [[0.0] * cols for _ in range(rows)]
+        tb_matrix = [[-1.0] * cols for _ in range(rows)]
+        # Fill border gaps.
+        for i in range(1, rows):
+            f_matrix[i][0] = gap * i
+        for j in range(1, cols):
+            f_matrix[0][j] = gap * j
+        # Fill matrices.
+        for i in range(1, rows):
+            for j in range(1, cols):
+                match = f_matrix[i - 1][j - 1] + self.scoring.score(seq1[i - 1], seq2[j - 1])
+                delete = f_matrix[i - 1][j] + self.scoring.score(seq1[i - 1], 0)
+                insert = f_matrix[i][j - 1] + self.scoring.score(0, seq2[j - 1])
+                choices = [match, delete, insert]
+                best = _argmax(choices)
+                f_matrix[i][j] = choices[best]
+                tb_matrix[i][j] = float(best)
+        score = f_matrix[rows - 1][cols - 1]
+        # Traceback.
+        align1: list[int] = []
+        align2: list[int] = []
+        i = rows - 1
+        j = cols - 1
+        while i > 0 and j > 0:
+            if tb_matrix[i][j] == 0.0:
+                # Diagonal -- match/mismatch.
+                i -= 1
+                j -= 1
+                align1.append(seq1[i])
+                align2.append(seq2[j])
+            elif tb_matrix[i][j] == 1.0:
+                # Up -- gap in seq2.
+                i -= 1
+                align1.append(seq1[i])
+                align2.append(0)
+            else:
+                # Left -- gap in seq1.
+                j -= 1
+                align1.append(0)
+                align2.append(seq2[j])
+        while i > 0:
+            i -= 1
+            align1.append(seq1[i])
+            align2.append(0)
+        while j > 0:
+            j -= 1
+            align1.append(0)
+            align2.append(seq2[j])
+        align1.reverse()
+        align2.reverse()
+        return AlignmentResult(
+            query=align1,
+            target=align2,
+            score=score,
+            length=len(align1),
+        )
+def _argmax(values: list[float]) -> int:
+    best = 0
+    for i in range(1, len(values)):
+        if values[i] > values[best]:
+            best = i
+    return best

pyseqalign/core/smith_waterman.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""Smith-Waterman local sequence alignment.
+Translated from the legacy C implementation in swAlign.c.
+Computes the k best non-overlapping local alignments between two sequences.
+"""
+from __future__ import annotations
+from typing import Protocol
+from pyseqalign.core.alignment import KLocalAlignmentResults, LocalAlignmentResult
+class ScoringFunction(Protocol):
+    """Protocol for scoring/distance functions used by alignment algorithms."""
+    def score(self, a: int, b: int) -> float:
+        """Return the similarity score between elements *a* and *b*."""
+        ...
+class SmithWaterman:
+    """Smith-Waterman local alignment.
+    Args:
+        scoring: A scoring function implementing the ``ScoringFunction`` protocol.
+        gap_penalty: Cost applied when introducing a gap (should be positive;
+            it is subtracted internally).
+    """
+    def __init__(self, scoring: ScoringFunction, gap_penalty: float = 8.0) -> None:
+        self.scoring = scoring
+        self.gap_penalty = gap_penalty
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def align(
+        self,
+        seq1: list[int],
+        seq2: list[int],
+        k: int = 1,
+        cutoff: float = 0.0,
+        min_score: float = 2.0,
+    ) -> KLocalAlignmentResults:
+        """Compute up to *k* best non-overlapping local alignments.
+        Args:
+            seq1: First input sequence (list of integer element IDs).
+            seq2: Second input sequence.
+            k: Maximum number of non-overlapping alignments to return.
+            cutoff: Minimum cell value to keep in the F-matrix (default 0 for SW).
+            min_score: Cells with score above this are considered trace start candidates.
+        Returns:
+            A ``KLocalAlignmentResults`` containing up to *k* alignments sorted
+            by score descending.
+        """
+        if k == 0:
+            return KLocalAlignmentResults()
+        rows = len(seq1) + 1
+        cols = len(seq2) + 1
+        # Initialise F-matrix and traceback matrix.
+        f_matrix = [[0.0] * cols for _ in range(rows)]
+        traceback = [[(-10, -10)] * cols for _ in range(rows)]
+        # Fill the matrices and collect high-scoring cells.
+        max_traces: list[tuple[int, int]] = []
+        d = self.gap_penalty
+        for i in range(1, rows):
+            for j in range(1, cols):
+                match = f_matrix[i - 1][j - 1] + self.scoring.score(seq1[i - 1], seq2[j - 1])
+                delete = f_matrix[i - 1][j] - d
+                insert = f_matrix[i][j - 1] - d
+                choices = [cutoff, match, delete, insert]
+                best_idx = _argmax(choices)
+                f_matrix[i][j] = choices[best_idx]
+                if best_idx == 1:
+                    traceback[i][j] = (i - 1, j - 1)
+                elif best_idx == 2:
+                    traceback[i][j] = (i - 1, j)
+                elif best_idx == 3:
+                    traceback[i][j] = (i, j - 1)
+                if choices[best_idx] > min_score:
+                    max_traces.append((i, j))
+        # Generate all candidate traces (sorted by score descending).
+        candidates = self._generate_traces(f_matrix, traceback, max_traces, rows, cols)
+        # Select up to k non-overlapping alignments.
+        selected: list[LocalAlignmentResult] = []
+        for candidate in candidates:
+            if len(selected) >= k:
+                break
+            if not any(self._overlaps(s, candidate) for s in selected):
+                selected.append(candidate)
+        return KLocalAlignmentResults(alignments=selected)
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _generate_traces(
+        f_matrix: list[list[float]],
+        traceback: list[list[tuple[int, int]]],
+        max_traces: list[tuple[int, int]],
+        rows: int,
+        cols: int,
+    ) -> list[LocalAlignmentResult]:
+        """Traceback from each high-scoring cell to produce alignment candidates."""
+        results: list[LocalAlignmentResult] = []
+        for end_i, end_j in max_traces:
+            path_a: list[int] = []
+            path_b: list[int] = []
+            score = 0.0
+            ci, cj = end_i, end_j
+            while traceback[ci][cj] != (-10, -10):
+                path_a.append(ci)
+                path_b.append(cj)
+                score += f_matrix[ci][cj]
+                ci, cj = traceback[ci][cj]
+            path_a.append(ci)
+            path_b.append(cj)
+            # Reverse to get start-to-end order.
+            path_a.reverse()
+            path_b.reverse()
+            length = len(path_a)
+            results.append(
+                LocalAlignmentResult(
+                    query_path=path_a,
+                    target_path=path_b,
+                    start_query=ci,
+                    start_target=cj,
+                    end_query=end_i,
+                    end_target=end_j,
+                    length=length,
+                    score=score,
+                )
+            )
+        # Sort by score descending.
+        results.sort(key=lambda r: r.score, reverse=True)
+        return results
+    @staticmethod
+    def _overlaps(a: LocalAlignmentResult, b: LocalAlignmentResult) -> bool:
+        """Check whether two local alignments share any (i, j) cell."""
+        cells_a = set(zip(a.query_path, a.target_path))
+        cells_b = set(zip(b.query_path, b.target_path))
+        return bool(cells_a & cells_b)
+def _argmax(values: list[float]) -> int:
+    """Return the index of the maximum value."""
+    best = 0
+    for i in range(1, len(values)):
+        if values[i] > values[best]:
+            best = i
+    return best

pyseqalign/learning/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Inductive Logic Programming (ILP) backends for learning alignment rules.
+This subpackage provides a common interface for learning scoring functions and
+alignment rules from example alignments.  Two backends are supported:
+- **Aleph** -- the classic ILP system (Srinivasan, 2001) via SWI-Prolog.
+  Ported from the legacy pySeqAlign code.
+- **Popper** -- a modern ILP system (Cropper & Morel, 2021) that learns from
+  failures using ASP/SAT solvers.  Recommended for new projects.
+"""
+from pyseqalign.learning.base import ILPLearner, ILPTask, LearnedProgram
+from pyseqalign.learning.task_builder import AlignmentTaskBuilder
+__all__ = [
+    "ILPTask",
+    "LearnedProgram",
+    "ILPLearner",
+    "AlignmentTaskBuilder",
+]

pyseqalign/learning/aleph.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""Aleph ILP backend.
+Runs the Aleph ILP system (Srinivasan, 2001) via SWI-Prolog to learn
+Prolog clauses from alignment examples.
+Requires SWI-Prolog installed on the system and accessible via ``pyswip``
+or the ``swipl`` command.
+The bundled ``aleph_swi_ak.pl`` file is a SWI-Prolog compatible version
+of Aleph 5, originally ported from YAP Prolog in the legacy pySeqAlign
+codebase.
+"""
+from __future__ import annotations
+import subprocess
+import tempfile
+from pathlib import Path
+from pyseqalign.learning.base import ILPTask, LearnedProgram
+class AlephLearner:
+    """Aleph ILP backend.
+    Uses SWI-Prolog to run Aleph's ``induce/1`` on the provided task.
+    Args:
+        aleph_path: Path to the ``aleph_swi_ak.pl`` file.  Defaults to the
+            bundled version shipped with pyseqalign.
+        swipl_cmd: Command to invoke SWI-Prolog (default ``"swipl"``).
+        induce_mode: Aleph induction mode.  One of ``"induce"``,
+            ``"induce_max"``, ``"induce_cover"``, ``"induce_tree"``,
+            ``"induce_features"``, ``"induce_constraints"``,
+            ``"induce_incremental"`` (default ``"induce"``).
+        timeout: Maximum seconds for the SWI-Prolog process (default 300).
+    """
+    VALID_MODES = {
+        "induce",
+        "induce_max",
+        "induce_cover",
+        "induce_tree",
+        "induce_features",
+        "induce_constraints",
+        "induce_incremental",
+        "induce_theory",
+    }
+    def __init__(
+        self,
+        aleph_path: str | Path | None = None,
+        swipl_cmd: str = "swipl",
+        induce_mode: str = "induce",
+        timeout: int = 300,
+    ) -> None:
+        if aleph_path is None:
+            aleph_path = Path(__file__).parent / "aleph_files" / "aleph_swi_ak.pl"
+        self.aleph_path = Path(aleph_path)
+        self.swipl_cmd = swipl_cmd
+        self.timeout = timeout
+        if induce_mode not in self.VALID_MODES:
+            raise ValueError(
+                f"Unknown induce_mode '{induce_mode}'. "
+                f"Valid modes: {sorted(self.VALID_MODES)}"
+            )
+        self.induce_mode = induce_mode
+    def learn(self, task: ILPTask) -> LearnedProgram:
+        """Run Aleph on the given task.
+        Writes the task to temporary files, invokes SWI-Prolog with Aleph,
+        and parses the output for learned clauses.
+        """
+        work_dir = task.work_dir or Path(tempfile.mkdtemp(prefix="pyseqalign_aleph_"))
+        work_dir = Path(work_dir)
+        work_dir.mkdir(parents=True, exist_ok=True)
+        # Write Aleph-format files.
+        bk_lines = []
+        for k, v in task.settings.items():
+            bk_lines.append(f":- set({k},{v}).")
+        bk_lines.extend(task.bias)
+        bk_lines.append("")
+        bk_lines.extend(task.background)
+        (work_dir / "task.b").write_text("\n".join(bk_lines) + "\n")
+        (work_dir / "task.f").write_text("\n".join(task.positive) + "\n")
+        (work_dir / "task.n").write_text("\n".join(task.negative) + "\n")
+        # Construct SWI-Prolog script.
+        aleph_abs = self.aleph_path.resolve()
+        task_abs = (work_dir / "task").resolve()
+        result_abs = (work_dir / "result.pl").resolve()
+        script = (
+            f":- consult('{aleph_abs}').\n"
+            f":- read_all('{task_abs}').\n"
+            f":- {self.induce_mode}.\n"
+            f":- write_rules('{result_abs}').\n"
+            f":- halt.\n"
+        )
+        script_path = work_dir / "run_aleph.pl"
+        script_path.write_text(script)
+        # Run SWI-Prolog.
+        try:
+            result = subprocess.run(
+                [self.swipl_cmd, "-s", str(script_path)],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout,
+                cwd=str(work_dir),
+            )
+            raw_output = result.stdout + result.stderr
+        except FileNotFoundError:
+            raise RuntimeError(
+                f"SWI-Prolog not found at '{self.swipl_cmd}'. "
+                "Install SWI-Prolog or set swipl_cmd to the correct path."
+            )
+        except subprocess.TimeoutExpired:
+            return LearnedProgram(
+                raw_output=f"Aleph timed out after {self.timeout}s",
+                stats={"timeout": True},
+            )
+        # Parse results.
+        clauses = self._parse_output(raw_output, result_abs)
+        return LearnedProgram(
+            clauses=clauses,
+            score=self._extract_score(raw_output),
+            stats=self._extract_stats(raw_output),
+            raw_output=raw_output,
+        )
+    # ------------------------------------------------------------------
+    # Parsing helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _parse_output(raw_output: str, result_file: Path) -> list[str]:
+        """Extract learned clauses from Aleph output."""
+        clauses: list[str] = []
+        # Try reading the written rules file first.
+        if result_file.exists():
+            text = result_file.read_text()
+            for line in text.strip().splitlines():
+                line = line.strip()
+                if line and not line.startswith("%"):
+                    clauses.append(line)
+            if clauses:
+                return clauses
+        # Fall back to parsing stdout for [Rule N] blocks.
+        in_rule = False
+        current: list[str] = []
+        for line in raw_output.splitlines():
+            if "[Rule" in line:
+                in_rule = True
+                current = []
+                continue
+            if in_rule:
+                stripped = line.strip()
+                if stripped == "":
+                    if current:
+                        clauses.append(" ".join(current))
+                        current = []
+                    in_rule = False
+                else:
+                    current.append(stripped)
+        if current:
+            clauses.append(" ".join(current))
+        return clauses
+    @staticmethod
+    def _extract_score(raw_output: str) -> float:
+        """Extract accuracy or coverage score from Aleph output."""
+        for line in raw_output.splitlines():
+            if "Accuracy" in line or "accuracy" in line:
+                parts = line.split()
+                for p in parts:
+                    try:
+                        return float(p.strip("()%,"))
+                    except ValueError:
+                        continue
+        return 0.0
+    @staticmethod
+    def _extract_stats(raw_output: str) -> dict[str, object]:
+        """Extract statistics from Aleph output."""
+        stats: dict[str, object] = {}
+        for line in raw_output.splitlines():
+            if "clauses constructed" in line.lower():
+                parts = line.split()
+                for p in parts:
+                    try:
+                        stats["clauses_constructed"] = int(p)
+                        break
+                    except ValueError:
+                        continue
+            if "nodes explored" in line.lower() or "nodes visited" in line.lower():
+                parts = line.split()
+                for p in parts:
+                    try:
+                        stats["nodes_explored"] = int(p)
+                        break
+                    except ValueError:
+                        continue
+        return stats

pyseqalign/learning/aleph_files/__init__.py ADDED Viewed

File without changes