PyPI - pyseqalignment - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pyseqalignment 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

pyseqalign/__init__.py +14 -0
pyseqalign/core/__init__.py +12 -0
pyseqalign/core/alignment.py +67 -0
pyseqalign/core/needleman_wunsch.py +122 -0
pyseqalign/core/smith_waterman.py +173 -0
pyseqalign/learning/__init__.py +20 -0
pyseqalign/learning/aleph.py +212 -0
pyseqalign/learning/aleph_files/__init__.py +0 -0
pyseqalign/learning/aleph_files/aleph_swi_ak.pl +10420 -0
pyseqalign/learning/base.py +68 -0
pyseqalign/learning/popper.py +215 -0
pyseqalign/learning/task_builder.py +213 -0
pyseqalign/prolog/__init__.py +5 -0
pyseqalign/prolog/engine.py +102 -0
pyseqalign/prolog/knowledge/__init__.py +0 -0
pyseqalign/prolog/knowledge/amino_acids.pl +53 -0
pyseqalign/prolog/knowledge/blosum50.pl +800 -0
pyseqalign/prolog/knowledge/defaults.pl +15 -0
pyseqalign/prolog/knowledge/distances.pl +119 -0
pyseqalign/scoring/__init__.py +11 -0
pyseqalign/scoring/distance.py +100 -0
pyseqalign/scoring/matrices.py +362 -0
pyseqalign/scoring/matrix_data/BLOSUM100 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM50 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM60 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM62 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM70 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM80 +31 -0
pyseqalign/scoring/matrix_data/BLOSUM90 +31 -0
pyseqalign/scoring/matrix_data/PAM150 +34 -0
pyseqalign/scoring/matrix_data/PAM200 +34 -0
pyseqalign/scoring/matrix_data/PAM250 +34 -0
pyseqalign/scoring/matrix_data/PAM50 +34 -0
pyseqalign/scoring/matrix_data/__init__.py +0 -0
pyseqalign/utils/__init__.py +9 -0
pyseqalign/utils/helpers.py +47 -0
pyseqalignment-0.1.0.dist-info/METADATA +317 -0
pyseqalignment-0.1.0.dist-info/RECORD +41 -0
pyseqalignment-0.1.0.dist-info/WHEEL +5 -0
pyseqalignment-0.1.0.dist-info/licenses/LICENSE +21 -0
pyseqalignment-0.1.0.dist-info/top_level.txt +1 -0

pyseqalign/prolog/knowledge/defaults.pl ADDED Viewed

@@ -0,0 +1,15 @@
+:- assert(gapDefault(-1.0)).
+:- assert(gapChar('$gap')).
+:- assert(gapChar('real_gap')).
+:- assert(learningRate(0,1.0)).
+assign(X,V) :-
+	Old =..[X,_], retract(Old),
+	New =..[X,V], assert(New).
+dist(sym,atomDistance,nc,0,0,_,Dist):-	gapDefault(Dist).
+dist(sym,atomDistance,nc,0,_,0,Dist):-	gapDefault(Dist).
+:- consult('aminoAcids.pl'),consult('blossum_50.pl').

pyseqalign/prolog/knowledge/distances.pl ADDED Viewed

@@ -0,0 +1,119 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%                                       %
+%      distances for logical atoms      %
+%                                       %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% The predicat dist/6 is meant to be called witch instanciated atoms.
+%
+% The form is:
+% dist(TypeOfDistance,NameOfDistance,Iteration,Atom1,Atom2,Distance)
+%
+%
+%
+%:- use(module(library(lists))).
+:- source.
+%:- dynamic gapDefault/1.
+%:- dynamic gapChar/1.
+:- dynamic x/1 .	      % this may be required in some Prologs
+:- assert(gapDefault(-1.0)).
+:- assert(gapChar('$gap')).
+:- assert(gapChar('real_gap')).
+:- assert(learningRate(0,1.0)).
+x(0).			% An initial value is required in this example
+assign(X,V) :-
+	Old =..[X,_], retract(Old),
+	New =..[X,V], assert(New).
+% Nienhuys-Cheng Distance
+dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID1,Atom1),
+	gapChar(Atom1),
+	gapDefault(Distance).
+dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID2,Atom2),
+	gapChar(Atom2),
+	gapDefault(Distance).
+dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID1,Atom1),
+	example(AtomID2,Atom2),
+	distSub(TypeOfDistance,NameOfDistance,0,Atom1,Atom2,Distance).
+dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID1,Atom1),
+	gapChar(Atom1),
+	gapDefault(Distance).
+dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID2,Atom2),
+	gapChar(Atom2),
+	gapDefault(Distance).
+dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
+	example(AtomID1,Atom1),
+	example(AtomID2,Atom2),
+	distSub(TypeOfDistance,NameOfDistance,0,Atom1,Atom2,DistanceI),
+	Distance is 1.0-DistanceI.
+distSub(atomDistance,nc,0,Atom,Atom,Dist):-
+	!,Dist is 0.0.
+%distSub(atomDistance,nc,0,A,_,Dist):-
+%	gapChar(A),
+%	gapDefault(Dist).
+%distSub(atomDistance,nc,0,_,B,Dist):-
+%	gapChar(B),
+%	gapDefault(Dist).
+distSub(atomDistance,nc,0,A,B,Dist) :-
+	A =.. [PredA|AL],
+	B =.. [PredB|BL],
+	PredA == PredB,
+	length(AL,Length),
+	length(BL,Length),!,
+	distSub_helper(atomDistance,nc,0,AL,BL,SumDist),
+	Dist is 1.0/(2*Length)*SumDist.
+distSub(atomDistance,nc,0,A,B,Dist) :-
+	Dist is 1.0.
+distSub_helper(atomDistance,nc,0,[],[],Dist) :-
+	!,Dist is 0.0.
+distSub_helper(atomDistance,nc,0,[A1|R1],[A2|R2],Dists) :-
+	distSub(atomDistance,nc,0,A1,A2,DistHere),
+	distSub_helper(atomDistance,nc,0,R1,R2,DistsThere),!,
+	Dists is DistHere+DistsThere.
+%distSub(atomDistance,nc,Iteration,Atom1,Atom2,Dist):-
+%	IterBefore is Iteration-1,
+%	distSub(atomDistance,nc,IterBefore,Atom1,Atom2,DistBefore),
+% this delta/5 works on the prolog programs resulting from a tilde run.
+delta(_, 0, _,_, 0.0).
+delta(Num, Iteration, Atom1,Atom2, Delta) :-
+	exampleC(Atom1,tag(What1,Tag1)),
+	assert(word(Num,What1)),
+	assert(tag(Num,What1,Tag1)),
+	assert(q(Num,What1)),
+	exampleC(Atom2,tag(What2,Tag2)),
+	% only assert if different
+	(What2 \== What1 ->
+	    assert(word(Num,What1));
+	    true),
+	% only assert if different
+	((Tag1 \== Tag2, What2 \== What1) ->
+	    assert(tag(Num,What2,Tag2));
+	    true),
+	assert(t(Num,What2)),!,
+	delta(Iteration, Num, [Delta]),
+	retract(q(Num,What1)),
+	retract(t(Num,What2)),
+        (retract(word(Num,What1));true),
+        (retract(word(Num,What2));true),
+        (retract(tag(Num,What1,Tag1));true),
+        (retract(tag(Num,What2,Tag2));true).

pyseqalign/scoring/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Scoring and distance functions for sequence alignment."""
+from pyseqalign.scoring.distance import AtomDistance, SimpleMatch
+from pyseqalign.scoring.matrices import Blosum50, SubstitutionMatrix
+__all__ = [
+    "Blosum50",
+    "SubstitutionMatrix",
+    "AtomDistance",
+    "SimpleMatch",
+]

pyseqalign/scoring/distance.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Distance-based scoring functions.
+Includes the Nienhuys-Cheng atom distance from the legacy distances.pl, as well
+as a simple identity-match scorer useful for testing.
+"""
+from __future__ import annotations
+class SimpleMatch:
+    """Simple identity-based scoring: +match_score for equal elements, +mismatch_score otherwise.
+    Args:
+        match_score: Score when elements are identical.
+        mismatch_score: Score when elements differ.
+        gap_score: Score for gap characters (element ID 0).
+    """
+    def __init__(
+        self,
+        match_score: float = 5.0,
+        mismatch_score: float = -4.0,
+        gap_score: float = -8.0,
+    ) -> None:
+        self.match_score = match_score
+        self.mismatch_score = mismatch_score
+        self.gap_score = gap_score
+    def score(self, a: int, b: int) -> float:
+        if a == 0 or b == 0:
+            return self.gap_score
+        return self.match_score if a == b else self.mismatch_score
+class AtomDistance:
+    """Nienhuys-Cheng distance for structured atoms.
+    This is a Python port of the recursive atom distance from the legacy
+    distances.pl Prolog knowledge base.  It operates on structured
+    representations where each atom is a tuple of ``(predicate, *args)`` and
+    computes a normalised distance in [0, 1].
+    For the integer-ID based interface used by the alignment algorithms, use
+    ``AtomDistance`` with an *atom_store* mapping IDs to structured atoms.
+    Args:
+        atom_store: Mapping from integer element IDs to structured atoms
+            (tuples).  ID 0 is reserved for gaps.
+        gap_score: Score returned for gap characters.
+        similarity: If ``True``, return ``1 - distance`` (similarity mode,
+            matching the legacy ``sym`` mode).
+    """
+    def __init__(
+        self,
+        atom_store: dict[int, tuple] | None = None,
+        gap_score: float = -1.0,
+        similarity: bool = True,
+    ) -> None:
+        self.atom_store = atom_store or {}
+        self.gap_score = gap_score
+        self.similarity = similarity
+    def score(self, a: int, b: int) -> float:
+        """Return the (dis)similarity score between atom IDs *a* and *b*."""
+        if a == 0 or b == 0:
+            return self.gap_score
+        atom_a = self.atom_store.get(a)
+        atom_b = self.atom_store.get(b)
+        if atom_a is None or atom_b is None:
+            return self.gap_score
+        dist = self._atom_distance(atom_a, atom_b)
+        if self.similarity:
+            return 1.0 - dist
+        return dist
+    def _atom_distance(self, a: tuple, b: tuple) -> float:
+        """Recursive Nienhuys-Cheng distance between two structured atoms."""
+        if a == b:
+            return 0.0
+        # Atoms must be tuples: (predicate, arg1, arg2, ...).
+        if not isinstance(a, tuple) or not isinstance(b, tuple):
+            return 1.0
+        pred_a, *args_a = a
+        pred_b, *args_b = b
+        # Different predicate or arity => maximal distance.
+        if pred_a != pred_b or len(args_a) != len(args_b):
+            return 1.0
+        if len(args_a) == 0:
+            return 0.0
+        total = sum(self._atom_distance(ai, bi) for ai, bi in zip(args_a, args_b))
+        return total / (2 * len(args_a))

pyseqalign/scoring/matrices.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""Substitution matrices for amino acid sequence alignment.
+Supports loading matrices dynamically from NCBI-format text files.
+A set of commonly used BLOSUM and PAM matrices are bundled with
+the package and can be loaded by name.
+Example usage::
+    # Load a bundled matrix by name
+    scoring = SubstitutionMatrix.from_bundled("BLOSUM62")
+    # Load from any NCBI-format file on disk
+    scoring = SubstitutionMatrix.from_file("/path/to/my/MATRIX")
+    # Download directly from NCBI FTP
+    scoring = SubstitutionMatrix.from_ncbi("PAM120")
+    # Legacy convenience alias (still works)
+    scoring = Blosum50()
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TextIO
+# Standard one-letter amino acid codes, indexed 1..20 to match the legacy encoding.
+# Index 0 is reserved for the gap character '-'.
+AMINO_ACIDS = [
+    "-",  # 0 -- gap
+    "a",  # 1
+    "r",  # 2
+    "n",  # 3
+    "d",  # 4
+    "c",  # 5
+    "q",  # 6
+    "e",  # 7
+    "g",  # 8
+    "h",  # 9
+    "i",  # 10
+    "l",  # 11
+    "k",  # 12
+    "m",  # 13
+    "f",  # 14
+    "p",  # 15
+    "s",  # 16
+    "t",  # 17
+    "w",  # 18
+    "y",  # 19
+    "v",  # 20
+]
+# Reverse lookup: one-letter code -> integer ID.
+_AA_TO_ID: dict[str, int] = {aa: idx for idx, aa in enumerate(AMINO_ACIDS)}
+# Directory containing bundled NCBI matrix files.
+_MATRIX_DATA_DIR = Path(__file__).parent / "matrix_data"
+# NCBI FTP base URL for substitution matrices.
+_NCBI_FTP_URL = "https://ftp.ncbi.nlm.nih.gov/blast/matrices"
+def _parse_ncbi_matrix(source: TextIO) -> dict[tuple[int, int], float]:
+    """Parse an NCBI-format substitution matrix into ``{(id_i, id_j): score}``.
+    The format consists of:
+    - Comment lines starting with ``#``
+    - A header row of single-letter amino acid codes
+    - Data rows: amino acid letter followed by integer scores
+    Only the 20 standard amino acids (A R N D C Q E G H I L K M F P S T W Y V)
+    are extracted; columns for ambiguity codes (B, Z, X) and stop (*) are ignored.
+    """
+    col_ids: list[int] = []
+    matrix: dict[tuple[int, int], float] = {}
+    for line in source:
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        tokens = line.split()
+        # Detect the header row: first token is a single letter that appears
+        # in our amino acid alphabet (case-insensitive).
+        if not col_ids:
+            # Header row -- all tokens should be single letters.
+            if all(len(t) == 1 for t in tokens):
+                for t in tokens:
+                    aa_id = _AA_TO_ID.get(t.lower(), -1)
+                    col_ids.append(aa_id)
+                continue
+            # Some files start the header with a leading letter (the row label
+            # coincides with the header).  Try treating the first token as a
+            # letter and the rest as letters too.
+        if not col_ids:
+            continue
+        # Data row: first token is the amino acid letter, rest are scores.
+        row_aa = tokens[0].lower()
+        row_id = _AA_TO_ID.get(row_aa, -1)
+        if row_id <= 0:
+            # Not a standard amino acid row (B, Z, X, *), skip.
+            continue
+        scores = tokens[1:]
+        for col_idx, score_str in enumerate(scores):
+            if col_idx >= len(col_ids):
+                break
+            col_id = col_ids[col_idx]
+            if col_id <= 0:
+                # Not a standard amino acid column, skip.
+                continue
+            val = float(score_str)
+            matrix[(row_id, col_id)] = val
+            matrix[(col_id, row_id)] = val
+    return matrix
+class SubstitutionMatrix:
+    """A substitution matrix scoring function loaded from an NCBI-format file.
+    Implements the ``ScoringFunction`` protocol so it can be passed directly
+    to ``SmithWaterman`` or ``NeedlemanWunsch``.
+    Args:
+        matrix: Symmetric ``{(id_i, id_j): score}`` mapping.
+        name: Human-readable name for the matrix (e.g. ``"BLOSUM62"``).
+        gap_score: Score returned when either element is a gap (ID 0).
+    """
+    def __init__(
+        self,
+        matrix: dict[tuple[int, int], float],
+        name: str = "custom",
+        gap_score: float = -8.0,
+    ) -> None:
+        self._matrix = matrix
+        self.name = name
+        self._gap_score = gap_score
+    def score(self, a: int, b: int) -> float:
+        """Return the substitution score for element IDs *a* and *b*."""
+        if a == 0 or b == 0:
+            return self._gap_score
+        return self._matrix.get((a, b), 0.0)
+    @classmethod
+    def from_file(
+        cls,
+        path: str | Path,
+        gap_score: float = -8.0,
+    ) -> SubstitutionMatrix:
+        """Load a substitution matrix from an NCBI-format text file.
+        Args:
+            path: Path to the matrix file.
+            gap_score: Score returned for gap characters.
+        Example::
+            scoring = SubstitutionMatrix.from_file("my_matrices/BLOSUM45")
+        """
+        path = Path(path)
+        with open(path) as f:
+            matrix = _parse_ncbi_matrix(f)
+        return cls(matrix, name=path.stem, gap_score=gap_score)
+    @classmethod
+    def from_string(
+        cls,
+        text: str,
+        name: str = "custom",
+        gap_score: float = -8.0,
+    ) -> SubstitutionMatrix:
+        """Parse a substitution matrix from an NCBI-format string.
+        Args:
+            text: The matrix text in NCBI format.
+            name: Name to assign to the matrix.
+            gap_score: Score returned for gap characters.
+        """
+        import io
+        matrix = _parse_ncbi_matrix(io.StringIO(text))
+        return cls(matrix, name=name, gap_score=gap_score)
+    @classmethod
+    def from_bundled(
+        cls,
+        name: str,
+        gap_score: float = -8.0,
+    ) -> SubstitutionMatrix:
+        """Load one of the bundled NCBI matrices by name.
+        Available matrices: BLOSUM50, BLOSUM60, BLOSUM62, BLOSUM70, BLOSUM80,
+        BLOSUM90, BLOSUM100, PAM50, PAM150, PAM200, PAM250.
+        Args:
+            name: Matrix name (case-insensitive), e.g. ``"BLOSUM62"`` or ``"pam250"``.
+            gap_score: Score returned for gap characters.
+        Raises:
+            FileNotFoundError: If no bundled matrix with that name exists.
+        Example::
+            scoring = SubstitutionMatrix.from_bundled("PAM250")
+        """
+        path = _MATRIX_DATA_DIR / name.upper()
+        if not path.exists():
+            available = sorted(
+                p.name for p in _MATRIX_DATA_DIR.iterdir()
+                if p.is_file() and not p.name.startswith(".")
+            )
+            raise FileNotFoundError(
+                f"No bundled matrix '{name}'. Available: {', '.join(available)}"
+            )
+        return cls.from_file(path, gap_score=gap_score)
+    @classmethod
+    def from_ncbi(
+        cls,
+        name: str,
+        gap_score: float = -8.0,
+    ) -> SubstitutionMatrix:
+        """Download a substitution matrix directly from the NCBI FTP server.
+        This fetches the matrix at runtime from
+        ``https://ftp.ncbi.nlm.nih.gov/blast/matrices/<name>``.
+        Args:
+            name: Matrix name as it appears on the NCBI FTP server
+                  (e.g. ``"BLOSUM45"``, ``"PAM120"``).
+            gap_score: Score returned for gap characters.
+        Raises:
+            urllib.error.URLError: If the download fails.
+        """
+        import io
+        import urllib.request
+        url = f"{_NCBI_FTP_URL}/{name}"
+        with urllib.request.urlopen(url) as resp:
+            text = resp.read().decode("ascii")
+        matrix = _parse_ncbi_matrix(io.StringIO(text))
+        return cls(matrix, name=name, gap_score=gap_score)
+    @classmethod
+    def list_bundled(cls) -> list[str]:
+        """Return the names of all bundled matrices."""
+        if not _MATRIX_DATA_DIR.exists():
+            return []
+        return sorted(
+            p.name for p in _MATRIX_DATA_DIR.iterdir()
+            if p.is_file() and not p.name.startswith(".")
+        )
+    def __repr__(self) -> str:
+        return f"SubstitutionMatrix(name={self.name!r}, gap_score={self._gap_score})"
+# ---------------------------------------------------------------------------
+# Legacy convenience aliases
+# ---------------------------------------------------------------------------
+# Keep the old hardcoded BLOSUM50 data for backward compatibility.
+_BLOSUM50_RAW: dict[tuple[int, int], float] = {
+    # a (1)
+    (1, 1): 5, (1, 2): -2, (1, 3): -1, (1, 4): -2, (1, 5): -1,
+    (1, 6): -1, (1, 7): -1, (1, 8): 0, (1, 9): -2, (1, 10): -1,
+    (1, 11): -2, (1, 12): -1, (1, 13): -1, (1, 14): -3, (1, 15): -1,
+    (1, 16): 1, (1, 17): 0, (1, 18): -3, (1, 19): -2, (1, 20): 0,
+    # r (2)
+    (2, 2): 7, (2, 3): -1, (2, 4): -2, (2, 5): -4,
+    (2, 6): 1, (2, 7): 0, (2, 8): -3, (2, 9): 0, (2, 10): -4,
+    (2, 11): -3, (2, 12): 3, (2, 13): -2, (2, 14): -3, (2, 15): -3,
+    (2, 16): -1, (2, 17): -1, (2, 18): -3, (2, 19): -1, (2, 20): -3,
+    # n (3)
+    (3, 3): 7, (3, 4): 2, (3, 5): -2,
+    (3, 6): 0, (3, 7): 0, (3, 8): 0, (3, 9): 1, (3, 10): -3,
+    (3, 11): -4, (3, 12): 0, (3, 13): -2, (3, 14): -4, (3, 15): -2,
+    (3, 16): 1, (3, 17): 0, (3, 18): -4, (3, 19): -2, (3, 20): -3,
+    # d (4)
+    (4, 4): 8, (4, 5): -4,
+    (4, 6): 0, (4, 7): 2, (4, 8): -1, (4, 9): -1, (4, 10): -4,
+    (4, 11): -4, (4, 12): -1, (4, 13): -4, (4, 14): -5, (4, 15): -1,
+    (4, 16): 0, (4, 17): -1, (4, 18): -5, (4, 19): -3, (4, 20): -4,
+    # c (5)
+    (5, 5): 13, (5, 6): -3, (5, 7): -3, (5, 8): -3, (5, 9): -3, (5, 10): -2,
+    (5, 11): -2, (5, 12): -3, (5, 13): -2, (5, 14): -2, (5, 15): -4,
+    (5, 16): -1, (5, 17): -1, (5, 18): -5, (5, 19): -3, (5, 20): -1,
+    # q (6)
+    (6, 6): 7, (6, 7): 2, (6, 8): -2, (6, 9): 1, (6, 10): -3,
+    (6, 11): -2, (6, 12): 2, (6, 13): 0, (6, 14): -4, (6, 15): -1,
+    (6, 16): 0, (6, 17): -1, (6, 18): -1, (6, 19): -1, (6, 20): -3,
+    # e (7)
+    (7, 7): 6, (7, 8): -3, (7, 9): 0, (7, 10): -4,
+    (7, 11): -3, (7, 12): 1, (7, 13): -2, (7, 14): -3, (7, 15): -1,
+    (7, 16): -1, (7, 17): -1, (7, 18): -3, (7, 19): -2, (7, 20): -3,
+    # g (8)
+    (8, 8): 8, (8, 9): -2, (8, 10): -4,
+    (8, 11): -4, (8, 12): -2, (8, 13): -3, (8, 14): -4, (8, 15): -2,
+    (8, 16): 0, (8, 17): -2, (8, 18): -3, (8, 19): -3, (8, 20): -4,
+    # h (9)
+    (9, 9): 10, (9, 10): -4,
+    (9, 11): -3, (9, 12): 0, (9, 13): -1, (9, 14): -1, (9, 15): -2,
+    (9, 16): -1, (9, 17): -2, (9, 18): -3, (9, 19): 2, (9, 20): -4,
+    # i (10)
+    (10, 10): 5, (10, 11): 2, (10, 12): -3, (10, 13): 2, (10, 14): 0,
+    (10, 15): -3, (10, 16): -3, (10, 17): -1, (10, 18): -3, (10, 19): -1, (10, 20): 4,
+    # l (11)
+    (11, 11): 5, (11, 12): -3, (11, 13): 3, (11, 14): 1,
+    (11, 15): -4, (11, 16): -3, (11, 17): -1, (11, 18): -2, (11, 19): -1, (11, 20): 1,
+    # k (12)
+    (12, 12): 6, (12, 13): -2, (12, 14): -4,
+    (12, 15): -1, (12, 16): 0, (12, 17): -1, (12, 18): -3, (12, 19): -2, (12, 20): -3,
+    # m (13)
+    (13, 13): 7, (13, 14): 0,
+    (13, 15): -3, (13, 16): -2, (13, 17): -1, (13, 18): -1, (13, 19): 0, (13, 20): 1,
+    # f (14)
+    (14, 14): 8, (14, 15): -4, (14, 16): -3, (14, 17): -2,
+    (14, 18): 1, (14, 19): 4, (14, 20): -1,
+    # p (15)
+    (15, 15): 10, (15, 16): -1, (15, 17): -1,
+    (15, 18): -4, (15, 19): -3, (15, 20): -3,
+    # s (16)
+    (16, 16): 5, (16, 17): 2, (16, 18): -4, (16, 19): -2, (16, 20): -2,
+    # t (17)
+    (17, 17): 5, (17, 18): -3, (17, 19): -2, (17, 20): 0,
+    # w (18)
+    (18, 18): 15, (18, 19): 2, (18, 20): -3,
+    # y (19)
+    (19, 19): 8, (19, 20): -1,
+    # v (20)
+    (20, 20): 5,
+}
+class Blosum50:
+    """BLOSUM50 substitution matrix scoring function (legacy convenience class).
+    Element IDs follow the legacy encoding (1..20 for amino acids, 0 for gap).
+    For new code, prefer ``SubstitutionMatrix.from_bundled("BLOSUM50")``.
+    """
+    def __init__(self, gap_score: float = -8.0) -> None:
+        self._gap_score = gap_score
+        # Build symmetric lookup.
+        self._matrix: dict[tuple[int, int], float] = {}
+        for (i, j), v in _BLOSUM50_RAW.items():
+            self._matrix[(i, j)] = v
+            self._matrix[(j, i)] = v
+    def score(self, a: int, b: int) -> float:
+        """Return BLOSUM50 score for element IDs *a* and *b*."""
+        if a == 0 or b == 0:
+            return self._gap_score
+        return self._matrix.get((a, b), 0.0)

pyseqalign/scoring/matrix_data/BLOSUM100 ADDED Viewed

@@ -0,0 +1,31 @@
+#  Matrix made by matblas from blosum100_3.iij
+#  * column uses minimum score
+#  BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+#  Blocks Database = /data/blocks_5.0/blocks.dat
+#  Cluster Percentage: >= 100
+#  Entropy =   1.4516, Expected =  -1.0948
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
+A  8 -3 -4 -5 -2 -2 -3 -1 -4 -4 -4 -2 -3 -5 -2  1 -1 -6 -5 -2 -4 -2 -2 -10
+R -3 10 -2 -5 -8  0 -2 -6 -1 -7 -6  3 -4 -6 -5 -3 -3 -7 -5 -6 -4 -1 -3 -10
+N -4 -2 11  1 -5 -1 -2 -2  0 -7 -7 -1 -5 -7 -5  0 -1 -8 -5 -7  5 -2 -3 -10
+D -5 -5  1 10 -8 -2  2 -4 -3 -8 -8 -3 -8 -8 -5 -2 -4 -10 -7 -8  6  0 -4 -10
+C -2 -8 -5 -8 14 -7 -9 -7 -8 -3 -5 -8 -4 -4 -8 -3 -3 -7 -6 -3 -7 -8 -5 -10
+Q -2  0 -1 -2 -7 11  2 -5  1 -6 -5  2 -2 -6 -4 -2 -3 -5 -4 -5 -2  5 -2 -10
+E -3 -2 -2  2 -9  2 10 -6 -2 -7 -7  0 -5 -8 -4 -2 -3 -8 -7 -5  0  7 -3 -10
+G -1 -6 -2 -4 -7 -5 -6  9 -6 -9 -8 -5 -7 -8 -6 -2 -5 -7 -8 -8 -3 -5 -4 -10
+H -4 -1  0 -3 -8  1 -2 -6 13 -7 -6 -3 -5 -4 -5 -3 -4 -5  1 -7 -2 -1 -4 -10
+I -4 -7 -7 -8 -3 -6 -7 -9 -7  8  2 -6  1 -2 -7 -5 -3 -6 -4  4 -8 -7 -3 -10
+L -4 -6 -7 -8 -5 -5 -7 -8 -6  2  8 -6  3  0 -7 -6 -4 -5 -4  0 -8 -6 -3 -10
+K -2  3 -1 -3 -8  2  0 -5 -3 -6 -6 10 -4 -6 -3 -2 -3 -8 -5 -5 -2  0 -3 -10
+M -3 -4 -5 -8 -4 -2 -5 -7 -5  1  3 -4 12 -1 -5 -4 -2 -4 -5  0 -7 -4 -3 -10
+F -5 -6 -7 -8 -4 -6 -8 -8 -4 -2  0 -6 -1 11 -7 -5 -5  0  4 -3 -7 -7 -4 -10
+P -2 -5 -5 -5 -8 -4 -4 -6 -5 -7 -7 -3 -5 -7 12 -3 -4 -8 -7 -6 -5 -4 -4 -10
+S  1 -3  0 -2 -3 -2 -2 -2 -3 -5 -6 -2 -4 -5 -3  9  2 -7 -5 -4 -1 -2 -2 -10
+T -1 -3 -1 -4 -3 -3 -3 -5 -4 -3 -4 -3 -2 -5 -4  2  9 -7 -5 -1 -2 -3 -2 -10
+W -6 -7 -8 -10 -7 -5 -8 -7 -5 -6 -5 -8 -4  0 -8 -7 -7 17  2 -5 -9 -7 -6 -10
+Y -5 -5 -5 -7 -6 -4 -7 -8  1 -4 -4 -5 -5  4 -7 -5 -5  2 12 -5 -6 -6 -4 -10
+V -2 -6 -7 -8 -3 -5 -5 -8 -7  4  0 -5  0 -3 -6 -4 -1 -5 -5  8 -7 -5 -3 -10
+B -4 -4  5  6 -7 -2  0 -3 -2 -8 -8 -2 -7 -7 -5 -1 -2 -9 -6 -7  6  0 -4 -10
+Z -2 -1 -2  0 -8  5  7 -5 -1 -7 -6  0 -4 -7 -4 -2 -3 -7 -6 -5  0  6 -2 -10
+X -2 -3 -3 -4 -5 -2 -3 -4 -4 -3 -3 -3 -3 -4 -4 -2 -2 -6 -4 -3 -4 -2 -3 -10
+* -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10  1

pyseqalign/scoring/matrix_data/BLOSUM50 ADDED Viewed

@@ -0,0 +1,31 @@
+#  Matrix made by matblas from blosum50.iij
+#  * column uses minimum score
+#  BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+#  Blocks Database = /data/blocks_5.0/blocks.dat
+#  Cluster Percentage: >= 50
+#  Entropy =   0.4808, Expected =  -0.3573
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
+A  5 -2 -1 -2 -1 -1 -1  0 -2 -1 -2 -1 -1 -3 -1  1  0 -3 -2  0 -2 -1 -1 -5
+R -2  7 -1 -2 -4  1  0 -3  0 -4 -3  3 -2 -3 -3 -1 -1 -3 -1 -3 -1  0 -1 -5
+N -1 -1  7  2 -2  0  0  0  1 -3 -4  0 -2 -4 -2  1  0 -4 -2 -3  4  0 -1 -5
+D -2 -2  2  8 -4  0  2 -1 -1 -4 -4 -1 -4 -5 -1  0 -1 -5 -3 -4  5  1 -1 -5
+C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2 -5
+Q -1  1  0  0 -3  7  2 -2  1 -3 -2  2  0 -4 -1  0 -1 -1 -1 -3  0  4 -1 -5
+E -1  0  0  2 -3  2  6 -3  0 -4 -3  1 -2 -3 -1 -1 -1 -3 -2 -3  1  5 -1 -5
+G  0 -3  0 -1 -3 -2 -3  8 -2 -4 -4 -2 -3 -4 -2  0 -2 -3 -3 -4 -1 -2 -2 -5
+H -2  0  1 -1 -3  1  0 -2 10 -4 -3  0 -1 -1 -2 -1 -2 -3  2 -4  0  0 -1 -5
+I -1 -4 -3 -4 -2 -3 -4 -4 -4  5  2 -3  2  0 -3 -3 -1 -3 -1  4 -4 -3 -1 -5
+L -2 -3 -4 -4 -2 -2 -3 -4 -3  2  5 -3  3  1 -4 -3 -1 -2 -1  1 -4 -3 -1 -5
+K -1  3  0 -1 -3  2  1 -2  0 -3 -3  6 -2 -4 -1  0 -1 -3 -2 -3  0  1 -1 -5
+M -1 -2 -2 -4 -2  0 -2 -3 -1  2  3 -2  7  0 -3 -2 -1 -1  0  1 -3 -1 -1 -5
+F -3 -3 -4 -5 -2 -4 -3 -4 -1  0  1 -4  0  8 -4 -3 -2  1  4 -1 -4 -4 -2 -5
+P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2 -5
+S  1 -1  1  0 -1  0 -1  0 -1 -3 -3  0 -2 -3 -1  5  2 -4 -2 -2  0  0 -1 -5
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  2  5 -3 -2  0  0 -1  0 -5
+W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1  1 -4 -4 -3 15  2 -3 -5 -2 -3 -5
+Y -2 -1 -2 -3 -3 -1 -2 -3  2 -1 -1 -2  0  4 -3 -2 -2  2  8 -1 -3 -2 -1 -5
+V  0 -3 -3 -4 -1 -3 -3 -4 -4  4  1 -3  1 -1 -3 -2  0 -3 -1  5 -4 -3 -1 -5
+B -2 -1  4  5 -3  0  1 -1  0 -4 -4  0 -3 -4 -2  0  0 -5 -3 -4  5  2 -1 -5
+Z -1  0  0  1 -3  4  5 -2  0 -3 -3  1 -1 -4 -1  0 -1 -2 -2 -3  2  5 -1 -5
+X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1  0 -3 -1 -1 -1 -1 -1 -5
+* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5  1