pyseqalignment 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pyseqalign/__init__.py +14 -0
  2. pyseqalign/core/__init__.py +12 -0
  3. pyseqalign/core/alignment.py +67 -0
  4. pyseqalign/core/needleman_wunsch.py +122 -0
  5. pyseqalign/core/smith_waterman.py +173 -0
  6. pyseqalign/learning/__init__.py +20 -0
  7. pyseqalign/learning/aleph.py +212 -0
  8. pyseqalign/learning/aleph_files/__init__.py +0 -0
  9. pyseqalign/learning/aleph_files/aleph_swi_ak.pl +10420 -0
  10. pyseqalign/learning/base.py +68 -0
  11. pyseqalign/learning/popper.py +215 -0
  12. pyseqalign/learning/task_builder.py +213 -0
  13. pyseqalign/prolog/__init__.py +5 -0
  14. pyseqalign/prolog/engine.py +102 -0
  15. pyseqalign/prolog/knowledge/__init__.py +0 -0
  16. pyseqalign/prolog/knowledge/amino_acids.pl +53 -0
  17. pyseqalign/prolog/knowledge/blosum50.pl +800 -0
  18. pyseqalign/prolog/knowledge/defaults.pl +15 -0
  19. pyseqalign/prolog/knowledge/distances.pl +119 -0
  20. pyseqalign/scoring/__init__.py +11 -0
  21. pyseqalign/scoring/distance.py +100 -0
  22. pyseqalign/scoring/matrices.py +362 -0
  23. pyseqalign/scoring/matrix_data/BLOSUM100 +31 -0
  24. pyseqalign/scoring/matrix_data/BLOSUM50 +31 -0
  25. pyseqalign/scoring/matrix_data/BLOSUM60 +31 -0
  26. pyseqalign/scoring/matrix_data/BLOSUM62 +31 -0
  27. pyseqalign/scoring/matrix_data/BLOSUM70 +31 -0
  28. pyseqalign/scoring/matrix_data/BLOSUM80 +31 -0
  29. pyseqalign/scoring/matrix_data/BLOSUM90 +31 -0
  30. pyseqalign/scoring/matrix_data/PAM150 +34 -0
  31. pyseqalign/scoring/matrix_data/PAM200 +34 -0
  32. pyseqalign/scoring/matrix_data/PAM250 +34 -0
  33. pyseqalign/scoring/matrix_data/PAM50 +34 -0
  34. pyseqalign/scoring/matrix_data/__init__.py +0 -0
  35. pyseqalign/utils/__init__.py +9 -0
  36. pyseqalign/utils/helpers.py +47 -0
  37. pyseqalignment-0.1.0.dist-info/METADATA +317 -0
  38. pyseqalignment-0.1.0.dist-info/RECORD +41 -0
  39. pyseqalignment-0.1.0.dist-info/WHEEL +5 -0
  40. pyseqalignment-0.1.0.dist-info/licenses/LICENSE +21 -0
  41. pyseqalignment-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,15 @@
1
+ :- assert(gapDefault(-1.0)).
2
+ :- assert(gapChar('$gap')).
3
+ :- assert(gapChar('real_gap')).
4
+ :- assert(learningRate(0,1.0)).
5
+
6
+ assign(X,V) :-
7
+ Old =..[X,_], retract(Old),
8
+ New =..[X,V], assert(New).
9
+
10
+ dist(sym,atomDistance,nc,0,0,_,Dist):- gapDefault(Dist).
11
+ dist(sym,atomDistance,nc,0,_,0,Dist):- gapDefault(Dist).
12
+
13
+ :- consult('aminoAcids.pl'),consult('blossum_50.pl').
14
+
15
+
@@ -0,0 +1,119 @@
1
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2
+ % %
3
+ % distances for logical atoms %
4
+ % %
5
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
6
+
7
+ % The predicat dist/6 is meant to be called witch instanciated atoms.
8
+ %
9
+ % The form is:
10
+ % dist(TypeOfDistance,NameOfDistance,Iteration,Atom1,Atom2,Distance)
11
+ %
12
+ %
13
+ %
14
+
15
+ %:- use(module(library(lists))).
16
+ :- source.
17
+
18
+ %:- dynamic gapDefault/1.
19
+ %:- dynamic gapChar/1.
20
+ :- dynamic x/1 . % this may be required in some Prologs
21
+ :- assert(gapDefault(-1.0)).
22
+ :- assert(gapChar('$gap')).
23
+ :- assert(gapChar('real_gap')).
24
+ :- assert(learningRate(0,1.0)).
25
+
26
+ x(0). % An initial value is required in this example
27
+
28
+ assign(X,V) :-
29
+ Old =..[X,_], retract(Old),
30
+ New =..[X,V], assert(New).
31
+
32
+ % Nienhuys-Cheng Distance
33
+
34
+
35
+ dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
36
+ example(AtomID1,Atom1),
37
+ gapChar(Atom1),
38
+ gapDefault(Distance).
39
+ dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
40
+ example(AtomID2,Atom2),
41
+ gapChar(Atom2),
42
+ gapDefault(Distance).
43
+ dist(dist,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
44
+ example(AtomID1,Atom1),
45
+ example(AtomID2,Atom2),
46
+ distSub(TypeOfDistance,NameOfDistance,0,Atom1,Atom2,Distance).
47
+
48
+ dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
49
+ example(AtomID1,Atom1),
50
+ gapChar(Atom1),
51
+ gapDefault(Distance).
52
+ dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
53
+ example(AtomID2,Atom2),
54
+ gapChar(Atom2),
55
+ gapDefault(Distance).
56
+ dist(sym,TypeOfDistance,NameOfDistance,0,AtomID1,AtomID2,Distance) :-
57
+ example(AtomID1,Atom1),
58
+ example(AtomID2,Atom2),
59
+ distSub(TypeOfDistance,NameOfDistance,0,Atom1,Atom2,DistanceI),
60
+ Distance is 1.0-DistanceI.
61
+
62
+
63
+ distSub(atomDistance,nc,0,Atom,Atom,Dist):-
64
+ !,Dist is 0.0.
65
+ %distSub(atomDistance,nc,0,A,_,Dist):-
66
+ % gapChar(A),
67
+ % gapDefault(Dist).
68
+ %distSub(atomDistance,nc,0,_,B,Dist):-
69
+ % gapChar(B),
70
+ % gapDefault(Dist).
71
+ distSub(atomDistance,nc,0,A,B,Dist) :-
72
+ A =.. [PredA|AL],
73
+ B =.. [PredB|BL],
74
+ PredA == PredB,
75
+ length(AL,Length),
76
+ length(BL,Length),!,
77
+ distSub_helper(atomDistance,nc,0,AL,BL,SumDist),
78
+ Dist is 1.0/(2*Length)*SumDist.
79
+ distSub(atomDistance,nc,0,A,B,Dist) :-
80
+ Dist is 1.0.
81
+
82
+ distSub_helper(atomDistance,nc,0,[],[],Dist) :-
83
+ !,Dist is 0.0.
84
+ distSub_helper(atomDistance,nc,0,[A1|R1],[A2|R2],Dists) :-
85
+ distSub(atomDistance,nc,0,A1,A2,DistHere),
86
+ distSub_helper(atomDistance,nc,0,R1,R2,DistsThere),!,
87
+ Dists is DistHere+DistsThere.
88
+
89
+
90
+ %distSub(atomDistance,nc,Iteration,Atom1,Atom2,Dist):-
91
+ % IterBefore is Iteration-1,
92
+ % distSub(atomDistance,nc,IterBefore,Atom1,Atom2,DistBefore),
93
+
94
+
95
+ % this delta/5 works on the prolog programs resulting from a tilde run.
96
+ delta(_, 0, _,_, 0.0).
97
+ delta(Num, Iteration, Atom1,Atom2, Delta) :-
98
+ exampleC(Atom1,tag(What1,Tag1)),
99
+ assert(word(Num,What1)),
100
+ assert(tag(Num,What1,Tag1)),
101
+ assert(q(Num,What1)),
102
+ exampleC(Atom2,tag(What2,Tag2)),
103
+ % only assert if different
104
+ (What2 \== What1 ->
105
+ assert(word(Num,What1));
106
+ true),
107
+ % only assert if different
108
+ ((Tag1 \== Tag2, What2 \== What1) ->
109
+ assert(tag(Num,What2,Tag2));
110
+ true),
111
+ assert(t(Num,What2)),!,
112
+ delta(Iteration, Num, [Delta]),
113
+ retract(q(Num,What1)),
114
+ retract(t(Num,What2)),
115
+ (retract(word(Num,What1));true),
116
+ (retract(word(Num,What2));true),
117
+ (retract(tag(Num,What1,Tag1));true),
118
+ (retract(tag(Num,What2,Tag2));true).
119
+
@@ -0,0 +1,11 @@
1
+ """Scoring and distance functions for sequence alignment."""
2
+
3
+ from pyseqalign.scoring.distance import AtomDistance, SimpleMatch
4
+ from pyseqalign.scoring.matrices import Blosum50, SubstitutionMatrix
5
+
6
+ __all__ = [
7
+ "Blosum50",
8
+ "SubstitutionMatrix",
9
+ "AtomDistance",
10
+ "SimpleMatch",
11
+ ]
@@ -0,0 +1,100 @@
1
+ """Distance-based scoring functions.
2
+
3
+ Includes the Nienhuys-Cheng atom distance from the legacy distances.pl, as well
4
+ as a simple identity-match scorer useful for testing.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class SimpleMatch:
11
+ """Simple identity-based scoring: +match_score for equal elements, +mismatch_score otherwise.
12
+
13
+ Args:
14
+ match_score: Score when elements are identical.
15
+ mismatch_score: Score when elements differ.
16
+ gap_score: Score for gap characters (element ID 0).
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ match_score: float = 5.0,
22
+ mismatch_score: float = -4.0,
23
+ gap_score: float = -8.0,
24
+ ) -> None:
25
+ self.match_score = match_score
26
+ self.mismatch_score = mismatch_score
27
+ self.gap_score = gap_score
28
+
29
+ def score(self, a: int, b: int) -> float:
30
+ if a == 0 or b == 0:
31
+ return self.gap_score
32
+ return self.match_score if a == b else self.mismatch_score
33
+
34
+
35
+ class AtomDistance:
36
+ """Nienhuys-Cheng distance for structured atoms.
37
+
38
+ This is a Python port of the recursive atom distance from the legacy
39
+ distances.pl Prolog knowledge base. It operates on structured
40
+ representations where each atom is a tuple of ``(predicate, *args)`` and
41
+ computes a normalised distance in [0, 1].
42
+
43
+ For the integer-ID based interface used by the alignment algorithms, use
44
+ ``AtomDistance`` with an *atom_store* mapping IDs to structured atoms.
45
+
46
+ Args:
47
+ atom_store: Mapping from integer element IDs to structured atoms
48
+ (tuples). ID 0 is reserved for gaps.
49
+ gap_score: Score returned for gap characters.
50
+ similarity: If ``True``, return ``1 - distance`` (similarity mode,
51
+ matching the legacy ``sym`` mode).
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ atom_store: dict[int, tuple] | None = None,
57
+ gap_score: float = -1.0,
58
+ similarity: bool = True,
59
+ ) -> None:
60
+ self.atom_store = atom_store or {}
61
+ self.gap_score = gap_score
62
+ self.similarity = similarity
63
+
64
+ def score(self, a: int, b: int) -> float:
65
+ """Return the (dis)similarity score between atom IDs *a* and *b*."""
66
+ if a == 0 or b == 0:
67
+ return self.gap_score
68
+
69
+ atom_a = self.atom_store.get(a)
70
+ atom_b = self.atom_store.get(b)
71
+
72
+ if atom_a is None or atom_b is None:
73
+ return self.gap_score
74
+
75
+ dist = self._atom_distance(atom_a, atom_b)
76
+ if self.similarity:
77
+ return 1.0 - dist
78
+ return dist
79
+
80
+ def _atom_distance(self, a: tuple, b: tuple) -> float:
81
+ """Recursive Nienhuys-Cheng distance between two structured atoms."""
82
+ if a == b:
83
+ return 0.0
84
+
85
+ # Atoms must be tuples: (predicate, arg1, arg2, ...).
86
+ if not isinstance(a, tuple) or not isinstance(b, tuple):
87
+ return 1.0
88
+
89
+ pred_a, *args_a = a
90
+ pred_b, *args_b = b
91
+
92
+ # Different predicate or arity => maximal distance.
93
+ if pred_a != pred_b or len(args_a) != len(args_b):
94
+ return 1.0
95
+
96
+ if len(args_a) == 0:
97
+ return 0.0
98
+
99
+ total = sum(self._atom_distance(ai, bi) for ai, bi in zip(args_a, args_b))
100
+ return total / (2 * len(args_a))
@@ -0,0 +1,362 @@
1
+ """Substitution matrices for amino acid sequence alignment.
2
+
3
+ Supports loading matrices dynamically from NCBI-format text files.
4
+ A set of commonly used BLOSUM and PAM matrices are bundled with
5
+ the package and can be loaded by name.
6
+
7
+ Example usage::
8
+
9
+ # Load a bundled matrix by name
10
+ scoring = SubstitutionMatrix.from_bundled("BLOSUM62")
11
+
12
+ # Load from any NCBI-format file on disk
13
+ scoring = SubstitutionMatrix.from_file("/path/to/my/MATRIX")
14
+
15
+ # Download directly from NCBI FTP
16
+ scoring = SubstitutionMatrix.from_ncbi("PAM120")
17
+
18
+ # Legacy convenience alias (still works)
19
+ scoring = Blosum50()
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from pathlib import Path
25
+ from typing import TextIO
26
+
27
+ # Standard one-letter amino acid codes, indexed 1..20 to match the legacy encoding.
28
+ # Index 0 is reserved for the gap character '-'.
29
+ AMINO_ACIDS = [
30
+ "-", # 0 -- gap
31
+ "a", # 1
32
+ "r", # 2
33
+ "n", # 3
34
+ "d", # 4
35
+ "c", # 5
36
+ "q", # 6
37
+ "e", # 7
38
+ "g", # 8
39
+ "h", # 9
40
+ "i", # 10
41
+ "l", # 11
42
+ "k", # 12
43
+ "m", # 13
44
+ "f", # 14
45
+ "p", # 15
46
+ "s", # 16
47
+ "t", # 17
48
+ "w", # 18
49
+ "y", # 19
50
+ "v", # 20
51
+ ]
52
+
53
+ # Reverse lookup: one-letter code -> integer ID.
54
+ _AA_TO_ID: dict[str, int] = {aa: idx for idx, aa in enumerate(AMINO_ACIDS)}
55
+
56
+ # Directory containing bundled NCBI matrix files.
57
+ _MATRIX_DATA_DIR = Path(__file__).parent / "matrix_data"
58
+
59
+ # NCBI FTP base URL for substitution matrices.
60
+ _NCBI_FTP_URL = "https://ftp.ncbi.nlm.nih.gov/blast/matrices"
61
+
62
+
63
+ def _parse_ncbi_matrix(source: TextIO) -> dict[tuple[int, int], float]:
64
+ """Parse an NCBI-format substitution matrix into ``{(id_i, id_j): score}``.
65
+
66
+ The format consists of:
67
+ - Comment lines starting with ``#``
68
+ - A header row of single-letter amino acid codes
69
+ - Data rows: amino acid letter followed by integer scores
70
+
71
+ Only the 20 standard amino acids (A R N D C Q E G H I L K M F P S T W Y V)
72
+ are extracted; columns for ambiguity codes (B, Z, X) and stop (*) are ignored.
73
+ """
74
+ col_ids: list[int] = []
75
+ matrix: dict[tuple[int, int], float] = {}
76
+
77
+ for line in source:
78
+ line = line.strip()
79
+ if not line or line.startswith("#"):
80
+ continue
81
+
82
+ tokens = line.split()
83
+
84
+ # Detect the header row: first token is a single letter that appears
85
+ # in our amino acid alphabet (case-insensitive).
86
+ if not col_ids:
87
+ # Header row -- all tokens should be single letters.
88
+ if all(len(t) == 1 for t in tokens):
89
+ for t in tokens:
90
+ aa_id = _AA_TO_ID.get(t.lower(), -1)
91
+ col_ids.append(aa_id)
92
+ continue
93
+ # Some files start the header with a leading letter (the row label
94
+ # coincides with the header). Try treating the first token as a
95
+ # letter and the rest as letters too.
96
+
97
+ if not col_ids:
98
+ continue
99
+
100
+ # Data row: first token is the amino acid letter, rest are scores.
101
+ row_aa = tokens[0].lower()
102
+ row_id = _AA_TO_ID.get(row_aa, -1)
103
+ if row_id <= 0:
104
+ # Not a standard amino acid row (B, Z, X, *), skip.
105
+ continue
106
+
107
+ scores = tokens[1:]
108
+ for col_idx, score_str in enumerate(scores):
109
+ if col_idx >= len(col_ids):
110
+ break
111
+ col_id = col_ids[col_idx]
112
+ if col_id <= 0:
113
+ # Not a standard amino acid column, skip.
114
+ continue
115
+ val = float(score_str)
116
+ matrix[(row_id, col_id)] = val
117
+ matrix[(col_id, row_id)] = val
118
+
119
+ return matrix
120
+
121
+
122
+ class SubstitutionMatrix:
123
+ """A substitution matrix scoring function loaded from an NCBI-format file.
124
+
125
+ Implements the ``ScoringFunction`` protocol so it can be passed directly
126
+ to ``SmithWaterman`` or ``NeedlemanWunsch``.
127
+
128
+ Args:
129
+ matrix: Symmetric ``{(id_i, id_j): score}`` mapping.
130
+ name: Human-readable name for the matrix (e.g. ``"BLOSUM62"``).
131
+ gap_score: Score returned when either element is a gap (ID 0).
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ matrix: dict[tuple[int, int], float],
137
+ name: str = "custom",
138
+ gap_score: float = -8.0,
139
+ ) -> None:
140
+ self._matrix = matrix
141
+ self.name = name
142
+ self._gap_score = gap_score
143
+
144
+ def score(self, a: int, b: int) -> float:
145
+ """Return the substitution score for element IDs *a* and *b*."""
146
+ if a == 0 or b == 0:
147
+ return self._gap_score
148
+ return self._matrix.get((a, b), 0.0)
149
+
150
+ @classmethod
151
+ def from_file(
152
+ cls,
153
+ path: str | Path,
154
+ gap_score: float = -8.0,
155
+ ) -> SubstitutionMatrix:
156
+ """Load a substitution matrix from an NCBI-format text file.
157
+
158
+ Args:
159
+ path: Path to the matrix file.
160
+ gap_score: Score returned for gap characters.
161
+
162
+ Example::
163
+
164
+ scoring = SubstitutionMatrix.from_file("my_matrices/BLOSUM45")
165
+ """
166
+ path = Path(path)
167
+ with open(path) as f:
168
+ matrix = _parse_ncbi_matrix(f)
169
+ return cls(matrix, name=path.stem, gap_score=gap_score)
170
+
171
+ @classmethod
172
+ def from_string(
173
+ cls,
174
+ text: str,
175
+ name: str = "custom",
176
+ gap_score: float = -8.0,
177
+ ) -> SubstitutionMatrix:
178
+ """Parse a substitution matrix from an NCBI-format string.
179
+
180
+ Args:
181
+ text: The matrix text in NCBI format.
182
+ name: Name to assign to the matrix.
183
+ gap_score: Score returned for gap characters.
184
+ """
185
+ import io
186
+
187
+ matrix = _parse_ncbi_matrix(io.StringIO(text))
188
+ return cls(matrix, name=name, gap_score=gap_score)
189
+
190
+ @classmethod
191
+ def from_bundled(
192
+ cls,
193
+ name: str,
194
+ gap_score: float = -8.0,
195
+ ) -> SubstitutionMatrix:
196
+ """Load one of the bundled NCBI matrices by name.
197
+
198
+ Available matrices: BLOSUM50, BLOSUM60, BLOSUM62, BLOSUM70, BLOSUM80,
199
+ BLOSUM90, BLOSUM100, PAM50, PAM150, PAM200, PAM250.
200
+
201
+ Args:
202
+ name: Matrix name (case-insensitive), e.g. ``"BLOSUM62"`` or ``"pam250"``.
203
+ gap_score: Score returned for gap characters.
204
+
205
+ Raises:
206
+ FileNotFoundError: If no bundled matrix with that name exists.
207
+
208
+ Example::
209
+
210
+ scoring = SubstitutionMatrix.from_bundled("PAM250")
211
+ """
212
+ path = _MATRIX_DATA_DIR / name.upper()
213
+ if not path.exists():
214
+ available = sorted(
215
+ p.name for p in _MATRIX_DATA_DIR.iterdir()
216
+ if p.is_file() and not p.name.startswith(".")
217
+ )
218
+ raise FileNotFoundError(
219
+ f"No bundled matrix '{name}'. Available: {', '.join(available)}"
220
+ )
221
+ return cls.from_file(path, gap_score=gap_score)
222
+
223
+ @classmethod
224
+ def from_ncbi(
225
+ cls,
226
+ name: str,
227
+ gap_score: float = -8.0,
228
+ ) -> SubstitutionMatrix:
229
+ """Download a substitution matrix directly from the NCBI FTP server.
230
+
231
+ This fetches the matrix at runtime from
232
+ ``https://ftp.ncbi.nlm.nih.gov/blast/matrices/<name>``.
233
+
234
+ Args:
235
+ name: Matrix name as it appears on the NCBI FTP server
236
+ (e.g. ``"BLOSUM45"``, ``"PAM120"``).
237
+ gap_score: Score returned for gap characters.
238
+
239
+ Raises:
240
+ urllib.error.URLError: If the download fails.
241
+ """
242
+ import io
243
+ import urllib.request
244
+
245
+ url = f"{_NCBI_FTP_URL}/{name}"
246
+ with urllib.request.urlopen(url) as resp:
247
+ text = resp.read().decode("ascii")
248
+ matrix = _parse_ncbi_matrix(io.StringIO(text))
249
+ return cls(matrix, name=name, gap_score=gap_score)
250
+
251
+ @classmethod
252
+ def list_bundled(cls) -> list[str]:
253
+ """Return the names of all bundled matrices."""
254
+ if not _MATRIX_DATA_DIR.exists():
255
+ return []
256
+ return sorted(
257
+ p.name for p in _MATRIX_DATA_DIR.iterdir()
258
+ if p.is_file() and not p.name.startswith(".")
259
+ )
260
+
261
+ def __repr__(self) -> str:
262
+ return f"SubstitutionMatrix(name={self.name!r}, gap_score={self._gap_score})"
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Legacy convenience aliases
267
+ # ---------------------------------------------------------------------------
268
+
269
+ # Keep the old hardcoded BLOSUM50 data for backward compatibility.
270
+ _BLOSUM50_RAW: dict[tuple[int, int], float] = {
271
+ # a (1)
272
+ (1, 1): 5, (1, 2): -2, (1, 3): -1, (1, 4): -2, (1, 5): -1,
273
+ (1, 6): -1, (1, 7): -1, (1, 8): 0, (1, 9): -2, (1, 10): -1,
274
+ (1, 11): -2, (1, 12): -1, (1, 13): -1, (1, 14): -3, (1, 15): -1,
275
+ (1, 16): 1, (1, 17): 0, (1, 18): -3, (1, 19): -2, (1, 20): 0,
276
+ # r (2)
277
+ (2, 2): 7, (2, 3): -1, (2, 4): -2, (2, 5): -4,
278
+ (2, 6): 1, (2, 7): 0, (2, 8): -3, (2, 9): 0, (2, 10): -4,
279
+ (2, 11): -3, (2, 12): 3, (2, 13): -2, (2, 14): -3, (2, 15): -3,
280
+ (2, 16): -1, (2, 17): -1, (2, 18): -3, (2, 19): -1, (2, 20): -3,
281
+ # n (3)
282
+ (3, 3): 7, (3, 4): 2, (3, 5): -2,
283
+ (3, 6): 0, (3, 7): 0, (3, 8): 0, (3, 9): 1, (3, 10): -3,
284
+ (3, 11): -4, (3, 12): 0, (3, 13): -2, (3, 14): -4, (3, 15): -2,
285
+ (3, 16): 1, (3, 17): 0, (3, 18): -4, (3, 19): -2, (3, 20): -3,
286
+ # d (4)
287
+ (4, 4): 8, (4, 5): -4,
288
+ (4, 6): 0, (4, 7): 2, (4, 8): -1, (4, 9): -1, (4, 10): -4,
289
+ (4, 11): -4, (4, 12): -1, (4, 13): -4, (4, 14): -5, (4, 15): -1,
290
+ (4, 16): 0, (4, 17): -1, (4, 18): -5, (4, 19): -3, (4, 20): -4,
291
+ # c (5)
292
+ (5, 5): 13, (5, 6): -3, (5, 7): -3, (5, 8): -3, (5, 9): -3, (5, 10): -2,
293
+ (5, 11): -2, (5, 12): -3, (5, 13): -2, (5, 14): -2, (5, 15): -4,
294
+ (5, 16): -1, (5, 17): -1, (5, 18): -5, (5, 19): -3, (5, 20): -1,
295
+ # q (6)
296
+ (6, 6): 7, (6, 7): 2, (6, 8): -2, (6, 9): 1, (6, 10): -3,
297
+ (6, 11): -2, (6, 12): 2, (6, 13): 0, (6, 14): -4, (6, 15): -1,
298
+ (6, 16): 0, (6, 17): -1, (6, 18): -1, (6, 19): -1, (6, 20): -3,
299
+ # e (7)
300
+ (7, 7): 6, (7, 8): -3, (7, 9): 0, (7, 10): -4,
301
+ (7, 11): -3, (7, 12): 1, (7, 13): -2, (7, 14): -3, (7, 15): -1,
302
+ (7, 16): -1, (7, 17): -1, (7, 18): -3, (7, 19): -2, (7, 20): -3,
303
+ # g (8)
304
+ (8, 8): 8, (8, 9): -2, (8, 10): -4,
305
+ (8, 11): -4, (8, 12): -2, (8, 13): -3, (8, 14): -4, (8, 15): -2,
306
+ (8, 16): 0, (8, 17): -2, (8, 18): -3, (8, 19): -3, (8, 20): -4,
307
+ # h (9)
308
+ (9, 9): 10, (9, 10): -4,
309
+ (9, 11): -3, (9, 12): 0, (9, 13): -1, (9, 14): -1, (9, 15): -2,
310
+ (9, 16): -1, (9, 17): -2, (9, 18): -3, (9, 19): 2, (9, 20): -4,
311
+ # i (10)
312
+ (10, 10): 5, (10, 11): 2, (10, 12): -3, (10, 13): 2, (10, 14): 0,
313
+ (10, 15): -3, (10, 16): -3, (10, 17): -1, (10, 18): -3, (10, 19): -1, (10, 20): 4,
314
+ # l (11)
315
+ (11, 11): 5, (11, 12): -3, (11, 13): 3, (11, 14): 1,
316
+ (11, 15): -4, (11, 16): -3, (11, 17): -1, (11, 18): -2, (11, 19): -1, (11, 20): 1,
317
+ # k (12)
318
+ (12, 12): 6, (12, 13): -2, (12, 14): -4,
319
+ (12, 15): -1, (12, 16): 0, (12, 17): -1, (12, 18): -3, (12, 19): -2, (12, 20): -3,
320
+ # m (13)
321
+ (13, 13): 7, (13, 14): 0,
322
+ (13, 15): -3, (13, 16): -2, (13, 17): -1, (13, 18): -1, (13, 19): 0, (13, 20): 1,
323
+ # f (14)
324
+ (14, 14): 8, (14, 15): -4, (14, 16): -3, (14, 17): -2,
325
+ (14, 18): 1, (14, 19): 4, (14, 20): -1,
326
+ # p (15)
327
+ (15, 15): 10, (15, 16): -1, (15, 17): -1,
328
+ (15, 18): -4, (15, 19): -3, (15, 20): -3,
329
+ # s (16)
330
+ (16, 16): 5, (16, 17): 2, (16, 18): -4, (16, 19): -2, (16, 20): -2,
331
+ # t (17)
332
+ (17, 17): 5, (17, 18): -3, (17, 19): -2, (17, 20): 0,
333
+ # w (18)
334
+ (18, 18): 15, (18, 19): 2, (18, 20): -3,
335
+ # y (19)
336
+ (19, 19): 8, (19, 20): -1,
337
+ # v (20)
338
+ (20, 20): 5,
339
+ }
340
+
341
+
342
+ class Blosum50:
343
+ """BLOSUM50 substitution matrix scoring function (legacy convenience class).
344
+
345
+ Element IDs follow the legacy encoding (1..20 for amino acids, 0 for gap).
346
+
347
+ For new code, prefer ``SubstitutionMatrix.from_bundled("BLOSUM50")``.
348
+ """
349
+
350
+ def __init__(self, gap_score: float = -8.0) -> None:
351
+ self._gap_score = gap_score
352
+ # Build symmetric lookup.
353
+ self._matrix: dict[tuple[int, int], float] = {}
354
+ for (i, j), v in _BLOSUM50_RAW.items():
355
+ self._matrix[(i, j)] = v
356
+ self._matrix[(j, i)] = v
357
+
358
+ def score(self, a: int, b: int) -> float:
359
+ """Return BLOSUM50 score for element IDs *a* and *b*."""
360
+ if a == 0 or b == 0:
361
+ return self._gap_score
362
+ return self._matrix.get((a, b), 0.0)
@@ -0,0 +1,31 @@
1
+ # Matrix made by matblas from blosum100_3.iij
2
+ # * column uses minimum score
3
+ # BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
4
+ # Blocks Database = /data/blocks_5.0/blocks.dat
5
+ # Cluster Percentage: >= 100
6
+ # Entropy = 1.4516, Expected = -1.0948
7
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
8
+ A 8 -3 -4 -5 -2 -2 -3 -1 -4 -4 -4 -2 -3 -5 -2 1 -1 -6 -5 -2 -4 -2 -2 -10
9
+ R -3 10 -2 -5 -8 0 -2 -6 -1 -7 -6 3 -4 -6 -5 -3 -3 -7 -5 -6 -4 -1 -3 -10
10
+ N -4 -2 11 1 -5 -1 -2 -2 0 -7 -7 -1 -5 -7 -5 0 -1 -8 -5 -7 5 -2 -3 -10
11
+ D -5 -5 1 10 -8 -2 2 -4 -3 -8 -8 -3 -8 -8 -5 -2 -4 -10 -7 -8 6 0 -4 -10
12
+ C -2 -8 -5 -8 14 -7 -9 -7 -8 -3 -5 -8 -4 -4 -8 -3 -3 -7 -6 -3 -7 -8 -5 -10
13
+ Q -2 0 -1 -2 -7 11 2 -5 1 -6 -5 2 -2 -6 -4 -2 -3 -5 -4 -5 -2 5 -2 -10
14
+ E -3 -2 -2 2 -9 2 10 -6 -2 -7 -7 0 -5 -8 -4 -2 -3 -8 -7 -5 0 7 -3 -10
15
+ G -1 -6 -2 -4 -7 -5 -6 9 -6 -9 -8 -5 -7 -8 -6 -2 -5 -7 -8 -8 -3 -5 -4 -10
16
+ H -4 -1 0 -3 -8 1 -2 -6 13 -7 -6 -3 -5 -4 -5 -3 -4 -5 1 -7 -2 -1 -4 -10
17
+ I -4 -7 -7 -8 -3 -6 -7 -9 -7 8 2 -6 1 -2 -7 -5 -3 -6 -4 4 -8 -7 -3 -10
18
+ L -4 -6 -7 -8 -5 -5 -7 -8 -6 2 8 -6 3 0 -7 -6 -4 -5 -4 0 -8 -6 -3 -10
19
+ K -2 3 -1 -3 -8 2 0 -5 -3 -6 -6 10 -4 -6 -3 -2 -3 -8 -5 -5 -2 0 -3 -10
20
+ M -3 -4 -5 -8 -4 -2 -5 -7 -5 1 3 -4 12 -1 -5 -4 -2 -4 -5 0 -7 -4 -3 -10
21
+ F -5 -6 -7 -8 -4 -6 -8 -8 -4 -2 0 -6 -1 11 -7 -5 -5 0 4 -3 -7 -7 -4 -10
22
+ P -2 -5 -5 -5 -8 -4 -4 -6 -5 -7 -7 -3 -5 -7 12 -3 -4 -8 -7 -6 -5 -4 -4 -10
23
+ S 1 -3 0 -2 -3 -2 -2 -2 -3 -5 -6 -2 -4 -5 -3 9 2 -7 -5 -4 -1 -2 -2 -10
24
+ T -1 -3 -1 -4 -3 -3 -3 -5 -4 -3 -4 -3 -2 -5 -4 2 9 -7 -5 -1 -2 -3 -2 -10
25
+ W -6 -7 -8 -10 -7 -5 -8 -7 -5 -6 -5 -8 -4 0 -8 -7 -7 17 2 -5 -9 -7 -6 -10
26
+ Y -5 -5 -5 -7 -6 -4 -7 -8 1 -4 -4 -5 -5 4 -7 -5 -5 2 12 -5 -6 -6 -4 -10
27
+ V -2 -6 -7 -8 -3 -5 -5 -8 -7 4 0 -5 0 -3 -6 -4 -1 -5 -5 8 -7 -5 -3 -10
28
+ B -4 -4 5 6 -7 -2 0 -3 -2 -8 -8 -2 -7 -7 -5 -1 -2 -9 -6 -7 6 0 -4 -10
29
+ Z -2 -1 -2 0 -8 5 7 -5 -1 -7 -6 0 -4 -7 -4 -2 -3 -7 -6 -5 0 6 -2 -10
30
+ X -2 -3 -3 -4 -5 -2 -3 -4 -4 -3 -3 -3 -3 -4 -4 -2 -2 -6 -4 -3 -4 -2 -3 -10
31
+ * -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 1
@@ -0,0 +1,31 @@
1
+ # Matrix made by matblas from blosum50.iij
2
+ # * column uses minimum score
3
+ # BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
4
+ # Blocks Database = /data/blocks_5.0/blocks.dat
5
+ # Cluster Percentage: >= 50
6
+ # Entropy = 0.4808, Expected = -0.3573
7
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
8
+ A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -1 -1 -5
9
+ R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 0 -1 -5
10
+ N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 4 0 -1 -5
11
+ D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 5 1 -1 -5
12
+ C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2 -5
13
+ Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 4 -1 -5
14
+ E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 5 -1 -5
15
+ G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -2 -2 -5
16
+ H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 0 -1 -5
17
+ I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 -3 -1 -5
18
+ L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 -3 -1 -5
19
+ K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 1 -1 -5
20
+ M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 -1 -1 -5
21
+ F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 -4 -2 -5
22
+ P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2 -5
23
+ S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 0 -1 -5
24
+ T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 0 -5
25
+ W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -3 -5
26
+ Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -2 -1 -5
27
+ V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -4 -3 -1 -5
28
+ B -2 -1 4 5 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -4 5 2 -1 -5
29
+ Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 2 5 -1 -5
30
+ X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1 0 -3 -1 -1 -1 -1 -1 -5
31
+ * -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1