weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ """Similarity-based foreignness scorer.
2
+
3
+ Scores peptides based on minimum distance to reference k-mers
4
+ using substitution matrices (BLOSUM, PMBEC).
5
+ """
6
+
7
+ from typing import Any, Dict, Iterator, List, Literal, Optional, Sequence, Tuple, Union
8
+
9
+ import numpy as np
10
+
11
+ from .base import BatchScorer
12
+ from .reference import BaseReference
13
+ from .registry import register_scorer
14
+
15
+
16
+ DistanceMetric = Literal['min_distance', 'mean_distance', 'max_similarity']
17
+
18
+
19
+ def _get_matrix(name: str) -> Tuple[np.ndarray, Dict[str, int]]:
20
+ """Load a substitution matrix by name.
21
+
22
+ Parameters
23
+ ----------
24
+ name : str
25
+ Matrix name: 'blosum30', 'blosum50', 'blosum62', or 'pmbec'.
26
+
27
+ Returns
28
+ -------
29
+ matrix : np.ndarray
30
+ The substitution matrix.
31
+ aa_indices : dict
32
+ Mapping from amino acid letter to matrix index.
33
+ """
34
+ name = name.lower()
35
+
36
+ if name == 'blosum30':
37
+ from ..blosum import blosum30_matrix
38
+ matrix = blosum30_matrix
39
+ elif name == 'blosum50':
40
+ from ..blosum import blosum50_matrix
41
+ matrix = blosum50_matrix
42
+ elif name == 'blosum62':
43
+ from ..blosum import blosum62_matrix
44
+ matrix = blosum62_matrix
45
+ elif name == 'pmbec':
46
+ from ..pmbec import pmbec_matrix
47
+ matrix = pmbec_matrix
48
+ else:
49
+ raise ValueError(
50
+ f"Unknown matrix '{name}'. Available: blosum30, blosum50, blosum62, pmbec"
51
+ )
52
+
53
+ # Build amino acid index mapping (canonical 20 amino acids)
54
+ from ..amino_acid_alphabet import canonical_amino_acid_letters
55
+ aa_indices = {aa: i for i, aa in enumerate(canonical_amino_acid_letters)}
56
+
57
+ return matrix, aa_indices
58
+
59
+
60
+ @register_scorer('similarity', description='Similarity-based scoring using substitution matrices')
61
+ class SimilarityScorer(BatchScorer):
62
+ """Similarity-based foreignness scorer.
63
+
64
+ Scores peptides by computing the minimum distance (or related metric)
65
+ to reference k-mers using substitution matrices like BLOSUM or PMBEC.
66
+
67
+ Higher scores indicate more "foreign" peptides (more distant from reference).
68
+
69
+ Parameters
70
+ ----------
71
+ k : int, default=8
72
+ K-mer size for decomposing peptides.
73
+ matrix : str, default='blosum62'
74
+ Substitution matrix to use: 'blosum30', 'blosum50', 'blosum62', 'pmbec'.
75
+ distance_metric : str, default='min_distance'
76
+ How to compute distance:
77
+ - 'min_distance': minimum distance to any reference k-mer (default)
78
+ - 'mean_distance': mean distance to sampled reference k-mers
79
+ - 'max_similarity': negative of maximum similarity
80
+ max_candidates : int, default=1000
81
+ Maximum reference k-mers to compare against per query k-mer.
82
+ Used for efficiency when reference is large.
83
+ aggregate : str, default='mean'
84
+ How to aggregate k-mer distances: 'mean', 'max', 'min', 'sum'.
85
+
86
+ Example
87
+ -------
88
+ >>> ref = SwissProtReference(categories=['human']).load()
89
+ >>> scorer = SimilarityScorer(matrix='blosum62')
90
+ >>> scorer.fit(ref)
91
+ >>> scores = scorer.score(['MTMDKSEL', 'XXXXXXXX'])
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ k: int = 8,
97
+ matrix: str = 'blosum62',
98
+ distance_metric: DistanceMetric = 'min_distance',
99
+ max_candidates: int = 1000,
100
+ aggregate: str = 'mean',
101
+ batch_size: int = 1000,
102
+ **kwargs
103
+ ):
104
+ super().__init__(batch_size=batch_size, **kwargs)
105
+ self._params.update({
106
+ 'k': k,
107
+ 'matrix': matrix,
108
+ 'distance_metric': distance_metric,
109
+ 'max_candidates': max_candidates,
110
+ 'aggregate': aggregate,
111
+ })
112
+
113
+ # Load matrix
114
+ self._matrix, self._aa_indices = _get_matrix(matrix)
115
+ self._ref_kmers: Optional[List[str]] = None
116
+
117
+ @property
118
+ def k(self) -> int:
119
+ """Get k-mer size."""
120
+ return self._params['k']
121
+
122
+ @property
123
+ def matrix_name(self) -> str:
124
+ """Get substitution matrix name."""
125
+ return self._params['matrix']
126
+
127
+ @property
128
+ def distance_metric(self) -> DistanceMetric:
129
+ """Get distance metric."""
130
+ return self._params['distance_metric']
131
+
132
+ @property
133
+ def max_candidates(self) -> int:
134
+ """Get maximum candidates per k-mer."""
135
+ return self._params['max_candidates']
136
+
137
+ @property
138
+ def aggregate(self) -> str:
139
+ """Get aggregation method."""
140
+ return self._params['aggregate']
141
+
142
+ def fit(self, reference: BaseReference) -> 'SimilarityScorer':
143
+ """Fit the scorer to a reference dataset.
144
+
145
+ Parameters
146
+ ----------
147
+ reference : BaseReference
148
+ Reference dataset providing k-mers to compare against.
149
+
150
+ Returns
151
+ -------
152
+ self : SimilarityScorer
153
+ Returns self for method chaining.
154
+ """
155
+ if not reference.is_loaded:
156
+ raise RuntimeError(
157
+ "Reference is not loaded. Call reference.load() first."
158
+ )
159
+ self._reference = reference
160
+
161
+ # Cache reference k-mers for efficient lookup
162
+ # For large references, we sample to max_candidates
163
+ all_kmers = list(reference.iter_kmers())
164
+
165
+ if len(all_kmers) <= self.max_candidates:
166
+ self._ref_kmers = all_kmers
167
+ else:
168
+ # Sample reference k-mers
169
+ import random
170
+ self._ref_kmers = random.sample(all_kmers, self.max_candidates)
171
+
172
+ self._is_fitted = True
173
+ return self
174
+
175
+ def score(self, peptides: Union[str, Sequence[str]]) -> np.ndarray:
176
+ """Score peptide(s) for foreignness.
177
+
178
+ Parameters
179
+ ----------
180
+ peptides : str or sequence of str
181
+ Single peptide or list of peptides to score.
182
+
183
+ Returns
184
+ -------
185
+ scores : np.ndarray
186
+ Array of foreignness scores. Higher = more foreign/distant.
187
+ """
188
+ self._check_is_fitted()
189
+ peptides = self._ensure_list(peptides)
190
+
191
+ scores = np.array([self._score_peptide(p) for p in peptides])
192
+ return scores
193
+
194
+ def _score_peptide(self, peptide: str) -> float:
195
+ """Score a single peptide.
196
+
197
+ Parameters
198
+ ----------
199
+ peptide : str
200
+ Peptide sequence.
201
+
202
+ Returns
203
+ -------
204
+ score : float
205
+ Foreignness score (distance-based).
206
+ """
207
+ k = self.k
208
+ if len(peptide) < k:
209
+ return float('inf')
210
+
211
+ # Extract k-mers
212
+ kmers = [peptide[i:i+k] for i in range(len(peptide) - k + 1)]
213
+
214
+ # Score each k-mer
215
+ kmer_distances = []
216
+ for kmer in kmers:
217
+ dist = self._kmer_distance(kmer)
218
+ kmer_distances.append(dist)
219
+
220
+ # Aggregate distances
221
+ kmer_distances = np.array(kmer_distances)
222
+ return self._aggregate_scores(kmer_distances)
223
+
224
+ def _kmer_distance(self, kmer: str) -> float:
225
+ """Compute distance of a k-mer to reference.
226
+
227
+ Parameters
228
+ ----------
229
+ kmer : str
230
+ K-mer to score.
231
+
232
+ Returns
233
+ -------
234
+ distance : float
235
+ Distance to reference (metric depends on distance_metric setting).
236
+ """
237
+ if not self._ref_kmers:
238
+ return float('inf')
239
+
240
+ metric = self.distance_metric
241
+
242
+ if metric == 'min_distance':
243
+ min_dist = float('inf')
244
+ for ref_kmer in self._ref_kmers:
245
+ dist = self._sequence_distance(kmer, ref_kmer)
246
+ if dist < min_dist:
247
+ min_dist = dist
248
+ if dist == 0:
249
+ break # Can't get better than 0
250
+ return min_dist
251
+
252
+ elif metric == 'mean_distance':
253
+ distances = [
254
+ self._sequence_distance(kmer, ref_kmer)
255
+ for ref_kmer in self._ref_kmers
256
+ ]
257
+ return np.mean(distances)
258
+
259
+ elif metric == 'max_similarity':
260
+ max_sim = float('-inf')
261
+ for ref_kmer in self._ref_kmers:
262
+ sim = self._sequence_similarity(kmer, ref_kmer)
263
+ if sim > max_sim:
264
+ max_sim = sim
265
+ # Return negative similarity (so higher = more foreign)
266
+ return -max_sim
267
+
268
+ else:
269
+ raise ValueError(f"Unknown distance metric: {metric}")
270
+
271
+ def _sequence_distance(self, seq1: str, seq2: str) -> float:
272
+ """Compute distance between two sequences.
273
+
274
+ Distance is computed as sum of (max_score - pairwise_score)
275
+ for each position.
276
+
277
+ Parameters
278
+ ----------
279
+ seq1, seq2 : str
280
+ Sequences to compare (same length).
281
+
282
+ Returns
283
+ -------
284
+ distance : float
285
+ Total distance between sequences.
286
+ """
287
+ if len(seq1) != len(seq2):
288
+ return float('inf')
289
+
290
+ total_dist = 0.0
291
+ for a, b in zip(seq1, seq2):
292
+ idx_a = self._aa_indices.get(a)
293
+ idx_b = self._aa_indices.get(b)
294
+
295
+ if idx_a is None or idx_b is None:
296
+ # Unknown amino acid - maximum penalty
297
+ total_dist += 10.0 # Arbitrary high penalty
298
+ continue
299
+
300
+ # Get similarity score
301
+ score = self._matrix[idx_a, idx_b]
302
+
303
+ # Get max possible score (diagonal)
304
+ max_score = self._matrix[idx_a, idx_a]
305
+
306
+ # Distance is gap from max score
307
+ total_dist += max_score - score
308
+
309
+ return total_dist
310
+
311
+ def _sequence_similarity(self, seq1: str, seq2: str) -> float:
312
+ """Compute similarity between two sequences.
313
+
314
+ Parameters
315
+ ----------
316
+ seq1, seq2 : str
317
+ Sequences to compare (same length).
318
+
319
+ Returns
320
+ -------
321
+ similarity : float
322
+ Total similarity score.
323
+ """
324
+ if len(seq1) != len(seq2):
325
+ return float('-inf')
326
+
327
+ total_sim = 0.0
328
+ for a, b in zip(seq1, seq2):
329
+ idx_a = self._aa_indices.get(a)
330
+ idx_b = self._aa_indices.get(b)
331
+
332
+ if idx_a is None or idx_b is None:
333
+ total_sim -= 10.0 # Penalty for unknown
334
+ continue
335
+
336
+ total_sim += self._matrix[idx_a, idx_b]
337
+
338
+ return total_sim
339
+
340
+ def _aggregate_scores(self, scores: np.ndarray) -> float:
341
+ """Aggregate k-mer scores into a single score."""
342
+ if len(scores) == 0:
343
+ return float('inf')
344
+
345
+ agg = self.aggregate
346
+ if agg == 'mean':
347
+ return float(np.mean(scores))
348
+ elif agg == 'max':
349
+ return float(np.max(scores))
350
+ elif agg == 'min':
351
+ return float(np.min(scores))
352
+ elif agg == 'sum':
353
+ return float(np.sum(scores))
354
+ else:
355
+ raise ValueError(f"Unknown aggregation method: {agg}")
356
+
357
+ def get_closest_reference(self, kmer: str, n: int = 5) -> List[Tuple[str, float]]:
358
+ """Find closest reference k-mers to a query k-mer.
359
+
360
+ Useful for understanding why a k-mer has a particular score.
361
+
362
+ Parameters
363
+ ----------
364
+ kmer : str
365
+ K-mer to find matches for.
366
+ n : int, default=5
367
+ Number of closest matches to return.
368
+
369
+ Returns
370
+ -------
371
+ matches : list of (str, float)
372
+ List of (reference_kmer, distance) tuples, sorted by distance.
373
+ """
374
+ self._check_is_fitted()
375
+
376
+ if not self._ref_kmers:
377
+ return []
378
+
379
+ distances = []
380
+ for ref_kmer in self._ref_kmers:
381
+ dist = self._sequence_distance(kmer, ref_kmer)
382
+ distances.append((ref_kmer, dist))
383
+
384
+ # Sort by distance and return top n
385
+ distances.sort(key=lambda x: x[1])
386
+ return distances[:n]