weirdo 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weirdo/__init__.py +104 -0
- weirdo/amino_acid.py +33 -0
- weirdo/amino_acid_alphabet.py +158 -0
- weirdo/amino_acid_properties.py +358 -0
- weirdo/api.py +372 -0
- weirdo/blosum.py +74 -0
- weirdo/chou_fasman.py +73 -0
- weirdo/cli.py +597 -0
- weirdo/common.py +22 -0
- weirdo/data_manager.py +475 -0
- weirdo/distances.py +16 -0
- weirdo/matrices/BLOSUM30 +25 -0
- weirdo/matrices/BLOSUM50 +21 -0
- weirdo/matrices/BLOSUM62 +27 -0
- weirdo/matrices/__init__.py +0 -0
- weirdo/matrices/amino_acid_properties.txt +829 -0
- weirdo/matrices/helix_vs_coil.txt +28 -0
- weirdo/matrices/helix_vs_strand.txt +27 -0
- weirdo/matrices/pmbec.mat +21 -0
- weirdo/matrices/strand_vs_coil.txt +27 -0
- weirdo/model_manager.py +346 -0
- weirdo/peptide_vectorizer.py +78 -0
- weirdo/pmbec.py +85 -0
- weirdo/reduced_alphabet.py +61 -0
- weirdo/residue_contact_energies.py +74 -0
- weirdo/scorers/__init__.py +95 -0
- weirdo/scorers/base.py +223 -0
- weirdo/scorers/config.py +299 -0
- weirdo/scorers/mlp.py +1126 -0
- weirdo/scorers/reference.py +265 -0
- weirdo/scorers/registry.py +282 -0
- weirdo/scorers/similarity.py +386 -0
- weirdo/scorers/swissprot.py +510 -0
- weirdo/scorers/trainable.py +219 -0
- weirdo/static_data.py +17 -0
- weirdo-2.1.0.dist-info/METADATA +294 -0
- weirdo-2.1.0.dist-info/RECORD +41 -0
- weirdo-2.1.0.dist-info/WHEEL +5 -0
- weirdo-2.1.0.dist-info/entry_points.txt +2 -0
- weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
- weirdo-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""Similarity-based foreignness scorer.
|
|
2
|
+
|
|
3
|
+
Scores peptides based on minimum distance to reference k-mers
|
|
4
|
+
using substitution matrices (BLOSUM, PMBEC).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, Iterator, List, Literal, Optional, Sequence, Tuple, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from .base import BatchScorer
|
|
12
|
+
from .reference import BaseReference
|
|
13
|
+
from .registry import register_scorer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
DistanceMetric = Literal['min_distance', 'mean_distance', 'max_similarity']
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_matrix(name: str) -> Tuple[np.ndarray, Dict[str, int]]:
|
|
20
|
+
"""Load a substitution matrix by name.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
name : str
|
|
25
|
+
Matrix name: 'blosum30', 'blosum50', 'blosum62', or 'pmbec'.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
matrix : np.ndarray
|
|
30
|
+
The substitution matrix.
|
|
31
|
+
aa_indices : dict
|
|
32
|
+
Mapping from amino acid letter to matrix index.
|
|
33
|
+
"""
|
|
34
|
+
name = name.lower()
|
|
35
|
+
|
|
36
|
+
if name == 'blosum30':
|
|
37
|
+
from ..blosum import blosum30_matrix
|
|
38
|
+
matrix = blosum30_matrix
|
|
39
|
+
elif name == 'blosum50':
|
|
40
|
+
from ..blosum import blosum50_matrix
|
|
41
|
+
matrix = blosum50_matrix
|
|
42
|
+
elif name == 'blosum62':
|
|
43
|
+
from ..blosum import blosum62_matrix
|
|
44
|
+
matrix = blosum62_matrix
|
|
45
|
+
elif name == 'pmbec':
|
|
46
|
+
from ..pmbec import pmbec_matrix
|
|
47
|
+
matrix = pmbec_matrix
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Unknown matrix '{name}'. Available: blosum30, blosum50, blosum62, pmbec"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Build amino acid index mapping (canonical 20 amino acids)
|
|
54
|
+
from ..amino_acid_alphabet import canonical_amino_acid_letters
|
|
55
|
+
aa_indices = {aa: i for i, aa in enumerate(canonical_amino_acid_letters)}
|
|
56
|
+
|
|
57
|
+
return matrix, aa_indices
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@register_scorer('similarity', description='Similarity-based scoring using substitution matrices')
|
|
61
|
+
class SimilarityScorer(BatchScorer):
|
|
62
|
+
"""Similarity-based foreignness scorer.
|
|
63
|
+
|
|
64
|
+
Scores peptides by computing the minimum distance (or related metric)
|
|
65
|
+
to reference k-mers using substitution matrices like BLOSUM or PMBEC.
|
|
66
|
+
|
|
67
|
+
Higher scores indicate more "foreign" peptides (more distant from reference).
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
k : int, default=8
|
|
72
|
+
K-mer size for decomposing peptides.
|
|
73
|
+
matrix : str, default='blosum62'
|
|
74
|
+
Substitution matrix to use: 'blosum30', 'blosum50', 'blosum62', 'pmbec'.
|
|
75
|
+
distance_metric : str, default='min_distance'
|
|
76
|
+
How to compute distance:
|
|
77
|
+
- 'min_distance': minimum distance to any reference k-mer (default)
|
|
78
|
+
- 'mean_distance': mean distance to sampled reference k-mers
|
|
79
|
+
- 'max_similarity': negative of maximum similarity
|
|
80
|
+
max_candidates : int, default=1000
|
|
81
|
+
Maximum reference k-mers to compare against per query k-mer.
|
|
82
|
+
Used for efficiency when reference is large.
|
|
83
|
+
aggregate : str, default='mean'
|
|
84
|
+
How to aggregate k-mer distances: 'mean', 'max', 'min', 'sum'.
|
|
85
|
+
|
|
86
|
+
Example
|
|
87
|
+
-------
|
|
88
|
+
>>> ref = SwissProtReference(categories=['human']).load()
|
|
89
|
+
>>> scorer = SimilarityScorer(matrix='blosum62')
|
|
90
|
+
>>> scorer.fit(ref)
|
|
91
|
+
>>> scores = scorer.score(['MTMDKSEL', 'XXXXXXXX'])
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(
|
|
95
|
+
self,
|
|
96
|
+
k: int = 8,
|
|
97
|
+
matrix: str = 'blosum62',
|
|
98
|
+
distance_metric: DistanceMetric = 'min_distance',
|
|
99
|
+
max_candidates: int = 1000,
|
|
100
|
+
aggregate: str = 'mean',
|
|
101
|
+
batch_size: int = 1000,
|
|
102
|
+
**kwargs
|
|
103
|
+
):
|
|
104
|
+
super().__init__(batch_size=batch_size, **kwargs)
|
|
105
|
+
self._params.update({
|
|
106
|
+
'k': k,
|
|
107
|
+
'matrix': matrix,
|
|
108
|
+
'distance_metric': distance_metric,
|
|
109
|
+
'max_candidates': max_candidates,
|
|
110
|
+
'aggregate': aggregate,
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
# Load matrix
|
|
114
|
+
self._matrix, self._aa_indices = _get_matrix(matrix)
|
|
115
|
+
self._ref_kmers: Optional[List[str]] = None
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def k(self) -> int:
|
|
119
|
+
"""Get k-mer size."""
|
|
120
|
+
return self._params['k']
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def matrix_name(self) -> str:
|
|
124
|
+
"""Get substitution matrix name."""
|
|
125
|
+
return self._params['matrix']
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def distance_metric(self) -> DistanceMetric:
|
|
129
|
+
"""Get distance metric."""
|
|
130
|
+
return self._params['distance_metric']
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def max_candidates(self) -> int:
|
|
134
|
+
"""Get maximum candidates per k-mer."""
|
|
135
|
+
return self._params['max_candidates']
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def aggregate(self) -> str:
|
|
139
|
+
"""Get aggregation method."""
|
|
140
|
+
return self._params['aggregate']
|
|
141
|
+
|
|
142
|
+
def fit(self, reference: BaseReference) -> 'SimilarityScorer':
|
|
143
|
+
"""Fit the scorer to a reference dataset.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
reference : BaseReference
|
|
148
|
+
Reference dataset providing k-mers to compare against.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
self : SimilarityScorer
|
|
153
|
+
Returns self for method chaining.
|
|
154
|
+
"""
|
|
155
|
+
if not reference.is_loaded:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
"Reference is not loaded. Call reference.load() first."
|
|
158
|
+
)
|
|
159
|
+
self._reference = reference
|
|
160
|
+
|
|
161
|
+
# Cache reference k-mers for efficient lookup
|
|
162
|
+
# For large references, we sample to max_candidates
|
|
163
|
+
all_kmers = list(reference.iter_kmers())
|
|
164
|
+
|
|
165
|
+
if len(all_kmers) <= self.max_candidates:
|
|
166
|
+
self._ref_kmers = all_kmers
|
|
167
|
+
else:
|
|
168
|
+
# Sample reference k-mers
|
|
169
|
+
import random
|
|
170
|
+
self._ref_kmers = random.sample(all_kmers, self.max_candidates)
|
|
171
|
+
|
|
172
|
+
self._is_fitted = True
|
|
173
|
+
return self
|
|
174
|
+
|
|
175
|
+
def score(self, peptides: Union[str, Sequence[str]]) -> np.ndarray:
|
|
176
|
+
"""Score peptide(s) for foreignness.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
peptides : str or sequence of str
|
|
181
|
+
Single peptide or list of peptides to score.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
scores : np.ndarray
|
|
186
|
+
Array of foreignness scores. Higher = more foreign/distant.
|
|
187
|
+
"""
|
|
188
|
+
self._check_is_fitted()
|
|
189
|
+
peptides = self._ensure_list(peptides)
|
|
190
|
+
|
|
191
|
+
scores = np.array([self._score_peptide(p) for p in peptides])
|
|
192
|
+
return scores
|
|
193
|
+
|
|
194
|
+
def _score_peptide(self, peptide: str) -> float:
|
|
195
|
+
"""Score a single peptide.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
peptide : str
|
|
200
|
+
Peptide sequence.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
score : float
|
|
205
|
+
Foreignness score (distance-based).
|
|
206
|
+
"""
|
|
207
|
+
k = self.k
|
|
208
|
+
if len(peptide) < k:
|
|
209
|
+
return float('inf')
|
|
210
|
+
|
|
211
|
+
# Extract k-mers
|
|
212
|
+
kmers = [peptide[i:i+k] for i in range(len(peptide) - k + 1)]
|
|
213
|
+
|
|
214
|
+
# Score each k-mer
|
|
215
|
+
kmer_distances = []
|
|
216
|
+
for kmer in kmers:
|
|
217
|
+
dist = self._kmer_distance(kmer)
|
|
218
|
+
kmer_distances.append(dist)
|
|
219
|
+
|
|
220
|
+
# Aggregate distances
|
|
221
|
+
kmer_distances = np.array(kmer_distances)
|
|
222
|
+
return self._aggregate_scores(kmer_distances)
|
|
223
|
+
|
|
224
|
+
def _kmer_distance(self, kmer: str) -> float:
|
|
225
|
+
"""Compute distance of a k-mer to reference.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
kmer : str
|
|
230
|
+
K-mer to score.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
distance : float
|
|
235
|
+
Distance to reference (metric depends on distance_metric setting).
|
|
236
|
+
"""
|
|
237
|
+
if not self._ref_kmers:
|
|
238
|
+
return float('inf')
|
|
239
|
+
|
|
240
|
+
metric = self.distance_metric
|
|
241
|
+
|
|
242
|
+
if metric == 'min_distance':
|
|
243
|
+
min_dist = float('inf')
|
|
244
|
+
for ref_kmer in self._ref_kmers:
|
|
245
|
+
dist = self._sequence_distance(kmer, ref_kmer)
|
|
246
|
+
if dist < min_dist:
|
|
247
|
+
min_dist = dist
|
|
248
|
+
if dist == 0:
|
|
249
|
+
break # Can't get better than 0
|
|
250
|
+
return min_dist
|
|
251
|
+
|
|
252
|
+
elif metric == 'mean_distance':
|
|
253
|
+
distances = [
|
|
254
|
+
self._sequence_distance(kmer, ref_kmer)
|
|
255
|
+
for ref_kmer in self._ref_kmers
|
|
256
|
+
]
|
|
257
|
+
return np.mean(distances)
|
|
258
|
+
|
|
259
|
+
elif metric == 'max_similarity':
|
|
260
|
+
max_sim = float('-inf')
|
|
261
|
+
for ref_kmer in self._ref_kmers:
|
|
262
|
+
sim = self._sequence_similarity(kmer, ref_kmer)
|
|
263
|
+
if sim > max_sim:
|
|
264
|
+
max_sim = sim
|
|
265
|
+
# Return negative similarity (so higher = more foreign)
|
|
266
|
+
return -max_sim
|
|
267
|
+
|
|
268
|
+
else:
|
|
269
|
+
raise ValueError(f"Unknown distance metric: {metric}")
|
|
270
|
+
|
|
271
|
+
def _sequence_distance(self, seq1: str, seq2: str) -> float:
|
|
272
|
+
"""Compute distance between two sequences.
|
|
273
|
+
|
|
274
|
+
Distance is computed as sum of (max_score - pairwise_score)
|
|
275
|
+
for each position.
|
|
276
|
+
|
|
277
|
+
Parameters
|
|
278
|
+
----------
|
|
279
|
+
seq1, seq2 : str
|
|
280
|
+
Sequences to compare (same length).
|
|
281
|
+
|
|
282
|
+
Returns
|
|
283
|
+
-------
|
|
284
|
+
distance : float
|
|
285
|
+
Total distance between sequences.
|
|
286
|
+
"""
|
|
287
|
+
if len(seq1) != len(seq2):
|
|
288
|
+
return float('inf')
|
|
289
|
+
|
|
290
|
+
total_dist = 0.0
|
|
291
|
+
for a, b in zip(seq1, seq2):
|
|
292
|
+
idx_a = self._aa_indices.get(a)
|
|
293
|
+
idx_b = self._aa_indices.get(b)
|
|
294
|
+
|
|
295
|
+
if idx_a is None or idx_b is None:
|
|
296
|
+
# Unknown amino acid - maximum penalty
|
|
297
|
+
total_dist += 10.0 # Arbitrary high penalty
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
# Get similarity score
|
|
301
|
+
score = self._matrix[idx_a, idx_b]
|
|
302
|
+
|
|
303
|
+
# Get max possible score (diagonal)
|
|
304
|
+
max_score = self._matrix[idx_a, idx_a]
|
|
305
|
+
|
|
306
|
+
# Distance is gap from max score
|
|
307
|
+
total_dist += max_score - score
|
|
308
|
+
|
|
309
|
+
return total_dist
|
|
310
|
+
|
|
311
|
+
def _sequence_similarity(self, seq1: str, seq2: str) -> float:
|
|
312
|
+
"""Compute similarity between two sequences.
|
|
313
|
+
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
seq1, seq2 : str
|
|
317
|
+
Sequences to compare (same length).
|
|
318
|
+
|
|
319
|
+
Returns
|
|
320
|
+
-------
|
|
321
|
+
similarity : float
|
|
322
|
+
Total similarity score.
|
|
323
|
+
"""
|
|
324
|
+
if len(seq1) != len(seq2):
|
|
325
|
+
return float('-inf')
|
|
326
|
+
|
|
327
|
+
total_sim = 0.0
|
|
328
|
+
for a, b in zip(seq1, seq2):
|
|
329
|
+
idx_a = self._aa_indices.get(a)
|
|
330
|
+
idx_b = self._aa_indices.get(b)
|
|
331
|
+
|
|
332
|
+
if idx_a is None or idx_b is None:
|
|
333
|
+
total_sim -= 10.0 # Penalty for unknown
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
total_sim += self._matrix[idx_a, idx_b]
|
|
337
|
+
|
|
338
|
+
return total_sim
|
|
339
|
+
|
|
340
|
+
def _aggregate_scores(self, scores: np.ndarray) -> float:
|
|
341
|
+
"""Aggregate k-mer scores into a single score."""
|
|
342
|
+
if len(scores) == 0:
|
|
343
|
+
return float('inf')
|
|
344
|
+
|
|
345
|
+
agg = self.aggregate
|
|
346
|
+
if agg == 'mean':
|
|
347
|
+
return float(np.mean(scores))
|
|
348
|
+
elif agg == 'max':
|
|
349
|
+
return float(np.max(scores))
|
|
350
|
+
elif agg == 'min':
|
|
351
|
+
return float(np.min(scores))
|
|
352
|
+
elif agg == 'sum':
|
|
353
|
+
return float(np.sum(scores))
|
|
354
|
+
else:
|
|
355
|
+
raise ValueError(f"Unknown aggregation method: {agg}")
|
|
356
|
+
|
|
357
|
+
def get_closest_reference(self, kmer: str, n: int = 5) -> List[Tuple[str, float]]:
|
|
358
|
+
"""Find closest reference k-mers to a query k-mer.
|
|
359
|
+
|
|
360
|
+
Useful for understanding why a k-mer has a particular score.
|
|
361
|
+
|
|
362
|
+
Parameters
|
|
363
|
+
----------
|
|
364
|
+
kmer : str
|
|
365
|
+
K-mer to find matches for.
|
|
366
|
+
n : int, default=5
|
|
367
|
+
Number of closest matches to return.
|
|
368
|
+
|
|
369
|
+
Returns
|
|
370
|
+
-------
|
|
371
|
+
matches : list of (str, float)
|
|
372
|
+
List of (reference_kmer, distance) tuples, sorted by distance.
|
|
373
|
+
"""
|
|
374
|
+
self._check_is_fitted()
|
|
375
|
+
|
|
376
|
+
if not self._ref_kmers:
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
distances = []
|
|
380
|
+
for ref_kmer in self._ref_kmers:
|
|
381
|
+
dist = self._sequence_distance(kmer, ref_kmer)
|
|
382
|
+
distances.append((ref_kmer, dist))
|
|
383
|
+
|
|
384
|
+
# Sort by distance and return top n
|
|
385
|
+
distances.sort(key=lambda x: x[1])
|
|
386
|
+
return distances[:n]
|