weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
weirdo/scorers/mlp.py ADDED
@@ -0,0 +1,1126 @@
1
+ """MLP-based origin scorer.
2
+
3
+ Neural network model for learning category probabilities from labeled data.
4
+ Uses rich peptide features including amino acid properties and composition
5
+ statistics.
6
+ """
7
+
8
+ import pickle
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional, Sequence, Tuple, Union
11
+
12
+ import numpy as np
13
+ from sklearn.neural_network import MLPRegressor
14
+ from sklearn.preprocessing import StandardScaler
15
+
16
+ from .trainable import TrainableScorer
17
+ from .registry import register_scorer
18
+ from ..reduced_alphabet import alphabets as REDUCED_ALPHABETS
19
+
20
+
21
+ # Amino acid to index mapping
22
+ AA_TO_IDX = {
23
+ 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
24
+ 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
25
+ 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
26
+ 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
27
+ 'X': 20, # Unknown/padding
28
+ }
29
+ AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'
30
+ NUM_AMINO_ACIDS = 21
31
+
32
+ # Amino acid categories for derived features
33
+ POSITIVE_CHARGED = set('KRH') # Basic residues (H partially charged at pH 7)
34
+ NEGATIVE_CHARGED = set('DE') # Acidic residues
35
+ HYDROPHOBIC = set('AILMFVPWG')
36
+ AROMATIC = set('FWY')
37
+ ALIPHATIC = set('AVILM')
38
+ POLAR_UNCHARGED = set('STNQ')
39
+ TINY = set('AGS')
40
+ SMALL = set('AGSCTDNPV')
41
+ DISORDER_PROMOTING = set('AEGRQSKP') # Disorder-promoting residues
42
+ ORDER_PROMOTING = set('WFYILMVC') # Order-promoting residues
43
+
44
+ # Chou-Fasman secondary structure propensities
45
+ HELIX_PROPENSITY = {
46
+ 'A': 1.42, 'C': 0.70, 'D': 1.01, 'E': 1.51, 'F': 1.13,
47
+ 'G': 0.57, 'H': 1.00, 'I': 1.08, 'K': 1.16, 'L': 1.21,
48
+ 'M': 1.45, 'N': 0.67, 'P': 0.57, 'Q': 1.11, 'R': 0.98,
49
+ 'S': 0.77, 'T': 0.83, 'V': 1.06, 'W': 1.08, 'Y': 0.69,
50
+ }
51
+ SHEET_PROPENSITY = {
52
+ 'A': 0.83, 'C': 1.19, 'D': 0.54, 'E': 0.37, 'F': 1.38,
53
+ 'G': 0.75, 'H': 0.87, 'I': 1.60, 'K': 0.74, 'L': 1.30,
54
+ 'M': 1.05, 'N': 0.89, 'P': 0.55, 'Q': 1.10, 'R': 0.93,
55
+ 'S': 0.75, 'T': 1.19, 'V': 1.70, 'W': 1.37, 'Y': 1.47,
56
+ }
57
+ TURN_PROPENSITY = {
58
+ 'A': 0.66, 'C': 1.19, 'D': 1.46, 'E': 0.74, 'F': 0.60,
59
+ 'G': 1.56, 'H': 0.95, 'I': 0.47, 'K': 1.01, 'L': 0.59,
60
+ 'M': 0.60, 'N': 1.56, 'P': 1.52, 'Q': 0.98, 'R': 0.95,
61
+ 'S': 1.43, 'T': 0.96, 'V': 0.50, 'W': 0.96, 'Y': 1.14,
62
+ }
63
+
64
+
65
+ def _get_aa_properties() -> Dict[str, Dict[str, float]]:
66
+ """Load all amino acid property dictionaries."""
67
+ from ..amino_acid_properties import (
68
+ accessible_surface_area,
69
+ accessible_surface_area_folded,
70
+ hydropathy,
71
+ hydrophilicity,
72
+ local_flexibility,
73
+ mass,
74
+ pK_side_chain,
75
+ polarity,
76
+ prct_exposed_residues,
77
+ refractivity,
78
+ solvent_exposed_area,
79
+ volume,
80
+ )
81
+ return {
82
+ 'accessible_surface_area': accessible_surface_area,
83
+ 'accessible_surface_area_folded': accessible_surface_area_folded,
84
+ 'hydropathy': hydropathy,
85
+ 'hydrophilicity': hydrophilicity,
86
+ 'local_flexibility': local_flexibility,
87
+ 'mass': mass,
88
+ 'pK_side_chain': pK_side_chain,
89
+ 'polarity': polarity,
90
+ 'prct_exposed_residues': prct_exposed_residues,
91
+ 'refractivity': refractivity,
92
+ 'solvent_exposed_area': solvent_exposed_area,
93
+ 'volume': volume,
94
+ }
95
+
96
+
97
+ def _compute_property_features(peptide: str, properties: Dict[str, Dict[str, float]]) -> np.ndarray:
98
+ """Compute aggregate statistics from amino acid properties.
99
+
100
+ For each property, compute: mean, std, min, max over the peptide.
101
+ Returns array of shape (n_properties * 4,).
102
+ """
103
+ features = []
104
+
105
+ for prop_name, prop_dict in properties.items():
106
+ # Get property values for each residue
107
+ values = [prop_dict[aa] for aa in peptide if aa in prop_dict]
108
+
109
+ if values:
110
+ features.extend([
111
+ np.mean(values),
112
+ np.std(values),
113
+ np.min(values),
114
+ np.max(values),
115
+ ])
116
+ else:
117
+ # Unknown amino acids - use zeros
118
+ features.extend([0.0, 0.0, 0.0, 0.0])
119
+
120
+ return np.array(features, dtype=np.float32)
121
+
122
+
123
+ def _compute_composition_features(peptide: str) -> np.ndarray:
124
+ """Compute amino acid composition (frequencies).
125
+
126
+ Returns array of shape (20,) with frequency of each amino acid.
127
+ """
128
+ counts = np.zeros(20, dtype=np.float32)
129
+ for aa in peptide:
130
+ idx = AMINO_ACIDS.find(aa)
131
+ if idx >= 0:
132
+ counts[idx] += 1
133
+
134
+ # Normalize to frequencies
135
+ if len(peptide) > 0:
136
+ counts /= len(peptide)
137
+
138
+ return counts
139
+
140
+
141
+ def _compute_dipeptide_features(peptide: str) -> np.ndarray:
142
+ """Compute dipeptide composition (frequencies of AA pairs).
143
+
144
+ Returns array of shape (400,) with frequency of each dipeptide.
145
+ """
146
+ counts = np.zeros(400, dtype=np.float32) # 20 * 20
147
+
148
+ for i in range(len(peptide) - 1):
149
+ aa1_idx = AMINO_ACIDS.find(peptide[i])
150
+ aa2_idx = AMINO_ACIDS.find(peptide[i + 1])
151
+ if aa1_idx >= 0 and aa2_idx >= 0:
152
+ counts[aa1_idx * 20 + aa2_idx] += 1
153
+
154
+ # Normalize to frequencies
155
+ n_dipeptides = max(1, len(peptide) - 1)
156
+ counts /= n_dipeptides
157
+
158
+ return counts
159
+
160
+
161
+ def _compute_structural_features(peptide: str) -> np.ndarray:
162
+ """Compute structural and physicochemical category features.
163
+
164
+ Returns array with:
165
+ - Secondary structure propensities (helix, sheet, turn) - 12 features (3 props × 4 stats)
166
+ - Category fractions (9 features: charged+/-, hydrophobic, aromatic, etc.)
167
+ - Charge features (4 features: net charge, charge density, transitions, clusters)
168
+ - Disorder features (2 features: disorder/order promoting ratios)
169
+
170
+ Total: 27 features
171
+ """
172
+ n = len(peptide) if peptide else 1
173
+ features = []
174
+
175
+ # Secondary structure propensities - mean, std, min, max for each
176
+ for prop_dict in [HELIX_PROPENSITY, SHEET_PROPENSITY, TURN_PROPENSITY]:
177
+ values = [prop_dict.get(aa, 1.0) for aa in peptide if aa in prop_dict]
178
+ if values:
179
+ features.extend([np.mean(values), np.std(values), np.min(values), np.max(values)])
180
+ else:
181
+ features.extend([1.0, 0.0, 1.0, 1.0])
182
+
183
+ # Category fractions (9 features)
184
+ features.append(sum(1 for aa in peptide if aa in POSITIVE_CHARGED) / n) # Positive charged
185
+ features.append(sum(1 for aa in peptide if aa in NEGATIVE_CHARGED) / n) # Negative charged
186
+ features.append(sum(1 for aa in peptide if aa in HYDROPHOBIC) / n) # Hydrophobic
187
+ features.append(sum(1 for aa in peptide if aa in AROMATIC) / n) # Aromatic
188
+ features.append(sum(1 for aa in peptide if aa in ALIPHATIC) / n) # Aliphatic
189
+ features.append(sum(1 for aa in peptide if aa in POLAR_UNCHARGED) / n) # Polar uncharged
190
+ features.append(sum(1 for aa in peptide if aa in TINY) / n) # Tiny
191
+ features.append(sum(1 for aa in peptide if aa in SMALL) / n) # Small
192
+ features.append(sum(1 for aa in peptide if aa == 'C') / n) # Cysteine (viral)
193
+
194
+ # Charge features (4 features)
195
+ pos_count = sum(1 for aa in peptide if aa in POSITIVE_CHARGED)
196
+ neg_count = sum(1 for aa in peptide if aa in NEGATIVE_CHARGED)
197
+ net_charge = pos_count - neg_count
198
+ features.append(net_charge / n) # Net charge per residue
199
+
200
+ # Charge transitions (+ to - or - to +)
201
+ transitions = 0
202
+ for i in range(len(peptide) - 1):
203
+ curr_pos = peptide[i] in POSITIVE_CHARGED
204
+ curr_neg = peptide[i] in NEGATIVE_CHARGED
205
+ next_pos = peptide[i+1] in POSITIVE_CHARGED
206
+ next_neg = peptide[i+1] in NEGATIVE_CHARGED
207
+ if (curr_pos and next_neg) or (curr_neg and next_pos):
208
+ transitions += 1
209
+ features.append(transitions / max(1, n - 1)) # Charge transitions
210
+
211
+ # Charge clustering - max consecutive same-sign charges
212
+ max_cluster = 0
213
+ current_cluster = 0
214
+ current_sign = None
215
+ for aa in peptide:
216
+ if aa in POSITIVE_CHARGED:
217
+ sign = '+'
218
+ elif aa in NEGATIVE_CHARGED:
219
+ sign = '-'
220
+ else:
221
+ sign = None
222
+ if sign and sign == current_sign:
223
+ current_cluster += 1
224
+ elif sign:
225
+ current_cluster = 1
226
+ current_sign = sign
227
+ else:
228
+ current_cluster = 0
229
+ current_sign = None
230
+ max_cluster = max(max_cluster, current_cluster)
231
+ features.append(max_cluster / n) # Max charge cluster size
232
+
233
+ # Arginine depletion (viruses often have less R) - R/(R+K) ratio
234
+ r_count = sum(1 for aa in peptide if aa == 'R')
235
+ k_count = sum(1 for aa in peptide if aa == 'K')
236
+ if r_count + k_count > 0:
237
+ features.append(r_count / (r_count + k_count))
238
+ else:
239
+ features.append(0.5) # Neutral when no R or K present
240
+
241
+ # Disorder features (2 features)
242
+ disorder_promoting = sum(1 for aa in peptide if aa in DISORDER_PROMOTING)
243
+ order_promoting = sum(1 for aa in peptide if aa in ORDER_PROMOTING)
244
+ features.append(disorder_promoting / n) # Disorder-promoting fraction
245
+ features.append(order_promoting / n) # Order-promoting fraction
246
+
247
+ return np.array(features, dtype=np.float32)
248
+
249
+
250
+ def _build_reduced_alphabet_index():
251
+ """Build reduced alphabet group indices in a stable order."""
252
+ alphabet_order = list(REDUCED_ALPHABETS.keys())
253
+ alphabet_groups = {}
254
+ for name in alphabet_order:
255
+ mapping = REDUCED_ALPHABETS[name]
256
+ groups: List[str] = []
257
+ for aa in AMINO_ACIDS:
258
+ rep = mapping.get(aa)
259
+ if rep is None:
260
+ continue
261
+ if rep not in groups:
262
+ groups.append(rep)
263
+ alphabet_groups[name] = {
264
+ 'groups': groups,
265
+ 'rep_to_idx': {rep: idx for idx, rep in enumerate(groups)},
266
+ }
267
+ return alphabet_order, alphabet_groups
268
+
269
+
270
+ REDUCED_ALPHABET_ORDER, REDUCED_ALPHABET_GROUPS = _build_reduced_alphabet_index()
271
+
272
+
273
+ def _compute_sequence_stats(peptide: str) -> np.ndarray:
274
+ """Compute sequence-level non-positional statistics."""
275
+ if not peptide:
276
+ return np.zeros(12, dtype=np.float32)
277
+
278
+ n = len(peptide)
279
+ log_len = np.log1p(n)
280
+ sqrt_len = np.sqrt(n)
281
+
282
+ counts = np.zeros(20, dtype=np.float32)
283
+ unknown = 0
284
+ for aa in peptide:
285
+ idx = AMINO_ACIDS.find(aa)
286
+ if idx >= 0:
287
+ counts[idx] += 1
288
+ else:
289
+ unknown += 1
290
+
291
+ total = counts.sum()
292
+ if total > 0:
293
+ freqs = counts / total
294
+ nonzero = freqs[freqs > 0]
295
+ entropy = -np.sum(nonzero * np.log(nonzero))
296
+ entropy_norm = entropy / np.log(20)
297
+ effective = np.exp(entropy) / 20.0
298
+ gini = 1.0 - np.sum(freqs ** 2)
299
+ max_freq = float(freqs.max())
300
+ top2 = float(np.sort(freqs)[-2:].sum())
301
+ unique_frac = float(np.count_nonzero(counts) / 20.0)
302
+ else:
303
+ entropy_norm = 0.0
304
+ effective = 0.0
305
+ gini = 0.0
306
+ max_freq = 0.0
307
+ top2 = 0.0
308
+ unique_frac = 0.0
309
+
310
+ # Run-length and repeat statistics
311
+ max_run = 1
312
+ repeats = 0
313
+ current_run = 1
314
+ for i in range(1, n):
315
+ if peptide[i] == peptide[i - 1]:
316
+ repeats += 1
317
+ current_run += 1
318
+ else:
319
+ current_run = 1
320
+ if current_run > max_run:
321
+ max_run = current_run
322
+
323
+ max_run_frac = max_run / n
324
+ repeat_frac = repeats / max(1, n - 1)
325
+ frac_unknown = unknown / n
326
+
327
+ return np.array([
328
+ n,
329
+ log_len,
330
+ sqrt_len,
331
+ frac_unknown,
332
+ unique_frac,
333
+ max_run_frac,
334
+ repeat_frac,
335
+ entropy_norm,
336
+ effective,
337
+ max_freq,
338
+ top2,
339
+ gini,
340
+ ], dtype=np.float32)
341
+
342
+
343
+ def _compute_reduced_alphabet_features(peptide: str) -> np.ndarray:
344
+ """Compute reduced alphabet composition features."""
345
+ if not peptide:
346
+ total_features = sum(
347
+ len(REDUCED_ALPHABET_GROUPS[name]['groups'])
348
+ for name in REDUCED_ALPHABET_ORDER
349
+ )
350
+ return np.zeros(total_features, dtype=np.float32)
351
+
352
+ features: List[np.ndarray] = []
353
+ for name in REDUCED_ALPHABET_ORDER:
354
+ mapping = REDUCED_ALPHABETS[name]
355
+ groups = REDUCED_ALPHABET_GROUPS[name]['groups']
356
+ rep_to_idx = REDUCED_ALPHABET_GROUPS[name]['rep_to_idx']
357
+ counts = np.zeros(len(groups), dtype=np.float32)
358
+ total = 0
359
+ for aa in peptide:
360
+ rep = mapping.get(aa)
361
+ if rep is None:
362
+ continue
363
+ counts[rep_to_idx[rep]] += 1
364
+ total += 1
365
+ if total > 0:
366
+ counts /= total
367
+ features.append(counts)
368
+
369
+ return np.concatenate(features) if features else np.array([], dtype=np.float32)
370
+
371
+
372
+ def _compute_dipeptide_summary(dipeptide_freqs: np.ndarray) -> np.ndarray:
373
+ """Compute summary statistics from dipeptide frequencies."""
374
+ if dipeptide_freqs.size == 0:
375
+ return np.zeros(5, dtype=np.float32)
376
+
377
+ total = dipeptide_freqs.sum()
378
+ if total > 0:
379
+ probs = dipeptide_freqs / total
380
+ nonzero = probs[probs > 0]
381
+ entropy = -np.sum(nonzero * np.log(nonzero))
382
+ entropy_norm = entropy / np.log(probs.size)
383
+ gini = 1.0 - np.sum(probs ** 2)
384
+ max_freq = float(probs.max())
385
+ top2 = float(np.sort(probs)[-2:].sum()) if probs.size >= 2 else max_freq
386
+ homodipep = float(np.trace(probs.reshape(20, 20)))
387
+ else:
388
+ entropy_norm = 0.0
389
+ gini = 0.0
390
+ max_freq = 0.0
391
+ top2 = 0.0
392
+ homodipep = 0.0
393
+
394
+ return np.array(
395
+ [entropy_norm, gini, max_freq, top2, homodipep],
396
+ dtype=np.float32,
397
+ )
398
+
399
+
400
+ def extract_features(peptide: str, k: int = 8, use_dipeptides: bool = True) -> np.ndarray:
401
+ """Extract all features from a peptide.
402
+
403
+ Features include:
404
+ - Amino acid property statistics (12 props × 4 stats = 48 features)
405
+ - Structural/physicochemical features (27 features)
406
+ - Amino acid composition (20 features)
407
+ - Dipeptide composition (400 features, optional)
408
+ - Dipeptide summary statistics (5 features, optional)
409
+ - Sequence-level statistics (12 features)
410
+ - Reduced alphabet compositions (80 features)
411
+
412
+ Parameters
413
+ ----------
414
+ peptide : str
415
+ Peptide sequence.
416
+ k : int
417
+ Unused; retained for backward compatibility.
418
+ use_dipeptides : bool
419
+ Include dipeptide composition features.
420
+
421
+ Returns
422
+ -------
423
+ features : np.ndarray
424
+ Feature vector.
425
+ """
426
+ properties = _get_aa_properties()
427
+
428
+ feature_parts = [
429
+ _compute_property_features(peptide, properties), # 48 features (12 props × 4 stats)
430
+ _compute_structural_features(peptide), # 27 features
431
+ _compute_composition_features(peptide), # 20 features
432
+ _compute_sequence_stats(peptide), # 12 features
433
+ _compute_reduced_alphabet_features(peptide), # 80 features
434
+ ]
435
+
436
+ if use_dipeptides:
437
+ dipep_freqs = _compute_dipeptide_features(peptide)
438
+ feature_parts.append(_compute_dipeptide_summary(dipep_freqs)) # 5 features
439
+ feature_parts.append(dipep_freqs) # 400 features
440
+
441
+ return np.concatenate(feature_parts)
442
+
443
+
444
+ @register_scorer('mlp', description='MLP foreignness scorer with rich peptide features')
445
+ class MLPScorer(TrainableScorer):
446
+ """MLP-based origin scorer using rich peptide features.
447
+
448
+ Combines multiple feature types:
449
+ - Amino acid properties (hydropathy, mass, polarity, etc.)
450
+ - Amino acid composition (single AA frequencies)
451
+ - Dipeptide composition (AA pair frequencies)
452
+ - Sequence-level statistics (entropy, repeats, complexity)
453
+ - Reduced alphabet compositions (Murphy/GBMR/SDM, etc.)
454
+
455
+ All features are normalized using StandardScaler before training.
456
+
457
+ Parameters
458
+ ----------
459
+ k : int, default=8
460
+ K-mer size used to window long peptides for aggregation.
461
+ hidden_layer_sizes : tuple of int, default=(256, 128, 64)
462
+ Sizes of hidden layers.
463
+ activation : str, default='relu'
464
+ Activation function: 'relu', 'tanh', 'logistic'.
465
+ alpha : float, default=0.0001
466
+ L2 regularization strength.
467
+ max_iter : int, default=200
468
+ Maximum training iterations.
469
+ early_stopping : bool, default=True
470
+ Use early stopping with validation split.
471
+ use_dipeptides : bool, default=True
472
+ Include dipeptide composition features.
473
+ batch_size : int, default=256
474
+ Batch size for training.
475
+
476
+ Example
477
+ -------
478
+ >>> from weirdo.scorers import MLPScorer
479
+ >>>
480
+ >>> scorer = MLPScorer(hidden_layer_sizes=(256, 128))
481
+ >>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
482
+ >>> scores = scorer.score(['MTMDKSEL', 'XXXXXXXX'])
483
+ """
484
+
485
+ def __init__(
486
+ self,
487
+ k: int = 8,
488
+ hidden_layer_sizes: Tuple[int, ...] = (256, 128, 64),
489
+ activation: str = 'relu',
490
+ alpha: float = 0.0001,
491
+ learning_rate_init: float = 0.001,
492
+ max_iter: int = 200,
493
+ early_stopping: bool = True,
494
+ use_dipeptides: bool = True,
495
+ batch_size: int = 256,
496
+ random_state: Optional[int] = None,
497
+ **kwargs
498
+ ):
499
+ super().__init__(k=k, batch_size=batch_size, **kwargs)
500
+ self._params.update({
501
+ 'hidden_layer_sizes': hidden_layer_sizes,
502
+ 'activation': activation,
503
+ 'alpha': alpha,
504
+ 'learning_rate_init': learning_rate_init,
505
+ 'max_iter': max_iter,
506
+ 'early_stopping': early_stopping,
507
+ 'use_dipeptides': use_dipeptides,
508
+ 'random_state': random_state,
509
+ })
510
+ self._model: Optional[MLPRegressor] = None
511
+ self._scaler: Optional[StandardScaler] = None
512
+ self._target_categories: Optional[List[str]] = None
513
+
514
+ def _extract_features(self, peptides: Sequence[str]) -> np.ndarray:
515
+ """Extract features from a list of peptides."""
516
+ return np.array([
517
+ extract_features(p, self.k, self._params['use_dipeptides'])
518
+ for p in peptides
519
+ ])
520
+
521
+ def _predict_raw_kmers(self, kmers: Sequence[str]) -> np.ndarray:
522
+ """Predict raw model outputs for k-mers (no aggregation)."""
523
+ if not self._is_trained:
524
+ raise RuntimeError("Model must be trained before scoring.")
525
+ if self._model is None or self._scaler is None:
526
+ raise RuntimeError("Model is not initialized. Train or load a model first.")
527
+
528
+ X = self._extract_features(kmers)
529
+ X_scaled = self._scaler.transform(X)
530
+ return self._model.predict(X_scaled)
531
+
532
+ def train(
533
+ self,
534
+ peptides: Sequence[str],
535
+ labels: Sequence[float],
536
+ val_peptides: Optional[Sequence[str]] = None,
537
+ val_labels: Optional[Sequence[float]] = None,
538
+ epochs: Optional[int] = None,
539
+ learning_rate: Optional[float] = None,
540
+ verbose: bool = True,
541
+ target_categories: Optional[List[str]] = None,
542
+ plot_loss: Union[bool, str, Path, None] = None,
543
+ **kwargs
544
+ ) -> 'MLPScorer':
545
+ """Train the MLP on labeled peptide data.
546
+
547
+ Parameters
548
+ ----------
549
+ peptides : sequence of str
550
+ Training peptide sequences.
551
+ labels : sequence of float or 2D array
552
+ Target labels in [0, 1]. Can be 1D (single foreignness score)
553
+ or 2D (multi-label with one column per category).
554
+ val_peptides : sequence of str, optional
555
+ Not used (sklearn handles validation internally).
556
+ val_labels : sequence of float, optional
557
+ Not used.
558
+ epochs : int, optional
559
+ Maximum training iterations (maps to max_iter). Defaults to max_iter.
560
+ learning_rate : float, optional
561
+ Initial learning rate. Defaults to learning_rate_init if not provided.
562
+ verbose : bool, default=True
563
+ Print training progress.
564
+ target_categories : list of str, optional
565
+ Names of target categories (for multi-label training).
566
+ E.g., ['human', 'viruses', 'bacteria', 'mammals'].
567
+ plot_loss : bool, str, or Path, optional
568
+ Save loss curve plot. If True, saves to 'loss_curve.png' in
569
+ current directory. If a path, saves to that location.
570
+
571
+ Returns
572
+ -------
573
+ self : MLPScorer
574
+ """
575
+ self._target_categories = target_categories
576
+ if epochs is None:
577
+ epochs = self._params['max_iter']
578
+ if learning_rate is None:
579
+ learning_rate = self._params['learning_rate_init']
580
+ # Extract features
581
+ X = self._extract_features(peptides)
582
+ y = np.array(labels)
583
+ if target_categories is not None:
584
+ if y.ndim == 1 and len(target_categories) != 1:
585
+ raise ValueError("target_categories length must match label dimensions.")
586
+ if y.ndim == 2 and y.shape[1] != len(target_categories):
587
+ raise ValueError("target_categories length must match label dimensions.")
588
+
589
+ # Scale features to zero mean, unit variance
590
+ self._scaler = StandardScaler()
591
+ X_scaled = self._scaler.fit_transform(X)
592
+
593
+ # Disable early stopping if dataset is too small
594
+ use_early_stopping = self._params['early_stopping']
595
+ if use_early_stopping and len(peptides) < 20:
596
+ use_early_stopping = False
597
+ if verbose:
598
+ print("Note: Early stopping disabled (dataset too small)")
599
+
600
+ # Create and train model
601
+ self._model = MLPRegressor(
602
+ hidden_layer_sizes=self._params['hidden_layer_sizes'],
603
+ activation=self._params['activation'],
604
+ alpha=self._params['alpha'],
605
+ learning_rate_init=learning_rate,
606
+ max_iter=epochs,
607
+ early_stopping=use_early_stopping,
608
+ validation_fraction=0.1 if use_early_stopping else 0.0,
609
+ n_iter_no_change=10,
610
+ random_state=self._params['random_state'],
611
+ verbose=verbose,
612
+ )
613
+
614
+ self._model.fit(X_scaled, y)
615
+
616
+ self._is_trained = True
617
+ self._is_fitted = True
618
+
619
+ # Save training metadata
620
+ self._metadata['n_train'] = len(peptides)
621
+ self._metadata['n_features'] = X.shape[1]
622
+ self._metadata['n_epochs'] = self._model.n_iter_
623
+ self._metadata['final_train_loss'] = float(self._model.loss_)
624
+ if hasattr(self._model, 'best_loss_') and self._model.best_loss_ is not None:
625
+ self._metadata['best_val_loss'] = float(self._model.best_loss_)
626
+
627
+ self._training_history = [
628
+ {'epoch': i + 1, 'loss': loss}
629
+ for i, loss in enumerate(self._model.loss_curve_)
630
+ ]
631
+
632
+ if verbose:
633
+ print(f"\nTraining complete:")
634
+ print(f" Features: {X.shape[1]}")
635
+ print(f" Iterations: {self._model.n_iter_}")
636
+ print(f" Final loss: {self._model.loss_:.4f}")
637
+
638
+ # Save loss curve plot if requested
639
+ if plot_loss:
640
+ self._save_loss_plot(plot_loss, verbose=verbose)
641
+
642
+ return self
643
+
644
+ def _save_loss_plot(
645
+ self,
646
+ path: Union[bool, str, Path],
647
+ verbose: bool = True,
648
+ ) -> Path:
649
+ """Save loss curve plot to file.
650
+
651
+ Parameters
652
+ ----------
653
+ path : bool, str, or Path
654
+ If True, saves to 'loss_curve.png'. If str/Path, saves to that location.
655
+ verbose : bool
656
+ Print save location.
657
+
658
+ Returns
659
+ -------
660
+ save_path : Path
661
+ Path where plot was saved.
662
+ """
663
+ import matplotlib.pyplot as plt
664
+
665
+ if not self._is_trained or self._model is None:
666
+ raise RuntimeError("Model must be trained before saving loss plot.")
667
+
668
+ # Determine save path
669
+ if path is True:
670
+ save_path = Path('loss_curve.png')
671
+ else:
672
+ save_path = Path(path)
673
+
674
+ # Get loss curve
675
+ loss_curve = self._model.loss_curve_
676
+
677
+ # Create plot
678
+ fig, ax = plt.subplots(figsize=(10, 6))
679
+ epochs = range(1, len(loss_curve) + 1)
680
+ ax.plot(epochs, loss_curve, 'b-', linewidth=2)
681
+ ax.set_xlabel('Epoch', fontsize=12)
682
+ ax.set_ylabel('Loss', fontsize=12)
683
+ ax.set_title(f'MLPScorer Training Loss ({self._metadata.get("n_train", "?")} samples)', fontsize=14)
684
+ ax.grid(True, alpha=0.3)
685
+
686
+ # Use log scale if loss spans multiple orders of magnitude
687
+ if len(loss_curve) > 1 and loss_curve[0] / max(loss_curve[-1], 1e-10) > 10:
688
+ ax.set_yscale('log')
689
+
690
+ # Add annotations
691
+ ax.annotate(
692
+ f'Start: {loss_curve[0]:.3f}',
693
+ xy=(1, loss_curve[0]),
694
+ xytext=(len(loss_curve) * 0.1, loss_curve[0] * 1.2),
695
+ fontsize=10,
696
+ arrowprops=dict(arrowstyle='->', color='gray', alpha=0.7),
697
+ )
698
+ ax.annotate(
699
+ f'End: {loss_curve[-1]:.4f}',
700
+ xy=(len(loss_curve), loss_curve[-1]),
701
+ xytext=(len(loss_curve) * 0.7, loss_curve[-1] * 2),
702
+ fontsize=10,
703
+ arrowprops=dict(arrowstyle='->', color='gray', alpha=0.7),
704
+ )
705
+
706
+ plt.tight_layout()
707
+ plt.savefig(save_path, dpi=150)
708
+ plt.close(fig)
709
+
710
+ if verbose:
711
+ print(f" Loss plot saved to: {save_path}")
712
+
713
+ return save_path
714
+
715
+ def score(
716
+ self,
717
+ peptides: Union[str, Sequence[str]],
718
+ aggregate: str = 'mean',
719
+ pathogen_categories: Optional[List[str]] = None,
720
+ self_categories: Optional[List[str]] = None,
721
+ ) -> np.ndarray:
722
+ """Score peptides for foreignness.
723
+
724
+ For variable-length peptides, scores are computed per k-mer and aggregated.
725
+
726
+ Parameters
727
+ ----------
728
+ peptides : str or sequence of str
729
+ Peptide(s) to score.
730
+ aggregate : str, default='mean'
731
+ How to aggregate k-mer probabilities: 'mean', 'max', 'min'.
732
+ pathogen_categories : list of str, optional
733
+ Categories considered "foreign" (default: ['bacteria', 'viruses']).
734
+ self_categories : list of str, optional
735
+ Categories considered "self" (default: ['human', 'rodents', 'mammals']).
736
+
737
+ Returns
738
+ -------
739
+ scores : np.ndarray
740
+ Foreignness scores (higher = more foreign).
741
+ """
742
+ if not self._is_trained:
743
+ raise RuntimeError("Model must be trained before scoring.")
744
+
745
+ if self._target_categories is None:
746
+ probs = self.predict_proba(peptides, aggregate=aggregate)
747
+ if probs.shape[1] > 1:
748
+ raise RuntimeError(
749
+ "Model has multiple outputs. Use predict_proba() or train with "
750
+ "target_categories to compute foreignness."
751
+ )
752
+ return probs.ravel()
753
+
754
+ return self.foreignness(
755
+ peptides,
756
+ pathogen_categories=pathogen_categories,
757
+ self_categories=self_categories,
758
+ aggregate=aggregate,
759
+ )
760
+
761
+ def _score_batch_impl(self, batch: List[str]) -> np.ndarray:
762
+ """Score a batch of peptides."""
763
+ return self.score(batch)
764
+
765
+ def predict_proba(
766
+ self,
767
+ peptides: Union[str, Sequence[str]],
768
+ aggregate: str = 'mean',
769
+ ) -> np.ndarray:
770
+ """Predict category probabilities using sigmoid activation.
771
+
772
+ Parameters
773
+ ----------
774
+ peptides : str or sequence of str
775
+ Peptide(s) to predict.
776
+ aggregate : str, default='mean'
777
+ How to aggregate k-mer probabilities for long peptides: 'mean', 'max', 'min'.
778
+
779
+ Returns
780
+ -------
781
+ probs : np.ndarray
782
+ Probabilities for each category, shape (n_peptides, n_categories).
783
+ Values are in [0, 1] via sigmoid transformation.
784
+ """
785
+ if not self._is_trained:
786
+ raise RuntimeError("Model must be trained before prediction.")
787
+
788
+ peptides = self._ensure_list(peptides)
789
+ results: List[np.ndarray] = []
790
+
791
+ for peptide in peptides:
792
+ kmers = self._extract_kmers(peptide)
793
+ raw = self._predict_raw_kmers(kmers)
794
+ probs = 1 / (1 + np.exp(-raw))
795
+ if probs.ndim == 1:
796
+ probs = probs.reshape(-1, 1)
797
+
798
+ if aggregate == 'mean':
799
+ agg_probs = probs.mean(axis=0)
800
+ elif aggregate == 'max':
801
+ agg_probs = probs.max(axis=0)
802
+ elif aggregate == 'min':
803
+ agg_probs = probs.min(axis=0)
804
+ else:
805
+ raise ValueError(f"Unknown aggregate method: {aggregate}")
806
+
807
+ results.append(agg_probs)
808
+
809
+ return np.vstack(results)
810
+
811
+ def foreignness(
812
+ self,
813
+ peptides: Union[str, Sequence[str]],
814
+ pathogen_categories: Optional[List[str]] = None,
815
+ self_categories: Optional[List[str]] = None,
816
+ aggregate: str = 'mean',
817
+ ) -> np.ndarray:
818
+ """Compute foreignness score from category probabilities.
819
+
820
+ Foreignness = max(pathogens) / (max(pathogens) + max(self))
821
+
822
+ Parameters
823
+ ----------
824
+ peptides : str or sequence of str
825
+ Peptide(s) to score.
826
+ pathogen_categories : list of str, optional
827
+ Categories considered "foreign" (default: ['bacteria', 'viruses']).
828
+ self_categories : list of str, optional
829
+ Categories considered "self" (default: ['human', 'rodents', 'mammals']).
830
+ aggregate : str, default='mean'
831
+ How to aggregate k-mer probabilities for long peptides.
832
+
833
+ Returns
834
+ -------
835
+ foreignness : np.ndarray
836
+ Foreignness scores in [0, 1]. Higher = more foreign.
837
+ """
838
+ if self._target_categories is None:
839
+ raise RuntimeError(
840
+ "Model must be trained with target_categories to use foreignness(). "
841
+ "Use score() for single-output models."
842
+ )
843
+
844
+ if pathogen_categories is None:
845
+ pathogen_categories = ['bacteria', 'viruses']
846
+ if self_categories is None:
847
+ self_categories = ['human', 'rodents', 'mammals']
848
+
849
+ # Get category indices
850
+ pathogen_idx = [
851
+ self._target_categories.index(cat)
852
+ for cat in pathogen_categories
853
+ if cat in self._target_categories
854
+ ]
855
+ self_idx = [
856
+ self._target_categories.index(cat)
857
+ for cat in self_categories
858
+ if cat in self._target_categories
859
+ ]
860
+
861
+ if not pathogen_idx:
862
+ raise ValueError(
863
+ f"No pathogen categories found. Available: {self._target_categories}"
864
+ )
865
+ if not self_idx:
866
+ raise ValueError(
867
+ f"No self categories found. Available: {self._target_categories}"
868
+ )
869
+
870
+ # Get probabilities
871
+ probs = self.predict_proba(peptides, aggregate=aggregate)
872
+
873
+ # Compute foreignness: max(pathogens) / (max(pathogens) + max(self))
874
+ max_pathogen = probs[:, pathogen_idx].max(axis=1)
875
+ max_self = probs[:, self_idx].max(axis=1)
876
+
877
+ # Avoid division by zero
878
+ denominator = max_pathogen + max_self
879
+ foreignness = np.where(
880
+ denominator > 0,
881
+ max_pathogen / denominator,
882
+ 0.5 # Neutral when both are zero
883
+ )
884
+
885
+ return foreignness
886
+
887
+ @property
888
+ def target_categories(self) -> Optional[List[str]]:
889
+ """Get target category names (if trained with multi-label)."""
890
+ return self._target_categories
891
+
892
+ def predict_dataframe(
893
+ self,
894
+ peptides: Sequence[str],
895
+ pathogen_categories: Optional[List[str]] = None,
896
+ self_categories: Optional[List[str]] = None,
897
+ aggregate: str = 'mean',
898
+ ) -> 'pd.DataFrame':
899
+ """Predict category probabilities and foreignness for variable-length peptides.
900
+
901
+ For peptides longer than k, breaks into overlapping k-mers and aggregates.
902
+
903
+ Parameters
904
+ ----------
905
+ peptides : sequence of str
906
+ Peptide sequences (can be variable length).
907
+ pathogen_categories : list of str, optional
908
+ Categories considered "foreign" (default: ['bacteria', 'viruses']).
909
+ self_categories : list of str, optional
910
+ Categories considered "self" (default: ['human', 'rodents', 'mammals']).
911
+ aggregate : str, default='mean'
912
+ How to aggregate k-mer scores for long peptides: 'mean', 'max', 'min'.
913
+
914
+ Returns
915
+ -------
916
+ df : pd.DataFrame
917
+ DataFrame with columns:
918
+ - 'peptide': input peptide sequence
919
+ - One column per target category (probabilities)
920
+ - 'foreignness': foreignness score
921
+ """
922
+ import pandas as pd
923
+
924
+ if self._target_categories is None:
925
+ raise RuntimeError(
926
+ "Model must be trained with target_categories to use predict_dataframe()."
927
+ )
928
+
929
+ if pathogen_categories is None:
930
+ pathogen_categories = ['bacteria', 'viruses']
931
+ if self_categories is None:
932
+ self_categories = ['human', 'rodents', 'mammals']
933
+
934
+ # Get category indices for foreignness calculation
935
+ pathogen_idx = [
936
+ self._target_categories.index(cat)
937
+ for cat in pathogen_categories
938
+ if cat in self._target_categories
939
+ ]
940
+ self_idx = [
941
+ self._target_categories.index(cat)
942
+ for cat in self_categories
943
+ if cat in self._target_categories
944
+ ]
945
+
946
+ peptides = list(peptides)
947
+ probs = self.predict_proba(peptides, aggregate=aggregate)
948
+ results = []
949
+ for peptide, row_probs in zip(peptides, probs):
950
+ max_pathogen = row_probs[pathogen_idx].max() if pathogen_idx else 0.0
951
+ max_self = row_probs[self_idx].max() if self_idx else 0.0
952
+ denom = max_pathogen + max_self
953
+ foreignness = max_pathogen / denom if denom > 0 else 0.5
954
+
955
+ row = {'peptide': peptide}
956
+ for cat, p in zip(self._target_categories, row_probs):
957
+ row[cat] = float(p)
958
+ row['foreignness'] = float(foreignness)
959
+ results.append(row)
960
+
961
+ # Create DataFrame with consistent column order
962
+ columns = ['peptide'] + self._target_categories + ['foreignness']
963
+ return pd.DataFrame(results, columns=columns)
964
+
965
+ def features_dataframe(
966
+ self,
967
+ peptides: Sequence[str],
968
+ aggregate: str = 'mean',
969
+ include_peptide: bool = True,
970
+ ) -> 'pd.DataFrame':
971
+ """Extract features for peptides as a DataFrame.
972
+
973
+ For peptides longer than k, breaks into overlapping k-mers and aggregates.
974
+
975
+ Parameters
976
+ ----------
977
+ peptides : sequence of str
978
+ Peptide sequences (can be variable length).
979
+ aggregate : str, default='mean'
980
+ How to aggregate k-mer features for long peptides: 'mean', 'max', 'min'.
981
+ include_peptide : bool, default=True
982
+ Include peptide sequence as first column.
983
+
984
+ Returns
985
+ -------
986
+ df : pd.DataFrame
987
+ DataFrame with 592 feature columns (+ peptide column if include_peptide=True).
988
+ Features: 48 AA properties, 27 structural, 20 AA composition,
989
+ 12 sequence stats, 80 reduced alphabet frequencies, 5 dipeptide
990
+ summaries, 400 dipeptides (if enabled).
991
+ """
992
+ import pandas as pd
993
+
994
+ feature_names = self.get_feature_names()
995
+ results = []
996
+
997
+ for peptide in peptides:
998
+ if len(peptide) < self.k:
999
+ # Pad short peptides
1000
+ kmers = [peptide + 'X' * (self.k - len(peptide))]
1001
+ elif len(peptide) == self.k:
1002
+ kmers = [peptide]
1003
+ else:
1004
+ # Extract overlapping k-mers
1005
+ kmers = [peptide[i:i+self.k] for i in range(len(peptide) - self.k + 1)]
1006
+
1007
+ # Extract features for all k-mers
1008
+ kmer_features = np.array([
1009
+ extract_features(kmer, self.k, self._params['use_dipeptides'])
1010
+ for kmer in kmers
1011
+ ])
1012
+
1013
+ # Aggregate across k-mers
1014
+ if aggregate == 'mean':
1015
+ features = kmer_features.mean(axis=0)
1016
+ elif aggregate == 'max':
1017
+ features = kmer_features.max(axis=0)
1018
+ elif aggregate == 'min':
1019
+ features = kmer_features.min(axis=0)
1020
+ else:
1021
+ raise ValueError(f"Unknown aggregate method: {aggregate}")
1022
+
1023
+ if include_peptide:
1024
+ row = {'peptide': peptide}
1025
+ row.update(dict(zip(feature_names, features)))
1026
+ else:
1027
+ row = dict(zip(feature_names, features))
1028
+ results.append(row)
1029
+
1030
+ # Create DataFrame with consistent column order
1031
+ if include_peptide:
1032
+ columns = ['peptide'] + feature_names
1033
+ else:
1034
+ columns = feature_names
1035
+ return pd.DataFrame(results, columns=columns)
1036
+
1037
+ def _save_model(self, path: Path) -> None:
1038
+ """Save model weights."""
1039
+ model_data = {
1040
+ 'model': self._model,
1041
+ 'scaler': self._scaler,
1042
+ 'target_categories': self._target_categories,
1043
+ }
1044
+ with open(path, 'wb') as f:
1045
+ pickle.dump(model_data, f)
1046
+
1047
+ def _load_model(self, path: Path) -> None:
1048
+ """Load model weights."""
1049
+ with open(path, 'rb') as f:
1050
+ model_data = pickle.load(f)
1051
+ self._model = model_data['model']
1052
+ self._scaler = model_data['scaler']
1053
+ self._target_categories = model_data.get('target_categories')
1054
+
1055
+ def get_feature_names(self) -> List[str]:
1056
+ """Get names of all features used by the model.
1057
+
1058
+ Returns
1059
+ -------
1060
+ names : list of str
1061
+ Feature names.
1062
+ """
1063
+ properties = list(_get_aa_properties().keys())
1064
+ names = []
1065
+
1066
+ # Property statistics (12 props × 4 stats = 48 features)
1067
+ for prop in properties:
1068
+ for stat in ['mean', 'std', 'min', 'max']:
1069
+ names.append(f'{prop}_{stat}')
1070
+
1071
+ # Structural features (27 features)
1072
+ for struct in ['helix', 'sheet', 'turn']:
1073
+ for stat in ['mean', 'std', 'min', 'max']:
1074
+ names.append(f'{struct}_propensity_{stat}')
1075
+ names.extend([
1076
+ 'frac_positive_charged', 'frac_negative_charged', 'frac_hydrophobic',
1077
+ 'frac_aromatic', 'frac_aliphatic', 'frac_polar_uncharged',
1078
+ 'frac_tiny', 'frac_small', 'frac_cysteine',
1079
+ 'net_charge_per_residue', 'charge_transitions', 'max_charge_cluster',
1080
+ 'arginine_ratio', # R/(R+K) - lower in viruses
1081
+ 'frac_disorder_promoting', 'frac_order_promoting',
1082
+ ])
1083
+
1084
+ # Amino acid composition (20 features)
1085
+ for aa in AMINO_ACIDS:
1086
+ names.append(f'aa_freq_{aa}')
1087
+
1088
+ # Sequence statistics (12 features)
1089
+ names.extend([
1090
+ 'seq_length',
1091
+ 'seq_log_length',
1092
+ 'seq_sqrt_length',
1093
+ 'frac_unknown',
1094
+ 'unique_frac',
1095
+ 'max_run_frac',
1096
+ 'repeat_frac',
1097
+ 'entropy_aa',
1098
+ 'effective_aa',
1099
+ 'max_aa_freq',
1100
+ 'top2_aa_freq',
1101
+ 'gini_aa',
1102
+ ])
1103
+
1104
+ # Reduced alphabet compositions (80 features)
1105
+ for name in REDUCED_ALPHABET_ORDER:
1106
+ groups = REDUCED_ALPHABET_GROUPS[name]['groups']
1107
+ for rep in groups:
1108
+ names.append(f'{name}_freq_{rep}')
1109
+
1110
+ # Dipeptide summary (5 features)
1111
+ if self._params.get('use_dipeptides', True):
1112
+ names.extend([
1113
+ 'dipep_entropy',
1114
+ 'dipep_gini',
1115
+ 'dipep_max_freq',
1116
+ 'dipep_top2_freq',
1117
+ 'dipep_homodimer_frac',
1118
+ ])
1119
+
1120
+ # Dipeptide composition (400 features)
1121
+ if self._params.get('use_dipeptides', True):
1122
+ for aa1 in AMINO_ACIDS:
1123
+ for aa2 in AMINO_ACIDS:
1124
+ names.append(f'dipep_{aa1}{aa2}')
1125
+
1126
+ return names