weirdo 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weirdo/__init__.py +104 -0
- weirdo/amino_acid.py +33 -0
- weirdo/amino_acid_alphabet.py +158 -0
- weirdo/amino_acid_properties.py +358 -0
- weirdo/api.py +372 -0
- weirdo/blosum.py +74 -0
- weirdo/chou_fasman.py +73 -0
- weirdo/cli.py +597 -0
- weirdo/common.py +22 -0
- weirdo/data_manager.py +475 -0
- weirdo/distances.py +16 -0
- weirdo/matrices/BLOSUM30 +25 -0
- weirdo/matrices/BLOSUM50 +21 -0
- weirdo/matrices/BLOSUM62 +27 -0
- weirdo/matrices/__init__.py +0 -0
- weirdo/matrices/amino_acid_properties.txt +829 -0
- weirdo/matrices/helix_vs_coil.txt +28 -0
- weirdo/matrices/helix_vs_strand.txt +27 -0
- weirdo/matrices/pmbec.mat +21 -0
- weirdo/matrices/strand_vs_coil.txt +27 -0
- weirdo/model_manager.py +346 -0
- weirdo/peptide_vectorizer.py +78 -0
- weirdo/pmbec.py +85 -0
- weirdo/reduced_alphabet.py +61 -0
- weirdo/residue_contact_energies.py +74 -0
- weirdo/scorers/__init__.py +95 -0
- weirdo/scorers/base.py +223 -0
- weirdo/scorers/config.py +299 -0
- weirdo/scorers/mlp.py +1126 -0
- weirdo/scorers/reference.py +265 -0
- weirdo/scorers/registry.py +282 -0
- weirdo/scorers/similarity.py +386 -0
- weirdo/scorers/swissprot.py +510 -0
- weirdo/scorers/trainable.py +219 -0
- weirdo/static_data.py +17 -0
- weirdo-2.1.0.dist-info/METADATA +294 -0
- weirdo-2.1.0.dist-info/RECORD +41 -0
- weirdo-2.1.0.dist-info/WHEEL +5 -0
- weirdo-2.1.0.dist-info/entry_points.txt +2 -0
- weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
- weirdo-2.1.0.dist-info/top_level.txt +1 -0
weirdo/scorers/mlp.py
ADDED
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
"""MLP-based origin scorer.
|
|
2
|
+
|
|
3
|
+
Neural network model for learning category probabilities from labeled data.
|
|
4
|
+
Uses rich peptide features including amino acid properties and composition
|
|
5
|
+
statistics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pickle
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from sklearn.neural_network import MLPRegressor
|
|
14
|
+
from sklearn.preprocessing import StandardScaler
|
|
15
|
+
|
|
16
|
+
from .trainable import TrainableScorer
|
|
17
|
+
from .registry import register_scorer
|
|
18
|
+
from ..reduced_alphabet import alphabets as REDUCED_ALPHABETS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Amino acid to index mapping
|
|
22
|
+
AA_TO_IDX = {
|
|
23
|
+
'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
|
|
24
|
+
'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
|
|
25
|
+
'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
|
|
26
|
+
'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
|
|
27
|
+
'X': 20, # Unknown/padding
|
|
28
|
+
}
|
|
29
|
+
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'
|
|
30
|
+
NUM_AMINO_ACIDS = 21
|
|
31
|
+
|
|
32
|
+
# Amino acid categories for derived features
|
|
33
|
+
POSITIVE_CHARGED = set('KRH') # Basic residues (H partially charged at pH 7)
|
|
34
|
+
NEGATIVE_CHARGED = set('DE') # Acidic residues
|
|
35
|
+
HYDROPHOBIC = set('AILMFVPWG')
|
|
36
|
+
AROMATIC = set('FWY')
|
|
37
|
+
ALIPHATIC = set('AVILM')
|
|
38
|
+
POLAR_UNCHARGED = set('STNQ')
|
|
39
|
+
TINY = set('AGS')
|
|
40
|
+
SMALL = set('AGSCTDNPV')
|
|
41
|
+
DISORDER_PROMOTING = set('AEGRQSKP') # Disorder-promoting residues
|
|
42
|
+
ORDER_PROMOTING = set('WFYILMVC') # Order-promoting residues
|
|
43
|
+
|
|
44
|
+
# Chou-Fasman secondary structure propensities
|
|
45
|
+
HELIX_PROPENSITY = {
|
|
46
|
+
'A': 1.42, 'C': 0.70, 'D': 1.01, 'E': 1.51, 'F': 1.13,
|
|
47
|
+
'G': 0.57, 'H': 1.00, 'I': 1.08, 'K': 1.16, 'L': 1.21,
|
|
48
|
+
'M': 1.45, 'N': 0.67, 'P': 0.57, 'Q': 1.11, 'R': 0.98,
|
|
49
|
+
'S': 0.77, 'T': 0.83, 'V': 1.06, 'W': 1.08, 'Y': 0.69,
|
|
50
|
+
}
|
|
51
|
+
SHEET_PROPENSITY = {
|
|
52
|
+
'A': 0.83, 'C': 1.19, 'D': 0.54, 'E': 0.37, 'F': 1.38,
|
|
53
|
+
'G': 0.75, 'H': 0.87, 'I': 1.60, 'K': 0.74, 'L': 1.30,
|
|
54
|
+
'M': 1.05, 'N': 0.89, 'P': 0.55, 'Q': 1.10, 'R': 0.93,
|
|
55
|
+
'S': 0.75, 'T': 1.19, 'V': 1.70, 'W': 1.37, 'Y': 1.47,
|
|
56
|
+
}
|
|
57
|
+
TURN_PROPENSITY = {
|
|
58
|
+
'A': 0.66, 'C': 1.19, 'D': 1.46, 'E': 0.74, 'F': 0.60,
|
|
59
|
+
'G': 1.56, 'H': 0.95, 'I': 0.47, 'K': 1.01, 'L': 0.59,
|
|
60
|
+
'M': 0.60, 'N': 1.56, 'P': 1.52, 'Q': 0.98, 'R': 0.95,
|
|
61
|
+
'S': 1.43, 'T': 0.96, 'V': 0.50, 'W': 0.96, 'Y': 1.14,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_aa_properties() -> Dict[str, Dict[str, float]]:
|
|
66
|
+
"""Load all amino acid property dictionaries."""
|
|
67
|
+
from ..amino_acid_properties import (
|
|
68
|
+
accessible_surface_area,
|
|
69
|
+
accessible_surface_area_folded,
|
|
70
|
+
hydropathy,
|
|
71
|
+
hydrophilicity,
|
|
72
|
+
local_flexibility,
|
|
73
|
+
mass,
|
|
74
|
+
pK_side_chain,
|
|
75
|
+
polarity,
|
|
76
|
+
prct_exposed_residues,
|
|
77
|
+
refractivity,
|
|
78
|
+
solvent_exposed_area,
|
|
79
|
+
volume,
|
|
80
|
+
)
|
|
81
|
+
return {
|
|
82
|
+
'accessible_surface_area': accessible_surface_area,
|
|
83
|
+
'accessible_surface_area_folded': accessible_surface_area_folded,
|
|
84
|
+
'hydropathy': hydropathy,
|
|
85
|
+
'hydrophilicity': hydrophilicity,
|
|
86
|
+
'local_flexibility': local_flexibility,
|
|
87
|
+
'mass': mass,
|
|
88
|
+
'pK_side_chain': pK_side_chain,
|
|
89
|
+
'polarity': polarity,
|
|
90
|
+
'prct_exposed_residues': prct_exposed_residues,
|
|
91
|
+
'refractivity': refractivity,
|
|
92
|
+
'solvent_exposed_area': solvent_exposed_area,
|
|
93
|
+
'volume': volume,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _compute_property_features(peptide: str, properties: Dict[str, Dict[str, float]]) -> np.ndarray:
|
|
98
|
+
"""Compute aggregate statistics from amino acid properties.
|
|
99
|
+
|
|
100
|
+
For each property, compute: mean, std, min, max over the peptide.
|
|
101
|
+
Returns array of shape (n_properties * 4,).
|
|
102
|
+
"""
|
|
103
|
+
features = []
|
|
104
|
+
|
|
105
|
+
for prop_name, prop_dict in properties.items():
|
|
106
|
+
# Get property values for each residue
|
|
107
|
+
values = [prop_dict[aa] for aa in peptide if aa in prop_dict]
|
|
108
|
+
|
|
109
|
+
if values:
|
|
110
|
+
features.extend([
|
|
111
|
+
np.mean(values),
|
|
112
|
+
np.std(values),
|
|
113
|
+
np.min(values),
|
|
114
|
+
np.max(values),
|
|
115
|
+
])
|
|
116
|
+
else:
|
|
117
|
+
# Unknown amino acids - use zeros
|
|
118
|
+
features.extend([0.0, 0.0, 0.0, 0.0])
|
|
119
|
+
|
|
120
|
+
return np.array(features, dtype=np.float32)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _compute_composition_features(peptide: str) -> np.ndarray:
|
|
124
|
+
"""Compute amino acid composition (frequencies).
|
|
125
|
+
|
|
126
|
+
Returns array of shape (20,) with frequency of each amino acid.
|
|
127
|
+
"""
|
|
128
|
+
counts = np.zeros(20, dtype=np.float32)
|
|
129
|
+
for aa in peptide:
|
|
130
|
+
idx = AMINO_ACIDS.find(aa)
|
|
131
|
+
if idx >= 0:
|
|
132
|
+
counts[idx] += 1
|
|
133
|
+
|
|
134
|
+
# Normalize to frequencies
|
|
135
|
+
if len(peptide) > 0:
|
|
136
|
+
counts /= len(peptide)
|
|
137
|
+
|
|
138
|
+
return counts
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _compute_dipeptide_features(peptide: str) -> np.ndarray:
|
|
142
|
+
"""Compute dipeptide composition (frequencies of AA pairs).
|
|
143
|
+
|
|
144
|
+
Returns array of shape (400,) with frequency of each dipeptide.
|
|
145
|
+
"""
|
|
146
|
+
counts = np.zeros(400, dtype=np.float32) # 20 * 20
|
|
147
|
+
|
|
148
|
+
for i in range(len(peptide) - 1):
|
|
149
|
+
aa1_idx = AMINO_ACIDS.find(peptide[i])
|
|
150
|
+
aa2_idx = AMINO_ACIDS.find(peptide[i + 1])
|
|
151
|
+
if aa1_idx >= 0 and aa2_idx >= 0:
|
|
152
|
+
counts[aa1_idx * 20 + aa2_idx] += 1
|
|
153
|
+
|
|
154
|
+
# Normalize to frequencies
|
|
155
|
+
n_dipeptides = max(1, len(peptide) - 1)
|
|
156
|
+
counts /= n_dipeptides
|
|
157
|
+
|
|
158
|
+
return counts
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _compute_structural_features(peptide: str) -> np.ndarray:
|
|
162
|
+
"""Compute structural and physicochemical category features.
|
|
163
|
+
|
|
164
|
+
Returns array with:
|
|
165
|
+
- Secondary structure propensities (helix, sheet, turn) - 12 features (3 props × 4 stats)
|
|
166
|
+
- Category fractions (9 features: charged+/-, hydrophobic, aromatic, etc.)
|
|
167
|
+
- Charge features (4 features: net charge, charge density, transitions, clusters)
|
|
168
|
+
- Disorder features (2 features: disorder/order promoting ratios)
|
|
169
|
+
|
|
170
|
+
Total: 27 features
|
|
171
|
+
"""
|
|
172
|
+
n = len(peptide) if peptide else 1
|
|
173
|
+
features = []
|
|
174
|
+
|
|
175
|
+
# Secondary structure propensities - mean, std, min, max for each
|
|
176
|
+
for prop_dict in [HELIX_PROPENSITY, SHEET_PROPENSITY, TURN_PROPENSITY]:
|
|
177
|
+
values = [prop_dict.get(aa, 1.0) for aa in peptide if aa in prop_dict]
|
|
178
|
+
if values:
|
|
179
|
+
features.extend([np.mean(values), np.std(values), np.min(values), np.max(values)])
|
|
180
|
+
else:
|
|
181
|
+
features.extend([1.0, 0.0, 1.0, 1.0])
|
|
182
|
+
|
|
183
|
+
# Category fractions (9 features)
|
|
184
|
+
features.append(sum(1 for aa in peptide if aa in POSITIVE_CHARGED) / n) # Positive charged
|
|
185
|
+
features.append(sum(1 for aa in peptide if aa in NEGATIVE_CHARGED) / n) # Negative charged
|
|
186
|
+
features.append(sum(1 for aa in peptide if aa in HYDROPHOBIC) / n) # Hydrophobic
|
|
187
|
+
features.append(sum(1 for aa in peptide if aa in AROMATIC) / n) # Aromatic
|
|
188
|
+
features.append(sum(1 for aa in peptide if aa in ALIPHATIC) / n) # Aliphatic
|
|
189
|
+
features.append(sum(1 for aa in peptide if aa in POLAR_UNCHARGED) / n) # Polar uncharged
|
|
190
|
+
features.append(sum(1 for aa in peptide if aa in TINY) / n) # Tiny
|
|
191
|
+
features.append(sum(1 for aa in peptide if aa in SMALL) / n) # Small
|
|
192
|
+
features.append(sum(1 for aa in peptide if aa == 'C') / n) # Cysteine (viral)
|
|
193
|
+
|
|
194
|
+
# Charge features (4 features)
|
|
195
|
+
pos_count = sum(1 for aa in peptide if aa in POSITIVE_CHARGED)
|
|
196
|
+
neg_count = sum(1 for aa in peptide if aa in NEGATIVE_CHARGED)
|
|
197
|
+
net_charge = pos_count - neg_count
|
|
198
|
+
features.append(net_charge / n) # Net charge per residue
|
|
199
|
+
|
|
200
|
+
# Charge transitions (+ to - or - to +)
|
|
201
|
+
transitions = 0
|
|
202
|
+
for i in range(len(peptide) - 1):
|
|
203
|
+
curr_pos = peptide[i] in POSITIVE_CHARGED
|
|
204
|
+
curr_neg = peptide[i] in NEGATIVE_CHARGED
|
|
205
|
+
next_pos = peptide[i+1] in POSITIVE_CHARGED
|
|
206
|
+
next_neg = peptide[i+1] in NEGATIVE_CHARGED
|
|
207
|
+
if (curr_pos and next_neg) or (curr_neg and next_pos):
|
|
208
|
+
transitions += 1
|
|
209
|
+
features.append(transitions / max(1, n - 1)) # Charge transitions
|
|
210
|
+
|
|
211
|
+
# Charge clustering - max consecutive same-sign charges
|
|
212
|
+
max_cluster = 0
|
|
213
|
+
current_cluster = 0
|
|
214
|
+
current_sign = None
|
|
215
|
+
for aa in peptide:
|
|
216
|
+
if aa in POSITIVE_CHARGED:
|
|
217
|
+
sign = '+'
|
|
218
|
+
elif aa in NEGATIVE_CHARGED:
|
|
219
|
+
sign = '-'
|
|
220
|
+
else:
|
|
221
|
+
sign = None
|
|
222
|
+
if sign and sign == current_sign:
|
|
223
|
+
current_cluster += 1
|
|
224
|
+
elif sign:
|
|
225
|
+
current_cluster = 1
|
|
226
|
+
current_sign = sign
|
|
227
|
+
else:
|
|
228
|
+
current_cluster = 0
|
|
229
|
+
current_sign = None
|
|
230
|
+
max_cluster = max(max_cluster, current_cluster)
|
|
231
|
+
features.append(max_cluster / n) # Max charge cluster size
|
|
232
|
+
|
|
233
|
+
# Arginine depletion (viruses often have less R) - R/(R+K) ratio
|
|
234
|
+
r_count = sum(1 for aa in peptide if aa == 'R')
|
|
235
|
+
k_count = sum(1 for aa in peptide if aa == 'K')
|
|
236
|
+
if r_count + k_count > 0:
|
|
237
|
+
features.append(r_count / (r_count + k_count))
|
|
238
|
+
else:
|
|
239
|
+
features.append(0.5) # Neutral when no R or K present
|
|
240
|
+
|
|
241
|
+
# Disorder features (2 features)
|
|
242
|
+
disorder_promoting = sum(1 for aa in peptide if aa in DISORDER_PROMOTING)
|
|
243
|
+
order_promoting = sum(1 for aa in peptide if aa in ORDER_PROMOTING)
|
|
244
|
+
features.append(disorder_promoting / n) # Disorder-promoting fraction
|
|
245
|
+
features.append(order_promoting / n) # Order-promoting fraction
|
|
246
|
+
|
|
247
|
+
return np.array(features, dtype=np.float32)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _build_reduced_alphabet_index():
|
|
251
|
+
"""Build reduced alphabet group indices in a stable order."""
|
|
252
|
+
alphabet_order = list(REDUCED_ALPHABETS.keys())
|
|
253
|
+
alphabet_groups = {}
|
|
254
|
+
for name in alphabet_order:
|
|
255
|
+
mapping = REDUCED_ALPHABETS[name]
|
|
256
|
+
groups: List[str] = []
|
|
257
|
+
for aa in AMINO_ACIDS:
|
|
258
|
+
rep = mapping.get(aa)
|
|
259
|
+
if rep is None:
|
|
260
|
+
continue
|
|
261
|
+
if rep not in groups:
|
|
262
|
+
groups.append(rep)
|
|
263
|
+
alphabet_groups[name] = {
|
|
264
|
+
'groups': groups,
|
|
265
|
+
'rep_to_idx': {rep: idx for idx, rep in enumerate(groups)},
|
|
266
|
+
}
|
|
267
|
+
return alphabet_order, alphabet_groups
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
REDUCED_ALPHABET_ORDER, REDUCED_ALPHABET_GROUPS = _build_reduced_alphabet_index()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _compute_sequence_stats(peptide: str) -> np.ndarray:
|
|
274
|
+
"""Compute sequence-level non-positional statistics."""
|
|
275
|
+
if not peptide:
|
|
276
|
+
return np.zeros(12, dtype=np.float32)
|
|
277
|
+
|
|
278
|
+
n = len(peptide)
|
|
279
|
+
log_len = np.log1p(n)
|
|
280
|
+
sqrt_len = np.sqrt(n)
|
|
281
|
+
|
|
282
|
+
counts = np.zeros(20, dtype=np.float32)
|
|
283
|
+
unknown = 0
|
|
284
|
+
for aa in peptide:
|
|
285
|
+
idx = AMINO_ACIDS.find(aa)
|
|
286
|
+
if idx >= 0:
|
|
287
|
+
counts[idx] += 1
|
|
288
|
+
else:
|
|
289
|
+
unknown += 1
|
|
290
|
+
|
|
291
|
+
total = counts.sum()
|
|
292
|
+
if total > 0:
|
|
293
|
+
freqs = counts / total
|
|
294
|
+
nonzero = freqs[freqs > 0]
|
|
295
|
+
entropy = -np.sum(nonzero * np.log(nonzero))
|
|
296
|
+
entropy_norm = entropy / np.log(20)
|
|
297
|
+
effective = np.exp(entropy) / 20.0
|
|
298
|
+
gini = 1.0 - np.sum(freqs ** 2)
|
|
299
|
+
max_freq = float(freqs.max())
|
|
300
|
+
top2 = float(np.sort(freqs)[-2:].sum())
|
|
301
|
+
unique_frac = float(np.count_nonzero(counts) / 20.0)
|
|
302
|
+
else:
|
|
303
|
+
entropy_norm = 0.0
|
|
304
|
+
effective = 0.0
|
|
305
|
+
gini = 0.0
|
|
306
|
+
max_freq = 0.0
|
|
307
|
+
top2 = 0.0
|
|
308
|
+
unique_frac = 0.0
|
|
309
|
+
|
|
310
|
+
# Run-length and repeat statistics
|
|
311
|
+
max_run = 1
|
|
312
|
+
repeats = 0
|
|
313
|
+
current_run = 1
|
|
314
|
+
for i in range(1, n):
|
|
315
|
+
if peptide[i] == peptide[i - 1]:
|
|
316
|
+
repeats += 1
|
|
317
|
+
current_run += 1
|
|
318
|
+
else:
|
|
319
|
+
current_run = 1
|
|
320
|
+
if current_run > max_run:
|
|
321
|
+
max_run = current_run
|
|
322
|
+
|
|
323
|
+
max_run_frac = max_run / n
|
|
324
|
+
repeat_frac = repeats / max(1, n - 1)
|
|
325
|
+
frac_unknown = unknown / n
|
|
326
|
+
|
|
327
|
+
return np.array([
|
|
328
|
+
n,
|
|
329
|
+
log_len,
|
|
330
|
+
sqrt_len,
|
|
331
|
+
frac_unknown,
|
|
332
|
+
unique_frac,
|
|
333
|
+
max_run_frac,
|
|
334
|
+
repeat_frac,
|
|
335
|
+
entropy_norm,
|
|
336
|
+
effective,
|
|
337
|
+
max_freq,
|
|
338
|
+
top2,
|
|
339
|
+
gini,
|
|
340
|
+
], dtype=np.float32)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _compute_reduced_alphabet_features(peptide: str) -> np.ndarray:
|
|
344
|
+
"""Compute reduced alphabet composition features."""
|
|
345
|
+
if not peptide:
|
|
346
|
+
total_features = sum(
|
|
347
|
+
len(REDUCED_ALPHABET_GROUPS[name]['groups'])
|
|
348
|
+
for name in REDUCED_ALPHABET_ORDER
|
|
349
|
+
)
|
|
350
|
+
return np.zeros(total_features, dtype=np.float32)
|
|
351
|
+
|
|
352
|
+
features: List[np.ndarray] = []
|
|
353
|
+
for name in REDUCED_ALPHABET_ORDER:
|
|
354
|
+
mapping = REDUCED_ALPHABETS[name]
|
|
355
|
+
groups = REDUCED_ALPHABET_GROUPS[name]['groups']
|
|
356
|
+
rep_to_idx = REDUCED_ALPHABET_GROUPS[name]['rep_to_idx']
|
|
357
|
+
counts = np.zeros(len(groups), dtype=np.float32)
|
|
358
|
+
total = 0
|
|
359
|
+
for aa in peptide:
|
|
360
|
+
rep = mapping.get(aa)
|
|
361
|
+
if rep is None:
|
|
362
|
+
continue
|
|
363
|
+
counts[rep_to_idx[rep]] += 1
|
|
364
|
+
total += 1
|
|
365
|
+
if total > 0:
|
|
366
|
+
counts /= total
|
|
367
|
+
features.append(counts)
|
|
368
|
+
|
|
369
|
+
return np.concatenate(features) if features else np.array([], dtype=np.float32)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _compute_dipeptide_summary(dipeptide_freqs: np.ndarray) -> np.ndarray:
|
|
373
|
+
"""Compute summary statistics from dipeptide frequencies."""
|
|
374
|
+
if dipeptide_freqs.size == 0:
|
|
375
|
+
return np.zeros(5, dtype=np.float32)
|
|
376
|
+
|
|
377
|
+
total = dipeptide_freqs.sum()
|
|
378
|
+
if total > 0:
|
|
379
|
+
probs = dipeptide_freqs / total
|
|
380
|
+
nonzero = probs[probs > 0]
|
|
381
|
+
entropy = -np.sum(nonzero * np.log(nonzero))
|
|
382
|
+
entropy_norm = entropy / np.log(probs.size)
|
|
383
|
+
gini = 1.0 - np.sum(probs ** 2)
|
|
384
|
+
max_freq = float(probs.max())
|
|
385
|
+
top2 = float(np.sort(probs)[-2:].sum()) if probs.size >= 2 else max_freq
|
|
386
|
+
homodipep = float(np.trace(probs.reshape(20, 20)))
|
|
387
|
+
else:
|
|
388
|
+
entropy_norm = 0.0
|
|
389
|
+
gini = 0.0
|
|
390
|
+
max_freq = 0.0
|
|
391
|
+
top2 = 0.0
|
|
392
|
+
homodipep = 0.0
|
|
393
|
+
|
|
394
|
+
return np.array(
|
|
395
|
+
[entropy_norm, gini, max_freq, top2, homodipep],
|
|
396
|
+
dtype=np.float32,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def extract_features(peptide: str, k: int = 8, use_dipeptides: bool = True) -> np.ndarray:
|
|
401
|
+
"""Extract all features from a peptide.
|
|
402
|
+
|
|
403
|
+
Features include:
|
|
404
|
+
- Amino acid property statistics (12 props × 4 stats = 48 features)
|
|
405
|
+
- Structural/physicochemical features (27 features)
|
|
406
|
+
- Amino acid composition (20 features)
|
|
407
|
+
- Dipeptide composition (400 features, optional)
|
|
408
|
+
- Dipeptide summary statistics (5 features, optional)
|
|
409
|
+
- Sequence-level statistics (12 features)
|
|
410
|
+
- Reduced alphabet compositions (80 features)
|
|
411
|
+
|
|
412
|
+
Parameters
|
|
413
|
+
----------
|
|
414
|
+
peptide : str
|
|
415
|
+
Peptide sequence.
|
|
416
|
+
k : int
|
|
417
|
+
Unused; retained for backward compatibility.
|
|
418
|
+
use_dipeptides : bool
|
|
419
|
+
Include dipeptide composition features.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
features : np.ndarray
|
|
424
|
+
Feature vector.
|
|
425
|
+
"""
|
|
426
|
+
properties = _get_aa_properties()
|
|
427
|
+
|
|
428
|
+
feature_parts = [
|
|
429
|
+
_compute_property_features(peptide, properties), # 48 features (12 props × 4 stats)
|
|
430
|
+
_compute_structural_features(peptide), # 27 features
|
|
431
|
+
_compute_composition_features(peptide), # 20 features
|
|
432
|
+
_compute_sequence_stats(peptide), # 12 features
|
|
433
|
+
_compute_reduced_alphabet_features(peptide), # 80 features
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
if use_dipeptides:
|
|
437
|
+
dipep_freqs = _compute_dipeptide_features(peptide)
|
|
438
|
+
feature_parts.append(_compute_dipeptide_summary(dipep_freqs)) # 5 features
|
|
439
|
+
feature_parts.append(dipep_freqs) # 400 features
|
|
440
|
+
|
|
441
|
+
return np.concatenate(feature_parts)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
@register_scorer('mlp', description='MLP foreignness scorer with rich peptide features')
|
|
445
|
+
class MLPScorer(TrainableScorer):
|
|
446
|
+
"""MLP-based origin scorer using rich peptide features.
|
|
447
|
+
|
|
448
|
+
Combines multiple feature types:
|
|
449
|
+
- Amino acid properties (hydropathy, mass, polarity, etc.)
|
|
450
|
+
- Amino acid composition (single AA frequencies)
|
|
451
|
+
- Dipeptide composition (AA pair frequencies)
|
|
452
|
+
- Sequence-level statistics (entropy, repeats, complexity)
|
|
453
|
+
- Reduced alphabet compositions (Murphy/GBMR/SDM, etc.)
|
|
454
|
+
|
|
455
|
+
All features are normalized using StandardScaler before training.
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
k : int, default=8
|
|
460
|
+
K-mer size used to window long peptides for aggregation.
|
|
461
|
+
hidden_layer_sizes : tuple of int, default=(256, 128, 64)
|
|
462
|
+
Sizes of hidden layers.
|
|
463
|
+
activation : str, default='relu'
|
|
464
|
+
Activation function: 'relu', 'tanh', 'logistic'.
|
|
465
|
+
alpha : float, default=0.0001
|
|
466
|
+
L2 regularization strength.
|
|
467
|
+
max_iter : int, default=200
|
|
468
|
+
Maximum training iterations.
|
|
469
|
+
early_stopping : bool, default=True
|
|
470
|
+
Use early stopping with validation split.
|
|
471
|
+
use_dipeptides : bool, default=True
|
|
472
|
+
Include dipeptide composition features.
|
|
473
|
+
batch_size : int, default=256
|
|
474
|
+
Batch size for training.
|
|
475
|
+
|
|
476
|
+
Example
|
|
477
|
+
-------
|
|
478
|
+
>>> from weirdo.scorers import MLPScorer
|
|
479
|
+
>>>
|
|
480
|
+
>>> scorer = MLPScorer(hidden_layer_sizes=(256, 128))
|
|
481
|
+
>>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
|
|
482
|
+
>>> scores = scorer.score(['MTMDKSEL', 'XXXXXXXX'])
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
def __init__(
|
|
486
|
+
self,
|
|
487
|
+
k: int = 8,
|
|
488
|
+
hidden_layer_sizes: Tuple[int, ...] = (256, 128, 64),
|
|
489
|
+
activation: str = 'relu',
|
|
490
|
+
alpha: float = 0.0001,
|
|
491
|
+
learning_rate_init: float = 0.001,
|
|
492
|
+
max_iter: int = 200,
|
|
493
|
+
early_stopping: bool = True,
|
|
494
|
+
use_dipeptides: bool = True,
|
|
495
|
+
batch_size: int = 256,
|
|
496
|
+
random_state: Optional[int] = None,
|
|
497
|
+
**kwargs
|
|
498
|
+
):
|
|
499
|
+
super().__init__(k=k, batch_size=batch_size, **kwargs)
|
|
500
|
+
self._params.update({
|
|
501
|
+
'hidden_layer_sizes': hidden_layer_sizes,
|
|
502
|
+
'activation': activation,
|
|
503
|
+
'alpha': alpha,
|
|
504
|
+
'learning_rate_init': learning_rate_init,
|
|
505
|
+
'max_iter': max_iter,
|
|
506
|
+
'early_stopping': early_stopping,
|
|
507
|
+
'use_dipeptides': use_dipeptides,
|
|
508
|
+
'random_state': random_state,
|
|
509
|
+
})
|
|
510
|
+
self._model: Optional[MLPRegressor] = None
|
|
511
|
+
self._scaler: Optional[StandardScaler] = None
|
|
512
|
+
self._target_categories: Optional[List[str]] = None
|
|
513
|
+
|
|
514
|
+
def _extract_features(self, peptides: Sequence[str]) -> np.ndarray:
|
|
515
|
+
"""Extract features from a list of peptides."""
|
|
516
|
+
return np.array([
|
|
517
|
+
extract_features(p, self.k, self._params['use_dipeptides'])
|
|
518
|
+
for p in peptides
|
|
519
|
+
])
|
|
520
|
+
|
|
521
|
+
def _predict_raw_kmers(self, kmers: Sequence[str]) -> np.ndarray:
|
|
522
|
+
"""Predict raw model outputs for k-mers (no aggregation)."""
|
|
523
|
+
if not self._is_trained:
|
|
524
|
+
raise RuntimeError("Model must be trained before scoring.")
|
|
525
|
+
if self._model is None or self._scaler is None:
|
|
526
|
+
raise RuntimeError("Model is not initialized. Train or load a model first.")
|
|
527
|
+
|
|
528
|
+
X = self._extract_features(kmers)
|
|
529
|
+
X_scaled = self._scaler.transform(X)
|
|
530
|
+
return self._model.predict(X_scaled)
|
|
531
|
+
|
|
532
|
+
def train(
|
|
533
|
+
self,
|
|
534
|
+
peptides: Sequence[str],
|
|
535
|
+
labels: Sequence[float],
|
|
536
|
+
val_peptides: Optional[Sequence[str]] = None,
|
|
537
|
+
val_labels: Optional[Sequence[float]] = None,
|
|
538
|
+
epochs: Optional[int] = None,
|
|
539
|
+
learning_rate: Optional[float] = None,
|
|
540
|
+
verbose: bool = True,
|
|
541
|
+
target_categories: Optional[List[str]] = None,
|
|
542
|
+
plot_loss: Union[bool, str, Path, None] = None,
|
|
543
|
+
**kwargs
|
|
544
|
+
) -> 'MLPScorer':
|
|
545
|
+
"""Train the MLP on labeled peptide data.
|
|
546
|
+
|
|
547
|
+
Parameters
|
|
548
|
+
----------
|
|
549
|
+
peptides : sequence of str
|
|
550
|
+
Training peptide sequences.
|
|
551
|
+
labels : sequence of float or 2D array
|
|
552
|
+
Target labels in [0, 1]. Can be 1D (single foreignness score)
|
|
553
|
+
or 2D (multi-label with one column per category).
|
|
554
|
+
val_peptides : sequence of str, optional
|
|
555
|
+
Not used (sklearn handles validation internally).
|
|
556
|
+
val_labels : sequence of float, optional
|
|
557
|
+
Not used.
|
|
558
|
+
epochs : int, optional
|
|
559
|
+
Maximum training iterations (maps to max_iter). Defaults to max_iter.
|
|
560
|
+
learning_rate : float, optional
|
|
561
|
+
Initial learning rate. Defaults to learning_rate_init if not provided.
|
|
562
|
+
verbose : bool, default=True
|
|
563
|
+
Print training progress.
|
|
564
|
+
target_categories : list of str, optional
|
|
565
|
+
Names of target categories (for multi-label training).
|
|
566
|
+
E.g., ['human', 'viruses', 'bacteria', 'mammals'].
|
|
567
|
+
plot_loss : bool, str, or Path, optional
|
|
568
|
+
Save loss curve plot. If True, saves to 'loss_curve.png' in
|
|
569
|
+
current directory. If a path, saves to that location.
|
|
570
|
+
|
|
571
|
+
Returns
|
|
572
|
+
-------
|
|
573
|
+
self : MLPScorer
|
|
574
|
+
"""
|
|
575
|
+
self._target_categories = target_categories
|
|
576
|
+
if epochs is None:
|
|
577
|
+
epochs = self._params['max_iter']
|
|
578
|
+
if learning_rate is None:
|
|
579
|
+
learning_rate = self._params['learning_rate_init']
|
|
580
|
+
# Extract features
|
|
581
|
+
X = self._extract_features(peptides)
|
|
582
|
+
y = np.array(labels)
|
|
583
|
+
if target_categories is not None:
|
|
584
|
+
if y.ndim == 1 and len(target_categories) != 1:
|
|
585
|
+
raise ValueError("target_categories length must match label dimensions.")
|
|
586
|
+
if y.ndim == 2 and y.shape[1] != len(target_categories):
|
|
587
|
+
raise ValueError("target_categories length must match label dimensions.")
|
|
588
|
+
|
|
589
|
+
# Scale features to zero mean, unit variance
|
|
590
|
+
self._scaler = StandardScaler()
|
|
591
|
+
X_scaled = self._scaler.fit_transform(X)
|
|
592
|
+
|
|
593
|
+
# Disable early stopping if dataset is too small
|
|
594
|
+
use_early_stopping = self._params['early_stopping']
|
|
595
|
+
if use_early_stopping and len(peptides) < 20:
|
|
596
|
+
use_early_stopping = False
|
|
597
|
+
if verbose:
|
|
598
|
+
print("Note: Early stopping disabled (dataset too small)")
|
|
599
|
+
|
|
600
|
+
# Create and train model
|
|
601
|
+
self._model = MLPRegressor(
|
|
602
|
+
hidden_layer_sizes=self._params['hidden_layer_sizes'],
|
|
603
|
+
activation=self._params['activation'],
|
|
604
|
+
alpha=self._params['alpha'],
|
|
605
|
+
learning_rate_init=learning_rate,
|
|
606
|
+
max_iter=epochs,
|
|
607
|
+
early_stopping=use_early_stopping,
|
|
608
|
+
validation_fraction=0.1 if use_early_stopping else 0.0,
|
|
609
|
+
n_iter_no_change=10,
|
|
610
|
+
random_state=self._params['random_state'],
|
|
611
|
+
verbose=verbose,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
self._model.fit(X_scaled, y)
|
|
615
|
+
|
|
616
|
+
self._is_trained = True
|
|
617
|
+
self._is_fitted = True
|
|
618
|
+
|
|
619
|
+
# Save training metadata
|
|
620
|
+
self._metadata['n_train'] = len(peptides)
|
|
621
|
+
self._metadata['n_features'] = X.shape[1]
|
|
622
|
+
self._metadata['n_epochs'] = self._model.n_iter_
|
|
623
|
+
self._metadata['final_train_loss'] = float(self._model.loss_)
|
|
624
|
+
if hasattr(self._model, 'best_loss_') and self._model.best_loss_ is not None:
|
|
625
|
+
self._metadata['best_val_loss'] = float(self._model.best_loss_)
|
|
626
|
+
|
|
627
|
+
self._training_history = [
|
|
628
|
+
{'epoch': i + 1, 'loss': loss}
|
|
629
|
+
for i, loss in enumerate(self._model.loss_curve_)
|
|
630
|
+
]
|
|
631
|
+
|
|
632
|
+
if verbose:
|
|
633
|
+
print(f"\nTraining complete:")
|
|
634
|
+
print(f" Features: {X.shape[1]}")
|
|
635
|
+
print(f" Iterations: {self._model.n_iter_}")
|
|
636
|
+
print(f" Final loss: {self._model.loss_:.4f}")
|
|
637
|
+
|
|
638
|
+
# Save loss curve plot if requested
|
|
639
|
+
if plot_loss:
|
|
640
|
+
self._save_loss_plot(plot_loss, verbose=verbose)
|
|
641
|
+
|
|
642
|
+
return self
|
|
643
|
+
|
|
644
|
+
def _save_loss_plot(
|
|
645
|
+
self,
|
|
646
|
+
path: Union[bool, str, Path],
|
|
647
|
+
verbose: bool = True,
|
|
648
|
+
) -> Path:
|
|
649
|
+
"""Save loss curve plot to file.
|
|
650
|
+
|
|
651
|
+
Parameters
|
|
652
|
+
----------
|
|
653
|
+
path : bool, str, or Path
|
|
654
|
+
If True, saves to 'loss_curve.png'. If str/Path, saves to that location.
|
|
655
|
+
verbose : bool
|
|
656
|
+
Print save location.
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
save_path : Path
|
|
661
|
+
Path where plot was saved.
|
|
662
|
+
"""
|
|
663
|
+
import matplotlib.pyplot as plt
|
|
664
|
+
|
|
665
|
+
if not self._is_trained or self._model is None:
|
|
666
|
+
raise RuntimeError("Model must be trained before saving loss plot.")
|
|
667
|
+
|
|
668
|
+
# Determine save path
|
|
669
|
+
if path is True:
|
|
670
|
+
save_path = Path('loss_curve.png')
|
|
671
|
+
else:
|
|
672
|
+
save_path = Path(path)
|
|
673
|
+
|
|
674
|
+
# Get loss curve
|
|
675
|
+
loss_curve = self._model.loss_curve_
|
|
676
|
+
|
|
677
|
+
# Create plot
|
|
678
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
679
|
+
epochs = range(1, len(loss_curve) + 1)
|
|
680
|
+
ax.plot(epochs, loss_curve, 'b-', linewidth=2)
|
|
681
|
+
ax.set_xlabel('Epoch', fontsize=12)
|
|
682
|
+
ax.set_ylabel('Loss', fontsize=12)
|
|
683
|
+
ax.set_title(f'MLPScorer Training Loss ({self._metadata.get("n_train", "?")} samples)', fontsize=14)
|
|
684
|
+
ax.grid(True, alpha=0.3)
|
|
685
|
+
|
|
686
|
+
# Use log scale if loss spans multiple orders of magnitude
|
|
687
|
+
if len(loss_curve) > 1 and loss_curve[0] / max(loss_curve[-1], 1e-10) > 10:
|
|
688
|
+
ax.set_yscale('log')
|
|
689
|
+
|
|
690
|
+
# Add annotations
|
|
691
|
+
ax.annotate(
|
|
692
|
+
f'Start: {loss_curve[0]:.3f}',
|
|
693
|
+
xy=(1, loss_curve[0]),
|
|
694
|
+
xytext=(len(loss_curve) * 0.1, loss_curve[0] * 1.2),
|
|
695
|
+
fontsize=10,
|
|
696
|
+
arrowprops=dict(arrowstyle='->', color='gray', alpha=0.7),
|
|
697
|
+
)
|
|
698
|
+
ax.annotate(
|
|
699
|
+
f'End: {loss_curve[-1]:.4f}',
|
|
700
|
+
xy=(len(loss_curve), loss_curve[-1]),
|
|
701
|
+
xytext=(len(loss_curve) * 0.7, loss_curve[-1] * 2),
|
|
702
|
+
fontsize=10,
|
|
703
|
+
arrowprops=dict(arrowstyle='->', color='gray', alpha=0.7),
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
plt.tight_layout()
|
|
707
|
+
plt.savefig(save_path, dpi=150)
|
|
708
|
+
plt.close(fig)
|
|
709
|
+
|
|
710
|
+
if verbose:
|
|
711
|
+
print(f" Loss plot saved to: {save_path}")
|
|
712
|
+
|
|
713
|
+
return save_path
|
|
714
|
+
|
|
715
|
+
def score(
|
|
716
|
+
self,
|
|
717
|
+
peptides: Union[str, Sequence[str]],
|
|
718
|
+
aggregate: str = 'mean',
|
|
719
|
+
pathogen_categories: Optional[List[str]] = None,
|
|
720
|
+
self_categories: Optional[List[str]] = None,
|
|
721
|
+
) -> np.ndarray:
|
|
722
|
+
"""Score peptides for foreignness.
|
|
723
|
+
|
|
724
|
+
For variable-length peptides, scores are computed per k-mer and aggregated.
|
|
725
|
+
|
|
726
|
+
Parameters
|
|
727
|
+
----------
|
|
728
|
+
peptides : str or sequence of str
|
|
729
|
+
Peptide(s) to score.
|
|
730
|
+
aggregate : str, default='mean'
|
|
731
|
+
How to aggregate k-mer probabilities: 'mean', 'max', 'min'.
|
|
732
|
+
pathogen_categories : list of str, optional
|
|
733
|
+
Categories considered "foreign" (default: ['bacteria', 'viruses']).
|
|
734
|
+
self_categories : list of str, optional
|
|
735
|
+
Categories considered "self" (default: ['human', 'rodents', 'mammals']).
|
|
736
|
+
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
scores : np.ndarray
|
|
740
|
+
Foreignness scores (higher = more foreign).
|
|
741
|
+
"""
|
|
742
|
+
if not self._is_trained:
|
|
743
|
+
raise RuntimeError("Model must be trained before scoring.")
|
|
744
|
+
|
|
745
|
+
if self._target_categories is None:
|
|
746
|
+
probs = self.predict_proba(peptides, aggregate=aggregate)
|
|
747
|
+
if probs.shape[1] > 1:
|
|
748
|
+
raise RuntimeError(
|
|
749
|
+
"Model has multiple outputs. Use predict_proba() or train with "
|
|
750
|
+
"target_categories to compute foreignness."
|
|
751
|
+
)
|
|
752
|
+
return probs.ravel()
|
|
753
|
+
|
|
754
|
+
return self.foreignness(
|
|
755
|
+
peptides,
|
|
756
|
+
pathogen_categories=pathogen_categories,
|
|
757
|
+
self_categories=self_categories,
|
|
758
|
+
aggregate=aggregate,
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
def _score_batch_impl(self, batch: List[str]) -> np.ndarray:
|
|
762
|
+
"""Score a batch of peptides."""
|
|
763
|
+
return self.score(batch)
|
|
764
|
+
|
|
765
|
+
def predict_proba(
|
|
766
|
+
self,
|
|
767
|
+
peptides: Union[str, Sequence[str]],
|
|
768
|
+
aggregate: str = 'mean',
|
|
769
|
+
) -> np.ndarray:
|
|
770
|
+
"""Predict category probabilities using sigmoid activation.
|
|
771
|
+
|
|
772
|
+
Parameters
|
|
773
|
+
----------
|
|
774
|
+
peptides : str or sequence of str
|
|
775
|
+
Peptide(s) to predict.
|
|
776
|
+
aggregate : str, default='mean'
|
|
777
|
+
How to aggregate k-mer probabilities for long peptides: 'mean', 'max', 'min'.
|
|
778
|
+
|
|
779
|
+
Returns
|
|
780
|
+
-------
|
|
781
|
+
probs : np.ndarray
|
|
782
|
+
Probabilities for each category, shape (n_peptides, n_categories).
|
|
783
|
+
Values are in [0, 1] via sigmoid transformation.
|
|
784
|
+
"""
|
|
785
|
+
if not self._is_trained:
|
|
786
|
+
raise RuntimeError("Model must be trained before prediction.")
|
|
787
|
+
|
|
788
|
+
peptides = self._ensure_list(peptides)
|
|
789
|
+
results: List[np.ndarray] = []
|
|
790
|
+
|
|
791
|
+
for peptide in peptides:
|
|
792
|
+
kmers = self._extract_kmers(peptide)
|
|
793
|
+
raw = self._predict_raw_kmers(kmers)
|
|
794
|
+
probs = 1 / (1 + np.exp(-raw))
|
|
795
|
+
if probs.ndim == 1:
|
|
796
|
+
probs = probs.reshape(-1, 1)
|
|
797
|
+
|
|
798
|
+
if aggregate == 'mean':
|
|
799
|
+
agg_probs = probs.mean(axis=0)
|
|
800
|
+
elif aggregate == 'max':
|
|
801
|
+
agg_probs = probs.max(axis=0)
|
|
802
|
+
elif aggregate == 'min':
|
|
803
|
+
agg_probs = probs.min(axis=0)
|
|
804
|
+
else:
|
|
805
|
+
raise ValueError(f"Unknown aggregate method: {aggregate}")
|
|
806
|
+
|
|
807
|
+
results.append(agg_probs)
|
|
808
|
+
|
|
809
|
+
return np.vstack(results)
|
|
810
|
+
|
|
811
|
+
def foreignness(
|
|
812
|
+
self,
|
|
813
|
+
peptides: Union[str, Sequence[str]],
|
|
814
|
+
pathogen_categories: Optional[List[str]] = None,
|
|
815
|
+
self_categories: Optional[List[str]] = None,
|
|
816
|
+
aggregate: str = 'mean',
|
|
817
|
+
) -> np.ndarray:
|
|
818
|
+
"""Compute foreignness score from category probabilities.
|
|
819
|
+
|
|
820
|
+
Foreignness = max(pathogens) / (max(pathogens) + max(self))
|
|
821
|
+
|
|
822
|
+
Parameters
|
|
823
|
+
----------
|
|
824
|
+
peptides : str or sequence of str
|
|
825
|
+
Peptide(s) to score.
|
|
826
|
+
pathogen_categories : list of str, optional
|
|
827
|
+
Categories considered "foreign" (default: ['bacteria', 'viruses']).
|
|
828
|
+
self_categories : list of str, optional
|
|
829
|
+
Categories considered "self" (default: ['human', 'rodents', 'mammals']).
|
|
830
|
+
aggregate : str, default='mean'
|
|
831
|
+
How to aggregate k-mer probabilities for long peptides.
|
|
832
|
+
|
|
833
|
+
Returns
|
|
834
|
+
-------
|
|
835
|
+
foreignness : np.ndarray
|
|
836
|
+
Foreignness scores in [0, 1]. Higher = more foreign.
|
|
837
|
+
"""
|
|
838
|
+
if self._target_categories is None:
|
|
839
|
+
raise RuntimeError(
|
|
840
|
+
"Model must be trained with target_categories to use foreignness(). "
|
|
841
|
+
"Use score() for single-output models."
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
if pathogen_categories is None:
|
|
845
|
+
pathogen_categories = ['bacteria', 'viruses']
|
|
846
|
+
if self_categories is None:
|
|
847
|
+
self_categories = ['human', 'rodents', 'mammals']
|
|
848
|
+
|
|
849
|
+
# Get category indices
|
|
850
|
+
pathogen_idx = [
|
|
851
|
+
self._target_categories.index(cat)
|
|
852
|
+
for cat in pathogen_categories
|
|
853
|
+
if cat in self._target_categories
|
|
854
|
+
]
|
|
855
|
+
self_idx = [
|
|
856
|
+
self._target_categories.index(cat)
|
|
857
|
+
for cat in self_categories
|
|
858
|
+
if cat in self._target_categories
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
if not pathogen_idx:
|
|
862
|
+
raise ValueError(
|
|
863
|
+
f"No pathogen categories found. Available: {self._target_categories}"
|
|
864
|
+
)
|
|
865
|
+
if not self_idx:
|
|
866
|
+
raise ValueError(
|
|
867
|
+
f"No self categories found. Available: {self._target_categories}"
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
# Get probabilities
|
|
871
|
+
probs = self.predict_proba(peptides, aggregate=aggregate)
|
|
872
|
+
|
|
873
|
+
# Compute foreignness: max(pathogens) / (max(pathogens) + max(self))
|
|
874
|
+
max_pathogen = probs[:, pathogen_idx].max(axis=1)
|
|
875
|
+
max_self = probs[:, self_idx].max(axis=1)
|
|
876
|
+
|
|
877
|
+
# Avoid division by zero
|
|
878
|
+
denominator = max_pathogen + max_self
|
|
879
|
+
foreignness = np.where(
|
|
880
|
+
denominator > 0,
|
|
881
|
+
max_pathogen / denominator,
|
|
882
|
+
0.5 # Neutral when both are zero
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
return foreignness
|
|
886
|
+
|
|
887
|
+
@property
|
|
888
|
+
def target_categories(self) -> Optional[List[str]]:
|
|
889
|
+
"""Get target category names (if trained with multi-label)."""
|
|
890
|
+
return self._target_categories
|
|
891
|
+
|
|
892
|
+
def predict_dataframe(
|
|
893
|
+
self,
|
|
894
|
+
peptides: Sequence[str],
|
|
895
|
+
pathogen_categories: Optional[List[str]] = None,
|
|
896
|
+
self_categories: Optional[List[str]] = None,
|
|
897
|
+
aggregate: str = 'mean',
|
|
898
|
+
) -> 'pd.DataFrame':
|
|
899
|
+
"""Predict category probabilities and foreignness for variable-length peptides.
|
|
900
|
+
|
|
901
|
+
For peptides longer than k, breaks into overlapping k-mers and aggregates.
|
|
902
|
+
|
|
903
|
+
Parameters
|
|
904
|
+
----------
|
|
905
|
+
peptides : sequence of str
|
|
906
|
+
Peptide sequences (can be variable length).
|
|
907
|
+
pathogen_categories : list of str, optional
|
|
908
|
+
Categories considered "foreign" (default: ['bacteria', 'viruses']).
|
|
909
|
+
self_categories : list of str, optional
|
|
910
|
+
Categories considered "self" (default: ['human', 'rodents', 'mammals']).
|
|
911
|
+
aggregate : str, default='mean'
|
|
912
|
+
How to aggregate k-mer scores for long peptides: 'mean', 'max', 'min'.
|
|
913
|
+
|
|
914
|
+
Returns
|
|
915
|
+
-------
|
|
916
|
+
df : pd.DataFrame
|
|
917
|
+
DataFrame with columns:
|
|
918
|
+
- 'peptide': input peptide sequence
|
|
919
|
+
- One column per target category (probabilities)
|
|
920
|
+
- 'foreignness': foreignness score
|
|
921
|
+
"""
|
|
922
|
+
import pandas as pd
|
|
923
|
+
|
|
924
|
+
if self._target_categories is None:
|
|
925
|
+
raise RuntimeError(
|
|
926
|
+
"Model must be trained with target_categories to use predict_dataframe()."
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
if pathogen_categories is None:
|
|
930
|
+
pathogen_categories = ['bacteria', 'viruses']
|
|
931
|
+
if self_categories is None:
|
|
932
|
+
self_categories = ['human', 'rodents', 'mammals']
|
|
933
|
+
|
|
934
|
+
# Get category indices for foreignness calculation
|
|
935
|
+
pathogen_idx = [
|
|
936
|
+
self._target_categories.index(cat)
|
|
937
|
+
for cat in pathogen_categories
|
|
938
|
+
if cat in self._target_categories
|
|
939
|
+
]
|
|
940
|
+
self_idx = [
|
|
941
|
+
self._target_categories.index(cat)
|
|
942
|
+
for cat in self_categories
|
|
943
|
+
if cat in self._target_categories
|
|
944
|
+
]
|
|
945
|
+
|
|
946
|
+
peptides = list(peptides)
|
|
947
|
+
probs = self.predict_proba(peptides, aggregate=aggregate)
|
|
948
|
+
results = []
|
|
949
|
+
for peptide, row_probs in zip(peptides, probs):
|
|
950
|
+
max_pathogen = row_probs[pathogen_idx].max() if pathogen_idx else 0.0
|
|
951
|
+
max_self = row_probs[self_idx].max() if self_idx else 0.0
|
|
952
|
+
denom = max_pathogen + max_self
|
|
953
|
+
foreignness = max_pathogen / denom if denom > 0 else 0.5
|
|
954
|
+
|
|
955
|
+
row = {'peptide': peptide}
|
|
956
|
+
for cat, p in zip(self._target_categories, row_probs):
|
|
957
|
+
row[cat] = float(p)
|
|
958
|
+
row['foreignness'] = float(foreignness)
|
|
959
|
+
results.append(row)
|
|
960
|
+
|
|
961
|
+
# Create DataFrame with consistent column order
|
|
962
|
+
columns = ['peptide'] + self._target_categories + ['foreignness']
|
|
963
|
+
return pd.DataFrame(results, columns=columns)
|
|
964
|
+
|
|
965
|
+
def features_dataframe(
|
|
966
|
+
self,
|
|
967
|
+
peptides: Sequence[str],
|
|
968
|
+
aggregate: str = 'mean',
|
|
969
|
+
include_peptide: bool = True,
|
|
970
|
+
) -> 'pd.DataFrame':
|
|
971
|
+
"""Extract features for peptides as a DataFrame.
|
|
972
|
+
|
|
973
|
+
For peptides longer than k, breaks into overlapping k-mers and aggregates.
|
|
974
|
+
|
|
975
|
+
Parameters
|
|
976
|
+
----------
|
|
977
|
+
peptides : sequence of str
|
|
978
|
+
Peptide sequences (can be variable length).
|
|
979
|
+
aggregate : str, default='mean'
|
|
980
|
+
How to aggregate k-mer features for long peptides: 'mean', 'max', 'min'.
|
|
981
|
+
include_peptide : bool, default=True
|
|
982
|
+
Include peptide sequence as first column.
|
|
983
|
+
|
|
984
|
+
Returns
|
|
985
|
+
-------
|
|
986
|
+
df : pd.DataFrame
|
|
987
|
+
DataFrame with 592 feature columns (+ peptide column if include_peptide=True).
|
|
988
|
+
Features: 48 AA properties, 27 structural, 20 AA composition,
|
|
989
|
+
12 sequence stats, 80 reduced alphabet frequencies, 5 dipeptide
|
|
990
|
+
summaries, 400 dipeptides (if enabled).
|
|
991
|
+
"""
|
|
992
|
+
import pandas as pd
|
|
993
|
+
|
|
994
|
+
feature_names = self.get_feature_names()
|
|
995
|
+
results = []
|
|
996
|
+
|
|
997
|
+
for peptide in peptides:
|
|
998
|
+
if len(peptide) < self.k:
|
|
999
|
+
# Pad short peptides
|
|
1000
|
+
kmers = [peptide + 'X' * (self.k - len(peptide))]
|
|
1001
|
+
elif len(peptide) == self.k:
|
|
1002
|
+
kmers = [peptide]
|
|
1003
|
+
else:
|
|
1004
|
+
# Extract overlapping k-mers
|
|
1005
|
+
kmers = [peptide[i:i+self.k] for i in range(len(peptide) - self.k + 1)]
|
|
1006
|
+
|
|
1007
|
+
# Extract features for all k-mers
|
|
1008
|
+
kmer_features = np.array([
|
|
1009
|
+
extract_features(kmer, self.k, self._params['use_dipeptides'])
|
|
1010
|
+
for kmer in kmers
|
|
1011
|
+
])
|
|
1012
|
+
|
|
1013
|
+
# Aggregate across k-mers
|
|
1014
|
+
if aggregate == 'mean':
|
|
1015
|
+
features = kmer_features.mean(axis=0)
|
|
1016
|
+
elif aggregate == 'max':
|
|
1017
|
+
features = kmer_features.max(axis=0)
|
|
1018
|
+
elif aggregate == 'min':
|
|
1019
|
+
features = kmer_features.min(axis=0)
|
|
1020
|
+
else:
|
|
1021
|
+
raise ValueError(f"Unknown aggregate method: {aggregate}")
|
|
1022
|
+
|
|
1023
|
+
if include_peptide:
|
|
1024
|
+
row = {'peptide': peptide}
|
|
1025
|
+
row.update(dict(zip(feature_names, features)))
|
|
1026
|
+
else:
|
|
1027
|
+
row = dict(zip(feature_names, features))
|
|
1028
|
+
results.append(row)
|
|
1029
|
+
|
|
1030
|
+
# Create DataFrame with consistent column order
|
|
1031
|
+
if include_peptide:
|
|
1032
|
+
columns = ['peptide'] + feature_names
|
|
1033
|
+
else:
|
|
1034
|
+
columns = feature_names
|
|
1035
|
+
return pd.DataFrame(results, columns=columns)
|
|
1036
|
+
|
|
1037
|
+
def _save_model(self, path: Path) -> None:
|
|
1038
|
+
"""Save model weights."""
|
|
1039
|
+
model_data = {
|
|
1040
|
+
'model': self._model,
|
|
1041
|
+
'scaler': self._scaler,
|
|
1042
|
+
'target_categories': self._target_categories,
|
|
1043
|
+
}
|
|
1044
|
+
with open(path, 'wb') as f:
|
|
1045
|
+
pickle.dump(model_data, f)
|
|
1046
|
+
|
|
1047
|
+
def _load_model(self, path: Path) -> None:
|
|
1048
|
+
"""Load model weights."""
|
|
1049
|
+
with open(path, 'rb') as f:
|
|
1050
|
+
model_data = pickle.load(f)
|
|
1051
|
+
self._model = model_data['model']
|
|
1052
|
+
self._scaler = model_data['scaler']
|
|
1053
|
+
self._target_categories = model_data.get('target_categories')
|
|
1054
|
+
|
|
1055
|
+
def get_feature_names(self) -> List[str]:
|
|
1056
|
+
"""Get names of all features used by the model.
|
|
1057
|
+
|
|
1058
|
+
Returns
|
|
1059
|
+
-------
|
|
1060
|
+
names : list of str
|
|
1061
|
+
Feature names.
|
|
1062
|
+
"""
|
|
1063
|
+
properties = list(_get_aa_properties().keys())
|
|
1064
|
+
names = []
|
|
1065
|
+
|
|
1066
|
+
# Property statistics (12 props × 4 stats = 48 features)
|
|
1067
|
+
for prop in properties:
|
|
1068
|
+
for stat in ['mean', 'std', 'min', 'max']:
|
|
1069
|
+
names.append(f'{prop}_{stat}')
|
|
1070
|
+
|
|
1071
|
+
# Structural features (27 features)
|
|
1072
|
+
for struct in ['helix', 'sheet', 'turn']:
|
|
1073
|
+
for stat in ['mean', 'std', 'min', 'max']:
|
|
1074
|
+
names.append(f'{struct}_propensity_{stat}')
|
|
1075
|
+
names.extend([
|
|
1076
|
+
'frac_positive_charged', 'frac_negative_charged', 'frac_hydrophobic',
|
|
1077
|
+
'frac_aromatic', 'frac_aliphatic', 'frac_polar_uncharged',
|
|
1078
|
+
'frac_tiny', 'frac_small', 'frac_cysteine',
|
|
1079
|
+
'net_charge_per_residue', 'charge_transitions', 'max_charge_cluster',
|
|
1080
|
+
'arginine_ratio', # R/(R+K) - lower in viruses
|
|
1081
|
+
'frac_disorder_promoting', 'frac_order_promoting',
|
|
1082
|
+
])
|
|
1083
|
+
|
|
1084
|
+
# Amino acid composition (20 features)
|
|
1085
|
+
for aa in AMINO_ACIDS:
|
|
1086
|
+
names.append(f'aa_freq_{aa}')
|
|
1087
|
+
|
|
1088
|
+
# Sequence statistics (12 features)
|
|
1089
|
+
names.extend([
|
|
1090
|
+
'seq_length',
|
|
1091
|
+
'seq_log_length',
|
|
1092
|
+
'seq_sqrt_length',
|
|
1093
|
+
'frac_unknown',
|
|
1094
|
+
'unique_frac',
|
|
1095
|
+
'max_run_frac',
|
|
1096
|
+
'repeat_frac',
|
|
1097
|
+
'entropy_aa',
|
|
1098
|
+
'effective_aa',
|
|
1099
|
+
'max_aa_freq',
|
|
1100
|
+
'top2_aa_freq',
|
|
1101
|
+
'gini_aa',
|
|
1102
|
+
])
|
|
1103
|
+
|
|
1104
|
+
# Reduced alphabet compositions (80 features)
|
|
1105
|
+
for name in REDUCED_ALPHABET_ORDER:
|
|
1106
|
+
groups = REDUCED_ALPHABET_GROUPS[name]['groups']
|
|
1107
|
+
for rep in groups:
|
|
1108
|
+
names.append(f'{name}_freq_{rep}')
|
|
1109
|
+
|
|
1110
|
+
# Dipeptide summary (5 features)
|
|
1111
|
+
if self._params.get('use_dipeptides', True):
|
|
1112
|
+
names.extend([
|
|
1113
|
+
'dipep_entropy',
|
|
1114
|
+
'dipep_gini',
|
|
1115
|
+
'dipep_max_freq',
|
|
1116
|
+
'dipep_top2_freq',
|
|
1117
|
+
'dipep_homodimer_frac',
|
|
1118
|
+
])
|
|
1119
|
+
|
|
1120
|
+
# Dipeptide composition (400 features)
|
|
1121
|
+
if self._params.get('use_dipeptides', True):
|
|
1122
|
+
for aa1 in AMINO_ACIDS:
|
|
1123
|
+
for aa2 in AMINO_ACIDS:
|
|
1124
|
+
names.append(f'dipep_{aa1}{aa2}')
|
|
1125
|
+
|
|
1126
|
+
return names
|