weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
weirdo/api.py ADDED
@@ -0,0 +1,372 @@
1
+ """High-level convenience API for foreignness scoring.
2
+
3
+ Provides simple functions for common use cases without
4
+ needing to understand the full scorer architecture.
5
+
6
+ Example
7
+ -------
8
+ >>> from weirdo import score_peptide, load_model
9
+ >>> scorer = load_model('my-mlp')
10
+ >>> score = score_peptide('MTMDKSEL', model=scorer)
11
+
12
+ >>> from weirdo import score_peptides
13
+ >>> scores = score_peptides(['MTMDKSEL', 'ACDEFGHI'], model=scorer)
14
+ """
15
+
16
+ from typing import Any, Dict, List, Optional, Sequence, Union
17
+
18
+ import numpy as np
19
+
20
+ from .scorers import ScorerConfig, BaseScorer, TrainableScorer
21
+
22
+
23
+ # Cache for scorer instances by preset
24
+ _scorer_cache: Dict[str, BaseScorer] = {}
25
+
26
+
27
+ def create_scorer(
28
+ preset: str = 'default',
29
+ cache: bool = True,
30
+ auto_download: bool = False,
31
+ train_data: Optional[Sequence[str]] = None,
32
+ train_labels: Optional[Any] = None,
33
+ target_categories: Optional[List[str]] = None,
34
+ **overrides
35
+ ) -> BaseScorer:
36
+ """Create a scorer from a preset configuration.
37
+
38
+ Parameters
39
+ ----------
40
+ preset : str, default='default'
41
+ Preset name (e.g., 'default', 'fast').
42
+ cache : bool, default=True
43
+ If True, cache the scorer instance for reuse.
44
+ Set to False if you need multiple independent instances.
45
+ auto_download : bool, default=False
46
+ If True, automatically download reference data if not present.
47
+ train_data : sequence of str, optional
48
+ Training peptides for trainable scorers.
49
+ train_labels : array-like, optional
50
+ Training labels for trainable scorers.
51
+ target_categories : list of str, optional
52
+ Category names for multi-label training.
53
+ **overrides : dict
54
+ Override specific config parameters (e.g., k=10, hidden_layer_sizes=(128, 64)).
55
+
56
+ Returns
57
+ -------
58
+ scorer : BaseScorer
59
+ Configured scorer. Trainable scorers are returned untrained unless
60
+ train_data and train_labels are provided.
61
+
62
+ Example
63
+ -------
64
+ >>> scorer = create_scorer('default', use_dipeptides=False)
65
+ >>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
66
+
67
+ >>> # Auto-download data on first use
68
+ >>> scorer = create_scorer('default', auto_download=True)
69
+ """
70
+ # Build cache key from preset and overrides
71
+ cache_key = f"{preset}:{sorted(overrides.items())}:auto={auto_download}"
72
+
73
+ if cache and cache_key in _scorer_cache and train_data is None and train_labels is None:
74
+ return _scorer_cache[cache_key]
75
+
76
+ # Get preset config
77
+ config = ScorerConfig.from_preset(preset)
78
+
79
+ # Apply overrides
80
+ if overrides:
81
+ # Check which params go to scorer vs reference
82
+ scorer_params = {
83
+ 'hidden_layer_sizes',
84
+ 'activation',
85
+ 'alpha',
86
+ 'learning_rate_init',
87
+ 'max_iter',
88
+ 'early_stopping',
89
+ 'use_dipeptides',
90
+ 'batch_size',
91
+ 'random_state',
92
+ }
93
+ reference_params = {'categories', 'lazy', 'use_set', 'data_path'}
94
+
95
+ for key, value in overrides.items():
96
+ if key == 'k':
97
+ config.k = value
98
+ elif key == 'scorer':
99
+ config.scorer = value
100
+ elif key == 'reference':
101
+ config.reference = value
102
+ elif key in scorer_params:
103
+ config.scorer_params[key] = value
104
+ elif key in reference_params:
105
+ config.reference_params[key] = value
106
+ else:
107
+ # Assume it's a scorer param
108
+ config.scorer_params[key] = value
109
+
110
+ # Add auto_download to reference params
111
+ if auto_download:
112
+ config.reference_params['auto_download'] = True
113
+
114
+ # Build scorer (trainable scorers are returned untrained unless training data provided)
115
+ scorer = config.build(
116
+ train_data=list(train_data) if train_data is not None else None,
117
+ train_labels=train_labels,
118
+ target_categories=target_categories,
119
+ )
120
+
121
+ if cache and train_data is None and train_labels is None:
122
+ _scorer_cache[cache_key] = scorer
123
+
124
+ return scorer
125
+
126
+
127
+ def score_peptide(
128
+ peptide: str,
129
+ model: Optional[Union[str, BaseScorer]] = None,
130
+ model_dir: Optional[str] = None,
131
+ preset: Optional[str] = None,
132
+ aggregate: str = 'mean',
133
+ **kwargs
134
+ ) -> float:
135
+ """Score a single peptide.
136
+
137
+ Parameters
138
+ ----------
139
+ peptide : str
140
+ Peptide sequence to score.
141
+ model : str or BaseScorer, optional
142
+ Model name (from ModelManager) or an instantiated scorer.
143
+ model_dir : str, optional
144
+ Custom model directory when loading by name.
145
+ preset : str, optional
146
+ Scoring preset for non-trainable scorers.
147
+ aggregate : str, default='mean'
148
+ How to aggregate k-mer probabilities for long peptides.
149
+ **kwargs : dict
150
+ Additional arguments passed to create_scorer().
151
+
152
+ Returns
153
+ -------
154
+ score : float
155
+ Foreignness score. Higher = more foreign.
156
+
157
+ Example
158
+ -------
159
+ >>> scorer = load_model('my-mlp')
160
+ >>> score = score_peptide('MTMDKSEL', model=scorer)
161
+ """
162
+ if model is None:
163
+ if preset is None:
164
+ raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
165
+ scorer = create_scorer(preset, **kwargs)
166
+ elif isinstance(model, str):
167
+ scorer = load_model(model, model_dir)
168
+ else:
169
+ scorer = model
170
+
171
+ if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
172
+ raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
173
+
174
+ try:
175
+ scores = scorer.score([peptide], aggregate=aggregate)
176
+ except TypeError:
177
+ scores = scorer.score([peptide])
178
+ return float(scores[0])
179
+
180
+
181
+ def score_peptides(
182
+ peptides: Sequence[str],
183
+ model: Optional[Union[str, BaseScorer]] = None,
184
+ model_dir: Optional[str] = None,
185
+ preset: Optional[str] = None,
186
+ aggregate: str = 'mean',
187
+ **kwargs
188
+ ) -> np.ndarray:
189
+ """Score multiple peptides.
190
+
191
+ Parameters
192
+ ----------
193
+ peptides : sequence of str
194
+ Peptide sequences to score.
195
+ model : str or BaseScorer, optional
196
+ Model name (from ModelManager) or an instantiated scorer.
197
+ model_dir : str, optional
198
+ Custom model directory when loading by name.
199
+ preset : str, optional
200
+ Scoring preset for non-trainable scorers.
201
+ aggregate : str, default='mean'
202
+ How to aggregate k-mer probabilities for long peptides.
203
+ **kwargs : dict
204
+ Additional arguments passed to create_scorer().
205
+
206
+ Returns
207
+ -------
208
+ scores : np.ndarray
209
+ Array of foreignness scores. Higher = more foreign.
210
+
211
+ Example
212
+ -------
213
+ >>> scorer = load_model('my-mlp')
214
+ >>> scores = score_peptides(['MTMDKSEL'], model=scorer)
215
+ """
216
+ if model is None:
217
+ if preset is None:
218
+ raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
219
+ scorer = create_scorer(preset, **kwargs)
220
+ elif isinstance(model, str):
221
+ scorer = load_model(model, model_dir)
222
+ else:
223
+ scorer = model
224
+
225
+ if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
226
+ raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
227
+
228
+ try:
229
+ return scorer.score(peptides, aggregate=aggregate)
230
+ except TypeError:
231
+ return scorer.score(peptides)
232
+
233
+
234
+ def clear_cache() -> None:
235
+ """Clear the scorer cache.
236
+
237
+ Use this to free memory or reset state.
238
+ """
239
+ _scorer_cache.clear()
240
+
241
+
242
+ def get_available_presets() -> List[str]:
243
+ """Get list of available preset names.
244
+
245
+ Returns
246
+ -------
247
+ presets : list of str
248
+ Available preset names.
249
+ """
250
+ from .scorers import list_presets
251
+ return list_presets()
252
+
253
+
254
+ def get_preset_info(preset: str) -> Dict[str, Any]:
255
+ """Get information about a preset configuration.
256
+
257
+ Parameters
258
+ ----------
259
+ preset : str
260
+ Preset name.
261
+
262
+ Returns
263
+ -------
264
+ info : dict
265
+ Preset configuration details.
266
+ """
267
+ config = ScorerConfig.from_preset(preset)
268
+ return config.to_dict()
269
+
270
+
271
+ # =============================================================================
272
+ # Model Management Functions
273
+ # =============================================================================
274
+
275
+ def list_models(model_dir: Optional[str] = None) -> List[Any]:
276
+ """List all available trained models.
277
+
278
+ Parameters
279
+ ----------
280
+ model_dir : str, optional
281
+ Custom model directory. Defaults to ~/.weirdo/models.
282
+
283
+ Returns
284
+ -------
285
+ models : list of ModelInfo
286
+ Information about each saved model.
287
+
288
+ Example
289
+ -------
290
+ >>> models = list_models()
291
+ >>> for m in models:
292
+ ... print(f"{m.name}: {m.scorer_type}")
293
+ """
294
+ from .model_manager import list_models as _list_models
295
+ return _list_models(model_dir)
296
+
297
+
298
+ def load_model(name: str, model_dir: Optional[str] = None) -> BaseScorer:
299
+ """Load a trained model by name.
300
+
301
+ Parameters
302
+ ----------
303
+ name : str
304
+ Model name.
305
+ model_dir : str, optional
306
+ Custom model directory.
307
+
308
+ Returns
309
+ -------
310
+ scorer : TrainableScorer
311
+ Loaded model ready for scoring.
312
+
313
+ Example
314
+ -------
315
+ >>> model = load_model('my-mlp')
316
+ >>> scores = model.score(['MTMDKSEL'])
317
+ """
318
+ from .model_manager import load_model as _load_model
319
+ return _load_model(name, model_dir)
320
+
321
+
322
+ def save_model(
323
+ scorer: BaseScorer,
324
+ name: str,
325
+ model_dir: Optional[str] = None,
326
+ overwrite: bool = False,
327
+ ) -> str:
328
+ """Save a trained model.
329
+
330
+ Parameters
331
+ ----------
332
+ scorer : TrainableScorer
333
+ Trained model to save.
334
+ name : str
335
+ Name for the saved model.
336
+ model_dir : str, optional
337
+ Custom model directory.
338
+ overwrite : bool, default=False
339
+ Overwrite existing model.
340
+
341
+ Returns
342
+ -------
343
+ path : str
344
+ Path where model was saved.
345
+
346
+ Example
347
+ -------
348
+ >>> scorer = MLPScorer()
349
+ >>> scorer.train(peptides, labels)
350
+ >>> save_model(scorer, 'my-mlp')
351
+ """
352
+ from .model_manager import save_model as _save_model
353
+ return str(_save_model(scorer, name, model_dir, overwrite))
354
+
355
+
356
+ def get_available_scorers() -> List[str]:
357
+ """Get list of available scorer types.
358
+
359
+ Returns both lookup-based and ML-based scorers.
360
+
361
+ Returns
362
+ -------
363
+ scorers : list of str
364
+ Available scorer names.
365
+
366
+ Example
367
+ -------
368
+ >>> print(get_available_scorers())
369
+ ['mlp']
370
+ """
371
+ from .scorers import list_scorers
372
+ return list_scorers()
weirdo/blosum.py ADDED
@@ -0,0 +1,74 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ from os.path import join
15
+
16
+ from .static_data import MATRIX_DIR
17
+
18
+ from .amino_acid_alphabet import dict_to_amino_acid_matrix
19
+
20
+ def parse_blosum_table(table, coeff_type=int, key_type='row'):
21
+ """
22
+ Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50)
23
+ """
24
+
25
+ lines = table.split("\n")
26
+ # drop comments
27
+ lines = [line for line in lines if not line.startswith("#")]
28
+ # drop CR endline characters
29
+ lines = [line.replace("\r", "") for line in lines]
30
+ # skip empty lines
31
+ lines = [line for line in lines if line]
32
+
33
+ labels = lines[0].split()
34
+
35
+ if len(labels) < 20:
36
+ raise ValueError(
37
+ "Expected 20+ amino acids but first line '%s' has %d fields" % (
38
+ lines[0],
39
+ len(labels)))
40
+ coeffs = {}
41
+ for line in lines[1:]:
42
+
43
+ fields = line.split()
44
+ assert len(fields) >= 21, \
45
+ "Expected AA and 20+ coefficients but '%s' has %d fields" % (
46
+ line, len(fields))
47
+ x = fields[0]
48
+ for i, coeff_str in enumerate(fields[1:]):
49
+ y = labels[i]
50
+ coeff = coeff_type(coeff_str)
51
+ if key_type == 'pair':
52
+ coeffs[(x, y)] = coeff
53
+ elif key_type == 'pair_string':
54
+ coeffs[x + y] = coeff
55
+ else:
56
+ assert key_type == 'row', "Unknown key type: %s" % key_type
57
+ if x not in coeffs:
58
+ coeffs[x] = {}
59
+ coeffs[x][y] = coeff
60
+ return coeffs
61
+
62
+
63
+ with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f:
64
+ blosum30_dict = parse_blosum_table(f.read())
65
+ blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict)
66
+
67
+ with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f:
68
+ blosum50_dict = parse_blosum_table(f.read())
69
+ blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict)
70
+
71
+ with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f:
72
+ blosum62_dict = parse_blosum_table(f.read())
73
+ blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict)
74
+
weirdo/chou_fasman.py ADDED
@@ -0,0 +1,73 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ from .amino_acid_alphabet import amino_acid_name_indices
15
+
16
+ # Chou-Fasman of structural properties from
17
+ # http://prowl.rockefeller.edu/aainfo/chou.htm
18
+ chou_fasman_table = """
19
+ Alanine 142 83 66 0.06 0.076 0.035 0.058
20
+ Arginine 98 93 95 0.070 0.106 0.099 0.085
21
+ Aspartic Acid 101 54 146 0.147 0.110 0.179 0.081
22
+ Asparagine 67 89 156 0.161 0.083 0.191 0.091
23
+ Cysteine 70 119 119 0.149 0.050 0.117 0.128
24
+ Glutamic Acid 151 037 74 0.056 0.060 0.077 0.064
25
+ Glutamine 111 110 98 0.074 0.098 0.037 0.098
26
+ Glycine 57 75 156 0.102 0.085 0.190 0.152
27
+ Histidine 100 87 95 0.140 0.047 0.093 0.054
28
+ Isoleucine 108 160 47 0.043 0.034 0.013 0.056
29
+ Leucine 121 130 59 0.061 0.025 0.036 0.070
30
+ Lysine 114 74 101 0.055 0.115 0.072 0.095
31
+ Methionine 145 105 60 0.068 0.082 0.014 0.055
32
+ Phenylalanine 113 138 60 0.059 0.041 0.065 0.065
33
+ Proline 57 55 152 0.102 0.301 0.034 0.068
34
+ Serine 77 75 143 0.120 0.139 0.125 0.106
35
+ Threonine 83 119 96 0.086 0.108 0.065 0.079
36
+ Tryptophan 108 137 96 0.077 0.013 0.064 0.167
37
+ Tyrosine 69 147 114 0.082 0.065 0.114 0.125
38
+ Valine 106 170 50 0.062 0.048 0.028 0.053
39
+ """
40
+
41
+
42
+ def parse_chou_fasman(table):
43
+ alpha_helix_score_dict = {}
44
+ beta_sheet_score_dict = {}
45
+ turn_score_dict = {}
46
+
47
+ for line in table.split("\n"):
48
+ fields = [field for field in line.split(" ") if len(field.strip()) > 0]
49
+ if len(fields) == 0:
50
+ continue
51
+
52
+ if fields[1] == 'Acid':
53
+ name = fields[0] + " " + fields[1]
54
+ fields = fields[1:]
55
+ else:
56
+ name = fields[0]
57
+
58
+ assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name
59
+ letter = amino_acid_name_indices[name]
60
+ alpha = int(fields[1])
61
+ beta = int(fields[2])
62
+ turn = int(fields[3])
63
+ alpha_helix_score_dict[letter] = alpha
64
+ beta_sheet_score_dict[letter] = beta
65
+ turn_score_dict[letter] = turn
66
+
67
+ assert len(alpha_helix_score_dict) == 20
68
+ assert len(beta_sheet_score_dict) == 20
69
+ assert len(turn_score_dict) == 20
70
+ return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict
71
+
72
+ alpha_helix_score, beta_sheet_score, turn_score = \
73
+ parse_chou_fasman(chou_fasman_table)