weirdo 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weirdo/__init__.py +104 -0
- weirdo/amino_acid.py +33 -0
- weirdo/amino_acid_alphabet.py +158 -0
- weirdo/amino_acid_properties.py +358 -0
- weirdo/api.py +372 -0
- weirdo/blosum.py +74 -0
- weirdo/chou_fasman.py +73 -0
- weirdo/cli.py +597 -0
- weirdo/common.py +22 -0
- weirdo/data_manager.py +475 -0
- weirdo/distances.py +16 -0
- weirdo/matrices/BLOSUM30 +25 -0
- weirdo/matrices/BLOSUM50 +21 -0
- weirdo/matrices/BLOSUM62 +27 -0
- weirdo/matrices/__init__.py +0 -0
- weirdo/matrices/amino_acid_properties.txt +829 -0
- weirdo/matrices/helix_vs_coil.txt +28 -0
- weirdo/matrices/helix_vs_strand.txt +27 -0
- weirdo/matrices/pmbec.mat +21 -0
- weirdo/matrices/strand_vs_coil.txt +27 -0
- weirdo/model_manager.py +346 -0
- weirdo/peptide_vectorizer.py +78 -0
- weirdo/pmbec.py +85 -0
- weirdo/reduced_alphabet.py +61 -0
- weirdo/residue_contact_energies.py +74 -0
- weirdo/scorers/__init__.py +95 -0
- weirdo/scorers/base.py +223 -0
- weirdo/scorers/config.py +299 -0
- weirdo/scorers/mlp.py +1126 -0
- weirdo/scorers/reference.py +265 -0
- weirdo/scorers/registry.py +282 -0
- weirdo/scorers/similarity.py +386 -0
- weirdo/scorers/swissprot.py +510 -0
- weirdo/scorers/trainable.py +219 -0
- weirdo/static_data.py +17 -0
- weirdo-2.1.0.dist-info/METADATA +294 -0
- weirdo-2.1.0.dist-info/RECORD +41 -0
- weirdo-2.1.0.dist-info/WHEEL +5 -0
- weirdo-2.1.0.dist-info/entry_points.txt +2 -0
- weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
- weirdo-2.1.0.dist-info/top_level.txt +1 -0
weirdo/api.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""High-level convenience API for foreignness scoring.
|
|
2
|
+
|
|
3
|
+
Provides simple functions for common use cases without
|
|
4
|
+
needing to understand the full scorer architecture.
|
|
5
|
+
|
|
6
|
+
Example
|
|
7
|
+
-------
|
|
8
|
+
>>> from weirdo import score_peptide, load_model
|
|
9
|
+
>>> scorer = load_model('my-mlp')
|
|
10
|
+
>>> score = score_peptide('MTMDKSEL', model=scorer)
|
|
11
|
+
|
|
12
|
+
>>> from weirdo import score_peptides
|
|
13
|
+
>>> scores = score_peptides(['MTMDKSEL', 'ACDEFGHI'], model=scorer)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import Any, Dict, List, Optional, Sequence, Union
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from .scorers import ScorerConfig, BaseScorer, TrainableScorer
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Cache for scorer instances by preset
|
|
24
|
+
_scorer_cache: Dict[str, BaseScorer] = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_scorer(
|
|
28
|
+
preset: str = 'default',
|
|
29
|
+
cache: bool = True,
|
|
30
|
+
auto_download: bool = False,
|
|
31
|
+
train_data: Optional[Sequence[str]] = None,
|
|
32
|
+
train_labels: Optional[Any] = None,
|
|
33
|
+
target_categories: Optional[List[str]] = None,
|
|
34
|
+
**overrides
|
|
35
|
+
) -> BaseScorer:
|
|
36
|
+
"""Create a scorer from a preset configuration.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
preset : str, default='default'
|
|
41
|
+
Preset name (e.g., 'default', 'fast').
|
|
42
|
+
cache : bool, default=True
|
|
43
|
+
If True, cache the scorer instance for reuse.
|
|
44
|
+
Set to False if you need multiple independent instances.
|
|
45
|
+
auto_download : bool, default=False
|
|
46
|
+
If True, automatically download reference data if not present.
|
|
47
|
+
train_data : sequence of str, optional
|
|
48
|
+
Training peptides for trainable scorers.
|
|
49
|
+
train_labels : array-like, optional
|
|
50
|
+
Training labels for trainable scorers.
|
|
51
|
+
target_categories : list of str, optional
|
|
52
|
+
Category names for multi-label training.
|
|
53
|
+
**overrides : dict
|
|
54
|
+
Override specific config parameters (e.g., k=10, hidden_layer_sizes=(128, 64)).
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
scorer : BaseScorer
|
|
59
|
+
Configured scorer. Trainable scorers are returned untrained unless
|
|
60
|
+
train_data and train_labels are provided.
|
|
61
|
+
|
|
62
|
+
Example
|
|
63
|
+
-------
|
|
64
|
+
>>> scorer = create_scorer('default', use_dipeptides=False)
|
|
65
|
+
>>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
|
|
66
|
+
|
|
67
|
+
>>> # Auto-download data on first use
|
|
68
|
+
>>> scorer = create_scorer('default', auto_download=True)
|
|
69
|
+
"""
|
|
70
|
+
# Build cache key from preset and overrides
|
|
71
|
+
cache_key = f"{preset}:{sorted(overrides.items())}:auto={auto_download}"
|
|
72
|
+
|
|
73
|
+
if cache and cache_key in _scorer_cache and train_data is None and train_labels is None:
|
|
74
|
+
return _scorer_cache[cache_key]
|
|
75
|
+
|
|
76
|
+
# Get preset config
|
|
77
|
+
config = ScorerConfig.from_preset(preset)
|
|
78
|
+
|
|
79
|
+
# Apply overrides
|
|
80
|
+
if overrides:
|
|
81
|
+
# Check which params go to scorer vs reference
|
|
82
|
+
scorer_params = {
|
|
83
|
+
'hidden_layer_sizes',
|
|
84
|
+
'activation',
|
|
85
|
+
'alpha',
|
|
86
|
+
'learning_rate_init',
|
|
87
|
+
'max_iter',
|
|
88
|
+
'early_stopping',
|
|
89
|
+
'use_dipeptides',
|
|
90
|
+
'batch_size',
|
|
91
|
+
'random_state',
|
|
92
|
+
}
|
|
93
|
+
reference_params = {'categories', 'lazy', 'use_set', 'data_path'}
|
|
94
|
+
|
|
95
|
+
for key, value in overrides.items():
|
|
96
|
+
if key == 'k':
|
|
97
|
+
config.k = value
|
|
98
|
+
elif key == 'scorer':
|
|
99
|
+
config.scorer = value
|
|
100
|
+
elif key == 'reference':
|
|
101
|
+
config.reference = value
|
|
102
|
+
elif key in scorer_params:
|
|
103
|
+
config.scorer_params[key] = value
|
|
104
|
+
elif key in reference_params:
|
|
105
|
+
config.reference_params[key] = value
|
|
106
|
+
else:
|
|
107
|
+
# Assume it's a scorer param
|
|
108
|
+
config.scorer_params[key] = value
|
|
109
|
+
|
|
110
|
+
# Add auto_download to reference params
|
|
111
|
+
if auto_download:
|
|
112
|
+
config.reference_params['auto_download'] = True
|
|
113
|
+
|
|
114
|
+
# Build scorer (trainable scorers are returned untrained unless training data provided)
|
|
115
|
+
scorer = config.build(
|
|
116
|
+
train_data=list(train_data) if train_data is not None else None,
|
|
117
|
+
train_labels=train_labels,
|
|
118
|
+
target_categories=target_categories,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if cache and train_data is None and train_labels is None:
|
|
122
|
+
_scorer_cache[cache_key] = scorer
|
|
123
|
+
|
|
124
|
+
return scorer
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def score_peptide(
|
|
128
|
+
peptide: str,
|
|
129
|
+
model: Optional[Union[str, BaseScorer]] = None,
|
|
130
|
+
model_dir: Optional[str] = None,
|
|
131
|
+
preset: Optional[str] = None,
|
|
132
|
+
aggregate: str = 'mean',
|
|
133
|
+
**kwargs
|
|
134
|
+
) -> float:
|
|
135
|
+
"""Score a single peptide.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
peptide : str
|
|
140
|
+
Peptide sequence to score.
|
|
141
|
+
model : str or BaseScorer, optional
|
|
142
|
+
Model name (from ModelManager) or an instantiated scorer.
|
|
143
|
+
model_dir : str, optional
|
|
144
|
+
Custom model directory when loading by name.
|
|
145
|
+
preset : str, optional
|
|
146
|
+
Scoring preset for non-trainable scorers.
|
|
147
|
+
aggregate : str, default='mean'
|
|
148
|
+
How to aggregate k-mer probabilities for long peptides.
|
|
149
|
+
**kwargs : dict
|
|
150
|
+
Additional arguments passed to create_scorer().
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
score : float
|
|
155
|
+
Foreignness score. Higher = more foreign.
|
|
156
|
+
|
|
157
|
+
Example
|
|
158
|
+
-------
|
|
159
|
+
>>> scorer = load_model('my-mlp')
|
|
160
|
+
>>> score = score_peptide('MTMDKSEL', model=scorer)
|
|
161
|
+
"""
|
|
162
|
+
if model is None:
|
|
163
|
+
if preset is None:
|
|
164
|
+
raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
|
|
165
|
+
scorer = create_scorer(preset, **kwargs)
|
|
166
|
+
elif isinstance(model, str):
|
|
167
|
+
scorer = load_model(model, model_dir)
|
|
168
|
+
else:
|
|
169
|
+
scorer = model
|
|
170
|
+
|
|
171
|
+
if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
|
|
172
|
+
raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
scores = scorer.score([peptide], aggregate=aggregate)
|
|
176
|
+
except TypeError:
|
|
177
|
+
scores = scorer.score([peptide])
|
|
178
|
+
return float(scores[0])
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def score_peptides(
|
|
182
|
+
peptides: Sequence[str],
|
|
183
|
+
model: Optional[Union[str, BaseScorer]] = None,
|
|
184
|
+
model_dir: Optional[str] = None,
|
|
185
|
+
preset: Optional[str] = None,
|
|
186
|
+
aggregate: str = 'mean',
|
|
187
|
+
**kwargs
|
|
188
|
+
) -> np.ndarray:
|
|
189
|
+
"""Score multiple peptides.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
peptides : sequence of str
|
|
194
|
+
Peptide sequences to score.
|
|
195
|
+
model : str or BaseScorer, optional
|
|
196
|
+
Model name (from ModelManager) or an instantiated scorer.
|
|
197
|
+
model_dir : str, optional
|
|
198
|
+
Custom model directory when loading by name.
|
|
199
|
+
preset : str, optional
|
|
200
|
+
Scoring preset for non-trainable scorers.
|
|
201
|
+
aggregate : str, default='mean'
|
|
202
|
+
How to aggregate k-mer probabilities for long peptides.
|
|
203
|
+
**kwargs : dict
|
|
204
|
+
Additional arguments passed to create_scorer().
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
scores : np.ndarray
|
|
209
|
+
Array of foreignness scores. Higher = more foreign.
|
|
210
|
+
|
|
211
|
+
Example
|
|
212
|
+
-------
|
|
213
|
+
>>> scorer = load_model('my-mlp')
|
|
214
|
+
>>> scores = score_peptides(['MTMDKSEL'], model=scorer)
|
|
215
|
+
"""
|
|
216
|
+
if model is None:
|
|
217
|
+
if preset is None:
|
|
218
|
+
raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
|
|
219
|
+
scorer = create_scorer(preset, **kwargs)
|
|
220
|
+
elif isinstance(model, str):
|
|
221
|
+
scorer = load_model(model, model_dir)
|
|
222
|
+
else:
|
|
223
|
+
scorer = model
|
|
224
|
+
|
|
225
|
+
if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
|
|
226
|
+
raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
return scorer.score(peptides, aggregate=aggregate)
|
|
230
|
+
except TypeError:
|
|
231
|
+
return scorer.score(peptides)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def clear_cache() -> None:
|
|
235
|
+
"""Clear the scorer cache.
|
|
236
|
+
|
|
237
|
+
Use this to free memory or reset state.
|
|
238
|
+
"""
|
|
239
|
+
_scorer_cache.clear()
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def get_available_presets() -> List[str]:
|
|
243
|
+
"""Get list of available preset names.
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
presets : list of str
|
|
248
|
+
Available preset names.
|
|
249
|
+
"""
|
|
250
|
+
from .scorers import list_presets
|
|
251
|
+
return list_presets()
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def get_preset_info(preset: str) -> Dict[str, Any]:
|
|
255
|
+
"""Get information about a preset configuration.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
preset : str
|
|
260
|
+
Preset name.
|
|
261
|
+
|
|
262
|
+
Returns
|
|
263
|
+
-------
|
|
264
|
+
info : dict
|
|
265
|
+
Preset configuration details.
|
|
266
|
+
"""
|
|
267
|
+
config = ScorerConfig.from_preset(preset)
|
|
268
|
+
return config.to_dict()
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# =============================================================================
|
|
272
|
+
# Model Management Functions
|
|
273
|
+
# =============================================================================
|
|
274
|
+
|
|
275
|
+
def list_models(model_dir: Optional[str] = None) -> List[Any]:
|
|
276
|
+
"""List all available trained models.
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
model_dir : str, optional
|
|
281
|
+
Custom model directory. Defaults to ~/.weirdo/models.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
models : list of ModelInfo
|
|
286
|
+
Information about each saved model.
|
|
287
|
+
|
|
288
|
+
Example
|
|
289
|
+
-------
|
|
290
|
+
>>> models = list_models()
|
|
291
|
+
>>> for m in models:
|
|
292
|
+
... print(f"{m.name}: {m.scorer_type}")
|
|
293
|
+
"""
|
|
294
|
+
from .model_manager import list_models as _list_models
|
|
295
|
+
return _list_models(model_dir)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def load_model(name: str, model_dir: Optional[str] = None) -> BaseScorer:
|
|
299
|
+
"""Load a trained model by name.
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
name : str
|
|
304
|
+
Model name.
|
|
305
|
+
model_dir : str, optional
|
|
306
|
+
Custom model directory.
|
|
307
|
+
|
|
308
|
+
Returns
|
|
309
|
+
-------
|
|
310
|
+
scorer : TrainableScorer
|
|
311
|
+
Loaded model ready for scoring.
|
|
312
|
+
|
|
313
|
+
Example
|
|
314
|
+
-------
|
|
315
|
+
>>> model = load_model('my-mlp')
|
|
316
|
+
>>> scores = model.score(['MTMDKSEL'])
|
|
317
|
+
"""
|
|
318
|
+
from .model_manager import load_model as _load_model
|
|
319
|
+
return _load_model(name, model_dir)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def save_model(
|
|
323
|
+
scorer: BaseScorer,
|
|
324
|
+
name: str,
|
|
325
|
+
model_dir: Optional[str] = None,
|
|
326
|
+
overwrite: bool = False,
|
|
327
|
+
) -> str:
|
|
328
|
+
"""Save a trained model.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
scorer : TrainableScorer
|
|
333
|
+
Trained model to save.
|
|
334
|
+
name : str
|
|
335
|
+
Name for the saved model.
|
|
336
|
+
model_dir : str, optional
|
|
337
|
+
Custom model directory.
|
|
338
|
+
overwrite : bool, default=False
|
|
339
|
+
Overwrite existing model.
|
|
340
|
+
|
|
341
|
+
Returns
|
|
342
|
+
-------
|
|
343
|
+
path : str
|
|
344
|
+
Path where model was saved.
|
|
345
|
+
|
|
346
|
+
Example
|
|
347
|
+
-------
|
|
348
|
+
>>> scorer = MLPScorer()
|
|
349
|
+
>>> scorer.train(peptides, labels)
|
|
350
|
+
>>> save_model(scorer, 'my-mlp')
|
|
351
|
+
"""
|
|
352
|
+
from .model_manager import save_model as _save_model
|
|
353
|
+
return str(_save_model(scorer, name, model_dir, overwrite))
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def get_available_scorers() -> List[str]:
|
|
357
|
+
"""Get list of available scorer types.
|
|
358
|
+
|
|
359
|
+
Returns both lookup-based and ML-based scorers.
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
scorers : list of str
|
|
364
|
+
Available scorer names.
|
|
365
|
+
|
|
366
|
+
Example
|
|
367
|
+
-------
|
|
368
|
+
>>> print(get_available_scorers())
|
|
369
|
+
['mlp']
|
|
370
|
+
"""
|
|
371
|
+
from .scorers import list_scorers
|
|
372
|
+
return list_scorers()
|
weirdo/blosum.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from os.path import join
|
|
15
|
+
|
|
16
|
+
from .static_data import MATRIX_DIR
|
|
17
|
+
|
|
18
|
+
from .amino_acid_alphabet import dict_to_amino_acid_matrix
|
|
19
|
+
|
|
20
|
+
def parse_blosum_table(table, coeff_type=int, key_type='row'):
|
|
21
|
+
"""
|
|
22
|
+
Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
lines = table.split("\n")
|
|
26
|
+
# drop comments
|
|
27
|
+
lines = [line for line in lines if not line.startswith("#")]
|
|
28
|
+
# drop CR endline characters
|
|
29
|
+
lines = [line.replace("\r", "") for line in lines]
|
|
30
|
+
# skip empty lines
|
|
31
|
+
lines = [line for line in lines if line]
|
|
32
|
+
|
|
33
|
+
labels = lines[0].split()
|
|
34
|
+
|
|
35
|
+
if len(labels) < 20:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"Expected 20+ amino acids but first line '%s' has %d fields" % (
|
|
38
|
+
lines[0],
|
|
39
|
+
len(labels)))
|
|
40
|
+
coeffs = {}
|
|
41
|
+
for line in lines[1:]:
|
|
42
|
+
|
|
43
|
+
fields = line.split()
|
|
44
|
+
assert len(fields) >= 21, \
|
|
45
|
+
"Expected AA and 20+ coefficients but '%s' has %d fields" % (
|
|
46
|
+
line, len(fields))
|
|
47
|
+
x = fields[0]
|
|
48
|
+
for i, coeff_str in enumerate(fields[1:]):
|
|
49
|
+
y = labels[i]
|
|
50
|
+
coeff = coeff_type(coeff_str)
|
|
51
|
+
if key_type == 'pair':
|
|
52
|
+
coeffs[(x, y)] = coeff
|
|
53
|
+
elif key_type == 'pair_string':
|
|
54
|
+
coeffs[x + y] = coeff
|
|
55
|
+
else:
|
|
56
|
+
assert key_type == 'row', "Unknown key type: %s" % key_type
|
|
57
|
+
if x not in coeffs:
|
|
58
|
+
coeffs[x] = {}
|
|
59
|
+
coeffs[x][y] = coeff
|
|
60
|
+
return coeffs
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f:
|
|
64
|
+
blosum30_dict = parse_blosum_table(f.read())
|
|
65
|
+
blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict)
|
|
66
|
+
|
|
67
|
+
with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f:
|
|
68
|
+
blosum50_dict = parse_blosum_table(f.read())
|
|
69
|
+
blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict)
|
|
70
|
+
|
|
71
|
+
with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f:
|
|
72
|
+
blosum62_dict = parse_blosum_table(f.read())
|
|
73
|
+
blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict)
|
|
74
|
+
|
weirdo/chou_fasman.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from .amino_acid_alphabet import amino_acid_name_indices
|
|
15
|
+
|
|
16
|
+
# Chou-Fasman of structural properties from
|
|
17
|
+
# http://prowl.rockefeller.edu/aainfo/chou.htm
|
|
18
|
+
chou_fasman_table = """
|
|
19
|
+
Alanine 142 83 66 0.06 0.076 0.035 0.058
|
|
20
|
+
Arginine 98 93 95 0.070 0.106 0.099 0.085
|
|
21
|
+
Aspartic Acid 101 54 146 0.147 0.110 0.179 0.081
|
|
22
|
+
Asparagine 67 89 156 0.161 0.083 0.191 0.091
|
|
23
|
+
Cysteine 70 119 119 0.149 0.050 0.117 0.128
|
|
24
|
+
Glutamic Acid 151 037 74 0.056 0.060 0.077 0.064
|
|
25
|
+
Glutamine 111 110 98 0.074 0.098 0.037 0.098
|
|
26
|
+
Glycine 57 75 156 0.102 0.085 0.190 0.152
|
|
27
|
+
Histidine 100 87 95 0.140 0.047 0.093 0.054
|
|
28
|
+
Isoleucine 108 160 47 0.043 0.034 0.013 0.056
|
|
29
|
+
Leucine 121 130 59 0.061 0.025 0.036 0.070
|
|
30
|
+
Lysine 114 74 101 0.055 0.115 0.072 0.095
|
|
31
|
+
Methionine 145 105 60 0.068 0.082 0.014 0.055
|
|
32
|
+
Phenylalanine 113 138 60 0.059 0.041 0.065 0.065
|
|
33
|
+
Proline 57 55 152 0.102 0.301 0.034 0.068
|
|
34
|
+
Serine 77 75 143 0.120 0.139 0.125 0.106
|
|
35
|
+
Threonine 83 119 96 0.086 0.108 0.065 0.079
|
|
36
|
+
Tryptophan 108 137 96 0.077 0.013 0.064 0.167
|
|
37
|
+
Tyrosine 69 147 114 0.082 0.065 0.114 0.125
|
|
38
|
+
Valine 106 170 50 0.062 0.048 0.028 0.053
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_chou_fasman(table):
|
|
43
|
+
alpha_helix_score_dict = {}
|
|
44
|
+
beta_sheet_score_dict = {}
|
|
45
|
+
turn_score_dict = {}
|
|
46
|
+
|
|
47
|
+
for line in table.split("\n"):
|
|
48
|
+
fields = [field for field in line.split(" ") if len(field.strip()) > 0]
|
|
49
|
+
if len(fields) == 0:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
if fields[1] == 'Acid':
|
|
53
|
+
name = fields[0] + " " + fields[1]
|
|
54
|
+
fields = fields[1:]
|
|
55
|
+
else:
|
|
56
|
+
name = fields[0]
|
|
57
|
+
|
|
58
|
+
assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name
|
|
59
|
+
letter = amino_acid_name_indices[name]
|
|
60
|
+
alpha = int(fields[1])
|
|
61
|
+
beta = int(fields[2])
|
|
62
|
+
turn = int(fields[3])
|
|
63
|
+
alpha_helix_score_dict[letter] = alpha
|
|
64
|
+
beta_sheet_score_dict[letter] = beta
|
|
65
|
+
turn_score_dict[letter] = turn
|
|
66
|
+
|
|
67
|
+
assert len(alpha_helix_score_dict) == 20
|
|
68
|
+
assert len(beta_sheet_score_dict) == 20
|
|
69
|
+
assert len(turn_score_dict) == 20
|
|
70
|
+
return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict
|
|
71
|
+
|
|
72
|
+
alpha_helix_score, beta_sheet_score, turn_score = \
|
|
73
|
+
parse_chou_fasman(chou_fasman_table)
|