weirdo 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weirdo/__init__.py +104 -0
- weirdo/amino_acid.py +33 -0
- weirdo/amino_acid_alphabet.py +158 -0
- weirdo/amino_acid_properties.py +358 -0
- weirdo/api.py +372 -0
- weirdo/blosum.py +74 -0
- weirdo/chou_fasman.py +73 -0
- weirdo/cli.py +597 -0
- weirdo/common.py +22 -0
- weirdo/data_manager.py +475 -0
- weirdo/distances.py +16 -0
- weirdo/matrices/BLOSUM30 +25 -0
- weirdo/matrices/BLOSUM50 +21 -0
- weirdo/matrices/BLOSUM62 +27 -0
- weirdo/matrices/__init__.py +0 -0
- weirdo/matrices/amino_acid_properties.txt +829 -0
- weirdo/matrices/helix_vs_coil.txt +28 -0
- weirdo/matrices/helix_vs_strand.txt +27 -0
- weirdo/matrices/pmbec.mat +21 -0
- weirdo/matrices/strand_vs_coil.txt +27 -0
- weirdo/model_manager.py +346 -0
- weirdo/peptide_vectorizer.py +78 -0
- weirdo/pmbec.py +85 -0
- weirdo/reduced_alphabet.py +61 -0
- weirdo/residue_contact_energies.py +74 -0
- weirdo/scorers/__init__.py +95 -0
- weirdo/scorers/base.py +223 -0
- weirdo/scorers/config.py +299 -0
- weirdo/scorers/mlp.py +1126 -0
- weirdo/scorers/reference.py +265 -0
- weirdo/scorers/registry.py +282 -0
- weirdo/scorers/similarity.py +386 -0
- weirdo/scorers/swissprot.py +510 -0
- weirdo/scorers/trainable.py +219 -0
- weirdo/static_data.py +17 -0
- weirdo-2.1.0.dist-info/METADATA +294 -0
- weirdo-2.1.0.dist-info/RECORD +41 -0
- weirdo-2.1.0.dist-info/WHEEL +5 -0
- weirdo-2.1.0.dist-info/entry_points.txt +2 -0
- weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
- weirdo-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Base class for trainable (ML-based) scorers.
|
|
2
|
+
|
|
3
|
+
Provides common infrastructure for training, saving, and loading models.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from .base import BatchScorer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TrainableScorer(BatchScorer):
|
|
18
|
+
"""Base class for trainable foreignness scorers.
|
|
19
|
+
|
|
20
|
+
Extends BatchScorer with training, saving, and loading capabilities.
|
|
21
|
+
Subclasses implement specific model architectures (MLP, etc.).
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
k : int, default=8
|
|
26
|
+
K-mer size for decomposing peptides.
|
|
27
|
+
batch_size : int, default=256
|
|
28
|
+
Batch size for training and inference.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
k: int = 8,
|
|
34
|
+
batch_size: int = 256,
|
|
35
|
+
**kwargs
|
|
36
|
+
):
|
|
37
|
+
super().__init__(batch_size=batch_size, **kwargs)
|
|
38
|
+
self._params.update({'k': k})
|
|
39
|
+
self._model = None
|
|
40
|
+
self._is_trained = False
|
|
41
|
+
self._training_history: List[Dict[str, float]] = []
|
|
42
|
+
self._metadata: Dict[str, Any] = {}
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def k(self) -> int:
|
|
46
|
+
"""Get k-mer size."""
|
|
47
|
+
return self._params['k']
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def is_trained(self) -> bool:
|
|
51
|
+
"""Check if model has been trained."""
|
|
52
|
+
return self._is_trained
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def training_history(self) -> List[Dict[str, float]]:
|
|
56
|
+
"""Get training history (loss per epoch)."""
|
|
57
|
+
return self._training_history.copy()
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def train(
|
|
61
|
+
self,
|
|
62
|
+
peptides: Sequence[str],
|
|
63
|
+
labels: Sequence[float],
|
|
64
|
+
val_peptides: Optional[Sequence[str]] = None,
|
|
65
|
+
val_labels: Optional[Sequence[float]] = None,
|
|
66
|
+
epochs: Optional[int] = None,
|
|
67
|
+
learning_rate: Optional[float] = None,
|
|
68
|
+
verbose: bool = True,
|
|
69
|
+
) -> 'TrainableScorer':
|
|
70
|
+
"""Train the model on labeled data.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
peptides : sequence of str
|
|
75
|
+
Training peptide sequences.
|
|
76
|
+
labels : sequence of float or 2D array
|
|
77
|
+
Target labels. For multi-label classification, use a 2D array.
|
|
78
|
+
val_peptides : sequence of str, optional
|
|
79
|
+
Validation peptides for early stopping.
|
|
80
|
+
val_labels : sequence of float, optional
|
|
81
|
+
Validation labels.
|
|
82
|
+
epochs : int, optional
|
|
83
|
+
Number of training epochs. Defaults to model's max_iter.
|
|
84
|
+
learning_rate : float, optional
|
|
85
|
+
Learning rate for optimizer. Defaults to model's learning_rate_init.
|
|
86
|
+
verbose : bool, default=True
|
|
87
|
+
Print training progress.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
self : TrainableScorer
|
|
92
|
+
Returns self for method chaining.
|
|
93
|
+
"""
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def _save_model(self, path: Path) -> None:
|
|
98
|
+
"""Save model weights to path (implemented by subclass)."""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def _load_model(self, path: Path) -> None:
|
|
103
|
+
"""Load model weights from path (implemented by subclass)."""
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
def save(self, path: Union[str, Path]) -> None:
|
|
107
|
+
"""Save trained model to disk.
|
|
108
|
+
|
|
109
|
+
Creates a directory containing:
|
|
110
|
+
- model.pt: Model weights
|
|
111
|
+
- config.json: Model configuration
|
|
112
|
+
- metadata.json: Training metadata
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
path : str or Path
|
|
117
|
+
Directory path to save model.
|
|
118
|
+
"""
|
|
119
|
+
if not self._is_trained:
|
|
120
|
+
raise RuntimeError("Model must be trained before saving.")
|
|
121
|
+
|
|
122
|
+
path = Path(path)
|
|
123
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
|
|
125
|
+
# Save model weights
|
|
126
|
+
self._save_model(path / 'model.pt')
|
|
127
|
+
|
|
128
|
+
# Save configuration
|
|
129
|
+
config = {
|
|
130
|
+
'scorer_type': self.__class__.__name__,
|
|
131
|
+
'params': self._params,
|
|
132
|
+
}
|
|
133
|
+
with open(path / 'config.json', 'w') as f:
|
|
134
|
+
json.dump(config, f, indent=2)
|
|
135
|
+
|
|
136
|
+
# Save metadata
|
|
137
|
+
metadata = {
|
|
138
|
+
'training_history': self._training_history,
|
|
139
|
+
**self._metadata,
|
|
140
|
+
}
|
|
141
|
+
with open(path / 'metadata.json', 'w') as f:
|
|
142
|
+
json.dump(metadata, f, indent=2)
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def load(cls, path: Union[str, Path]) -> 'TrainableScorer':
|
|
146
|
+
"""Load a trained model from disk.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
path : str or Path
|
|
151
|
+
Directory path containing saved model.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
scorer : TrainableScorer
|
|
156
|
+
Loaded model ready for inference.
|
|
157
|
+
"""
|
|
158
|
+
path = Path(path)
|
|
159
|
+
|
|
160
|
+
# Load configuration
|
|
161
|
+
with open(path / 'config.json', 'r') as f:
|
|
162
|
+
config = json.load(f)
|
|
163
|
+
|
|
164
|
+
# Create instance with saved params
|
|
165
|
+
instance = cls(**config['params'])
|
|
166
|
+
|
|
167
|
+
# Load model weights
|
|
168
|
+
instance._load_model(path / 'model.pt')
|
|
169
|
+
instance._is_trained = True
|
|
170
|
+
instance._is_fitted = True
|
|
171
|
+
|
|
172
|
+
# Load metadata
|
|
173
|
+
if (path / 'metadata.json').exists():
|
|
174
|
+
with open(path / 'metadata.json', 'r') as f:
|
|
175
|
+
metadata = json.load(f)
|
|
176
|
+
instance._training_history = metadata.get('training_history', [])
|
|
177
|
+
instance._metadata = metadata
|
|
178
|
+
|
|
179
|
+
return instance
|
|
180
|
+
|
|
181
|
+
def fit(self, reference=None) -> 'TrainableScorer':
|
|
182
|
+
"""For compatibility with BaseScorer interface.
|
|
183
|
+
|
|
184
|
+
Trainable scorers use train() instead of fit().
|
|
185
|
+
If already trained, this is a no-op.
|
|
186
|
+
"""
|
|
187
|
+
if self._is_trained:
|
|
188
|
+
self._is_fitted = True
|
|
189
|
+
return self
|
|
190
|
+
raise RuntimeError(
|
|
191
|
+
"Trainable scorers must be trained with train() or loaded with load(). "
|
|
192
|
+
"Use scorer.train(peptides, labels) to train a new model."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def _extract_kmers(self, peptide: str) -> List[str]:
|
|
196
|
+
"""Extract overlapping k-mers from peptide."""
|
|
197
|
+
k = self.k
|
|
198
|
+
if len(peptide) < k:
|
|
199
|
+
# Pad short peptides
|
|
200
|
+
peptide = peptide + 'X' * (k - len(peptide))
|
|
201
|
+
return [peptide[i:i+k] for i in range(len(peptide) - k + 1)]
|
|
202
|
+
|
|
203
|
+
def _peptide_to_indices(self, peptide: str) -> List[List[int]]:
|
|
204
|
+
"""Convert peptide to list of k-mer amino acid indices.
|
|
205
|
+
|
|
206
|
+
Returns list of k-mers, each as list of AA indices (0-20).
|
|
207
|
+
"""
|
|
208
|
+
AA_TO_IDX = {
|
|
209
|
+
'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4,
|
|
210
|
+
'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
|
|
211
|
+
'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14,
|
|
212
|
+
'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
|
|
213
|
+
'X': 20, # Unknown/padding
|
|
214
|
+
}
|
|
215
|
+
kmers = self._extract_kmers(peptide)
|
|
216
|
+
return [
|
|
217
|
+
[AA_TO_IDX.get(aa, 20) for aa in kmer]
|
|
218
|
+
for kmer in kmers
|
|
219
|
+
]
|
weirdo/static_data.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from os.path import dirname, realpath, join
|
|
15
|
+
|
|
16
|
+
PACKAGE_DIR = dirname(realpath(__file__))
|
|
17
|
+
MATRIX_DIR = join(PACKAGE_DIR, 'matrices')
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: weirdo
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Metrics of immunological foreignness for candidate T-cell epitopes
|
|
5
|
+
Home-page: https://github.com/pirl-unc/weirdo
|
|
6
|
+
Author: Alex Rubinsteyn
|
|
7
|
+
Author-email: alex.rubinsteyn@unc.edu
|
|
8
|
+
License: http://www.apache.org/licenses/LICENSE-2.0.html
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy<2.0.0,>=1.20.0
|
|
24
|
+
Requires-Dist: pandas<3.0.0,>=1.3.0
|
|
25
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pylint>=2.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: sphinx>=5.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: tqdm>=4.0.0; extra == "dev"
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Requires-Dist: sphinx>=5.0.0; extra == "docs"
|
|
36
|
+
Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
|
|
37
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.0.0; extra == "docs"
|
|
38
|
+
Dynamic: author
|
|
39
|
+
Dynamic: author-email
|
|
40
|
+
Dynamic: classifier
|
|
41
|
+
Dynamic: description
|
|
42
|
+
Dynamic: description-content-type
|
|
43
|
+
Dynamic: home-page
|
|
44
|
+
Dynamic: license
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: provides-extra
|
|
47
|
+
Dynamic: requires-dist
|
|
48
|
+
Dynamic: requires-python
|
|
49
|
+
Dynamic: summary
|
|
50
|
+
|
|
51
|
+
[](https://github.com/pirl-unc/weirdo/actions/workflows/tests.yml)
|
|
52
|
+
[](https://github.com/pirl-unc/weirdo/actions/workflows/docs.yml)
|
|
53
|
+
[](https://pypi.python.org/pypi/weirdo/)
|
|
54
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
55
|
+
|
|
56
|
+
# WEIRDO
|
|
57
|
+
|
|
58
|
+
**W**idely **E**stimated **I**mmunological **R**ecognition and **D**etection of **O**utliers
|
|
59
|
+
|
|
60
|
+
A Python library for computing peptide foreignness scores—predicting whether a peptide sequence is likely from a pathogen (bacteria, virus) or from self (human, mammalian).
|
|
61
|
+
|
|
62
|
+
## Overview
|
|
63
|
+
|
|
64
|
+
WEIRDO trains a multi-layer perceptron (MLP) on k-mer presence data from SwissProt to predict organism category membership. Given any peptide, it outputs:
|
|
65
|
+
|
|
66
|
+
- **Category probabilities**: likelihood of appearing in human, bacteria, viruses, mammals, etc.
|
|
67
|
+
- **Foreignness score**: `max(pathogens) / (max(pathogens) + max(self))`
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from weirdo.scorers import SwissProtReference, MLPScorer
|
|
73
|
+
|
|
74
|
+
# Load reference data (SwissProt 8-mers with organism labels)
|
|
75
|
+
ref = SwissProtReference().load()
|
|
76
|
+
|
|
77
|
+
# Define organism categories
|
|
78
|
+
categories = [
|
|
79
|
+
'archaea', 'bacteria', 'fungi', 'human', 'invertebrates',
|
|
80
|
+
'mammals', 'plants', 'rodents', 'vertebrates', 'viruses'
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
# Get training data: each 8-mer labeled with organism presence
|
|
84
|
+
peptides, labels = ref.get_training_data(
|
|
85
|
+
target_categories=categories,
|
|
86
|
+
multi_label=True,
|
|
87
|
+
max_samples=200000 # Optional: sample for faster training
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Train the MLP
|
|
91
|
+
scorer = MLPScorer(k=8, hidden_layer_sizes=(256, 128, 64))
|
|
92
|
+
scorer.train(peptides, labels, target_categories=categories, epochs=200)
|
|
93
|
+
|
|
94
|
+
# Score new peptides (any length)
|
|
95
|
+
df = scorer.predict_dataframe(['MTMDKSEL', 'SIINFEKL', 'NLVPMVATV'])
|
|
96
|
+
print(df)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Output:**
|
|
100
|
+
```
|
|
101
|
+
peptide human viruses bacteria mammals ... foreignness
|
|
102
|
+
MTMDKSEL 0.82 0.12 0.08 0.79 ... 0.127
|
|
103
|
+
SIINFEKL 0.15 0.73 0.21 0.18 ... 0.802
|
|
104
|
+
NLVPMVATV 0.31 0.68 0.15 0.35 ... 0.660
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Installation
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install weirdo
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Download reference data (~2.5 GB compressed / ~7.5 GB uncompressed) for training:
|
|
114
|
+
```bash
|
|
115
|
+
weirdo data download
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Training Data
|
|
119
|
+
|
|
120
|
+
WEIRDO uses pre-computed 8-mer data from SwissProt (~100M unique k-mers):
|
|
121
|
+
|
|
122
|
+
| Category | Description |
|
|
123
|
+
|----------|-------------|
|
|
124
|
+
| human | Homo sapiens proteins |
|
|
125
|
+
| rodents | Mouse, rat proteins |
|
|
126
|
+
| mammals | Other mammals (dog, cow, primates, etc.) |
|
|
127
|
+
| vertebrates | Fish, birds, reptiles, amphibians |
|
|
128
|
+
| invertebrates | Insects, worms, mollusks |
|
|
129
|
+
| bacteria | Bacterial proteins |
|
|
130
|
+
| viruses | Viral proteins |
|
|
131
|
+
| archaea | Archaeal proteins |
|
|
132
|
+
| fungi | Fungal proteins |
|
|
133
|
+
| plants | Plant proteins |
|
|
134
|
+
|
|
135
|
+
Each 8-mer has True/False labels for each category, indicating whether it appears in proteins from that organism group.
|
|
136
|
+
|
|
137
|
+
## Feature Extraction
|
|
138
|
+
|
|
139
|
+
The MLP uses **592 features** extracted from each peptide:
|
|
140
|
+
|
|
141
|
+
### Amino Acid Properties (48 features)
|
|
142
|
+
12 physicochemical properties × 4 statistics (mean, std, min, max):
|
|
143
|
+
- Hydropathy, hydrophilicity
|
|
144
|
+
- Mass, volume
|
|
145
|
+
- Polarity, pK side chain
|
|
146
|
+
- Accessible surface area (folded/unfolded)
|
|
147
|
+
- Local flexibility, refractivity
|
|
148
|
+
- Solvent exposed area, % exposed residues
|
|
149
|
+
|
|
150
|
+
### Structural Features (27 features)
|
|
151
|
+
- **Secondary structure propensities** (12): helix, sheet, turn × 4 stats
|
|
152
|
+
- **Category fractions** (9): positive/negative charged, hydrophobic, aromatic, aliphatic, polar, tiny, small, cysteine
|
|
153
|
+
- **Charge features** (4): net charge, charge transitions, max cluster, R/(R+K) ratio
|
|
154
|
+
- **Disorder features** (2): disorder/order promoting fractions
|
|
155
|
+
|
|
156
|
+
### Composition Features (420 features)
|
|
157
|
+
- **Amino acid frequencies** (20): fraction of each amino acid
|
|
158
|
+
- **Dipeptide frequencies** (400): fraction of each amino acid pair
|
|
159
|
+
|
|
160
|
+
### Sequence Statistics (12 features)
|
|
161
|
+
- Length, log-length, sqrt-length
|
|
162
|
+
- Unknown fraction, unique AA fraction
|
|
163
|
+
- Max run length, repeat fraction
|
|
164
|
+
- Entropy/complexity (entropy, effective AAs, Gini, top-2/maximum frequency)
|
|
165
|
+
|
|
166
|
+
### Reduced Alphabet Frequencies (80 features)
|
|
167
|
+
- Composition across common reduced alphabets (Murphy, GBMR, SDM, etc.)
|
|
168
|
+
|
|
169
|
+
### Dipeptide Summary (5 features)
|
|
170
|
+
- Entropy, Gini, max/top2 frequency, homodipeptide fraction
|
|
171
|
+
|
|
172
|
+
## API Reference
|
|
173
|
+
|
|
174
|
+
### Training
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from weirdo.scorers import SwissProtReference, MLPScorer
|
|
178
|
+
|
|
179
|
+
# Load reference
|
|
180
|
+
ref = SwissProtReference().load()
|
|
181
|
+
|
|
182
|
+
# Get training data
|
|
183
|
+
peptides, labels = ref.get_training_data(
|
|
184
|
+
target_categories=['human', 'viruses', 'bacteria', 'mammals'],
|
|
185
|
+
multi_label=True,
|
|
186
|
+
max_samples=100000 # Optional: limit for memory
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Train
|
|
190
|
+
scorer = MLPScorer(
|
|
191
|
+
k=8,
|
|
192
|
+
hidden_layer_sizes=(256, 128, 64),
|
|
193
|
+
activation='relu',
|
|
194
|
+
alpha=0.0001, # L2 regularization
|
|
195
|
+
)
|
|
196
|
+
scorer.train(
|
|
197
|
+
peptides, labels,
|
|
198
|
+
target_categories=['human', 'viruses', 'bacteria', 'mammals'],
|
|
199
|
+
epochs=200,
|
|
200
|
+
learning_rate=0.001
|
|
201
|
+
)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Prediction
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
# Category probabilities (sigmoid-activated)
|
|
208
|
+
probs = scorer.predict_proba(['MTMDKSEL'])
|
|
209
|
+
# Shape: (1, n_categories)
|
|
210
|
+
|
|
211
|
+
# Foreignness score
|
|
212
|
+
foreign = scorer.foreignness(
|
|
213
|
+
['MTMDKSEL'],
|
|
214
|
+
pathogen_categories=['bacteria', 'viruses'],
|
|
215
|
+
self_categories=['human', 'mammals', 'rodents']
|
|
216
|
+
)
|
|
217
|
+
# Returns: max(pathogens) / (max(pathogens) + max(self))
|
|
218
|
+
|
|
219
|
+
# Full DataFrame output (handles variable-length peptides)
|
|
220
|
+
df = scorer.predict_dataframe(['MTMDKSEL', 'SIINFEKL', 'NLVPMVATV'])
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Feature Extraction
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
# Extract features as DataFrame
|
|
227
|
+
df = scorer.features_dataframe(['MTMDKSEL', 'SIINFEKL'])
|
|
228
|
+
# Shape: (2, 593) - 592 features + peptide column
|
|
229
|
+
|
|
230
|
+
# Feature names
|
|
231
|
+
names = scorer.get_feature_names()
|
|
232
|
+
# ['hydropathy_mean', 'hydropathy_std', ..., 'dipep_YY']
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Model Persistence
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from weirdo import save_model, load_model, list_models
|
|
239
|
+
|
|
240
|
+
# Save trained model
|
|
241
|
+
save_model(scorer, 'my-foreignness-model')
|
|
242
|
+
|
|
243
|
+
# List saved models
|
|
244
|
+
for model in list_models():
|
|
245
|
+
print(f"{model.name}: {model.scorer_type}")
|
|
246
|
+
|
|
247
|
+
# Load model
|
|
248
|
+
scorer = load_model('my-foreignness-model')
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### CLI
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
# Data management
|
|
255
|
+
weirdo data download # Download SwissProt reference
|
|
256
|
+
weirdo data list # Show data status
|
|
257
|
+
|
|
258
|
+
# Model management
|
|
259
|
+
weirdo models list # List trained models
|
|
260
|
+
weirdo models train --data train.csv --name my-model
|
|
261
|
+
weirdo models info my-model # Show model details
|
|
262
|
+
|
|
263
|
+
# Scoring
|
|
264
|
+
weirdo score --model my-model MTMDKSEL SIINFEKL
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## Architecture
|
|
268
|
+
|
|
269
|
+
```
|
|
270
|
+
weirdo/
|
|
271
|
+
├── scorers/
|
|
272
|
+
│ ├── mlp.py # MLPScorer with feature extraction
|
|
273
|
+
│ ├── swissprot.py # SwissProtReference (training data)
|
|
274
|
+
│ ├── config.py # Presets and configuration
|
|
275
|
+
│ ├── registry.py # Scorer registry
|
|
276
|
+
│ └── trainable.py # TrainableScorer base class
|
|
277
|
+
├── model_manager.py # Save/load trained models
|
|
278
|
+
├── amino_acid_properties.py # 12 AA property dictionaries
|
|
279
|
+
└── api.py # High-level functions
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## Citation
|
|
283
|
+
|
|
284
|
+
```bibtex
|
|
285
|
+
@software{weirdo,
|
|
286
|
+
title = {WEIRDO: Widely Estimated Immunological Recognition and Detection of Outliers},
|
|
287
|
+
author = {PIRL-UNC},
|
|
288
|
+
url = {https://github.com/pirl-unc/weirdo}
|
|
289
|
+
}
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
Apache License 2.0. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
weirdo/__init__.py,sha256=4Y4eon4__PBBd5WBWZBn8qvVAvQhZCe2VYtOCty3jVY,2124
|
|
2
|
+
weirdo/amino_acid.py,sha256=ejd1a3a30qnKeZnrXKpAgTQeCgQ6Q33B6fFH_qzCsOI,1220
|
|
3
|
+
weirdo/amino_acid_alphabet.py,sha256=_0TmcGHP4rCJ3hotHXmFD58_6AkA4GOOrouy219t7qw,4616
|
|
4
|
+
weirdo/amino_acid_properties.py,sha256=5EH1iaO8ryCXIca7QIUJCnWlfpjclr7yxqxB9RlGyM0,6203
|
|
5
|
+
weirdo/api.py,sha256=7tLxZ5NiebQznsBxbS2Yq0mRGFTDIGgMb4h2N3DIrvU,10504
|
|
6
|
+
weirdo/blosum.py,sha256=873ejyPMmTmsLDNS1EM7uNx8hWclw1WCPqFkFuzSIGU,2535
|
|
7
|
+
weirdo/chou_fasman.py,sha256=t_k-JggZtEmm2ZiMVnMHrlGFk47PbQU2e02W35Qoa0A,3214
|
|
8
|
+
weirdo/cli.py,sha256=vk7zdReoXt3LchJlalRfFOL1BDb-rTBa5SWuTfc9DG0,18965
|
|
9
|
+
weirdo/common.py,sha256=iVXKu5ZUQogAaP1arrF53fFu64kFZoTl-vuCv2Pww-k,840
|
|
10
|
+
weirdo/data_manager.py,sha256=NOYLkpznV9FFMAd6ZPJCxefjs-30PnS2NCpC-FEV-s0,14385
|
|
11
|
+
weirdo/distances.py,sha256=CyVyKZc5XOiRsj_kaj73lCXcQam5O8w9qX_tafe3Vxk,648
|
|
12
|
+
weirdo/model_manager.py,sha256=5DkbA4HrWOysbUKaWb_e-HoSgnSnF5lhRTzZDACXFDk,9476
|
|
13
|
+
weirdo/peptide_vectorizer.py,sha256=MeEF5V1t21LvejAPC3OPhrjueAxWTMe0LbAvavILu2U,2817
|
|
14
|
+
weirdo/pmbec.py,sha256=SSJ_Km0DdiGkyYwWn_zHFaCDwz__S89KF5dh808Tr_s,2894
|
|
15
|
+
weirdo/reduced_alphabet.py,sha256=Bkwe8EJ_3f658GM_4NHzric9gr5uDPdP_eAwR_wsKCA,2088
|
|
16
|
+
weirdo/residue_contact_energies.py,sha256=9COim1bw8-q8mGhiKe_yuH9fCMQoO2zjcS7TgN92dNg,2871
|
|
17
|
+
weirdo/static_data.py,sha256=9V5RmhXTTba8ccOrL2n-czEsg5lCNuFHFIm_RPh2s1M,676
|
|
18
|
+
weirdo/matrices/BLOSUM30,sha256=x2ESElo8Qb49iZsPS30KjP57ZYAf_l350ZUBYfk_ahc,1900
|
|
19
|
+
weirdo/matrices/BLOSUM50,sha256=8Q58JEqlHIotcBfE3q1NyBm93WT5xiyrY7TNn5s3-ZA,1322
|
|
20
|
+
weirdo/matrices/BLOSUM62,sha256=7jMEl7Vws5RtKB3HjmCJpWkwDrv75eo2tI-VxqyXDxI,2061
|
|
21
|
+
weirdo/matrices/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
weirdo/matrices/amino_acid_properties.txt,sha256=dpq0L7_MSinEoLAboryhiT28chg49MU-VwzjiqOtKWI,13214
|
|
23
|
+
weirdo/matrices/helix_vs_coil.txt,sha256=britXB_v_TqcCW3IHXz08nSwNe_CB5uaOzykbMYL5wk,2651
|
|
24
|
+
weirdo/matrices/helix_vs_strand.txt,sha256=EanjvMyWH1xkjqMNRawnqqMDGDKxUyWxl37jVrsZ1s4,2692
|
|
25
|
+
weirdo/matrices/pmbec.mat,sha256=psM1Wn2KR46JHA4UXiSPQVwOf5KgwELpS2kcHDpnIag,6746
|
|
26
|
+
weirdo/matrices/strand_vs_coil.txt,sha256=0qv-jyTD_YO1c9oVO2NJ_2t6gLY8pvORP9aH69V1ONA,2610
|
|
27
|
+
weirdo/scorers/__init__.py,sha256=P0YyGdSK1HijaOhPAYcOTlas1Q-muZDz2vOgE2QBKko,2211
|
|
28
|
+
weirdo/scorers/base.py,sha256=7VIDx5Sc6ieuvfW-np1jYwFhywSTifg6fNzksnlo6Lw,6159
|
|
29
|
+
weirdo/scorers/config.py,sha256=eE_9EJSry3QnUsxxOMHlK5c2an2pFdmxZ4qdiAg5Nf4,8275
|
|
30
|
+
weirdo/scorers/mlp.py,sha256=tinatxnu3f9h6SEl4V6l3BWvwLoQe-afhfHI1sCBG8k,39850
|
|
31
|
+
weirdo/scorers/reference.py,sha256=RTIDHLI4tj1ruuu6NVb0HKD7txMhu3NGoctCxM-wEZw,7016
|
|
32
|
+
weirdo/scorers/registry.py,sha256=K2GQtgp-wF975Zf6F4ItQQeh_qsI8UOvfyW8I5DJUJo,7858
|
|
33
|
+
weirdo/scorers/similarity.py,sha256=tIveCPNDCJ8rJmRViVw9VBScW7OKA47Q7xAAsiEz5tk,11658
|
|
34
|
+
weirdo/scorers/swissprot.py,sha256=UGtDgVp4r-lnG2DVTSSFf1gqTLvJKKdd7R2QMhIWtn0,16888
|
|
35
|
+
weirdo/scorers/trainable.py,sha256=XylC98ltHTBaV62rfAN6_iGWQyeD8j-lqMHYfR5J-Vk,6721
|
|
36
|
+
weirdo-2.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
37
|
+
weirdo-2.1.0.dist-info/METADATA,sha256=JxwUuGu72tFNEG-KjFCF2mPzTJQH_VI4ll6c95fcESQ,9238
|
|
38
|
+
weirdo-2.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
39
|
+
weirdo-2.1.0.dist-info/entry_points.txt,sha256=D6RgZ4ugZW0RrZIW2X_DEeIW-vmdFuyCM6xLlUABqac,42
|
|
40
|
+
weirdo-2.1.0.dist-info/top_level.txt,sha256=aC8Ch-s2qwCn-RVOwY1WqNXNoFJHVyfnW3x9umQZ3K0,7
|
|
41
|
+
weirdo-2.1.0.dist-info/RECORD,,
|