weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ def dict_from_list(groups):
15
+ aa_to_group = {}
16
+ for _, group in enumerate(groups):
17
+ for c in group:
18
+ sorted_group = sorted(group)
19
+ aa_to_group[c] = sorted_group[0]
20
+ return aa_to_group
21
+
22
+
23
+ """
24
+ Amino acid groupings from
25
+ 'Reduced amino acid alphabets improve the sensitivity...' by
26
+ Peterson, Kondev, et al.
27
+ http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf
28
+ """
29
+
30
+ """
31
+ Other alphabets from
32
+ http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html
33
+ """
34
+
35
+
36
+ alphabets = dict(
37
+ gbmr4=dict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"]),
38
+ sdm12=dict_from_list([
39
+ "A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"]),
40
+ hsdm17 = dict_from_list([
41
+ "A", "D", "KE", "R", "N", "T", "S", "Q", "Y",
42
+ "F", "LIV", "M", "C", "W", "H", "G", "P"
43
+ ]),
44
+ # hydrophilic vs. hydrophobic
45
+ hp2 = dict_from_list(["AGTSNQDEHRKP", "CMFILVWY"]),
46
+ # Murphy reduced alphabets (groupings derived from murphy10 splits/merges)
47
+ murphy8 = dict_from_list([
48
+ "LVIM", "C", "AG", "STP", "FYW", "EDNQ", "KR", "H"
49
+ ]),
50
+ murphy10 = dict_from_list([
51
+ "LVIM", "C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H"
52
+ ]),
53
+ murphy15 = dict_from_list([
54
+ "LIV", "M", "C", "A", "G", "S", "T", "P", "FY", "W", "ED", "NQ", "K", "R", "H"
55
+ ]),
56
+ alex6=dict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]),
57
+ aromatic2=dict_from_list(["FHWY", "ADKERNTSQLIVMCGP"]),
58
+ hp_vs_aromatic = dict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"]),
59
+ )
60
+
61
+
@@ -0,0 +1,74 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ from os.path import join
14
+
15
+ from .amino_acid_alphabet import canonical_amino_acid_letters, dict_to_amino_acid_matrix
16
+ from .static_data import MATRIX_DIR
17
+
18
+
19
+ def parse_interaction_table(table, amino_acid_order="ARNDCQEGHILKMFPSTWYV"):
20
+ table = table.strip()
21
+ while " " in table:
22
+ table = table.replace(" ", " ")
23
+
24
+ lines = [l.strip() for l in table.split("\n")]
25
+ lines = [l for l in lines if len(l) > 0 and not l.startswith("#")]
26
+ assert len(lines) == 20, "Malformed amino acid interaction table"
27
+ d = {}
28
+ for i, line in enumerate(lines):
29
+ coeff_strings = line.split(" ")
30
+ assert len(coeff_strings) == 20, \
31
+ "Malformed row in amino acid interaction table"
32
+ x = amino_acid_order[i]
33
+ d[x] = {}
34
+ for j, coeff_str in enumerate(coeff_strings):
35
+ value = float(coeff_str)
36
+ y = amino_acid_order[j]
37
+ d[x][y] = value
38
+ return d
39
+
40
+ def transpose_interaction_dict(d):
41
+ transposed = {}
42
+ for x in canonical_amino_acid_letters:
43
+ transposed[x] = {}
44
+ for y in canonical_amino_acid_letters:
45
+ transposed[x][y] = d[y][x]
46
+ return transposed
47
+
48
+
49
+ with open(join(MATRIX_DIR, 'strand_vs_coil.txt'), 'r') as f:
50
+ # Strand vs. Coil
51
+ strand_vs_coil_dict = parse_interaction_table(f.read())
52
+ strand_vs_coil_array = dict_to_amino_acid_matrix(strand_vs_coil_dict)
53
+
54
+ # Coil vs. Strand
55
+ coil_vs_strand_dict = transpose_interaction_dict(strand_vs_coil_dict)
56
+ coil_vs_strand_array = dict_to_amino_acid_matrix(coil_vs_strand_dict)
57
+
58
+ with open(join(MATRIX_DIR, 'helix_vs_strand.txt'), 'r') as f:
59
+ # Helix vs. Strand
60
+ helix_vs_strand_dict = parse_interaction_table(f.read())
61
+ helix_vs_strand_array = dict_to_amino_acid_matrix(helix_vs_strand_dict)
62
+
63
+ # Strand vs. Helix
64
+ strand_vs_helix_dict = transpose_interaction_dict(helix_vs_strand_dict)
65
+ strand_vs_helix_array = dict_to_amino_acid_matrix(strand_vs_helix_dict)
66
+
67
+ with open(join(MATRIX_DIR, 'helix_vs_coil.txt'), 'r') as f:
68
+ # Helix vs. Coil
69
+ helix_vs_coil_dict = parse_interaction_table(f.read())
70
+ helix_vs_coil_array = dict_to_amino_acid_matrix(helix_vs_coil_dict)
71
+
72
+ # Coil vs. Helix
73
+ coil_vs_helix_dict = transpose_interaction_dict(helix_vs_coil_dict)
74
+ coil_vs_helix_array = dict_to_amino_acid_matrix(coil_vs_helix_dict)
@@ -0,0 +1,95 @@
1
+ """Extensible foreignness scoring system.
2
+
3
+ This module provides a plugin-style architecture for scoring peptides
4
+ based on how "foreign" they are relative to a reference dataset.
5
+
6
+ Quick Start
7
+ -----------
8
+ >>> from weirdo.scorers import MLPScorer
9
+ >>> scorer = MLPScorer(k=8, hidden_layer_sizes=(128, 64))
10
+ >>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
11
+ >>> scores = scorer.score(['MTMDKSEL', 'ACDEFGHI'])
12
+
13
+ Using Presets
14
+ -------------
15
+ >>> from weirdo.scorers import ScorerConfig
16
+ >>> config = ScorerConfig.from_preset('default')
17
+ >>> scorer = config.build()
18
+ >>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
19
+ >>> scores = scorer.score(['MTMDKSEL'])
20
+
21
+ Adding Custom Scorers
22
+ ---------------------
23
+ >>> from weirdo.scorers import register_scorer, BaseScorer
24
+ >>>
25
+ >>> @register_scorer('my_scorer', description='My custom scorer')
26
+ ... class MyScorer(BaseScorer):
27
+ ... def fit(self, reference): ...
28
+ ... def score(self, peptides): ...
29
+ """
30
+
31
+ # Base classes
32
+ from .base import BaseScorer, BatchScorer
33
+
34
+ # Reference classes
35
+ from .reference import BaseReference, StreamingReference
36
+
37
+ # Registry
38
+ from .registry import (
39
+ ScorerRegistry,
40
+ registry,
41
+ register_scorer,
42
+ register_reference,
43
+ get_scorer,
44
+ get_reference,
45
+ create_scorer,
46
+ create_reference,
47
+ list_scorers,
48
+ list_references,
49
+ )
50
+
51
+ # Configuration
52
+ from .config import (
53
+ ScorerConfig,
54
+ PRESETS,
55
+ get_preset,
56
+ list_presets,
57
+ )
58
+
59
+ # Trainable base class
60
+ from .trainable import TrainableScorer
61
+
62
+ # Concrete implementations (import to trigger registration)
63
+ from .swissprot import SwissProtReference
64
+
65
+ # ML-based scorer
66
+ from .mlp import MLPScorer
67
+
68
+ __all__ = [
69
+ # Base classes
70
+ 'BaseScorer',
71
+ 'BatchScorer',
72
+ 'BaseReference',
73
+ 'StreamingReference',
74
+ 'TrainableScorer',
75
+ # Registry
76
+ 'ScorerRegistry',
77
+ 'registry',
78
+ 'register_scorer',
79
+ 'register_reference',
80
+ 'get_scorer',
81
+ 'get_reference',
82
+ 'create_scorer',
83
+ 'create_reference',
84
+ 'list_scorers',
85
+ 'list_references',
86
+ # Configuration
87
+ 'ScorerConfig',
88
+ 'PRESETS',
89
+ 'get_preset',
90
+ 'list_presets',
91
+ # Implementations
92
+ 'SwissProtReference',
93
+ # ML scorer
94
+ 'MLPScorer',
95
+ ]
weirdo/scorers/base.py ADDED
@@ -0,0 +1,223 @@
1
+ """Base classes for foreignness scorers.
2
+
3
+ Provides abstract base classes defining the scorer interface,
4
+ following sklearn-style fit/score patterns.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, Dict, List, Optional, Sequence, Union
9
+
10
+ import numpy as np
11
+
12
+
13
+ class BaseScorer(ABC):
14
+ """Abstract base class for foreignness scorers.
15
+
16
+ Scorers follow a fit/score pattern similar to sklearn:
17
+ 1. Initialize with configuration parameters
18
+ 2. Call fit() with a reference dataset
19
+ 3. Call score() on new peptides
20
+
21
+ Example
22
+ -------
23
+ >>> scorer = MyScorer(k=8, aggregate='mean')
24
+ >>> scorer.fit(reference)
25
+ >>> scores = scorer.score(['MTMDKSEL', 'ACDEFGHI'])
26
+ """
27
+
28
+ def __init__(self, **params):
29
+ """Initialize scorer with parameters.
30
+
31
+ Parameters
32
+ ----------
33
+ **params : dict
34
+ Scorer-specific configuration parameters.
35
+ """
36
+ self._params = params
37
+ self._is_fitted = False
38
+ self._reference = None
39
+
40
+ @abstractmethod
41
+ def fit(self, reference: Any) -> 'BaseScorer':
42
+ """Fit the scorer to a reference dataset.
43
+
44
+ Parameters
45
+ ----------
46
+ reference : BaseReference
47
+ Reference dataset providing k-mer frequencies or other data.
48
+
49
+ Returns
50
+ -------
51
+ self : BaseScorer
52
+ Returns self for method chaining.
53
+ """
54
+ pass
55
+
56
+ @abstractmethod
57
+ def score(self, peptides: Union[str, Sequence[str]]) -> np.ndarray:
58
+ """Score peptide(s) for foreignness.
59
+
60
+ Parameters
61
+ ----------
62
+ peptides : str or sequence of str
63
+ Single peptide or list of peptides to score.
64
+
65
+ Returns
66
+ -------
67
+ scores : np.ndarray
68
+ Array of foreignness scores. Higher = more foreign.
69
+ Shape: (n_peptides,)
70
+ """
71
+ pass
72
+
73
+ def fit_score(self, reference: Any, peptides: Union[str, Sequence[str]]) -> np.ndarray:
74
+ """Fit to reference and score peptides in one call.
75
+
76
+ Parameters
77
+ ----------
78
+ reference : BaseReference
79
+ Reference dataset to fit.
80
+ peptides : str or sequence of str
81
+ Peptides to score.
82
+
83
+ Returns
84
+ -------
85
+ scores : np.ndarray
86
+ Foreignness scores.
87
+ """
88
+ self.fit(reference)
89
+ return self.score(peptides)
90
+
91
+ def get_params(self, deep: bool = True) -> Dict[str, Any]:
92
+ """Get scorer parameters.
93
+
94
+ Parameters
95
+ ----------
96
+ deep : bool, default=True
97
+ If True, return parameters of nested objects.
98
+
99
+ Returns
100
+ -------
101
+ params : dict
102
+ Parameter names mapped to their values.
103
+ """
104
+ return self._params.copy()
105
+
106
+ def set_params(self, **params) -> 'BaseScorer':
107
+ """Set scorer parameters.
108
+
109
+ Parameters
110
+ ----------
111
+ **params : dict
112
+ Scorer parameters to update.
113
+
114
+ Returns
115
+ -------
116
+ self : BaseScorer
117
+ Returns self for method chaining.
118
+ """
119
+ self._params.update(params)
120
+ self._is_fitted = False # Invalidate fit when params change
121
+ return self
122
+
123
+ @property
124
+ def is_fitted(self) -> bool:
125
+ """Check if scorer has been fitted."""
126
+ return self._is_fitted
127
+
128
+ def _check_is_fitted(self) -> None:
129
+ """Raise error if scorer is not fitted."""
130
+ if not self._is_fitted:
131
+ raise RuntimeError(
132
+ f"{self.__class__.__name__} is not fitted. "
133
+ "Call fit() before score()."
134
+ )
135
+
136
+ def _ensure_list(self, peptides: Union[str, Sequence[str]]) -> List[str]:
137
+ """Convert single peptide to list if needed."""
138
+ if isinstance(peptides, str):
139
+ return [peptides]
140
+ return list(peptides)
141
+
142
+
143
+ class BatchScorer(BaseScorer):
144
+ """Base class for scorers that support efficient batch operations.
145
+
146
+ Extends BaseScorer with score_batch() for vectorized scoring
147
+ of large peptide sets.
148
+ """
149
+
150
+ def __init__(self, batch_size: int = 10000, **params):
151
+ """Initialize batch scorer.
152
+
153
+ Parameters
154
+ ----------
155
+ batch_size : int, default=10000
156
+ Number of peptides to process per batch.
157
+ **params : dict
158
+ Additional scorer parameters.
159
+ """
160
+ super().__init__(**params)
161
+ self._params['batch_size'] = batch_size
162
+
163
+ @property
164
+ def batch_size(self) -> int:
165
+ """Get batch size for vectorized operations."""
166
+ return self._params.get('batch_size', 10000)
167
+
168
+ def score_batch(
169
+ self,
170
+ peptides: Sequence[str],
171
+ show_progress: bool = False
172
+ ) -> np.ndarray:
173
+ """Score peptides in batches for memory efficiency.
174
+
175
+ Parameters
176
+ ----------
177
+ peptides : sequence of str
178
+ Peptides to score.
179
+ show_progress : bool, default=False
180
+ If True, show progress bar (requires tqdm).
181
+
182
+ Returns
183
+ -------
184
+ scores : np.ndarray
185
+ Foreignness scores.
186
+ """
187
+ self._check_is_fitted()
188
+ peptides = self._ensure_list(peptides)
189
+ n_peptides = len(peptides)
190
+ scores = np.zeros(n_peptides)
191
+
192
+ # Create batch iterator
193
+ batches = range(0, n_peptides, self.batch_size)
194
+ if show_progress:
195
+ try:
196
+ from tqdm import tqdm
197
+ batches = tqdm(batches, desc="Scoring", unit="batch")
198
+ except ImportError:
199
+ pass
200
+
201
+ for i in batches:
202
+ batch = peptides[i:i + self.batch_size]
203
+ scores[i:i + len(batch)] = self._score_batch_impl(batch)
204
+
205
+ return scores
206
+
207
+ def _score_batch_impl(self, batch: List[str]) -> np.ndarray:
208
+ """Implementation of batch scoring.
209
+
210
+ Override this for efficient vectorized scoring.
211
+ Default implementation calls score() on each peptide.
212
+
213
+ Parameters
214
+ ----------
215
+ batch : list of str
216
+ Batch of peptides to score.
217
+
218
+ Returns
219
+ -------
220
+ scores : np.ndarray
221
+ Scores for the batch.
222
+ """
223
+ return self.score(batch)