weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,265 @@
1
+ """Base classes for reference datasets.
2
+
3
+ Provides abstract base classes for loading and querying
4
+ reference k-mer data (e.g., from SwissProt).
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
9
+
10
+
11
+ class BaseReference(ABC):
12
+ """Abstract base class for reference datasets.
13
+
14
+ A reference dataset provides k-mer presence or frequency information
15
+ used by scorers to compute foreignness scores.
16
+
17
+ Example
18
+ -------
19
+ >>> ref = MyReference(categories=['human'])
20
+ >>> ref.load()
21
+ >>> ref.contains('MTMDKSEL')
22
+ True
23
+ >>> ref.get_frequency('MTMDKSEL')
24
+ 1.0
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ categories: Optional[List[str]] = None,
30
+ k: int = 8,
31
+ **kwargs
32
+ ):
33
+ """Initialize reference.
34
+
35
+ Parameters
36
+ ----------
37
+ categories : list of str, optional
38
+ Filter to specific organism categories.
39
+ If None, use all available categories.
40
+ k : int, default=8
41
+ K-mer size expected in this reference.
42
+ **kwargs : dict
43
+ Additional implementation-specific parameters.
44
+ """
45
+ self._categories = categories
46
+ self._k = k
47
+ self._is_loaded = False
48
+ self._kwargs = kwargs
49
+
50
+ @abstractmethod
51
+ def load(self) -> 'BaseReference':
52
+ """Load reference data into memory.
53
+
54
+ Returns
55
+ -------
56
+ self : BaseReference
57
+ Returns self for method chaining.
58
+ """
59
+ pass
60
+
61
+ @abstractmethod
62
+ def contains(self, kmer: str) -> bool:
63
+ """Check if k-mer exists in reference.
64
+
65
+ Parameters
66
+ ----------
67
+ kmer : str
68
+ K-mer sequence to look up.
69
+
70
+ Returns
71
+ -------
72
+ exists : bool
73
+ True if k-mer is in reference.
74
+ """
75
+ pass
76
+
77
+ @abstractmethod
78
+ def get_frequency(self, kmer: str, default: float = 0.0) -> float:
79
+ """Get frequency of k-mer in reference.
80
+
81
+ Parameters
82
+ ----------
83
+ kmer : str
84
+ K-mer sequence to look up.
85
+ default : float, default=0.0
86
+ Value to return if k-mer not found.
87
+
88
+ Returns
89
+ -------
90
+ frequency : float
91
+ Frequency or presence of k-mer (0.0 to 1.0) or default if not found.
92
+ """
93
+ pass
94
+
95
+ @abstractmethod
96
+ def get_categories(self) -> List[str]:
97
+ """Get list of available organism categories.
98
+
99
+ Returns
100
+ -------
101
+ categories : list of str
102
+ Available category names (e.g., ['human', 'bacteria']).
103
+ """
104
+ pass
105
+
106
+ @abstractmethod
107
+ def iter_kmers(self) -> Iterator[str]:
108
+ """Iterate over all k-mers in reference.
109
+
110
+ Yields
111
+ ------
112
+ kmer : str
113
+ Each k-mer sequence in the reference.
114
+ """
115
+ pass
116
+
117
+ @property
118
+ def k(self) -> int:
119
+ """Get k-mer size."""
120
+ return self._k
121
+
122
+ @property
123
+ def categories(self) -> Optional[List[str]]:
124
+ """Get filtered categories, or None for all."""
125
+ return self._categories
126
+
127
+ @property
128
+ def is_loaded(self) -> bool:
129
+ """Check if reference data is loaded."""
130
+ return self._is_loaded
131
+
132
+ def _check_is_loaded(self) -> None:
133
+ """Raise error if reference is not loaded."""
134
+ if not self._is_loaded:
135
+ raise RuntimeError(
136
+ f"{self.__class__.__name__} is not loaded. "
137
+ "Call load() first."
138
+ )
139
+
140
+ def __len__(self) -> int:
141
+ """Return number of k-mers in reference."""
142
+ raise NotImplementedError("Subclass must implement __len__")
143
+
144
+ def __contains__(self, kmer: str) -> bool:
145
+ """Support 'in' operator."""
146
+ return self.contains(kmer)
147
+
148
+
149
+ class StreamingReference(BaseReference):
150
+ """Base class for references that support streaming operations.
151
+
152
+ Extends BaseReference with methods for memory-efficient
153
+ iteration over large datasets.
154
+ """
155
+
156
+ def __init__(
157
+ self,
158
+ categories: Optional[List[str]] = None,
159
+ k: int = 8,
160
+ lazy: bool = False,
161
+ use_set: bool = False,
162
+ **kwargs
163
+ ):
164
+ """Initialize streaming reference.
165
+
166
+ Parameters
167
+ ----------
168
+ categories : list of str, optional
169
+ Filter to specific organism categories.
170
+ k : int, default=8
171
+ K-mer size.
172
+ lazy : bool, default=False
173
+ If True, don't load data into memory; stream from disk.
174
+ use_set : bool, default=False
175
+ If True, only track k-mer presence (not frequencies).
176
+ Reduces memory by ~50% but loses frequency information.
177
+ **kwargs : dict
178
+ Additional parameters.
179
+ """
180
+ super().__init__(categories=categories, k=k, **kwargs)
181
+ self._lazy = lazy
182
+ self._use_set = use_set
183
+
184
+ @property
185
+ def lazy(self) -> bool:
186
+ """Check if using lazy (streaming) mode."""
187
+ return self._lazy
188
+
189
+ @property
190
+ def use_set(self) -> bool:
191
+ """Check if only tracking presence (no frequencies)."""
192
+ return self._use_set
193
+
194
+ @abstractmethod
195
+ def iter_kmers_with_counts(self) -> Iterator[Tuple[str, int]]:
196
+ """Iterate over k-mers with their counts.
197
+
198
+ Yields
199
+ ------
200
+ kmer : str
201
+ K-mer sequence.
202
+ count : int
203
+ Number of times k-mer appears in reference.
204
+ """
205
+ pass
206
+
207
+ @abstractmethod
208
+ def iter_kmers_with_categories(
209
+ self
210
+ ) -> Iterator[Tuple[str, Dict[str, bool]]]:
211
+ """Iterate over k-mers with category presence.
212
+
213
+ Yields
214
+ ------
215
+ kmer : str
216
+ K-mer sequence.
217
+ categories : dict
218
+ Mapping of category name to presence (True/False).
219
+ """
220
+ pass
221
+
222
+ def sample_kmers(
223
+ self,
224
+ n: int,
225
+ seed: Optional[int] = None
226
+ ) -> List[str]:
227
+ """Sample random k-mers from reference.
228
+
229
+ Parameters
230
+ ----------
231
+ n : int
232
+ Number of k-mers to sample.
233
+ seed : int, optional
234
+ Random seed for reproducibility.
235
+
236
+ Returns
237
+ -------
238
+ kmers : list of str
239
+ Sampled k-mer sequences.
240
+ """
241
+ import random
242
+ if seed is not None:
243
+ random.seed(seed)
244
+
245
+ # Default implementation: collect and sample
246
+ # Subclasses can override for memory-efficient sampling
247
+ all_kmers = list(self.iter_kmers())
248
+ return random.sample(all_kmers, min(n, len(all_kmers)))
249
+
250
+ def get_kmers_for_category(self, category: str) -> Iterator[str]:
251
+ """Get k-mers present in a specific category.
252
+
253
+ Parameters
254
+ ----------
255
+ category : str
256
+ Category name to filter by.
257
+
258
+ Yields
259
+ ------
260
+ kmer : str
261
+ K-mers present in the specified category.
262
+ """
263
+ for kmer, cats in self.iter_kmers_with_categories():
264
+ if cats.get(category, False):
265
+ yield kmer
@@ -0,0 +1,282 @@
1
+ """Registry for scorer and reference implementations.
2
+
3
+ Provides a plugin-style registration system for scorers and references.
4
+ """
5
+
6
+ from typing import Any, Callable, Dict, List, Optional, Type
7
+
8
+ from .base import BaseScorer
9
+ from .reference import BaseReference
10
+
11
+
12
+ class ScorerRegistry:
13
+ """Registry for scorer and reference implementations.
14
+
15
+ Provides decorator-based registration and factory methods
16
+ for creating scorer instances.
17
+
18
+ Example
19
+ -------
20
+ >>> from weirdo.scorers import registry, BaseScorer
21
+ >>>
22
+ >>> @registry.register_scorer('my_scorer', description='My custom scorer')
23
+ ... class MyScorer(BaseScorer):
24
+ ... def fit(self, reference): ...
25
+ ... def score(self, peptides): ...
26
+ >>>
27
+ >>> scorer = registry.create_scorer('my_scorer', k=8)
28
+ """
29
+
30
+ def __init__(self):
31
+ self._scorers: Dict[str, Dict[str, Any]] = {}
32
+ self._references: Dict[str, Dict[str, Any]] = {}
33
+
34
+ def register_scorer(
35
+ self,
36
+ name: str,
37
+ description: str = '',
38
+ aliases: Optional[List[str]] = None
39
+ ) -> Callable[[Type[BaseScorer]], Type[BaseScorer]]:
40
+ """Decorator to register a scorer class.
41
+
42
+ Parameters
43
+ ----------
44
+ name : str
45
+ Unique identifier for the scorer.
46
+ description : str, optional
47
+ Human-readable description.
48
+ aliases : list of str, optional
49
+ Alternative names for the scorer.
50
+
51
+ Returns
52
+ -------
53
+ decorator : callable
54
+ Decorator function that registers the class.
55
+
56
+ Example
57
+ -------
58
+ >>> @registry.register_scorer('mlp', description='MLP-based scoring')
59
+ ... class MLPScorer(BaseScorer):
60
+ ... pass
61
+ """
62
+ aliases = aliases or []
63
+
64
+ def decorator(cls: Type[BaseScorer]) -> Type[BaseScorer]:
65
+ entry = {
66
+ 'class': cls,
67
+ 'name': name,
68
+ 'description': description,
69
+ 'aliases': aliases,
70
+ }
71
+ self._scorers[name] = entry
72
+ for alias in aliases:
73
+ self._scorers[alias] = entry
74
+ return cls
75
+
76
+ return decorator
77
+
78
+ def register_reference(
79
+ self,
80
+ name: str,
81
+ description: str = '',
82
+ aliases: Optional[List[str]] = None
83
+ ) -> Callable[[Type[BaseReference]], Type[BaseReference]]:
84
+ """Decorator to register a reference class.
85
+
86
+ Parameters
87
+ ----------
88
+ name : str
89
+ Unique identifier for the reference.
90
+ description : str, optional
91
+ Human-readable description.
92
+ aliases : list of str, optional
93
+ Alternative names for the reference.
94
+
95
+ Returns
96
+ -------
97
+ decorator : callable
98
+ Decorator function that registers the class.
99
+ """
100
+ aliases = aliases or []
101
+
102
+ def decorator(cls: Type[BaseReference]) -> Type[BaseReference]:
103
+ entry = {
104
+ 'class': cls,
105
+ 'name': name,
106
+ 'description': description,
107
+ 'aliases': aliases,
108
+ }
109
+ self._references[name] = entry
110
+ for alias in aliases:
111
+ self._references[alias] = entry
112
+ return cls
113
+
114
+ return decorator
115
+
116
+ def get_scorer(self, name: str) -> Type[BaseScorer]:
117
+ """Get scorer class by name.
118
+
119
+ Parameters
120
+ ----------
121
+ name : str
122
+ Scorer name or alias.
123
+
124
+ Returns
125
+ -------
126
+ scorer_class : type
127
+ The scorer class.
128
+
129
+ Raises
130
+ ------
131
+ KeyError
132
+ If scorer name is not registered.
133
+ """
134
+ if name not in self._scorers:
135
+ available = self.list_scorers()
136
+ raise KeyError(
137
+ f"Unknown scorer '{name}'. Available: {available}"
138
+ )
139
+ return self._scorers[name]['class']
140
+
141
+ def get_reference(self, name: str) -> Type[BaseReference]:
142
+ """Get reference class by name.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str
147
+ Reference name or alias.
148
+
149
+ Returns
150
+ -------
151
+ reference_class : type
152
+ The reference class.
153
+
154
+ Raises
155
+ ------
156
+ KeyError
157
+ If reference name is not registered.
158
+ """
159
+ if name not in self._references:
160
+ available = self.list_references()
161
+ raise KeyError(
162
+ f"Unknown reference '{name}'. Available: {available}"
163
+ )
164
+ return self._references[name]['class']
165
+
166
+ def create_scorer(self, name: str, **params) -> BaseScorer:
167
+ """Create scorer instance by name.
168
+
169
+ Parameters
170
+ ----------
171
+ name : str
172
+ Scorer name or alias.
173
+ **params : dict
174
+ Parameters to pass to scorer constructor.
175
+
176
+ Returns
177
+ -------
178
+ scorer : BaseScorer
179
+ Instantiated scorer.
180
+ """
181
+ cls = self.get_scorer(name)
182
+ return cls(**params)
183
+
184
+ def create_reference(self, name: str, **params) -> BaseReference:
185
+ """Create reference instance by name.
186
+
187
+ Parameters
188
+ ----------
189
+ name : str
190
+ Reference name or alias.
191
+ **params : dict
192
+ Parameters to pass to reference constructor.
193
+
194
+ Returns
195
+ -------
196
+ reference : BaseReference
197
+ Instantiated reference.
198
+ """
199
+ cls = self.get_reference(name)
200
+ return cls(**params)
201
+
202
+ def list_scorers(self) -> List[str]:
203
+ """List registered scorer names (excluding aliases).
204
+
205
+ Returns
206
+ -------
207
+ names : list of str
208
+ Registered scorer names.
209
+ """
210
+ seen = set()
211
+ names = []
212
+ for name, entry in self._scorers.items():
213
+ canonical = entry['name']
214
+ if canonical not in seen:
215
+ seen.add(canonical)
216
+ names.append(canonical)
217
+ return sorted(names)
218
+
219
+ def list_references(self) -> List[str]:
220
+ """List registered reference names (excluding aliases).
221
+
222
+ Returns
223
+ -------
224
+ names : list of str
225
+ Registered reference names.
226
+ """
227
+ seen = set()
228
+ names = []
229
+ for name, entry in self._references.items():
230
+ canonical = entry['name']
231
+ if canonical not in seen:
232
+ seen.add(canonical)
233
+ names.append(canonical)
234
+ return sorted(names)
235
+
236
+ def get_scorer_info(self, name: str) -> Dict[str, Any]:
237
+ """Get metadata about a scorer.
238
+
239
+ Parameters
240
+ ----------
241
+ name : str
242
+ Scorer name or alias.
243
+
244
+ Returns
245
+ -------
246
+ info : dict
247
+ Scorer metadata (name, description, aliases, class).
248
+ """
249
+ if name not in self._scorers:
250
+ raise KeyError(f"Unknown scorer '{name}'")
251
+ return self._scorers[name].copy()
252
+
253
+ def get_reference_info(self, name: str) -> Dict[str, Any]:
254
+ """Get metadata about a reference.
255
+
256
+ Parameters
257
+ ----------
258
+ name : str
259
+ Reference name or alias.
260
+
261
+ Returns
262
+ -------
263
+ info : dict
264
+ Reference metadata (name, description, aliases, class).
265
+ """
266
+ if name not in self._references:
267
+ raise KeyError(f"Unknown reference '{name}'")
268
+ return self._references[name].copy()
269
+
270
+
271
+ # Global registry instance
272
+ registry = ScorerRegistry()
273
+
274
+ # Convenience functions that delegate to global registry
275
+ register_scorer = registry.register_scorer
276
+ register_reference = registry.register_reference
277
+ get_scorer = registry.get_scorer
278
+ get_reference = registry.get_reference
279
+ create_scorer = registry.create_scorer
280
+ create_reference = registry.create_reference
281
+ list_scorers = registry.list_scorers
282
+ list_references = registry.list_references