XspecT 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +51 -38
- xspect/definitions.py +50 -10
- xspect/download_models.py +10 -2
- xspect/file_io.py +115 -48
- xspect/filter_sequences.py +36 -66
- xspect/main.py +41 -10
- xspect/mlst_feature/mlst_helper.py +3 -0
- xspect/mlst_feature/pub_mlst_handler.py +43 -1
- xspect/model_management.py +84 -14
- xspect/models/probabilistic_filter_mlst_model.py +75 -37
- xspect/models/probabilistic_filter_model.py +194 -12
- xspect/models/probabilistic_filter_svm_model.py +99 -6
- xspect/models/probabilistic_single_filter_model.py +66 -5
- xspect/models/result.py +77 -10
- xspect/ncbi.py +45 -10
- xspect/train.py +2 -1
- xspect/web.py +68 -12
- xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
- xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
- xspect/xspect-web/dist/index.html +2 -2
- xspect/xspect-web/src/App.tsx +4 -2
- xspect/xspect-web/src/api.tsx +23 -1
- xspect/xspect-web/src/components/filter-form.tsx +16 -3
- xspect/xspect-web/src/components/filtering-result.tsx +65 -0
- xspect/xspect-web/src/components/result.tsx +2 -2
- xspect/xspect-web/src/types.tsx +5 -0
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/METADATA +1 -1
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
- xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
from math import ceil
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
6
7
|
from Bio.Seq import Seq
|
|
7
8
|
from Bio.SeqRecord import SeqRecord
|
|
8
9
|
from Bio import SeqIO
|
|
@@ -28,6 +29,25 @@ class ProbabilisticFilterModel:
|
|
|
28
29
|
num_hashes: int = 7,
|
|
29
30
|
training_accessions: dict[str, list[str]] | None = None,
|
|
30
31
|
) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initializes the probabilistic filter model.
|
|
34
|
+
|
|
35
|
+
This method sets up the model with the specified parameters, including the k-mer size,
|
|
36
|
+
display name, author information, model type, base path for storage, false positive rate,
|
|
37
|
+
number of hashes, and training accessions.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
k (int): The size of the k-mers to be used in the model.
|
|
41
|
+
model_display_name (str): The display name of the model.
|
|
42
|
+
author (str | None): The name of the author of the model.
|
|
43
|
+
author_email (str | None): The email of the author of the model.
|
|
44
|
+
model_type (str): The type of the model.
|
|
45
|
+
base_path (Path): The base path where the model will be stored.
|
|
46
|
+
fpr (float): The false positive rate for the model. Default is 0.01.
|
|
47
|
+
num_hashes (int): The number of hashes to use in the model. Default is 7.
|
|
48
|
+
training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
|
|
49
|
+
lists of accession numbers used for training the model. Default is None.
|
|
50
|
+
"""
|
|
31
51
|
if k < 1:
|
|
32
52
|
raise ValueError("Invalid k value, must be greater than 0")
|
|
33
53
|
if not model_display_name:
|
|
@@ -50,11 +70,27 @@ class ProbabilisticFilterModel:
|
|
|
50
70
|
self.training_accessions = training_accessions
|
|
51
71
|
|
|
52
72
|
def get_cobs_index_path(self) -> str:
|
|
53
|
-
"""
|
|
73
|
+
"""
|
|
74
|
+
Returns the path to the cobs inde
|
|
75
|
+
|
|
76
|
+
This method constructs the path where the cobs index file will be stored,
|
|
77
|
+
based on the model's slug and the base path.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
str: The path to the cobs index file.
|
|
81
|
+
"""
|
|
54
82
|
return str(self.base_path / self.slug() / "index.cobs_classic")
|
|
55
83
|
|
|
56
84
|
def to_dict(self) -> dict:
|
|
57
|
-
"""
|
|
85
|
+
"""
|
|
86
|
+
Returns a dictionary representation of the model
|
|
87
|
+
|
|
88
|
+
This method includes all relevant attributes of the model, such as k-mer size,
|
|
89
|
+
display name, author information, model type, and other parameters.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
dict: A dictionary containing the model's attributes.
|
|
93
|
+
"""
|
|
58
94
|
return {
|
|
59
95
|
"model_slug": self.slug(),
|
|
60
96
|
"k": self.k,
|
|
@@ -70,7 +106,15 @@ class ProbabilisticFilterModel:
|
|
|
70
106
|
}
|
|
71
107
|
|
|
72
108
|
def slug(self) -> str:
|
|
73
|
-
"""
|
|
109
|
+
"""
|
|
110
|
+
Returns a slug representation of the model
|
|
111
|
+
|
|
112
|
+
This method generates a slug based on the model's display name and type,
|
|
113
|
+
which can be used for file naming or identification purposes.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
str: A slug representation of the model.
|
|
117
|
+
"""
|
|
74
118
|
return slugify(self.model_display_name + "-" + str(self.model_type))
|
|
75
119
|
|
|
76
120
|
def fit(
|
|
@@ -79,7 +123,23 @@ class ProbabilisticFilterModel:
|
|
|
79
123
|
display_names: dict | None = None,
|
|
80
124
|
training_accessions: dict[str, list[str]] | None = None,
|
|
81
125
|
) -> None:
|
|
82
|
-
"""
|
|
126
|
+
"""
|
|
127
|
+
Adds filters to the model
|
|
128
|
+
|
|
129
|
+
This method constructs the model's index from sequence files in the specified directory.
|
|
130
|
+
It reads files with specified extensions (fasta and fastq), constructs a document list,
|
|
131
|
+
and builds a cobs index for efficient searching.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
dir_path (Path): The directory containing sequence files to be indexed.
|
|
135
|
+
display_names (dict | None): A dictionary mapping file names to display names.
|
|
136
|
+
If None, uses file names as display names.
|
|
137
|
+
training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
|
|
138
|
+
lists of accession numbers used for training the model. If None, no training accessions
|
|
139
|
+
are set.
|
|
140
|
+
Raises:
|
|
141
|
+
ValueError: If the directory path is invalid, does not exist, or is not a directory.
|
|
142
|
+
"""
|
|
83
143
|
|
|
84
144
|
if display_names is None:
|
|
85
145
|
display_names = {}
|
|
@@ -125,8 +185,26 @@ class ProbabilisticFilterModel:
|
|
|
125
185
|
def calculate_hits(
|
|
126
186
|
self, sequence: Seq, filter_ids: list[str] | None = None, step: int = 1
|
|
127
187
|
) -> dict:
|
|
128
|
-
"""
|
|
129
|
-
|
|
188
|
+
"""
|
|
189
|
+
Calculates the hits for a sequence
|
|
190
|
+
|
|
191
|
+
This method searches the model's index for the given sequence and returns a dictionary
|
|
192
|
+
of filter IDs and their corresponding scores. If filter_ids is provided, it filters the
|
|
193
|
+
results to only include those IDs.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
sequence (Seq): The sequence to search for in the model's index.
|
|
197
|
+
filter_ids (list[str] | None): A list of filter IDs to filter the results. If None,
|
|
198
|
+
all results are returned.
|
|
199
|
+
step (int): The step size for the k-mer search. Default is 1.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
dict: A dictionary where keys are filter IDs and values are scores for the sequence.
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
ValueError: If the sequence is not a valid Bio.Seq or Bio.SeqRecord object,
|
|
206
|
+
if the sequence length is not greater than k, or if the input is invalid.
|
|
207
|
+
"""
|
|
130
208
|
if not isinstance(sequence, (Seq)):
|
|
131
209
|
raise ValueError(
|
|
132
210
|
"Invalid sequence, must be a Bio.Seq or a Bio.SeqRecord object"
|
|
@@ -153,7 +231,30 @@ class ProbabilisticFilterModel:
|
|
|
153
231
|
filter_ids: list[str] = None,
|
|
154
232
|
step: int = 1,
|
|
155
233
|
) -> ModelResult:
|
|
156
|
-
"""
|
|
234
|
+
"""
|
|
235
|
+
Returns a model result object for the sequence(s) based on the filters in the model
|
|
236
|
+
|
|
237
|
+
This method processes the input sequence(s) and calculates hits against the model's index.
|
|
238
|
+
It supports various input types, including single sequences, lists of sequences,
|
|
239
|
+
SeqIO iterators, and file paths. The results are returned as a ModelResult object.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
|
|
243
|
+
SeqIO.QualityIO.FastqPhredIterator | Path):
|
|
244
|
+
The input sequence(s) to be processed. Can be a single SeqRecord, a list of
|
|
245
|
+
SeqRecords, a SeqIO iterator, or a Path to a fasta/fastq file.
|
|
246
|
+
filter_ids (list[str]): A list of filter IDs to filter the results. If None,
|
|
247
|
+
all results are returned.
|
|
248
|
+
step (int): The step size for the k-mer search. Default is 1.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
ModelResult: An object containing the hits for each sequence, the number of kmers,
|
|
252
|
+
and the sparse sampling step.
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
ValueError: If the input sequence is not valid, or if it is not a Seq object,
|
|
256
|
+
a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq file.
|
|
257
|
+
"""
|
|
157
258
|
if isinstance(sequence_input, (SeqRecord)):
|
|
158
259
|
return ProbabilisticFilterModel.predict(
|
|
159
260
|
self, [sequence_input], filter_ids, step=step
|
|
@@ -186,7 +287,14 @@ class ProbabilisticFilterModel:
|
|
|
186
287
|
)
|
|
187
288
|
|
|
188
289
|
def save(self) -> None:
|
|
189
|
-
"""
|
|
290
|
+
"""
|
|
291
|
+
Saves the model to disk
|
|
292
|
+
|
|
293
|
+
This method serializes the model's attributes to a JSON file and creates a directory
|
|
294
|
+
for the model based on its slug. The JSON file contains all relevant information about
|
|
295
|
+
the model, including k-mer size, display name, author information, model type, and
|
|
296
|
+
other parameters. The directory structure is created if it does not already exist.
|
|
297
|
+
"""
|
|
190
298
|
json_path = self.base_path / f"{self.slug()}.json"
|
|
191
299
|
filter_path = self.base_path / self.slug()
|
|
192
300
|
filter_path.mkdir(exist_ok=True, parents=True)
|
|
@@ -198,7 +306,23 @@ class ProbabilisticFilterModel:
|
|
|
198
306
|
|
|
199
307
|
@staticmethod
|
|
200
308
|
def load(path: Path) -> "ProbabilisticFilterModel":
|
|
201
|
-
"""
|
|
309
|
+
"""
|
|
310
|
+
Loads the model from a file
|
|
311
|
+
|
|
312
|
+
This static method reads a JSON file containing the model's attributes and constructs
|
|
313
|
+
a ProbabilisticFilterModel object. It also checks for the existence of the cobs index file
|
|
314
|
+
and initializes the index if it exists.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
path (Path): The path to the JSON file containing the model's attributes.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
ProbabilisticFilterModel: An instance of the ProbabilisticFilterModel class
|
|
321
|
+
initialized with the attributes from the JSON file.
|
|
322
|
+
|
|
323
|
+
Raises:
|
|
324
|
+
FileNotFoundError: If the JSON file or the cobs index file does not exist.
|
|
325
|
+
"""
|
|
202
326
|
with open(path, "r", encoding="utf-8") as file:
|
|
203
327
|
json_object = file.read()
|
|
204
328
|
model_json = json.loads(json_object)
|
|
@@ -223,6 +347,18 @@ class ProbabilisticFilterModel:
|
|
|
223
347
|
return model
|
|
224
348
|
|
|
225
349
|
def _convert_cobs_result_to_dict(self, cobs_result: cobs.SearchResult) -> dict:
|
|
350
|
+
"""
|
|
351
|
+
Converts a cobs SearchResult to a dictionary
|
|
352
|
+
|
|
353
|
+
This method takes a cobs SearchResult object and converts it into a dictionary
|
|
354
|
+
where the keys are document names and the values are their corresponding scores.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
cobs_result (cobs.SearchResult): The result object from a cobs search.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
dict: A dictionary mapping document names to their scores.
|
|
361
|
+
"""
|
|
226
362
|
return {
|
|
227
363
|
individual_result.doc_name: individual_result.score
|
|
228
364
|
for individual_result in cobs_result
|
|
@@ -239,7 +375,27 @@ class ProbabilisticFilterModel:
|
|
|
239
375
|
),
|
|
240
376
|
step: int = 1,
|
|
241
377
|
) -> int:
|
|
242
|
-
"""
|
|
378
|
+
"""
|
|
379
|
+
Counts the number of kmers in the sequence(s)
|
|
380
|
+
|
|
381
|
+
This method calculates the number of k-mers in a given sequence or list of sequences.
|
|
382
|
+
It supports various input types, including single sequences, SeqRecords, lists of sequences,
|
|
383
|
+
and SeqIO iterators. The step size for the k-mer search can be specified.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
sequence_input (Seq | SeqRecord | list[Seq] | SeqIO.FastaIO.FastaIterator |
|
|
387
|
+
SeqIO.QualityIO.FastqPhredIterator):
|
|
388
|
+
The input sequence(s) to count k-mers in. Can be a single Seq, a SeqRecord,
|
|
389
|
+
a list of Seq objects, or a SeqIO iterator.
|
|
390
|
+
step (int): The step size for the k-mer search. Default is 1.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
int: The total number of k-mers in the input sequence(s).
|
|
394
|
+
|
|
395
|
+
Raises:
|
|
396
|
+
ValueError: If the input sequence is not valid, or if it is not a Seq object,
|
|
397
|
+
a SeqRecord, a list of Seq objects, or a SeqIO iterator.
|
|
398
|
+
"""
|
|
243
399
|
if isinstance(sequence_input, Seq):
|
|
244
400
|
return self._count_kmers([sequence_input], step=step)
|
|
245
401
|
|
|
@@ -268,12 +424,38 @@ class ProbabilisticFilterModel:
|
|
|
268
424
|
" SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
|
|
269
425
|
)
|
|
270
426
|
|
|
271
|
-
def _is_sequence_list(self, sequence_input):
|
|
427
|
+
def _is_sequence_list(self, sequence_input: Any) -> bool:
|
|
428
|
+
"""
|
|
429
|
+
Checks if the input is a list of SeqRecord objects
|
|
430
|
+
|
|
431
|
+
This method verifies if the input is a list and that all elements in the list
|
|
432
|
+
are instances of SeqRecord. This is useful for ensuring that the input is a valid
|
|
433
|
+
collection of sequence records.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
sequence_input (Any): The input to check.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
bool: True if the input is a list of SeqRecord objects, False otherwise.
|
|
440
|
+
"""
|
|
272
441
|
return isinstance(sequence_input, list) and all(
|
|
273
442
|
isinstance(seq, (SeqRecord)) for seq in sequence_input
|
|
274
443
|
)
|
|
275
444
|
|
|
276
|
-
def _is_sequence_iterator(self, sequence_input):
|
|
445
|
+
def _is_sequence_iterator(self, sequence_input: Any) -> bool:
|
|
446
|
+
"""
|
|
447
|
+
Checks if the input is a SeqIO iterator
|
|
448
|
+
|
|
449
|
+
This method verifies if the input is an instance of a SeqIO iterator, such as
|
|
450
|
+
FastaIterator or FastqPhredIterator. This is useful for ensuring that the input
|
|
451
|
+
is a valid sequence iterator that can be processed by the model.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
sequence_input (Any): The input to check.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
bool: True if the input is a SeqIO iterator, False otherwise.
|
|
458
|
+
"""
|
|
277
459
|
return isinstance(
|
|
278
460
|
sequence_input,
|
|
279
461
|
(SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
|
|
@@ -15,7 +15,13 @@ from xspect.models.result import ModelResult
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
18
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
Probabilistic filter SVM model for sequence data
|
|
20
|
+
|
|
21
|
+
In addition to the standard probabilistic filter model, this model uses an SVM to predict
|
|
22
|
+
labels based on their scores and training data. It requires the `scikit-learn` library
|
|
23
|
+
to be installed.
|
|
24
|
+
"""
|
|
19
25
|
|
|
20
26
|
def __init__(
|
|
21
27
|
self,
|
|
@@ -32,6 +38,28 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
32
38
|
training_accessions: dict[str, list[str]] | None = None,
|
|
33
39
|
svm_accessions: dict[str, list[str]] | None = None,
|
|
34
40
|
) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Initialize the SVM model with the given parameters.
|
|
43
|
+
|
|
44
|
+
In addition to the standard parameters, this model uses an SVM.
|
|
45
|
+
Therefore, it requires the `kernel` and `C` parameters to be set.
|
|
46
|
+
Furthermore, the `svm_accessions` parameter is used to store which accessions
|
|
47
|
+
are used for training the SVM.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
k (int): The k-mer size for the probabilistic filter.
|
|
51
|
+
model_display_name (str): The display name of the model.
|
|
52
|
+
author (str | None): The author of the model.
|
|
53
|
+
author_email (str | None): The author's email address.
|
|
54
|
+
model_type (str): The type of the model.
|
|
55
|
+
base_path (Path): The base path where the model will be stored.
|
|
56
|
+
kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
|
|
57
|
+
c (float): Regularization parameter for the SVM.
|
|
58
|
+
fpr (float, optional): False positive rate for the probabilistic filter. Defaults to 0.01.
|
|
59
|
+
num_hashes (int, optional): Number of hashes for the probabilistic filter. Defaults to 7.
|
|
60
|
+
training_accessions (dict[str, list[str]] | None, optional): Accessions used for training the probabilistic filter. Defaults to None.
|
|
61
|
+
svm_accessions (dict[str, list[str]] | None, optional): Accessions used for training the SVM. Defaults to None.
|
|
62
|
+
"""
|
|
35
63
|
super().__init__(
|
|
36
64
|
k=k,
|
|
37
65
|
model_display_name=model_display_name,
|
|
@@ -48,6 +76,12 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
48
76
|
self.svm_accessions = svm_accessions
|
|
49
77
|
|
|
50
78
|
def to_dict(self) -> dict:
|
|
79
|
+
"""
|
|
80
|
+
Convert the model to a dictionary representation
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
dict: A dictionary containing the model's parameters and state.
|
|
84
|
+
"""
|
|
51
85
|
return super().to_dict() | {
|
|
52
86
|
"kernel": self.kernel,
|
|
53
87
|
"C": self.c,
|
|
@@ -55,7 +89,13 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
55
89
|
}
|
|
56
90
|
|
|
57
91
|
def set_svm_params(self, kernel: str, c: float) -> None:
|
|
58
|
-
"""
|
|
92
|
+
"""
|
|
93
|
+
Set the parameters for the SVM
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
|
|
97
|
+
c (float): Regularization parameter for the SVM.
|
|
98
|
+
"""
|
|
59
99
|
self.kernel = kernel
|
|
60
100
|
self.c = c
|
|
61
101
|
self.save()
|
|
@@ -69,7 +109,22 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
69
109
|
training_accessions: dict[str, list[str]] | None = None,
|
|
70
110
|
svm_accessions: dict[str, list[str]] | None = None,
|
|
71
111
|
) -> None:
|
|
72
|
-
"""
|
|
112
|
+
"""
|
|
113
|
+
Fit the SVM to the sequences and labels.
|
|
114
|
+
|
|
115
|
+
This method first trains the probabilistic filter model and then
|
|
116
|
+
calculates scores for the SVM training. It expects the sequences to be in
|
|
117
|
+
the specified directory and the SVM training sequences to be in the
|
|
118
|
+
specified SVM path. The scores are saved in a CSV file for later use.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
dir_path (Path): The directory containing the training sequences.
|
|
122
|
+
svm_path (Path): The directory containing the SVM training sequences.
|
|
123
|
+
display_names (dict[str, str] | None): A mapping of accession IDs to display names.
|
|
124
|
+
svm_step (int): Step size for sparse sampling in SVM training.
|
|
125
|
+
training_accessions (dict[str, list[str]] | None): Accessions used for training the probabilistic filter.
|
|
126
|
+
svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
|
|
127
|
+
"""
|
|
73
128
|
|
|
74
129
|
# Since the SVM works with score data, we need to train
|
|
75
130
|
# the underlying data structure for score generation first
|
|
@@ -124,7 +179,21 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
124
179
|
filter_ids: list[str] = None,
|
|
125
180
|
step: int = 1,
|
|
126
181
|
) -> ModelResult:
|
|
127
|
-
"""
|
|
182
|
+
"""
|
|
183
|
+
Predict the labels of the sequences.
|
|
184
|
+
|
|
185
|
+
This method uses the SVM to predict labels based on the scores generated
|
|
186
|
+
from the sequences. It expects the sequences to be in a format compatible
|
|
187
|
+
with the probabilistic filter model, and it will return a `ModelResult`.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator | SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
|
|
191
|
+
filter_ids (list[str], optional): A list of IDs to filter the predictions. Defaults to None.
|
|
192
|
+
step (int, optional): Step size for sparse sampling. Defaults to 1.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
ModelResult: The result of the prediction containing hits, number of kmers, and the predicted label.
|
|
196
|
+
"""
|
|
128
197
|
# get scores and format them for the SVM
|
|
129
198
|
res = super().predict(sequence_input, filter_ids, step=step)
|
|
130
199
|
svm_scores = dict(sorted(res.get_scores()["total"].items()))
|
|
@@ -140,7 +209,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
140
209
|
)
|
|
141
210
|
|
|
142
211
|
def _get_svm(self, id_keys) -> SVC:
|
|
143
|
-
"""
|
|
212
|
+
"""
|
|
213
|
+
Get the SVM for the given id keys.
|
|
214
|
+
|
|
215
|
+
This method loads the SVM model from the scores CSV file and trains it
|
|
216
|
+
using the scores from the CSV. If `id_keys` is provided, it filters the
|
|
217
|
+
training data to only include those keys.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
id_keys (list[str] | None): A list of IDs to filter the training data. If None, all data is used.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
SVC: The trained SVM model.
|
|
224
|
+
"""
|
|
144
225
|
svm = SVC(kernel=self.kernel, C=self.c)
|
|
145
226
|
# parse csv
|
|
146
227
|
with open(
|
|
@@ -160,7 +241,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
160
241
|
|
|
161
242
|
@staticmethod
|
|
162
243
|
def load(path: Path) -> "ProbabilisticFilterSVMModel":
|
|
163
|
-
"""
|
|
244
|
+
"""
|
|
245
|
+
Load the model from disk
|
|
246
|
+
|
|
247
|
+
Loads the model from the specified path. The path should point to a JSON file
|
|
248
|
+
containing the model's parameters and state. It also checks for the existence of
|
|
249
|
+
the COBS index file.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
path (Path): The path to the model JSON file.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
ProbabilisticFilterSVMModel: The loaded model instance.
|
|
256
|
+
"""
|
|
164
257
|
with open(path, "r", encoding="utf-8") as file:
|
|
165
258
|
json_object = file.read()
|
|
166
259
|
model_json = json.loads(json_object)
|
|
@@ -14,7 +14,12 @@ from xspect.file_io import get_record_iterator
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
17
|
-
"""
|
|
17
|
+
"""
|
|
18
|
+
Probabilistic filter model for sequence data, with a single filter
|
|
19
|
+
|
|
20
|
+
This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
|
|
21
|
+
be used with a single filter, which is suitable e. g. for genus-level classification.
|
|
22
|
+
"""
|
|
18
23
|
|
|
19
24
|
def __init__(
|
|
20
25
|
self,
|
|
@@ -27,6 +32,21 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
27
32
|
fpr: float = 0.01,
|
|
28
33
|
training_accessions: list[str] | None = None,
|
|
29
34
|
) -> None:
|
|
35
|
+
"""Initialize probabilistic single filter model.
|
|
36
|
+
|
|
37
|
+
This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
|
|
38
|
+
be used with a single filter, which is suitable e.g. for genus-level classification.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
k (int): Length of the k-mers to use for filtering
|
|
42
|
+
model_display_name (str): Display name of the model
|
|
43
|
+
author (str | None): Author of the model
|
|
44
|
+
author_email (str | None): Email of the author
|
|
45
|
+
model_type (str): Type of the model, e.g. "probabilistic_single_filter"
|
|
46
|
+
base_path (Path): Base path where the model will be saved
|
|
47
|
+
fpr (float): False positive rate for the Bloom filter, default is 0.01
|
|
48
|
+
training_accessions (list[str] | None): List of accessions used for training, default is None
|
|
49
|
+
"""
|
|
30
50
|
super().__init__(
|
|
31
51
|
k=k,
|
|
32
52
|
model_display_name=model_display_name,
|
|
@@ -46,7 +66,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
46
66
|
display_name: str,
|
|
47
67
|
training_accessions: list[str] | None = None,
|
|
48
68
|
) -> None:
|
|
49
|
-
"""
|
|
69
|
+
"""
|
|
70
|
+
Fit the bloom filter to the sequences.
|
|
71
|
+
|
|
72
|
+
Trains the model by reading sequences from the provided file path,
|
|
73
|
+
generating k-mers, and adding them to the Bloom filter.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path (Path): Path to the file containing sequences in FASTA format
|
|
77
|
+
display_name (str): Display name for the model
|
|
78
|
+
training_accessions (list[str] | None): List of accessions used for training, default is None
|
|
79
|
+
"""
|
|
50
80
|
self.training_accessions = training_accessions
|
|
51
81
|
|
|
52
82
|
# estimate number of kmers
|
|
@@ -68,7 +98,18 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
68
98
|
def calculate_hits(
|
|
69
99
|
self, sequence: Seq | SeqRecord, filter_ids=None, step: int = 1
|
|
70
100
|
) -> dict:
|
|
71
|
-
"""
|
|
101
|
+
"""
|
|
102
|
+
Calculate the hits for the sequence
|
|
103
|
+
|
|
104
|
+
Calculates the number of k-mers in the sequence that are present in the Bloom filter.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
sequence (Seq | SeqRecord): Sequence to calculate hits for, can be a Bio.Seq or Bio.SeqRecord object
|
|
108
|
+
filter_ids (list[str] | None): List of filter IDs to use, default is None
|
|
109
|
+
step (int): Step size for generating k-mers, default is 1
|
|
110
|
+
Returns:
|
|
111
|
+
dict: Dictionary with the display name as key and the number of hits as value
|
|
112
|
+
"""
|
|
72
113
|
if isinstance(sequence, SeqRecord):
|
|
73
114
|
sequence = sequence.seq
|
|
74
115
|
|
|
@@ -85,7 +126,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
85
126
|
|
|
86
127
|
@staticmethod
|
|
87
128
|
def load(path: Path) -> "ProbabilisticSingleFilterModel":
|
|
88
|
-
"""
|
|
129
|
+
"""
|
|
130
|
+
Load the model from disk
|
|
131
|
+
|
|
132
|
+
This method reads the model's JSON file and the associated Bloom filter file,
|
|
133
|
+
reconstructing the model instance.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
path (Path): Path to the model directory containing the JSON file
|
|
137
|
+
Returns:
|
|
138
|
+
ProbabilisticSingleFilterModel: An instance of the model loaded from disk
|
|
139
|
+
"""
|
|
89
140
|
with open(path, "r", encoding="utf-8") as file:
|
|
90
141
|
json_object = file.read()
|
|
91
142
|
model_json = json.loads(json_object)
|
|
@@ -108,7 +159,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
108
159
|
return model
|
|
109
160
|
|
|
110
161
|
def _generate_kmers(self, sequence: Seq, step: int = 1):
|
|
111
|
-
"""
|
|
162
|
+
"""
|
|
163
|
+
Generate kmers from the sequence
|
|
164
|
+
|
|
165
|
+
Generates k-mers from the sequence, considering both the forward and reverse complement strands.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
sequence (Seq): Sequence to generate k-mers from
|
|
169
|
+
step (int): Step size for generating k-mers, default is 1
|
|
170
|
+
Yields:
|
|
171
|
+
str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and reverse complement)
|
|
172
|
+
"""
|
|
112
173
|
num_kmers = ceil((len(sequence) - self.k + 1) / step)
|
|
113
174
|
for i in range(num_kmers):
|
|
114
175
|
start_pos = i * step
|