XspecT 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (33) hide show
  1. xspect/classify.py +51 -38
  2. xspect/definitions.py +50 -10
  3. xspect/download_models.py +10 -2
  4. xspect/file_io.py +115 -48
  5. xspect/filter_sequences.py +36 -66
  6. xspect/main.py +41 -10
  7. xspect/mlst_feature/mlst_helper.py +3 -0
  8. xspect/mlst_feature/pub_mlst_handler.py +43 -1
  9. xspect/model_management.py +84 -14
  10. xspect/models/probabilistic_filter_mlst_model.py +75 -37
  11. xspect/models/probabilistic_filter_model.py +194 -12
  12. xspect/models/probabilistic_filter_svm_model.py +99 -6
  13. xspect/models/probabilistic_single_filter_model.py +66 -5
  14. xspect/models/result.py +77 -10
  15. xspect/ncbi.py +45 -10
  16. xspect/train.py +2 -1
  17. xspect/web.py +68 -12
  18. xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
  19. xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
  20. xspect/xspect-web/dist/index.html +2 -2
  21. xspect/xspect-web/src/App.tsx +4 -2
  22. xspect/xspect-web/src/api.tsx +23 -1
  23. xspect/xspect-web/src/components/filter-form.tsx +16 -3
  24. xspect/xspect-web/src/components/filtering-result.tsx +65 -0
  25. xspect/xspect-web/src/components/result.tsx +2 -2
  26. xspect/xspect-web/src/types.tsx +5 -0
  27. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/METADATA +1 -1
  28. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
  29. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
  30. xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
  31. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
  32. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
  33. {xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  from math import ceil
5
5
  from pathlib import Path
6
+ from typing import Any
6
7
  from Bio.Seq import Seq
7
8
  from Bio.SeqRecord import SeqRecord
8
9
  from Bio import SeqIO
@@ -28,6 +29,25 @@ class ProbabilisticFilterModel:
28
29
  num_hashes: int = 7,
29
30
  training_accessions: dict[str, list[str]] | None = None,
30
31
  ) -> None:
32
+ """
33
+ Initializes the probabilistic filter model.
34
+
35
+ This method sets up the model with the specified parameters, including the k-mer size,
36
+ display name, author information, model type, base path for storage, false positive rate,
37
+ number of hashes, and training accessions.
38
+
39
+ Args:
40
+ k (int): The size of the k-mers to be used in the model.
41
+ model_display_name (str): The display name of the model.
42
+ author (str | None): The name of the author of the model.
43
+ author_email (str | None): The email of the author of the model.
44
+ model_type (str): The type of the model.
45
+ base_path (Path): The base path where the model will be stored.
46
+ fpr (float): The false positive rate for the model. Default is 0.01.
47
+ num_hashes (int): The number of hashes to use in the model. Default is 7.
48
+ training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
49
+ lists of accession numbers used for training the model. Default is None.
50
+ """
31
51
  if k < 1:
32
52
  raise ValueError("Invalid k value, must be greater than 0")
33
53
  if not model_display_name:
@@ -50,11 +70,27 @@ class ProbabilisticFilterModel:
50
70
  self.training_accessions = training_accessions
51
71
 
52
72
  def get_cobs_index_path(self) -> str:
53
- """Returns the path to the cobs index"""
73
+ """
74
+ Returns the path to the cobs inde
75
+
76
+ This method constructs the path where the cobs index file will be stored,
77
+ based on the model's slug and the base path.
78
+
79
+ Returns:
80
+ str: The path to the cobs index file.
81
+ """
54
82
  return str(self.base_path / self.slug() / "index.cobs_classic")
55
83
 
56
84
  def to_dict(self) -> dict:
57
- """Returns a dictionary representation of the model"""
85
+ """
86
+ Returns a dictionary representation of the model
87
+
88
+ This method includes all relevant attributes of the model, such as k-mer size,
89
+ display name, author information, model type, and other parameters.
90
+
91
+ Returns:
92
+ dict: A dictionary containing the model's attributes.
93
+ """
58
94
  return {
59
95
  "model_slug": self.slug(),
60
96
  "k": self.k,
@@ -70,7 +106,15 @@ class ProbabilisticFilterModel:
70
106
  }
71
107
 
72
108
  def slug(self) -> str:
73
- """Returns a slug representation of the model"""
109
+ """
110
+ Returns a slug representation of the model
111
+
112
+ This method generates a slug based on the model's display name and type,
113
+ which can be used for file naming or identification purposes.
114
+
115
+ Returns:
116
+ str: A slug representation of the model.
117
+ """
74
118
  return slugify(self.model_display_name + "-" + str(self.model_type))
75
119
 
76
120
  def fit(
@@ -79,7 +123,23 @@ class ProbabilisticFilterModel:
79
123
  display_names: dict | None = None,
80
124
  training_accessions: dict[str, list[str]] | None = None,
81
125
  ) -> None:
82
- """Adds filters to the model"""
126
+ """
127
+ Adds filters to the model
128
+
129
+ This method constructs the model's index from sequence files in the specified directory.
130
+ It reads files with specified extensions (fasta and fastq), constructs a document list,
131
+ and builds a cobs index for efficient searching.
132
+
133
+ Args:
134
+ dir_path (Path): The directory containing sequence files to be indexed.
135
+ display_names (dict | None): A dictionary mapping file names to display names.
136
+ If None, uses file names as display names.
137
+ training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
138
+ lists of accession numbers used for training the model. If None, no training accessions
139
+ are set.
140
+ Raises:
141
+ ValueError: If the directory path is invalid, does not exist, or is not a directory.
142
+ """
83
143
 
84
144
  if display_names is None:
85
145
  display_names = {}
@@ -125,8 +185,26 @@ class ProbabilisticFilterModel:
125
185
  def calculate_hits(
126
186
  self, sequence: Seq, filter_ids: list[str] | None = None, step: int = 1
127
187
  ) -> dict:
128
- """Calculates the hits for a sequence"""
129
-
188
+ """
189
+ Calculates the hits for a sequence
190
+
191
+ This method searches the model's index for the given sequence and returns a dictionary
192
+ of filter IDs and their corresponding scores. If filter_ids is provided, it filters the
193
+ results to only include those IDs.
194
+
195
+ Args:
196
+ sequence (Seq): The sequence to search for in the model's index.
197
+ filter_ids (list[str] | None): A list of filter IDs to filter the results. If None,
198
+ all results are returned.
199
+ step (int): The step size for the k-mer search. Default is 1.
200
+
201
+ Returns:
202
+ dict: A dictionary where keys are filter IDs and values are scores for the sequence.
203
+
204
+ Raises:
205
+ ValueError: If the sequence is not a valid Bio.Seq or Bio.SeqRecord object,
206
+ if the sequence length is not greater than k, or if the input is invalid.
207
+ """
130
208
  if not isinstance(sequence, (Seq)):
131
209
  raise ValueError(
132
210
  "Invalid sequence, must be a Bio.Seq or a Bio.SeqRecord object"
@@ -153,7 +231,30 @@ class ProbabilisticFilterModel:
153
231
  filter_ids: list[str] = None,
154
232
  step: int = 1,
155
233
  ) -> ModelResult:
156
- """Returns scores for the sequence(s) based on the filters in the model"""
234
+ """
235
+ Returns a model result object for the sequence(s) based on the filters in the model
236
+
237
+ This method processes the input sequence(s) and calculates hits against the model's index.
238
+ It supports various input types, including single sequences, lists of sequences,
239
+ SeqIO iterators, and file paths. The results are returned as a ModelResult object.
240
+
241
+ Args:
242
+ sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
243
+ SeqIO.QualityIO.FastqPhredIterator | Path):
244
+ The input sequence(s) to be processed. Can be a single SeqRecord, a list of
245
+ SeqRecords, a SeqIO iterator, or a Path to a fasta/fastq file.
246
+ filter_ids (list[str]): A list of filter IDs to filter the results. If None,
247
+ all results are returned.
248
+ step (int): The step size for the k-mer search. Default is 1.
249
+
250
+ Returns:
251
+ ModelResult: An object containing the hits for each sequence, the number of kmers,
252
+ and the sparse sampling step.
253
+
254
+ Raises:
255
+ ValueError: If the input sequence is not valid, or if it is not a Seq object,
256
+ a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq file.
257
+ """
157
258
  if isinstance(sequence_input, (SeqRecord)):
158
259
  return ProbabilisticFilterModel.predict(
159
260
  self, [sequence_input], filter_ids, step=step
@@ -186,7 +287,14 @@ class ProbabilisticFilterModel:
186
287
  )
187
288
 
188
289
  def save(self) -> None:
189
- """Saves the model to disk"""
290
+ """
291
+ Saves the model to disk
292
+
293
+ This method serializes the model's attributes to a JSON file and creates a directory
294
+ for the model based on its slug. The JSON file contains all relevant information about
295
+ the model, including k-mer size, display name, author information, model type, and
296
+ other parameters. The directory structure is created if it does not already exist.
297
+ """
190
298
  json_path = self.base_path / f"{self.slug()}.json"
191
299
  filter_path = self.base_path / self.slug()
192
300
  filter_path.mkdir(exist_ok=True, parents=True)
@@ -198,7 +306,23 @@ class ProbabilisticFilterModel:
198
306
 
199
307
  @staticmethod
200
308
  def load(path: Path) -> "ProbabilisticFilterModel":
201
- """Loads the model from a file"""
309
+ """
310
+ Loads the model from a file
311
+
312
+ This static method reads a JSON file containing the model's attributes and constructs
313
+ a ProbabilisticFilterModel object. It also checks for the existence of the cobs index file
314
+ and initializes the index if it exists.
315
+
316
+ Args:
317
+ path (Path): The path to the JSON file containing the model's attributes.
318
+
319
+ Returns:
320
+ ProbabilisticFilterModel: An instance of the ProbabilisticFilterModel class
321
+ initialized with the attributes from the JSON file.
322
+
323
+ Raises:
324
+ FileNotFoundError: If the JSON file or the cobs index file does not exist.
325
+ """
202
326
  with open(path, "r", encoding="utf-8") as file:
203
327
  json_object = file.read()
204
328
  model_json = json.loads(json_object)
@@ -223,6 +347,18 @@ class ProbabilisticFilterModel:
223
347
  return model
224
348
 
225
349
  def _convert_cobs_result_to_dict(self, cobs_result: cobs.SearchResult) -> dict:
350
+ """
351
+ Converts a cobs SearchResult to a dictionary
352
+
353
+ This method takes a cobs SearchResult object and converts it into a dictionary
354
+ where the keys are document names and the values are their corresponding scores.
355
+
356
+ Args:
357
+ cobs_result (cobs.SearchResult): The result object from a cobs search.
358
+
359
+ Returns:
360
+ dict: A dictionary mapping document names to their scores.
361
+ """
226
362
  return {
227
363
  individual_result.doc_name: individual_result.score
228
364
  for individual_result in cobs_result
@@ -239,7 +375,27 @@ class ProbabilisticFilterModel:
239
375
  ),
240
376
  step: int = 1,
241
377
  ) -> int:
242
- """Counts the number of kmers in the sequence(s)"""
378
+ """
379
+ Counts the number of kmers in the sequence(s)
380
+
381
+ This method calculates the number of k-mers in a given sequence or list of sequences.
382
+ It supports various input types, including single sequences, SeqRecords, lists of sequences,
383
+ and SeqIO iterators. The step size for the k-mer search can be specified.
384
+
385
+ Args:
386
+ sequence_input (Seq | SeqRecord | list[Seq] | SeqIO.FastaIO.FastaIterator |
387
+ SeqIO.QualityIO.FastqPhredIterator):
388
+ The input sequence(s) to count k-mers in. Can be a single Seq, a SeqRecord,
389
+ a list of Seq objects, or a SeqIO iterator.
390
+ step (int): The step size for the k-mer search. Default is 1.
391
+
392
+ Returns:
393
+ int: The total number of k-mers in the input sequence(s).
394
+
395
+ Raises:
396
+ ValueError: If the input sequence is not valid, or if it is not a Seq object,
397
+ a SeqRecord, a list of Seq objects, or a SeqIO iterator.
398
+ """
243
399
  if isinstance(sequence_input, Seq):
244
400
  return self._count_kmers([sequence_input], step=step)
245
401
 
@@ -268,12 +424,38 @@ class ProbabilisticFilterModel:
268
424
  " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
269
425
  )
270
426
 
271
- def _is_sequence_list(self, sequence_input):
427
+ def _is_sequence_list(self, sequence_input: Any) -> bool:
428
+ """
429
+ Checks if the input is a list of SeqRecord objects
430
+
431
+ This method verifies if the input is a list and that all elements in the list
432
+ are instances of SeqRecord. This is useful for ensuring that the input is a valid
433
+ collection of sequence records.
434
+
435
+ Args:
436
+ sequence_input (Any): The input to check.
437
+
438
+ Returns:
439
+ bool: True if the input is a list of SeqRecord objects, False otherwise.
440
+ """
272
441
  return isinstance(sequence_input, list) and all(
273
442
  isinstance(seq, (SeqRecord)) for seq in sequence_input
274
443
  )
275
444
 
276
- def _is_sequence_iterator(self, sequence_input):
445
+ def _is_sequence_iterator(self, sequence_input: Any) -> bool:
446
+ """
447
+ Checks if the input is a SeqIO iterator
448
+
449
+ This method verifies if the input is an instance of a SeqIO iterator, such as
450
+ FastaIterator or FastqPhredIterator. This is useful for ensuring that the input
451
+ is a valid sequence iterator that can be processed by the model.
452
+
453
+ Args:
454
+ sequence_input (Any): The input to check.
455
+
456
+ Returns:
457
+ bool: True if the input is a SeqIO iterator, False otherwise.
458
+ """
277
459
  return isinstance(
278
460
  sequence_input,
279
461
  (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
@@ -15,7 +15,13 @@ from xspect.models.result import ModelResult
15
15
 
16
16
 
17
17
  class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
18
- """Probabilistic filter SVM model for sequence data"""
18
+ """
19
+ Probabilistic filter SVM model for sequence data
20
+
21
+ In addition to the standard probabilistic filter model, this model uses an SVM to predict
22
+ labels based on their scores and training data. It requires the `scikit-learn` library
23
+ to be installed.
24
+ """
19
25
 
20
26
  def __init__(
21
27
  self,
@@ -32,6 +38,28 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
32
38
  training_accessions: dict[str, list[str]] | None = None,
33
39
  svm_accessions: dict[str, list[str]] | None = None,
34
40
  ) -> None:
41
+ """
42
+ Initialize the SVM model with the given parameters.
43
+
44
+ In addition to the standard parameters, this model uses an SVM.
45
+ Therefore, it requires the `kernel` and `C` parameters to be set.
46
+ Furthermore, the `svm_accessions` parameter is used to store which accessions
47
+ are used for training the SVM.
48
+
49
+ Args:
50
+ k (int): The k-mer size for the probabilistic filter.
51
+ model_display_name (str): The display name of the model.
52
+ author (str | None): The author of the model.
53
+ author_email (str | None): The author's email address.
54
+ model_type (str): The type of the model.
55
+ base_path (Path): The base path where the model will be stored.
56
+ kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
57
+ c (float): Regularization parameter for the SVM.
58
+ fpr (float, optional): False positive rate for the probabilistic filter. Defaults to 0.01.
59
+ num_hashes (int, optional): Number of hashes for the probabilistic filter. Defaults to 7.
60
+ training_accessions (dict[str, list[str]] | None, optional): Accessions used for training the probabilistic filter. Defaults to None.
61
+ svm_accessions (dict[str, list[str]] | None, optional): Accessions used for training the SVM. Defaults to None.
62
+ """
35
63
  super().__init__(
36
64
  k=k,
37
65
  model_display_name=model_display_name,
@@ -48,6 +76,12 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
48
76
  self.svm_accessions = svm_accessions
49
77
 
50
78
  def to_dict(self) -> dict:
79
+ """
80
+ Convert the model to a dictionary representation
81
+
82
+ Returns:
83
+ dict: A dictionary containing the model's parameters and state.
84
+ """
51
85
  return super().to_dict() | {
52
86
  "kernel": self.kernel,
53
87
  "C": self.c,
@@ -55,7 +89,13 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
55
89
  }
56
90
 
57
91
  def set_svm_params(self, kernel: str, c: float) -> None:
58
- """Set the parameters for the SVM"""
92
+ """
93
+ Set the parameters for the SVM
94
+
95
+ Args:
96
+ kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
97
+ c (float): Regularization parameter for the SVM.
98
+ """
59
99
  self.kernel = kernel
60
100
  self.c = c
61
101
  self.save()
@@ -69,7 +109,22 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
69
109
  training_accessions: dict[str, list[str]] | None = None,
70
110
  svm_accessions: dict[str, list[str]] | None = None,
71
111
  ) -> None:
72
- """Fit the SVM to the sequences and labels"""
112
+ """
113
+ Fit the SVM to the sequences and labels.
114
+
115
+ This method first trains the probabilistic filter model and then
116
+ calculates scores for the SVM training. It expects the sequences to be in
117
+ the specified directory and the SVM training sequences to be in the
118
+ specified SVM path. The scores are saved in a CSV file for later use.
119
+
120
+ Args:
121
+ dir_path (Path): The directory containing the training sequences.
122
+ svm_path (Path): The directory containing the SVM training sequences.
123
+ display_names (dict[str, str] | None): A mapping of accession IDs to display names.
124
+ svm_step (int): Step size for sparse sampling in SVM training.
125
+ training_accessions (dict[str, list[str]] | None): Accessions used for training the probabilistic filter.
126
+ svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
127
+ """
73
128
 
74
129
  # Since the SVM works with score data, we need to train
75
130
  # the underlying data structure for score generation first
@@ -124,7 +179,21 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
124
179
  filter_ids: list[str] = None,
125
180
  step: int = 1,
126
181
  ) -> ModelResult:
127
- """Predict the labels of the sequences"""
182
+ """
183
+ Predict the labels of the sequences.
184
+
185
+ This method uses the SVM to predict labels based on the scores generated
186
+ from the sequences. It expects the sequences to be in a format compatible
187
+ with the probabilistic filter model, and it will return a `ModelResult`.
188
+
189
+ Args:
190
+ sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator | SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
191
+ filter_ids (list[str], optional): A list of IDs to filter the predictions. Defaults to None.
192
+ step (int, optional): Step size for sparse sampling. Defaults to 1.
193
+
194
+ Returns:
195
+ ModelResult: The result of the prediction containing hits, number of kmers, and the predicted label.
196
+ """
128
197
  # get scores and format them for the SVM
129
198
  res = super().predict(sequence_input, filter_ids, step=step)
130
199
  svm_scores = dict(sorted(res.get_scores()["total"].items()))
@@ -140,7 +209,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
140
209
  )
141
210
 
142
211
  def _get_svm(self, id_keys) -> SVC:
143
- """Get the SVM for the given id keys"""
212
+ """
213
+ Get the SVM for the given id keys.
214
+
215
+ This method loads the SVM model from the scores CSV file and trains it
216
+ using the scores from the CSV. If `id_keys` is provided, it filters the
217
+ training data to only include those keys.
218
+
219
+ Args:
220
+ id_keys (list[str] | None): A list of IDs to filter the training data. If None, all data is used.
221
+
222
+ Returns:
223
+ SVC: The trained SVM model.
224
+ """
144
225
  svm = SVC(kernel=self.kernel, C=self.c)
145
226
  # parse csv
146
227
  with open(
@@ -160,7 +241,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
160
241
 
161
242
  @staticmethod
162
243
  def load(path: Path) -> "ProbabilisticFilterSVMModel":
163
- """Load the model from disk"""
244
+ """
245
+ Load the model from disk
246
+
247
+ Loads the model from the specified path. The path should point to a JSON file
248
+ containing the model's parameters and state. It also checks for the existence of
249
+ the COBS index file.
250
+
251
+ Args:
252
+ path (Path): The path to the model JSON file.
253
+
254
+ Returns:
255
+ ProbabilisticFilterSVMModel: The loaded model instance.
256
+ """
164
257
  with open(path, "r", encoding="utf-8") as file:
165
258
  json_object = file.read()
166
259
  model_json = json.loads(json_object)
@@ -14,7 +14,12 @@ from xspect.file_io import get_record_iterator
14
14
 
15
15
 
16
16
  class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
17
- """Base probabilistic filter model for sequence data"""
17
+ """
18
+ Probabilistic filter model for sequence data, with a single filter
19
+
20
+ This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
21
+ be used with a single filter, which is suitable e. g. for genus-level classification.
22
+ """
18
23
 
19
24
  def __init__(
20
25
  self,
@@ -27,6 +32,21 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
27
32
  fpr: float = 0.01,
28
33
  training_accessions: list[str] | None = None,
29
34
  ) -> None:
35
+ """Initialize probabilistic single filter model.
36
+
37
+ This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
38
+ be used with a single filter, which is suitable e.g. for genus-level classification.
39
+
40
+ Args:
41
+ k (int): Length of the k-mers to use for filtering
42
+ model_display_name (str): Display name of the model
43
+ author (str | None): Author of the model
44
+ author_email (str | None): Email of the author
45
+ model_type (str): Type of the model, e.g. "probabilistic_single_filter"
46
+ base_path (Path): Base path where the model will be saved
47
+ fpr (float): False positive rate for the Bloom filter, default is 0.01
48
+ training_accessions (list[str] | None): List of accessions used for training, default is None
49
+ """
30
50
  super().__init__(
31
51
  k=k,
32
52
  model_display_name=model_display_name,
@@ -46,7 +66,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
46
66
  display_name: str,
47
67
  training_accessions: list[str] | None = None,
48
68
  ) -> None:
49
- """Fit the cobs classic index to the sequences and labels"""
69
+ """
70
+ Fit the bloom filter to the sequences.
71
+
72
+ Trains the model by reading sequences from the provided file path,
73
+ generating k-mers, and adding them to the Bloom filter.
74
+
75
+ Args:
76
+ file_path (Path): Path to the file containing sequences in FASTA format
77
+ display_name (str): Display name for the model
78
+ training_accessions (list[str] | None): List of accessions used for training, default is None
79
+ """
50
80
  self.training_accessions = training_accessions
51
81
 
52
82
  # estimate number of kmers
@@ -68,7 +98,18 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
68
98
  def calculate_hits(
69
99
  self, sequence: Seq | SeqRecord, filter_ids=None, step: int = 1
70
100
  ) -> dict:
71
- """Calculate the hits for the sequence"""
101
+ """
102
+ Calculate the hits for the sequence
103
+
104
+ Calculates the number of k-mers in the sequence that are present in the Bloom filter.
105
+
106
+ Args:
107
+ sequence (Seq | SeqRecord): Sequence to calculate hits for, can be a Bio.Seq or Bio.SeqRecord object
108
+ filter_ids (list[str] | None): List of filter IDs to use, default is None
109
+ step (int): Step size for generating k-mers, default is 1
110
+ Returns:
111
+ dict: Dictionary with the display name as key and the number of hits as value
112
+ """
72
113
  if isinstance(sequence, SeqRecord):
73
114
  sequence = sequence.seq
74
115
 
@@ -85,7 +126,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
85
126
 
86
127
  @staticmethod
87
128
  def load(path: Path) -> "ProbabilisticSingleFilterModel":
88
- """Load the model from disk"""
129
+ """
130
+ Load the model from disk
131
+
132
+ This method reads the model's JSON file and the associated Bloom filter file,
133
+ reconstructing the model instance.
134
+
135
+ Args:
136
+ path (Path): Path to the model directory containing the JSON file
137
+ Returns:
138
+ ProbabilisticSingleFilterModel: An instance of the model loaded from disk
139
+ """
89
140
  with open(path, "r", encoding="utf-8") as file:
90
141
  json_object = file.read()
91
142
  model_json = json.loads(json_object)
@@ -108,7 +159,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
108
159
  return model
109
160
 
110
161
  def _generate_kmers(self, sequence: Seq, step: int = 1):
111
- """Generate kmers from the sequence"""
162
+ """
163
+ Generate kmers from the sequence
164
+
165
+ Generates k-mers from the sequence, considering both the forward and reverse complement strands.
166
+
167
+ Args:
168
+ sequence (Seq): Sequence to generate k-mers from
169
+ step (int): Step size for generating k-mers, default is 1
170
+ Yields:
171
+ str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and reverse complement)
172
+ """
112
173
  num_kmers = ceil((len(sequence) - self.k + 1) / step)
113
174
  for i in range(num_kmers):
114
175
  start_pos = i * step