XspecT 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +31 -8
- xspect/definitions.py +11 -10
- xspect/file_io.py +2 -1
- xspect/filter_sequences.py +20 -4
- xspect/main.py +63 -26
- xspect/mlst_feature/mlst_helper.py +15 -19
- xspect/mlst_feature/pub_mlst_handler.py +16 -19
- xspect/model_management.py +14 -17
- xspect/models/probabilistic_filter_mlst_model.py +11 -10
- xspect/models/probabilistic_filter_model.py +21 -5
- xspect/models/probabilistic_filter_svm_model.py +30 -15
- xspect/models/probabilistic_single_filter_model.py +9 -7
- xspect/models/result.py +20 -15
- xspect/web.py +13 -4
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/METADATA +1 -1
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/RECORD +20 -20
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/WHEEL +0 -0
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/entry_points.txt +0 -0
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.5.3.dist-info → xspect-0.5.4.dist-info}/top_level.txt +0 -0
xspect/classify.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
"""Classification module"""
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
|
-
from
|
|
4
|
+
from importlib import import_module
|
|
3
5
|
import xspect.model_management as mm
|
|
4
|
-
from xspect.models.probabilistic_filter_mlst_model import (
|
|
5
|
-
ProbabilisticFilterMlstSchemeModel,
|
|
6
|
-
)
|
|
7
6
|
from xspect.file_io import prepare_input_output_paths
|
|
8
7
|
|
|
8
|
+
# inline imports lead to "invalid name" issues
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def classify_genus(
|
|
11
13
|
model_genus: str, input_path: Path, output_path: Path, step: int = 1
|
|
@@ -22,7 +24,12 @@ def classify_genus(
|
|
|
22
24
|
output_path (Path): The path to the output file where results will be saved.
|
|
23
25
|
step (int): The amount of kmers to be skipped.
|
|
24
26
|
"""
|
|
25
|
-
|
|
27
|
+
ProbabilisticSingleFilterModel = import_module(
|
|
28
|
+
"xspect.models.probabilistic_single_filter_model"
|
|
29
|
+
).ProbabilisticSingleFilterModel
|
|
30
|
+
|
|
31
|
+
model_path = mm.get_genus_model_path(model_genus)
|
|
32
|
+
model = ProbabilisticSingleFilterModel.load(model_path)
|
|
26
33
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
27
34
|
|
|
28
35
|
for idx, current_path in enumerate(input_paths):
|
|
@@ -34,7 +41,11 @@ def classify_genus(
|
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
def classify_species(
|
|
37
|
-
model_genus: str,
|
|
44
|
+
model_genus: str,
|
|
45
|
+
input_path: Path,
|
|
46
|
+
output_path: Path,
|
|
47
|
+
step: int = 1,
|
|
48
|
+
display_name: bool = False,
|
|
38
49
|
):
|
|
39
50
|
"""
|
|
40
51
|
Classify the species of sequences.
|
|
@@ -47,12 +58,18 @@ def classify_species(
|
|
|
47
58
|
input_path (Path): The path to the input file/directory containing sequences.
|
|
48
59
|
output_path (Path): The path to the output file where results will be saved.
|
|
49
60
|
step (int): The amount of kmers to be skipped.
|
|
61
|
+
display_name (bool): Includes a display name for each tax_ID.
|
|
50
62
|
"""
|
|
51
|
-
|
|
63
|
+
ProbabilisticFilterSVMModel = import_module(
|
|
64
|
+
"xspect.models.probabilistic_filter_svm_model"
|
|
65
|
+
).ProbabilisticFilterSVMModel
|
|
66
|
+
|
|
67
|
+
model_path = mm.get_species_model_path(model_genus)
|
|
68
|
+
model = ProbabilisticFilterSVMModel.load(model_path)
|
|
52
69
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
53
70
|
|
|
54
71
|
for idx, current_path in enumerate(input_paths):
|
|
55
|
-
result = model.predict(current_path, step=step)
|
|
72
|
+
result = model.predict(current_path, step=step, display_name=display_name)
|
|
56
73
|
result.input_source = current_path.name
|
|
57
74
|
cls_path = get_output_path(idx, output_path)
|
|
58
75
|
result.save(cls_path)
|
|
@@ -68,6 +85,12 @@ def classify_mlst(input_path: Path, output_path: Path, limit: bool):
|
|
|
68
85
|
output_path (Path): The path to the output file where results will be saved.
|
|
69
86
|
limit (bool): A limit for the highest allele_id results that are shown.
|
|
70
87
|
"""
|
|
88
|
+
pick_scheme_from_models_dir = import_module(
|
|
89
|
+
"xspect.mlst_feature.mlst_helper"
|
|
90
|
+
).pick_scheme_from_models_dir
|
|
91
|
+
ProbabilisticFilterMlstSchemeModel = import_module(
|
|
92
|
+
"xspect.models.probabilistic_filter_mlst_model"
|
|
93
|
+
).ProbabilisticFilterMlstSchemeModel
|
|
71
94
|
|
|
72
95
|
scheme_path = pick_scheme_from_models_dir()
|
|
73
96
|
model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
|
xspect/definitions.py
CHANGED
|
@@ -11,8 +11,9 @@ def get_xspect_root_path() -> Path:
|
|
|
11
11
|
"""
|
|
12
12
|
Return the root path for XspecT data.
|
|
13
13
|
|
|
14
|
-
Returns the path to the XspecT data directory, which can be located either in the user's home
|
|
15
|
-
If neither exists, it creates the directory in
|
|
14
|
+
Returns the path to the XspecT data directory, which can be located either in the user's home
|
|
15
|
+
directory or in the current working directory. If neither exists, it creates the directory in
|
|
16
|
+
the user's home directory.
|
|
16
17
|
|
|
17
18
|
Returns:
|
|
18
19
|
Path: The path to the XspecT data directory.
|
|
@@ -34,8 +35,8 @@ def get_xspect_model_path() -> Path:
|
|
|
34
35
|
"""
|
|
35
36
|
Return the path to the XspecT models.
|
|
36
37
|
|
|
37
|
-
Returns the path to the XspecT models directory, which is located within the XspecT data
|
|
38
|
-
If the directory does not exist, it creates the directory.
|
|
38
|
+
Returns the path to the XspecT models directory, which is located within the XspecT data
|
|
39
|
+
directory. If the directory does not exist, it creates the directory.
|
|
39
40
|
|
|
40
41
|
Returns:
|
|
41
42
|
Path: The path to the XspecT models directory.
|
|
@@ -49,8 +50,8 @@ def get_xspect_upload_path() -> Path:
|
|
|
49
50
|
"""
|
|
50
51
|
Return the path to the XspecT upload directory.
|
|
51
52
|
|
|
52
|
-
Returns the path to the XspecT uploads directory, which is located within the XspecT data
|
|
53
|
-
If the directory does not exist, it creates the directory.
|
|
53
|
+
Returns the path to the XspecT uploads directory, which is located within the XspecT data
|
|
54
|
+
directory. If the directory does not exist, it creates the directory.
|
|
54
55
|
|
|
55
56
|
Returns:
|
|
56
57
|
Path: The path to the XspecT uploads directory.
|
|
@@ -64,8 +65,8 @@ def get_xspect_runs_path() -> Path:
|
|
|
64
65
|
"""
|
|
65
66
|
Return the path to the XspecT runs directory.
|
|
66
67
|
|
|
67
|
-
Returns the path to the XspecT runs directory, which is located within the XspecT data
|
|
68
|
-
If the directory does not exist, it creates the directory.
|
|
68
|
+
Returns the path to the XspecT runs directory, which is located within the XspecT data
|
|
69
|
+
directory. If the directory does not exist, it creates the directory.
|
|
69
70
|
|
|
70
71
|
Returns:
|
|
71
72
|
Path: The path to the XspecT runs directory.
|
|
@@ -79,8 +80,8 @@ def get_xspect_mlst_path() -> Path:
|
|
|
79
80
|
"""
|
|
80
81
|
Return the path to the XspecT MLST directory.
|
|
81
82
|
|
|
82
|
-
Returns the path to the XspecT MLST directory, which is located within the XspecT data
|
|
83
|
-
If the directory does not exist, it creates the directory.
|
|
83
|
+
Returns the path to the XspecT MLST directory, which is located within the XspecT data
|
|
84
|
+
directory. If the directory does not exist, it creates the directory.
|
|
84
85
|
|
|
85
86
|
Returns:
|
|
86
87
|
Path: The path to the XspecT MLST directory.
|
xspect/file_io.py
CHANGED
|
@@ -113,7 +113,8 @@ def concatenate_metagenome(fasta_dir: Path, meta_path: Path) -> None:
|
|
|
113
113
|
Concatenate all fasta files in a directory into one file.
|
|
114
114
|
|
|
115
115
|
This function searches for all fasta files in the specified directory and writes their contents
|
|
116
|
-
into a single output file. The output file will contain the concatenated sequences from all
|
|
116
|
+
into a single output file. The output file will contain the concatenated sequences from all
|
|
117
|
+
fasta files.
|
|
117
118
|
|
|
118
119
|
Args:
|
|
119
120
|
fasta_dir (Path): Path to the directory with the fasta files.
|
xspect/filter_sequences.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
"""Sequence filtering module"""
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
|
-
from
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
from xspect.model_management import get_genus_model_path, get_species_model_path
|
|
3
6
|
from xspect.file_io import filter_sequences, prepare_input_output_paths
|
|
4
7
|
|
|
8
|
+
# inline imports lead to "invalid name" issues
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
|
|
5
11
|
|
|
6
12
|
def filter_species(
|
|
7
13
|
model_genus: str,
|
|
@@ -31,7 +37,12 @@ def filter_species(
|
|
|
31
37
|
available species scores.
|
|
32
38
|
sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
|
|
33
39
|
"""
|
|
34
|
-
|
|
40
|
+
ProbabilisticFilterSVMModel = import_module(
|
|
41
|
+
"xspect.models.probabilistic_filter_svm_model"
|
|
42
|
+
).ProbabilisticFilterSVMModel
|
|
43
|
+
|
|
44
|
+
species_model_path = get_species_model_path(model_genus)
|
|
45
|
+
species_model = ProbabilisticFilterSVMModel.load(species_model_path)
|
|
35
46
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
36
47
|
|
|
37
48
|
for idx, current_path in enumerate(input_paths):
|
|
@@ -82,11 +93,16 @@ def filter_genus(
|
|
|
82
93
|
sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
|
|
83
94
|
|
|
84
95
|
"""
|
|
85
|
-
|
|
96
|
+
ProbabilisticSingleFilterModel = import_module(
|
|
97
|
+
"xspect.models.probabilistic_single_filter_model"
|
|
98
|
+
).ProbabilisticSingleFilterModel
|
|
99
|
+
|
|
100
|
+
genus_model_path = get_genus_model_path(model_genus)
|
|
101
|
+
genus_model = ProbabilisticSingleFilterModel.load(genus_model_path)
|
|
86
102
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
87
103
|
|
|
88
104
|
for idx, current_path in enumerate(input_paths):
|
|
89
|
-
result =
|
|
105
|
+
result = genus_model.predict(current_path, step=sparse_sampling_step)
|
|
90
106
|
result.input_source = current_path.name
|
|
91
107
|
|
|
92
108
|
if classification_output_path:
|
xspect/main.py
CHANGED
|
@@ -2,25 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from uuid import uuid4
|
|
5
|
+
from importlib import import_module
|
|
5
6
|
import click
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from xspect import filter_sequences
|
|
11
|
-
from xspect.train import train_from_directory, train_from_ncbi
|
|
12
|
-
from xspect.definitions import (
|
|
13
|
-
get_xspect_model_path,
|
|
14
|
-
)
|
|
15
|
-
from xspect.mlst_feature.mlst_helper import pick_scheme
|
|
16
|
-
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
17
|
-
from xspect.models.probabilistic_filter_mlst_model import (
|
|
18
|
-
ProbabilisticFilterMlstSchemeModel,
|
|
19
|
-
)
|
|
20
|
-
from xspect.model_management import (
|
|
21
|
-
get_model_metadata,
|
|
22
|
-
get_models,
|
|
23
|
-
)
|
|
7
|
+
from xspect.model_management import get_models
|
|
8
|
+
|
|
9
|
+
# inline imports lead to "invalid name" issues
|
|
10
|
+
# pylint: disable=invalid-name
|
|
24
11
|
|
|
25
12
|
|
|
26
13
|
@click.group()
|
|
@@ -32,7 +19,10 @@ def cli():
|
|
|
32
19
|
@cli.command()
|
|
33
20
|
def web():
|
|
34
21
|
"""Open the XspecT web application."""
|
|
35
|
-
|
|
22
|
+
app = import_module("xspect.web").app
|
|
23
|
+
run = import_module("uvicorn").run
|
|
24
|
+
|
|
25
|
+
run(app, host="0.0.0.0", port=8000)
|
|
36
26
|
|
|
37
27
|
|
|
38
28
|
# # # # # # # # # # # # # # #
|
|
@@ -49,6 +39,8 @@ def models():
|
|
|
49
39
|
def download():
|
|
50
40
|
"""Download models."""
|
|
51
41
|
click.echo("Downloading models, this may take a while...")
|
|
42
|
+
download_test_models = import_module("xspect.download_models").download_test_models
|
|
43
|
+
|
|
52
44
|
download_test_models(
|
|
53
45
|
"https://assets.adrianromberg.com/science/xspect-models-07-08-2025.zip"
|
|
54
46
|
)
|
|
@@ -64,7 +56,6 @@ def list_models():
|
|
|
64
56
|
if not available_models:
|
|
65
57
|
click.echo("No models found.")
|
|
66
58
|
return
|
|
67
|
-
# todo: make this machine readable
|
|
68
59
|
click.echo("Models found:")
|
|
69
60
|
click.echo("--------------")
|
|
70
61
|
for model_type, names in available_models.items():
|
|
@@ -100,6 +91,8 @@ def train_ncbi(model_genus, svm_steps, author, author_email):
|
|
|
100
91
|
"""Train a species and a genus model based on NCBI data."""
|
|
101
92
|
click.echo(f"Training {model_genus} species and genus metagenome model.")
|
|
102
93
|
try:
|
|
94
|
+
train_from_ncbi = import_module("xspect.train").train_from_ncbi
|
|
95
|
+
|
|
103
96
|
train_from_ncbi(model_genus, svm_steps, author, author_email)
|
|
104
97
|
except ValueError as e:
|
|
105
98
|
click.echo(f"Error: {e}")
|
|
@@ -143,6 +136,8 @@ def train_ncbi(model_genus, svm_steps, author, author_email):
|
|
|
143
136
|
def train_directory(model_genus, input_path, svm_steps, meta, author, author_email):
|
|
144
137
|
"""Train a model based on data from a directory for a given genus."""
|
|
145
138
|
click.echo(f"Training {model_genus} model with {svm_steps} SVM steps.")
|
|
139
|
+
train_from_directory = import_module("xspect.train").train_from_directory
|
|
140
|
+
|
|
146
141
|
train_from_directory(
|
|
147
142
|
model_genus,
|
|
148
143
|
Path(input_path),
|
|
@@ -167,12 +162,28 @@ def train_directory(model_genus, input_path, svm_steps, meta, author, author_ema
|
|
|
167
162
|
def train_mlst(choose_schemes):
|
|
168
163
|
"""Download alleles and train bloom filters."""
|
|
169
164
|
click.echo("Updating alleles")
|
|
165
|
+
mlst_helper = import_module("xspect.mlst_feature.mlst_helper")
|
|
166
|
+
pick_scheme = mlst_helper.pick_scheme
|
|
167
|
+
|
|
168
|
+
pub_mlst_handler = import_module("xspect.mlst_feature.pub_mlst_handler")
|
|
169
|
+
PubMLSTHandler = pub_mlst_handler.PubMLSTHandler
|
|
170
|
+
|
|
171
|
+
probabilistic_filter_mlst_model = import_module(
|
|
172
|
+
"xspect.models.probabilistic_filter_mlst_model"
|
|
173
|
+
)
|
|
174
|
+
ProbabilisticFilterMlstSchemeModel = (
|
|
175
|
+
probabilistic_filter_mlst_model.ProbabilisticFilterMlstSchemeModel
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
definitions = import_module("xspect.definitions")
|
|
179
|
+
get_xspect_model_path = definitions.get_xspect_model_path
|
|
180
|
+
|
|
170
181
|
handler = PubMLSTHandler()
|
|
171
182
|
handler.download_alleles(choose_schemes)
|
|
172
183
|
click.echo("Download finished")
|
|
173
184
|
scheme_path = pick_scheme(handler.get_scheme_paths())
|
|
174
185
|
species_name = str(scheme_path).split("/")[-2]
|
|
175
|
-
scheme_name = str(scheme_path).
|
|
186
|
+
scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
176
187
|
scheme_url = handler.scheme_mapping[str(scheme_path)]
|
|
177
188
|
model = ProbabilisticFilterMlstSchemeModel(
|
|
178
189
|
31, f"{species_name}:{scheme_name}", get_xspect_model_path(), scheme_url
|
|
@@ -230,6 +241,8 @@ def classify_seqs():
|
|
|
230
241
|
def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
|
|
231
242
|
"""Classify samples using a genus model."""
|
|
232
243
|
click.echo("Classifying...")
|
|
244
|
+
classify = import_module("xspect.classify")
|
|
245
|
+
|
|
233
246
|
classify.classify_genus(
|
|
234
247
|
model_genus, Path(input_path), Path(output_path), sparse_sampling_step
|
|
235
248
|
)
|
|
@@ -268,11 +281,25 @@ def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
|
|
|
268
281
|
help="Sparse sampling step (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
|
|
269
282
|
default=1,
|
|
270
283
|
)
|
|
271
|
-
|
|
284
|
+
@click.option(
|
|
285
|
+
"-n",
|
|
286
|
+
"--display-names",
|
|
287
|
+
help="Includes the display names next to taxonomy-IDs.",
|
|
288
|
+
is_flag=True,
|
|
289
|
+
)
|
|
290
|
+
def classify_species(
|
|
291
|
+
model_genus, input_path, output_path, sparse_sampling_step, display_names
|
|
292
|
+
):
|
|
272
293
|
"""Classify samples using a species model."""
|
|
273
294
|
click.echo("Classifying...")
|
|
295
|
+
classify = import_module("xspect.classify")
|
|
296
|
+
|
|
274
297
|
classify.classify_species(
|
|
275
|
-
model_genus,
|
|
298
|
+
model_genus,
|
|
299
|
+
Path(input_path),
|
|
300
|
+
Path(output_path),
|
|
301
|
+
sparse_sampling_step,
|
|
302
|
+
display_names,
|
|
276
303
|
)
|
|
277
304
|
|
|
278
305
|
|
|
@@ -301,6 +328,8 @@ def classify_species(model_genus, input_path, output_path, sparse_sampling_step)
|
|
|
301
328
|
def classify_mlst(input_path, output_path, limit):
|
|
302
329
|
"""MLST classify a sample."""
|
|
303
330
|
click.echo("Classifying...")
|
|
331
|
+
classify = import_module("xspect.classify")
|
|
332
|
+
|
|
304
333
|
classify.classify_mlst(Path(input_path), Path(output_path), limit)
|
|
305
334
|
|
|
306
335
|
|
|
@@ -372,6 +401,7 @@ def filter_genus(
|
|
|
372
401
|
):
|
|
373
402
|
"""Filter samples using a genus model."""
|
|
374
403
|
click.echo("Filtering...")
|
|
404
|
+
filter_sequences = import_module("xspect.filter_sequences")
|
|
375
405
|
|
|
376
406
|
filter_sequences.filter_genus(
|
|
377
407
|
model_genus,
|
|
@@ -426,14 +456,16 @@ def filter_genus(
|
|
|
426
456
|
"-t",
|
|
427
457
|
"--threshold",
|
|
428
458
|
type=float,
|
|
429
|
-
help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring
|
|
459
|
+
help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring "
|
|
460
|
+
"species.",
|
|
430
461
|
default=0.7,
|
|
431
462
|
prompt=True,
|
|
432
463
|
)
|
|
433
464
|
@click.option(
|
|
434
465
|
"--sparse-sampling-step",
|
|
435
466
|
type=int,
|
|
436
|
-
help="Sparse sampling step (e. g. only every 500th kmer for
|
|
467
|
+
help="Sparse sampling step (e. g. only every 500th kmer for "
|
|
468
|
+
"'--sparse-sampling-step 500').",
|
|
437
469
|
default=1,
|
|
438
470
|
)
|
|
439
471
|
def filter_species(
|
|
@@ -449,9 +481,12 @@ def filter_species(
|
|
|
449
481
|
|
|
450
482
|
if threshold != -1 and (threshold < 0 or threshold > 1):
|
|
451
483
|
raise click.BadParameter(
|
|
452
|
-
"Threshold must be between 0 and 1, or -1 for filtering by the highest
|
|
484
|
+
"Threshold must be between 0 and 1, or -1 for filtering by the highest "
|
|
485
|
+
"scoring species."
|
|
453
486
|
)
|
|
454
487
|
|
|
488
|
+
get_model_metadata = import_module("xspect.model_management").get_model_metadata
|
|
489
|
+
|
|
455
490
|
available_species = get_model_metadata(f"{model_genus}-species")["display_names"]
|
|
456
491
|
available_species = {
|
|
457
492
|
id: name.replace(f"{model_genus} ", "")
|
|
@@ -476,6 +511,8 @@ def filter_species(
|
|
|
476
511
|
][0]
|
|
477
512
|
|
|
478
513
|
click.echo("Filtering...")
|
|
514
|
+
filter_sequences = import_module("xspect.filter_sequences")
|
|
515
|
+
|
|
479
516
|
filter_sequences.filter_species(
|
|
480
517
|
model_genus,
|
|
481
518
|
model_species,
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import requests
|
|
6
5
|
import json
|
|
7
|
-
from io import StringIO
|
|
8
6
|
from pathlib import Path
|
|
7
|
+
from io import StringIO
|
|
8
|
+
import requests
|
|
9
9
|
from Bio import SeqIO
|
|
10
10
|
from xspect.definitions import get_xspect_model_path
|
|
11
11
|
|
|
@@ -29,7 +29,7 @@ def create_fasta_files(locus_path: Path, fasta_batch: str) -> None:
|
|
|
29
29
|
output_fasta_file = locus_path / f"Allele_ID_{number}.fasta"
|
|
30
30
|
if output_fasta_file.exists():
|
|
31
31
|
continue # Ignore existing ones
|
|
32
|
-
with open(output_fasta_file, "w") as allele:
|
|
32
|
+
with open(output_fasta_file, "w", encoding="utf-8") as allele:
|
|
33
33
|
SeqIO.write(record, allele, "fasta")
|
|
34
34
|
|
|
35
35
|
|
|
@@ -59,10 +59,9 @@ def pick_species_number_from_db(available_species: dict) -> str:
|
|
|
59
59
|
if int(choice) in available_species.keys():
|
|
60
60
|
chosen_species = available_species.get(int(choice))
|
|
61
61
|
return chosen_species
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
62
|
+
print(
|
|
63
|
+
"Wrong input! Try again with a number that is available in the list above."
|
|
64
|
+
)
|
|
66
65
|
except ValueError:
|
|
67
66
|
print(
|
|
68
67
|
"Wrong input! Try again with a number that is available in the list above."
|
|
@@ -95,10 +94,9 @@ def pick_scheme_number_from_db(available_schemes: dict) -> str:
|
|
|
95
94
|
if int(choice) in available_schemes.keys():
|
|
96
95
|
chosen_scheme = available_schemes.get(int(choice))[1]
|
|
97
96
|
return chosen_scheme
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
)
|
|
97
|
+
print(
|
|
98
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
99
|
+
)
|
|
102
100
|
except ValueError:
|
|
103
101
|
print(
|
|
104
102
|
"Wrong input! Try again with a number that is available in the above list."
|
|
@@ -162,12 +160,12 @@ def pick_scheme(available_schemes: dict) -> Path:
|
|
|
162
160
|
for counter, scheme in available_schemes.items():
|
|
163
161
|
# For Strain Typing with an API-POST Request to the db
|
|
164
162
|
if str(scheme).startswith("http"):
|
|
165
|
-
scheme_json = requests.get(scheme).json()
|
|
163
|
+
scheme_json = requests.get(scheme, timeout=10).json()
|
|
166
164
|
print(str(counter) + ":" + scheme_json["description"])
|
|
167
165
|
|
|
168
166
|
# To pick a scheme after download for fitting
|
|
169
167
|
else:
|
|
170
|
-
print(str(counter) + ":" + str(scheme).
|
|
168
|
+
print(str(counter) + ":" + str(scheme).rsplit("/", maxsplit=1)[-1])
|
|
171
169
|
|
|
172
170
|
print("\nPick a scheme for strain type prediction")
|
|
173
171
|
while True:
|
|
@@ -176,10 +174,9 @@ def pick_scheme(available_schemes: dict) -> Path:
|
|
|
176
174
|
if int(choice) in available_schemes.keys():
|
|
177
175
|
chosen_scheme = available_schemes.get(int(choice))
|
|
178
176
|
return chosen_scheme
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
)
|
|
177
|
+
print(
|
|
178
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
179
|
+
)
|
|
183
180
|
except ValueError:
|
|
184
181
|
print(
|
|
185
182
|
"Wrong input! Try again with a number that is available in the above list."
|
|
@@ -209,8 +206,7 @@ class MlstResult:
|
|
|
209
206
|
Returns:
|
|
210
207
|
dict: The result dictionary with s sequence ID as key and the Strain type as value.
|
|
211
208
|
"""
|
|
212
|
-
|
|
213
|
-
return results
|
|
209
|
+
return dict(self.hits.items())
|
|
214
210
|
|
|
215
211
|
def to_dict(self) -> dict:
|
|
216
212
|
"""
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import requests
|
|
6
5
|
import json
|
|
6
|
+
import requests
|
|
7
7
|
from xspect.mlst_feature.mlst_helper import (
|
|
8
8
|
create_fasta_files,
|
|
9
9
|
pick_species_number_from_db,
|
|
@@ -51,7 +51,7 @@ class PubMLSTHandler:
|
|
|
51
51
|
counter = 1
|
|
52
52
|
# retrieve all available species
|
|
53
53
|
species_url = PubMLSTHandler.base_url
|
|
54
|
-
for species_databases in requests.get(species_url).json():
|
|
54
|
+
for species_databases in requests.get(species_url, timeout=10).json():
|
|
55
55
|
for database in species_databases["databases"]:
|
|
56
56
|
if database["name"].endswith("seqdef"):
|
|
57
57
|
available_species[counter] = database["name"]
|
|
@@ -61,7 +61,7 @@ class PubMLSTHandler:
|
|
|
61
61
|
|
|
62
62
|
counter = 1
|
|
63
63
|
scheme_url = f"{species_url}/{chosen_species}/schemes"
|
|
64
|
-
for scheme in requests.get(scheme_url).json()["schemes"]:
|
|
64
|
+
for scheme in requests.get(scheme_url, timeout=10).json()["schemes"]:
|
|
65
65
|
# scheme["description"] stores the name of a scheme.
|
|
66
66
|
# scheme["scheme"] stores the URL that is needed for downloading all loci.
|
|
67
67
|
available_schemes[counter] = [scheme["description"], scheme["scheme"]]
|
|
@@ -70,11 +70,8 @@ class PubMLSTHandler:
|
|
|
70
70
|
# Selection process of available scheme from a species for download (doubles are caught!)
|
|
71
71
|
while True:
|
|
72
72
|
chosen_scheme = pick_scheme_number_from_db(available_schemes)
|
|
73
|
-
|
|
73
|
+
if chosen_scheme not in chosen_schemes:
|
|
74
74
|
chosen_schemes.append(chosen_scheme)
|
|
75
|
-
if chosen_scheme not in chosen_schemes
|
|
76
|
-
else None
|
|
77
|
-
)
|
|
78
75
|
choice = input(
|
|
79
76
|
"Do you want to pick another scheme to download? (y/n):"
|
|
80
77
|
).lower()
|
|
@@ -97,7 +94,7 @@ class PubMLSTHandler:
|
|
|
97
94
|
self.choose_schemes() # changes the scheme_list attribute
|
|
98
95
|
|
|
99
96
|
for scheme in self.scheme_list:
|
|
100
|
-
scheme_json = requests.get(scheme).json()
|
|
97
|
+
scheme_json = requests.get(scheme, timeout=10).json()
|
|
101
98
|
# We only want the name and the respective featured loci of a scheme
|
|
102
99
|
scheme_name = scheme_json["description"]
|
|
103
100
|
locus_list = scheme_json["loci"]
|
|
@@ -117,7 +114,7 @@ class PubMLSTHandler:
|
|
|
117
114
|
if not locus_path.exists():
|
|
118
115
|
locus_path.mkdir(exist_ok=True, parents=True)
|
|
119
116
|
|
|
120
|
-
alleles = requests.get(f"{locus_url}/alleles_fasta").text
|
|
117
|
+
alleles = requests.get(f"{locus_url}/alleles_fasta", timeout=10).text
|
|
121
118
|
create_fasta_files(locus_path, alleles)
|
|
122
119
|
|
|
123
120
|
def assign_strain_type_by_db(self) -> None:
|
|
@@ -132,13 +129,15 @@ class PubMLSTHandler:
|
|
|
132
129
|
str(pick_scheme(scheme_list_to_dict(self.scheme_list))) + "/sequence"
|
|
133
130
|
)
|
|
134
131
|
fasta_file = get_xspect_upload_path() / "Test.fna"
|
|
135
|
-
with open(fasta_file, "r") as file:
|
|
132
|
+
with open(fasta_file, "r", encoding="utf-8") as file:
|
|
136
133
|
data = file.read()
|
|
137
134
|
payload = { # Essential API-POST-Body
|
|
138
135
|
"sequence": data,
|
|
139
136
|
"filetype": "fasta",
|
|
140
137
|
}
|
|
141
|
-
response = requests.post(
|
|
138
|
+
response = requests.post(
|
|
139
|
+
scheme_url, data=json.dumps(payload), timeout=10
|
|
140
|
+
).json()
|
|
142
141
|
|
|
143
142
|
for locus, meta_data in response["exact_matches"].items():
|
|
144
143
|
# meta_data is a list containing a dictionary, therefore [0] and then key value.
|
|
@@ -170,18 +169,16 @@ class PubMLSTHandler:
|
|
|
170
169
|
}
|
|
171
170
|
}
|
|
172
171
|
|
|
173
|
-
response = requests.post(post_url + "/designations", json=payload)
|
|
172
|
+
response = requests.post(post_url + "/designations", json=payload, timeout=10)
|
|
174
173
|
|
|
175
174
|
if response.status_code == 200:
|
|
176
175
|
data = response.json()
|
|
177
176
|
if "fields" in data:
|
|
178
177
|
post_response = data["fields"]
|
|
179
178
|
return post_response
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
post_response += "Possibly a novel Strain Type."
|
|
183
|
-
return post_response
|
|
184
|
-
else:
|
|
185
|
-
post_response = "Error:" + str(response.status_code)
|
|
186
|
-
post_response += response.text
|
|
179
|
+
post_response = "No matching Strain Type found in the database. "
|
|
180
|
+
post_response += "Possibly a novel Strain Type."
|
|
187
181
|
return post_response
|
|
182
|
+
post_response = "Error:" + str(response.status_code)
|
|
183
|
+
post_response += response.text
|
|
184
|
+
return post_response
|
xspect/model_management.py
CHANGED
|
@@ -2,45 +2,41 @@
|
|
|
2
2
|
|
|
3
3
|
from json import loads, dumps
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from xspect.models.probabilistic_single_filter_model import (
|
|
6
|
-
ProbabilisticSingleFilterModel,
|
|
7
|
-
)
|
|
8
|
-
from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
|
|
9
5
|
from xspect.definitions import get_xspect_model_path
|
|
10
6
|
|
|
11
7
|
|
|
12
|
-
def
|
|
8
|
+
def get_genus_model_path(genus) -> Path:
|
|
13
9
|
"""
|
|
14
|
-
Get a genus model for the specified genus.
|
|
10
|
+
Get a genus model path for the specified genus.
|
|
15
11
|
|
|
16
|
-
This function retrieves a pre-trained genus classification model based on the
|
|
12
|
+
This function retrieves the path of a pre-trained genus classification model based on the
|
|
13
|
+
provided genus name.
|
|
17
14
|
|
|
18
15
|
Args:
|
|
19
16
|
genus (str): The genus name for which the model is to be retrieved.
|
|
20
17
|
|
|
21
18
|
Returns:
|
|
22
|
-
|
|
19
|
+
Path: The file path of the genus classification model.
|
|
23
20
|
"""
|
|
24
21
|
genus_model_path = get_xspect_model_path() / (genus.lower() + "-genus.json")
|
|
25
|
-
|
|
26
|
-
return genus_filter_model
|
|
22
|
+
return genus_model_path
|
|
27
23
|
|
|
28
24
|
|
|
29
|
-
def
|
|
25
|
+
def get_species_model_path(genus) -> Path:
|
|
30
26
|
"""
|
|
31
|
-
Get a species
|
|
27
|
+
Get a species model path for the specified genus.
|
|
32
28
|
|
|
33
|
-
This function retrieves a pre-trained species classification model based on the
|
|
29
|
+
This function retrieves the path of a pre-trained species classification model based on the
|
|
30
|
+
provided genus name.
|
|
34
31
|
|
|
35
32
|
Args:
|
|
36
33
|
genus (str): The genus name for which the species model is to be retrieved.
|
|
37
34
|
|
|
38
35
|
Returns:
|
|
39
|
-
|
|
36
|
+
Path: The file path of the species classification model.
|
|
40
37
|
"""
|
|
41
38
|
species_model_path = get_xspect_model_path() / (genus.lower() + "-species.json")
|
|
42
|
-
|
|
43
|
-
return species_filter_model
|
|
39
|
+
return species_model_path
|
|
44
40
|
|
|
45
41
|
|
|
46
42
|
def get_model_metadata(model: str | Path) -> dict:
|
|
@@ -121,7 +117,8 @@ def get_models() -> dict[str, list[dict]]:
|
|
|
121
117
|
This function scans the model directory for JSON files and organizes them by their model type.
|
|
122
118
|
|
|
123
119
|
Returns:
|
|
124
|
-
dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
|
|
120
|
+
dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
|
|
121
|
+
model display names.
|
|
125
122
|
"""
|
|
126
123
|
model_dict = {}
|
|
127
124
|
for model_file in get_xspect_model_path().glob("*.json"):
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import cobs_index
|
|
6
5
|
import json
|
|
7
6
|
from pathlib import Path
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import cobs_index
|
|
9
|
+
from cobs_index import DocumentList
|
|
8
10
|
from Bio import SeqIO
|
|
9
11
|
from Bio.Seq import Seq
|
|
10
12
|
from Bio.SeqRecord import SeqRecord
|
|
11
|
-
from cobs_index import DocumentList
|
|
12
|
-
from collections import defaultdict
|
|
13
13
|
from xspect.file_io import get_record_iterator
|
|
14
14
|
from xspect.mlst_feature.mlst_helper import MlstResult
|
|
15
15
|
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
@@ -100,11 +100,11 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
100
100
|
"Scheme not found. Please make sure to download the schemes prior!"
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
-
scheme = str(scheme_path).
|
|
103
|
+
scheme = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
104
104
|
cobs_path = ""
|
|
105
105
|
# COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
|
|
106
106
|
for locus_path in sorted(scheme_path.iterdir()):
|
|
107
|
-
locus = str(locus_path).
|
|
107
|
+
locus = str(locus_path).rsplit("/", maxsplit=1)[-1]
|
|
108
108
|
# counts all fasta files that belong to a locus
|
|
109
109
|
self.loci[locus] = sum(
|
|
110
110
|
(1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
|
|
@@ -112,7 +112,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
112
112
|
|
|
113
113
|
# determine the avg base pair size of alleles
|
|
114
114
|
fasta_file = next(locus_path.glob("*.fasta"), None)
|
|
115
|
-
with open(fasta_file, "r") as handle:
|
|
115
|
+
with open(fasta_file, "r", encoding="utf-8") as handle:
|
|
116
116
|
record = next(SeqIO.parse(handle, "fasta"))
|
|
117
117
|
self.avg_locus_bp_size.append(len(record.seq))
|
|
118
118
|
|
|
@@ -134,7 +134,8 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
134
134
|
|
|
135
135
|
def save(self) -> None:
|
|
136
136
|
"""Saves the model to disk"""
|
|
137
|
-
|
|
137
|
+
# [-1] contains the scheme name
|
|
138
|
+
scheme = str(self.scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
138
139
|
json_path = self.base_path / scheme / f"{scheme}.json"
|
|
139
140
|
json_object = json.dumps(self.to_dict(), indent=4)
|
|
140
141
|
|
|
@@ -152,7 +153,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
152
153
|
Returns:
|
|
153
154
|
ProbabilisticFilterMlstSchemeModel: A trained model from the disk in JSON format.
|
|
154
155
|
"""
|
|
155
|
-
scheme_name = str(scheme_path).
|
|
156
|
+
scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
156
157
|
json_path = scheme_path / f"{scheme_name}.json"
|
|
157
158
|
with open(json_path, "r", encoding="utf-8") as file:
|
|
158
159
|
json_object = file.read()
|
|
@@ -221,7 +222,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
221
222
|
for entry in sorted(cobs_path.iterdir()):
|
|
222
223
|
if str(entry).endswith(".json"):
|
|
223
224
|
continue
|
|
224
|
-
file_name = str(entry).
|
|
225
|
+
file_name = str(entry).rsplit("/", maxsplit=1)[-1] # file_name = locus
|
|
225
226
|
scheme_path_list.append(file_name.split(".")[0]) # without the file ending
|
|
226
227
|
|
|
227
228
|
result_dict = {}
|
|
@@ -442,7 +443,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
442
443
|
Returns:
|
|
443
444
|
bool: True if any locus score >= 0.5 * its avg base pair size, False otherwise.
|
|
444
445
|
"""
|
|
445
|
-
for i, (
|
|
446
|
+
for i, (_, allele_score_dict) in enumerate(highest_results.items()):
|
|
446
447
|
if not allele_score_dict:
|
|
447
448
|
continue # skip empty values
|
|
448
449
|
|
|
@@ -135,8 +135,8 @@ class ProbabilisticFilterModel:
|
|
|
135
135
|
display_names (dict | None): A dictionary mapping file names to display names.
|
|
136
136
|
If None, uses file names as display names.
|
|
137
137
|
training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
|
|
138
|
-
lists of accession numbers used for training the model. If None, no training
|
|
139
|
-
are set.
|
|
138
|
+
lists of accession numbers used for training the model. If None, no training
|
|
139
|
+
accessions are set.
|
|
140
140
|
Raises:
|
|
141
141
|
ValueError: If the directory path is invalid, does not exist, or is not a directory.
|
|
142
142
|
"""
|
|
@@ -230,6 +230,7 @@ class ProbabilisticFilterModel:
|
|
|
230
230
|
),
|
|
231
231
|
filter_ids: list[str] = None,
|
|
232
232
|
step: int = 1,
|
|
233
|
+
display_name: bool = False,
|
|
233
234
|
) -> ModelResult:
|
|
234
235
|
"""
|
|
235
236
|
Returns a model result object for the sequence(s) based on the filters in the model
|
|
@@ -246,6 +247,7 @@ class ProbabilisticFilterModel:
|
|
|
246
247
|
filter_ids (list[str]): A list of filter IDs to filter the results. If None,
|
|
247
248
|
all results are returned.
|
|
248
249
|
step (int): The step size for the k-mer search. Default is 1.
|
|
250
|
+
display_name (bool): Includes a display name for each tax_ID.
|
|
249
251
|
|
|
250
252
|
Returns:
|
|
251
253
|
ModelResult: An object containing the hits for each sequence, the number of kmers,
|
|
@@ -253,11 +255,12 @@ class ProbabilisticFilterModel:
|
|
|
253
255
|
|
|
254
256
|
Raises:
|
|
255
257
|
ValueError: If the input sequence is not valid, or if it is not a Seq object,
|
|
256
|
-
a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq
|
|
258
|
+
a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq
|
|
259
|
+
file.
|
|
257
260
|
"""
|
|
258
261
|
if isinstance(sequence_input, (SeqRecord)):
|
|
259
262
|
return ProbabilisticFilterModel.predict(
|
|
260
|
-
self, [sequence_input], filter_ids, step=step
|
|
263
|
+
self, [sequence_input], filter_ids, step=step, display_name=display_name
|
|
261
264
|
)
|
|
262
265
|
|
|
263
266
|
if self._is_sequence_list(sequence_input) | self._is_sequence_iterator(
|
|
@@ -272,12 +275,25 @@ class ProbabilisticFilterModel:
|
|
|
272
275
|
num_kmers[individual_sequence.id] = self._count_kmers(
|
|
273
276
|
individual_sequence, step=step
|
|
274
277
|
)
|
|
278
|
+
if display_name:
|
|
279
|
+
individual_hits.update(
|
|
280
|
+
{
|
|
281
|
+
f"{key} -{self.display_names.get(key, 'Unknown').replace(
|
|
282
|
+
self.model_display_name, '', 1)}": individual_hits.pop(
|
|
283
|
+
key
|
|
284
|
+
)
|
|
285
|
+
for key in list(individual_hits.keys())
|
|
286
|
+
}
|
|
287
|
+
)
|
|
275
288
|
hits[individual_sequence.id] = individual_hits
|
|
276
289
|
return ModelResult(self.slug(), hits, num_kmers, sparse_sampling_step=step)
|
|
277
290
|
|
|
278
291
|
if isinstance(sequence_input, Path):
|
|
279
292
|
return ProbabilisticFilterModel.predict(
|
|
280
|
-
self,
|
|
293
|
+
self,
|
|
294
|
+
get_record_iterator(sequence_input),
|
|
295
|
+
step=step,
|
|
296
|
+
display_name=display_name,
|
|
281
297
|
)
|
|
282
298
|
|
|
283
299
|
raise ValueError(
|
|
@@ -55,10 +55,14 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
55
55
|
base_path (Path): The base path where the model will be stored.
|
|
56
56
|
kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
|
|
57
57
|
c (float): Regularization parameter for the SVM.
|
|
58
|
-
fpr (float, optional): False positive rate for the probabilistic filter.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
fpr (float, optional): False positive rate for the probabilistic filter.
|
|
59
|
+
Defaults to 0.01.
|
|
60
|
+
num_hashes (int, optional): Number of hashes for the probabilistic filter.
|
|
61
|
+
Defaults to 7.
|
|
62
|
+
training_accessions (dict[str, list[str]] | None, optional): Accessions used for
|
|
63
|
+
training the probabilistic filter. Defaults to None.
|
|
64
|
+
svm_accessions (dict[str, list[str]] | None, optional): Accessions used for
|
|
65
|
+
training the SVM. Defaults to None.
|
|
62
66
|
"""
|
|
63
67
|
super().__init__(
|
|
64
68
|
k=k,
|
|
@@ -112,17 +116,18 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
112
116
|
"""
|
|
113
117
|
Fit the SVM to the sequences and labels.
|
|
114
118
|
|
|
115
|
-
This method first trains the probabilistic filter model and then
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
+
This method first trains the probabilistic filter model and then calculates scores for
|
|
120
|
+
the SVM training. It expects the sequences to be in the specified directory and the SVM
|
|
121
|
+
training sequences to be in the specified SVM path. The scores are saved in a CSV file
|
|
122
|
+
for later use.
|
|
119
123
|
|
|
120
124
|
Args:
|
|
121
125
|
dir_path (Path): The directory containing the training sequences.
|
|
122
126
|
svm_path (Path): The directory containing the SVM training sequences.
|
|
123
127
|
display_names (dict[str, str] | None): A mapping of accession IDs to display names.
|
|
124
128
|
svm_step (int): Step size for sparse sampling in SVM training.
|
|
125
|
-
training_accessions (dict[str, list[str]] | None): Accessions used for training the
|
|
129
|
+
training_accessions (dict[str, list[str]] | None): Accessions used for training the
|
|
130
|
+
probabilistic filter.
|
|
126
131
|
svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
|
|
127
132
|
"""
|
|
128
133
|
|
|
@@ -178,6 +183,7 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
178
183
|
),
|
|
179
184
|
filter_ids: list[str] = None,
|
|
180
185
|
step: int = 1,
|
|
186
|
+
display_name: bool = False,
|
|
181
187
|
) -> ModelResult:
|
|
182
188
|
"""
|
|
183
189
|
Predict the labels of the sequences.
|
|
@@ -187,25 +193,33 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
187
193
|
with the probabilistic filter model, and it will return a `ModelResult`.
|
|
188
194
|
|
|
189
195
|
Args:
|
|
190
|
-
sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
|
|
191
|
-
|
|
196
|
+
sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
|
|
197
|
+
SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
|
|
198
|
+
filter_ids (list[str], optional): A list of IDs to filter the predictions.
|
|
192
199
|
step (int, optional): Step size for sparse sampling. Defaults to 1.
|
|
200
|
+
display_name (bool): Includes a display name for each tax_ID.
|
|
193
201
|
|
|
194
202
|
Returns:
|
|
195
|
-
ModelResult: The result of the prediction containing hits, number of kmers, and the
|
|
203
|
+
ModelResult: The result of the prediction containing hits, number of kmers, and the
|
|
204
|
+
predicted label.
|
|
196
205
|
"""
|
|
197
206
|
# get scores and format them for the SVM
|
|
198
|
-
res = super().predict(sequence_input, filter_ids, step
|
|
207
|
+
res = super().predict(sequence_input, filter_ids, step, display_name)
|
|
199
208
|
svm_scores = dict(sorted(res.get_scores()["total"].items()))
|
|
200
209
|
svm_scores = [list(svm_scores.values())]
|
|
201
210
|
|
|
202
211
|
svm = self._get_svm(filter_ids)
|
|
212
|
+
svm_prediction = str(svm.predict(svm_scores)[0])
|
|
213
|
+
if display_name:
|
|
214
|
+
svm_prediction = f"{svm_prediction} -{self.display_names.get(svm_prediction, 'Unknown')}".replace(
|
|
215
|
+
self.model_display_name, "", 1
|
|
216
|
+
)
|
|
203
217
|
return ModelResult(
|
|
204
218
|
self.slug(),
|
|
205
219
|
res.hits,
|
|
206
220
|
res.num_kmers,
|
|
207
221
|
sparse_sampling_step=step,
|
|
208
|
-
prediction=
|
|
222
|
+
prediction=svm_prediction,
|
|
209
223
|
)
|
|
210
224
|
|
|
211
225
|
def _get_svm(self, id_keys) -> SVC:
|
|
@@ -217,7 +231,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
217
231
|
training data to only include those keys.
|
|
218
232
|
|
|
219
233
|
Args:
|
|
220
|
-
id_keys (list[str] | None): A list of IDs to filter the training data.
|
|
234
|
+
id_keys (list[str] | None): A list of IDs to filter the training data.
|
|
235
|
+
If None, all data is used.
|
|
221
236
|
|
|
222
237
|
Returns:
|
|
223
238
|
SVC: The trained SVM model.
|
|
@@ -34,8 +34,8 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
34
34
|
) -> None:
|
|
35
35
|
"""Initialize probabilistic single filter model.
|
|
36
36
|
|
|
37
|
-
This model uses a Bloom filter to store k-mers from the training sequences. It is designed
|
|
38
|
-
be used with a single filter, which is suitable e.g. for genus-level classification.
|
|
37
|
+
This model uses a Bloom filter to store k-mers from the training sequences. It is designed
|
|
38
|
+
to be used with a single filter, which is suitable e.g. for genus-level classification.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
41
|
k (int): Length of the k-mers to use for filtering
|
|
@@ -45,7 +45,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
45
45
|
model_type (str): Type of the model, e.g. "probabilistic_single_filter"
|
|
46
46
|
base_path (Path): Base path where the model will be saved
|
|
47
47
|
fpr (float): False positive rate for the Bloom filter, default is 0.01
|
|
48
|
-
training_accessions (list[str] | None): List of accessions used for training
|
|
48
|
+
training_accessions (list[str] | None): List of accessions used for training
|
|
49
49
|
"""
|
|
50
50
|
super().__init__(
|
|
51
51
|
k=k,
|
|
@@ -75,7 +75,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
75
75
|
Args:
|
|
76
76
|
file_path (Path): Path to the file containing sequences in FASTA format
|
|
77
77
|
display_name (str): Display name for the model
|
|
78
|
-
training_accessions (list[str] | None): List of accessions used for training
|
|
78
|
+
training_accessions (list[str] | None): List of accessions used for training
|
|
79
79
|
"""
|
|
80
80
|
self.training_accessions = training_accessions
|
|
81
81
|
|
|
@@ -104,7 +104,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
104
104
|
Calculates the number of k-mers in the sequence that are present in the Bloom filter.
|
|
105
105
|
|
|
106
106
|
Args:
|
|
107
|
-
sequence (Seq | SeqRecord): Sequence to calculate hits for
|
|
107
|
+
sequence (Seq | SeqRecord): Sequence to calculate hits for
|
|
108
108
|
filter_ids (list[str] | None): List of filter IDs to use, default is None
|
|
109
109
|
step (int): Step size for generating k-mers, default is 1
|
|
110
110
|
Returns:
|
|
@@ -162,13 +162,15 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
162
162
|
"""
|
|
163
163
|
Generate kmers from the sequence
|
|
164
164
|
|
|
165
|
-
Generates k-mers from the sequence, considering both the forward and reverse complement
|
|
165
|
+
Generates k-mers from the sequence, considering both the forward and reverse complement
|
|
166
|
+
strands.
|
|
166
167
|
|
|
167
168
|
Args:
|
|
168
169
|
sequence (Seq): Sequence to generate k-mers from
|
|
169
170
|
step (int): Step size for generating k-mers, default is 1
|
|
170
171
|
Yields:
|
|
171
|
-
str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and
|
|
172
|
+
str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and
|
|
173
|
+
reverse complement)
|
|
172
174
|
"""
|
|
173
175
|
num_kmers = ceil((len(sequence) - self.k + 1) / step)
|
|
174
176
|
for i in range(num_kmers):
|
xspect/models/result.py
CHANGED
|
@@ -50,7 +50,8 @@ class ModelResult:
|
|
|
50
50
|
|
|
51
51
|
Returns:
|
|
52
52
|
dict: A dictionary where keys are subsequence names and values are dictionaries
|
|
53
|
-
with labels as keys and scores as values. Also includes a 'total' key for
|
|
53
|
+
with labels as keys and scores as values. Also includes a 'total' key for
|
|
54
|
+
overall scores.
|
|
54
55
|
"""
|
|
55
56
|
scores = {
|
|
56
57
|
subsequence: {
|
|
@@ -78,7 +79,8 @@ class ModelResult:
|
|
|
78
79
|
The total hits are calculated by summing the hits for each label across all subsequences.
|
|
79
80
|
|
|
80
81
|
Returns:
|
|
81
|
-
dict: A dictionary where keys are labels and values are the total number of hits for
|
|
82
|
+
dict: A dictionary where keys are labels and values are the total number of hits for
|
|
83
|
+
that label.
|
|
82
84
|
"""
|
|
83
85
|
total_hits = {label: 0 for label in list(self.hits.values())[0]}
|
|
84
86
|
for _, subsequence_hits in self.hits.items():
|
|
@@ -97,8 +99,8 @@ class ModelResult:
|
|
|
97
99
|
|
|
98
100
|
Args:
|
|
99
101
|
label (str): The label for which to filter the subsequences.
|
|
100
|
-
filter_threshold (float): The threshold for filtering subsequences. Must be between 0
|
|
101
|
-
or -1 to return the subsequence with the maximum score for the label.
|
|
102
|
+
filter_threshold (float): The threshold for filtering subsequences. Must be between 0
|
|
103
|
+
and 1, or -1 to return the subsequence with the maximum score for the label.
|
|
102
104
|
|
|
103
105
|
Returns:
|
|
104
106
|
dict[str, bool]: A dictionary where keys are subsequence names and values are booleans
|
|
@@ -114,11 +116,10 @@ class ModelResult:
|
|
|
114
116
|
subsequence: score[label] >= filter_threshold
|
|
115
117
|
for subsequence, score in scores.items()
|
|
116
118
|
}
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
}
|
|
119
|
+
return {
|
|
120
|
+
subsequence: score[label] == max(score.values())
|
|
121
|
+
for subsequence, score in scores.items()
|
|
122
|
+
}
|
|
122
123
|
|
|
123
124
|
def get_filtered_subsequence_labels(
|
|
124
125
|
self, label: str, filter_threshold: float = 0.7
|
|
@@ -126,15 +127,17 @@ class ModelResult:
|
|
|
126
127
|
"""
|
|
127
128
|
Return the labels of filtered subsequences.
|
|
128
129
|
|
|
129
|
-
This method filters subsequences based on the scores for a given label and a filter
|
|
130
|
+
This method filters subsequences based on the scores for a given label and a filter
|
|
131
|
+
threshold.
|
|
130
132
|
|
|
131
133
|
Args:
|
|
132
134
|
label (str): The label for which to filter the subsequences.
|
|
133
|
-
filter_threshold (float): The threshold for filtering subsequences. Must be between 0
|
|
134
|
-
or -1 to return the subsequence with the maximum score for the label.
|
|
135
|
+
filter_threshold (float): The threshold for filtering subsequences. Must be between 0
|
|
136
|
+
and 1, or -1 to return the subsequence with the maximum score for the label.
|
|
135
137
|
|
|
136
138
|
Returns:
|
|
137
|
-
list[str]: A list of subsequence names that meet the filter criteria for the given
|
|
139
|
+
list[str]: A list of subsequence names that meet the filter criteria for the given
|
|
140
|
+
label.
|
|
138
141
|
"""
|
|
139
142
|
return [
|
|
140
143
|
subsequence
|
|
@@ -148,11 +151,13 @@ class ModelResult:
|
|
|
148
151
|
"""
|
|
149
152
|
Return the result as a dictionary.
|
|
150
153
|
|
|
151
|
-
This method converts the ModelResult object into a dictionary format suitable for
|
|
154
|
+
This method converts the ModelResult object into a dictionary format suitable for
|
|
155
|
+
serialization.
|
|
152
156
|
|
|
153
157
|
Returns:
|
|
154
158
|
dict: A dictionary representation of the ModelResult object, including model slug,
|
|
155
|
-
sparse sampling step, hits, scores, number of k-mers, input source, and prediction if
|
|
159
|
+
sparse sampling step, hits, scores, number of k-mers, input source, and prediction if
|
|
160
|
+
available.
|
|
156
161
|
"""
|
|
157
162
|
res = {
|
|
158
163
|
"model_slug": self.model_slug,
|
xspect/web.py
CHANGED
|
@@ -1,17 +1,26 @@
|
|
|
1
1
|
"""FastAPI-based web application for XspecT."""
|
|
2
2
|
|
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
|
4
|
+
|
|
5
|
+
|
|
3
6
|
from uuid import uuid4
|
|
4
7
|
import json
|
|
5
8
|
from shutil import copyfileobj
|
|
6
9
|
import importlib.resources as pkg_resources
|
|
7
|
-
from fastapi import
|
|
10
|
+
from fastapi import (
|
|
11
|
+
APIRouter,
|
|
12
|
+
BackgroundTasks,
|
|
13
|
+
FastAPI,
|
|
14
|
+
HTTPException,
|
|
15
|
+
UploadFile,
|
|
16
|
+
)
|
|
8
17
|
from fastapi.responses import FileResponse, RedirectResponse
|
|
18
|
+
from fastapi.staticfiles import StaticFiles
|
|
9
19
|
from xspect.definitions import get_xspect_runs_path, get_xspect_upload_path
|
|
10
20
|
from xspect.download_models import download_test_models
|
|
11
21
|
import xspect.model_management as mm
|
|
12
22
|
from xspect.train import train_from_ncbi
|
|
13
23
|
from xspect import classify, filter_sequences
|
|
14
|
-
from fastapi.staticfiles import StaticFiles
|
|
15
24
|
|
|
16
25
|
app = FastAPI()
|
|
17
26
|
app.mount(
|
|
@@ -72,7 +81,7 @@ def classify_post(
|
|
|
72
81
|
)
|
|
73
82
|
return {"message": "Classification started.", "uuid": uuid}
|
|
74
83
|
|
|
75
|
-
|
|
84
|
+
if classification_type == "Species":
|
|
76
85
|
background_tasks.add_task(
|
|
77
86
|
classify.classify_species,
|
|
78
87
|
model,
|
|
@@ -119,7 +128,7 @@ def filter_post(
|
|
|
119
128
|
)
|
|
120
129
|
return {"message": "Genus filtering started.", "uuid": uuid}
|
|
121
130
|
|
|
122
|
-
|
|
131
|
+
if filter_type == "Species":
|
|
123
132
|
if not filter_species:
|
|
124
133
|
raise ValueError("filter_species must be provided for species filtering.")
|
|
125
134
|
background_tasks.add_task(
|
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
xspect/classify.py,sha256=
|
|
3
|
-
xspect/definitions.py,sha256=
|
|
2
|
+
xspect/classify.py,sha256=SOTfUsEarZVbVtBuwNTlOYQ_MvoW0ADAqH7rTASWEI8,3936
|
|
3
|
+
xspect/definitions.py,sha256=8PpU8bpzcwv8PPacncywz-Na_MicMl-JsvjiX3e46yo,2734
|
|
4
4
|
xspect/download_models.py,sha256=VALcnowzkUpR-OAvgB5BUdEq9WnyNbli0CxH3OT40Rc,1121
|
|
5
|
-
xspect/file_io.py,sha256=
|
|
6
|
-
xspect/filter_sequences.py,sha256=
|
|
7
|
-
xspect/main.py,sha256=
|
|
8
|
-
xspect/model_management.py,sha256=
|
|
5
|
+
xspect/file_io.py,sha256=QX2nBtlLAexBdfUr7rtHLlWOuXiaKvfRdpn1Dn0avnY,8120
|
|
6
|
+
xspect/filter_sequences.py,sha256=QKjgUCk3RBY3U9hHmyvSQeQt8n1voBna-NjOoTqdp3A,5196
|
|
7
|
+
xspect/main.py,sha256=vRCsSH_QVKY6usU5d5pjehPAhk4WJfZ_eyl_0xUGu5E,14137
|
|
8
|
+
xspect/model_management.py,sha256=yWbCk6tUn7-OYpzH0BViX2oWr4cdNkEBjrvnaw5GPdQ,4893
|
|
9
9
|
xspect/ncbi.py,sha256=VRbFvtfGR4WTsc3buZE9UCabE3OJUTRphDRY20g63-E,11704
|
|
10
10
|
xspect/train.py,sha256=jxjK4OqzTywmd5KGPou9A-doH8Nwhlv_xF4X7M6X_jI,11588
|
|
11
|
-
xspect/web.py,sha256=
|
|
11
|
+
xspect/web.py,sha256=kM4BZ3fA0f731EEXScAaiGrJZvjjfep1iC1iZemfazw,7039
|
|
12
12
|
xspect/mlst_feature/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
xspect/mlst_feature/mlst_helper.py,sha256=
|
|
14
|
-
xspect/mlst_feature/pub_mlst_handler.py,sha256=
|
|
13
|
+
xspect/mlst_feature/mlst_helper.py,sha256=pxRX_nRbrTSIFPf_FDV3dxR_FonmGtxttFgqNS7sIxE,8130
|
|
14
|
+
xspect/mlst_feature/pub_mlst_handler.py,sha256=gX0bgAqXTaW9weWgxcbsiD7UtMGuDD9veE9mj42Ffm8,7685
|
|
15
15
|
xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
xspect/models/probabilistic_filter_mlst_model.py,sha256=
|
|
17
|
-
xspect/models/probabilistic_filter_model.py,sha256=
|
|
18
|
-
xspect/models/probabilistic_filter_svm_model.py,sha256=
|
|
19
|
-
xspect/models/probabilistic_single_filter_model.py,sha256=
|
|
20
|
-
xspect/models/result.py,sha256=
|
|
16
|
+
xspect/models/probabilistic_filter_mlst_model.py,sha256=w9ibUkAYA-DSOEkU8fBenlENrs8JwRRLaF5KO1HVKoM,17716
|
|
17
|
+
xspect/models/probabilistic_filter_model.py,sha256=pUgkN4E2EO-gePVR4BMndgMhJcyvOfVzfjVypjIz2JA,19047
|
|
18
|
+
xspect/models/probabilistic_filter_svm_model.py,sha256=n_HMARvcUMP1i-csiW8uvskcocrvhWMjue7kfsaKPpI,11146
|
|
19
|
+
xspect/models/probabilistic_single_filter_model.py,sha256=vJvKZrAybYHq_UdKQ2GvvVwgTYwqRrL-nDDQZxb6RRc,6828
|
|
20
|
+
xspect/models/result.py,sha256=Wpsm9EYrvMazDO0JAqF51Sb8BJqAZwYx4G6-SUOt5-c,7070
|
|
21
21
|
xspect/xspect-web/.gitignore,sha256=_nGOe6uxTzy60tl_CIibnOUhXtP-DkOyuM-_s7m4ROg,253
|
|
22
22
|
xspect/xspect-web/README.md,sha256=Fa5cCk66ohbqD_AAVgnXUZLhuzshnLxhlUFhxyscScc,1942
|
|
23
23
|
xspect/xspect-web/components.json,sha256=5emhfq5JRW9J8Zga-1N5jAcj4B-r8VREXnH7Z6tZGNk,425
|
|
@@ -78,9 +78,9 @@ xspect/xspect-web/src/components/ui/switch.tsx,sha256=uIqRXtd41ba0eusIEUWVyYZv82
|
|
|
78
78
|
xspect/xspect-web/src/components/ui/table.tsx,sha256=M2-TIHKwPFWuXrwysSufdQRSMJT-K9jPzGOokfU6PXo,2463
|
|
79
79
|
xspect/xspect-web/src/components/ui/tabs.tsx,sha256=BImHKcdDCtrS3CCV1AGgn8qg0b65RB5P-QdH49IAhx0,1955
|
|
80
80
|
xspect/xspect-web/src/lib/utils.ts,sha256=66ibdQiEHKftZBq1OMLmOKqWma1BkO-O60rc1IQYwLE,165
|
|
81
|
-
xspect-0.5.
|
|
82
|
-
xspect-0.5.
|
|
83
|
-
xspect-0.5.
|
|
84
|
-
xspect-0.5.
|
|
85
|
-
xspect-0.5.
|
|
86
|
-
xspect-0.5.
|
|
81
|
+
xspect-0.5.4.dist-info/licenses/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
82
|
+
xspect-0.5.4.dist-info/METADATA,sha256=T1EVSE_qesDZjlSCaq3xgnUN57n0NIFjOIQCi4swsEo,4569
|
|
83
|
+
xspect-0.5.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
84
|
+
xspect-0.5.4.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
85
|
+
xspect-0.5.4.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
86
|
+
xspect-0.5.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|