XspecT 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +41 -6
- xspect/definitions.py +11 -3
- xspect/filter_sequences.py +104 -22
- xspect/main.py +52 -32
- xspect/models/probabilistic_filter_model.py +7 -7
- xspect/models/probabilistic_filter_svm_model.py +7 -7
- xspect/models/probabilistic_single_filter_model.py +7 -4
- xspect/ncbi.py +3 -2
- xspect/train.py +17 -10
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/METADATA +11 -5
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/RECORD +15 -15
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/WHEEL +1 -1
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/entry_points.txt +0 -0
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.5.0.dist-info → xspect-0.5.1.dist-info}/top_level.txt +0 -0
xspect/classify.py
CHANGED
|
@@ -4,6 +4,7 @@ import xspect.model_management as mm
|
|
|
4
4
|
from xspect.models.probabilistic_filter_mlst_model import (
|
|
5
5
|
ProbabilisticFilterMlstSchemeModel,
|
|
6
6
|
)
|
|
7
|
+
from xspect.definitions import fasta_endings, fastq_endings
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def classify_genus(
|
|
@@ -11,17 +12,51 @@ def classify_genus(
|
|
|
11
12
|
):
|
|
12
13
|
"""Classify the input file using the genus model."""
|
|
13
14
|
model = mm.get_genus_model(model_genus)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
|
|
16
|
+
input_paths = []
|
|
17
|
+
input_is_dir = input_path.is_dir()
|
|
18
|
+
ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
|
|
19
|
+
|
|
20
|
+
if input_is_dir:
|
|
21
|
+
input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
|
|
22
|
+
elif input_path.is_file():
|
|
23
|
+
input_paths = [input_path]
|
|
24
|
+
|
|
25
|
+
for idx, current_path in enumerate(input_paths):
|
|
26
|
+
result = model.predict(current_path, step=step)
|
|
27
|
+
result.input_source = current_path.name
|
|
28
|
+
output_name = (
|
|
29
|
+
f"{output_path.stem}_{idx+1}{output_path.suffix}"
|
|
30
|
+
if input_is_dir
|
|
31
|
+
else output_path.name
|
|
32
|
+
)
|
|
33
|
+
result.save(output_path.parent / output_name)
|
|
34
|
+
print(f"Saved result as {output_name}")
|
|
17
35
|
|
|
18
36
|
|
|
19
37
|
def classify_species(model_genus, input_path, output_path, step=1):
|
|
20
38
|
"""Classify the input file using the species model."""
|
|
21
39
|
model = mm.get_species_model(model_genus)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
40
|
+
|
|
41
|
+
input_paths = []
|
|
42
|
+
input_is_dir = input_path.is_dir()
|
|
43
|
+
ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
|
|
44
|
+
|
|
45
|
+
if input_is_dir:
|
|
46
|
+
input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
|
|
47
|
+
elif input_path.is_file():
|
|
48
|
+
input_paths = [input_path]
|
|
49
|
+
|
|
50
|
+
for idx, current_path in enumerate(input_paths):
|
|
51
|
+
result = model.predict(current_path, step=step)
|
|
52
|
+
result.input_source = current_path.name
|
|
53
|
+
output_name = (
|
|
54
|
+
f"{output_path.stem}_{idx+1}{output_path.suffix}"
|
|
55
|
+
if input_is_dir
|
|
56
|
+
else output_path.name
|
|
57
|
+
)
|
|
58
|
+
result.save(output_path.parent / output_name)
|
|
59
|
+
print(f"Saved result as {output_name}")
|
|
25
60
|
|
|
26
61
|
|
|
27
62
|
def classify_mlst(input_path, output_path):
|
xspect/definitions.py
CHANGED
|
@@ -9,9 +9,17 @@ fastq_endings = ["fastq", "fq"]
|
|
|
9
9
|
|
|
10
10
|
def get_xspect_root_path():
|
|
11
11
|
"""Return the root path for XspecT data."""
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
|
|
13
|
+
home_based_dir = Path.home() / "xspect-data"
|
|
14
|
+
if home_based_dir.exists():
|
|
15
|
+
return home_based_dir
|
|
16
|
+
|
|
17
|
+
cwd_based_dir = Path(getcwd()) / "xspect-data"
|
|
18
|
+
if cwd_based_dir.exists():
|
|
19
|
+
return cwd_based_dir
|
|
20
|
+
|
|
21
|
+
home_based_dir.mkdir(exist_ok=True, parents=True)
|
|
22
|
+
return home_based_dir
|
|
15
23
|
|
|
16
24
|
|
|
17
25
|
def get_xspect_model_path():
|
xspect/filter_sequences.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from xspect.model_management import get_genus_model, get_species_model
|
|
3
3
|
from xspect.file_io import filter_sequences
|
|
4
|
+
from xspect.definitions import fasta_endings, fastq_endings
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def filter_species(
|
|
@@ -9,6 +10,7 @@ def filter_species(
|
|
|
9
10
|
input_path: Path,
|
|
10
11
|
output_path: Path,
|
|
11
12
|
threshold: float,
|
|
13
|
+
classification_output_path: Path | None = None,
|
|
12
14
|
):
|
|
13
15
|
"""Filter sequences by species.
|
|
14
16
|
This function filters sequences from the input file based on the species model.
|
|
@@ -20,20 +22,56 @@ def filter_species(
|
|
|
20
22
|
model_species (str): The species model slug.
|
|
21
23
|
input_path (Path): The path to the input file containing sequences.
|
|
22
24
|
output_path (Path): The path to the output file where filtered sequences will be saved.
|
|
25
|
+
above this threshold will be included in the output file. A threshold of -1 will
|
|
26
|
+
include only sequences if the species score is the highest among the
|
|
27
|
+
available species scores.
|
|
28
|
+
classification_output_path (Path): Optional path to save the classification results.
|
|
23
29
|
threshold (float): The threshold for filtering sequences. Only sequences with a score
|
|
24
|
-
above this threshold will be included in the output file.
|
|
30
|
+
above this threshold will be included in the output file. A threshold of -1 will
|
|
31
|
+
include only sequences if the species score is the highest among the
|
|
32
|
+
available species scores.
|
|
25
33
|
"""
|
|
26
34
|
species_model = get_species_model(model_genus)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
input_path
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
|
|
36
|
+
input_paths = []
|
|
37
|
+
input_is_dir = input_path.is_dir()
|
|
38
|
+
ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
|
|
39
|
+
|
|
40
|
+
if input_is_dir:
|
|
41
|
+
input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
|
|
42
|
+
elif input_path.is_file():
|
|
43
|
+
input_paths = [input_path]
|
|
44
|
+
|
|
45
|
+
for idx, current_path in enumerate(input_paths):
|
|
46
|
+
result = species_model.predict(current_path)
|
|
47
|
+
result.input_source = current_path.name
|
|
48
|
+
|
|
49
|
+
if classification_output_path:
|
|
50
|
+
classification_output_name = (
|
|
51
|
+
f"{classification_output_path.stem}_{idx+1}{classification_output_path.suffix}"
|
|
52
|
+
if input_is_dir
|
|
53
|
+
else classification_output_path.name
|
|
54
|
+
)
|
|
55
|
+
result.save(classification_output_path.parent / classification_output_name)
|
|
56
|
+
print(
|
|
57
|
+
f"Saved classification results from {current_path.name} as {classification_output_name}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
included_ids = result.get_filtered_subsequence_labels(model_species, threshold)
|
|
61
|
+
if not included_ids:
|
|
62
|
+
print(f"No sequences found for the given species in {current_path.name}.")
|
|
63
|
+
continue
|
|
64
|
+
output_name = (
|
|
65
|
+
f"{output_path.stem}_{idx+1}{output_path.suffix}"
|
|
66
|
+
if input_is_dir
|
|
67
|
+
else output_path.name
|
|
68
|
+
)
|
|
69
|
+
filter_sequences(
|
|
70
|
+
current_path,
|
|
71
|
+
output_path.parent / output_name,
|
|
72
|
+
included_ids,
|
|
73
|
+
)
|
|
74
|
+
print(f"Saved filtered sequences from {current_path.name} as {output_name}")
|
|
37
75
|
|
|
38
76
|
|
|
39
77
|
def filter_genus(
|
|
@@ -41,16 +79,60 @@ def filter_genus(
|
|
|
41
79
|
input_path: Path,
|
|
42
80
|
output_path: Path,
|
|
43
81
|
threshold: float,
|
|
82
|
+
classification_output_path: Path | None = None,
|
|
44
83
|
):
|
|
84
|
+
"""Filter sequences by genus.
|
|
85
|
+
This function filters sequences from the input file based on the genus model.
|
|
86
|
+
It uses the genus model to identify the genus of the sequences and then applies
|
|
87
|
+
the filtering based on the provided threshold.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
model_genus (str): The genus model slug.
|
|
91
|
+
input_path (Path): The path to the input file containing sequences.
|
|
92
|
+
output_path (Path): The path to the output file where filtered sequences will be saved.
|
|
93
|
+
threshold (float): The threshold for filtering sequences. Only sequences with a score
|
|
94
|
+
above this threshold will be included in the output file.
|
|
95
|
+
classification_output_path (Path): Optional path to save the classification results.
|
|
96
|
+
|
|
97
|
+
"""
|
|
45
98
|
genus_model = get_genus_model(model_genus)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
99
|
+
|
|
100
|
+
input_paths = []
|
|
101
|
+
input_is_dir = input_path.is_dir()
|
|
102
|
+
ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
|
|
103
|
+
|
|
104
|
+
if input_is_dir:
|
|
105
|
+
input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
|
|
106
|
+
elif input_path.is_file():
|
|
107
|
+
input_paths = [input_path]
|
|
108
|
+
|
|
109
|
+
for idx, current_path in enumerate(input_paths):
|
|
110
|
+
result = genus_model.predict(current_path)
|
|
111
|
+
result.input_source = current_path.name
|
|
112
|
+
|
|
113
|
+
if classification_output_path:
|
|
114
|
+
classification_output_name = (
|
|
115
|
+
f"{classification_output_path.stem}_{idx+1}{classification_output_path.suffix}"
|
|
116
|
+
if input_is_dir
|
|
117
|
+
else classification_output_path.name
|
|
118
|
+
)
|
|
119
|
+
result.save(classification_output_path.parent / classification_output_name)
|
|
120
|
+
print(
|
|
121
|
+
f"Saved classification results from {current_path.name} as {classification_output_name}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
|
|
125
|
+
if not included_ids:
|
|
126
|
+
print(f"No sequences found for the given genus in {current_path.name}.")
|
|
127
|
+
continue
|
|
128
|
+
output_name = (
|
|
129
|
+
f"{output_path.stem}_{idx+1}{output_path.suffix}"
|
|
130
|
+
if input_is_dir
|
|
131
|
+
else output_path.name
|
|
132
|
+
)
|
|
133
|
+
filter_sequences(
|
|
134
|
+
current_path,
|
|
135
|
+
output_path.parent / output_name,
|
|
136
|
+
included_ids,
|
|
137
|
+
)
|
|
138
|
+
print(f"Saved filtered sequences from {current_path.name} as {output_name}")
|
xspect/main.py
CHANGED
|
@@ -7,12 +7,12 @@ import uvicorn
|
|
|
7
7
|
from xspect import classify
|
|
8
8
|
from xspect.web import app
|
|
9
9
|
from xspect.download_models import download_test_models
|
|
10
|
-
from xspect
|
|
10
|
+
from xspect import filter_sequences
|
|
11
11
|
from xspect.train import train_from_directory, train_from_ncbi
|
|
12
12
|
from xspect.definitions import (
|
|
13
13
|
get_xspect_model_path,
|
|
14
14
|
)
|
|
15
|
-
from xspect.mlst_feature.mlst_helper import pick_scheme
|
|
15
|
+
from xspect.mlst_feature.mlst_helper import pick_scheme
|
|
16
16
|
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
17
17
|
from xspect.models.probabilistic_filter_mlst_model import (
|
|
18
18
|
ProbabilisticFilterMlstSchemeModel,
|
|
@@ -211,19 +211,19 @@ def classify_seqs():
|
|
|
211
211
|
help="Path to FASTA or FASTQ file for classification.",
|
|
212
212
|
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
213
213
|
prompt=True,
|
|
214
|
+
default=Path("."),
|
|
214
215
|
)
|
|
215
216
|
@click.option(
|
|
216
217
|
"-o",
|
|
217
218
|
"--output-path",
|
|
218
219
|
help="Path to the output file.",
|
|
219
|
-
type=click.Path(dir_okay=
|
|
220
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
220
221
|
default=Path(".") / f"result_{uuid4()}.json",
|
|
221
222
|
)
|
|
222
223
|
def classify_genus(model_genus, input_path, output_path):
|
|
223
224
|
"""Classify samples using a genus model."""
|
|
224
225
|
click.echo("Classifying...")
|
|
225
226
|
classify.classify_genus(model_genus, Path(input_path), Path(output_path))
|
|
226
|
-
click.echo(f"Result saved as {output_path}.")
|
|
227
227
|
|
|
228
228
|
|
|
229
229
|
@classify_seqs.command(
|
|
@@ -244,12 +244,13 @@ def classify_genus(model_genus, input_path, output_path):
|
|
|
244
244
|
help="Path to FASTA or FASTQ file for classification.",
|
|
245
245
|
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
246
246
|
prompt=True,
|
|
247
|
+
default=Path("."),
|
|
247
248
|
)
|
|
248
249
|
@click.option(
|
|
249
250
|
"-o",
|
|
250
251
|
"--output-path",
|
|
251
252
|
help="Path to the output file.",
|
|
252
|
-
type=click.Path(dir_okay=
|
|
253
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
253
254
|
default=Path(".") / f"result_{uuid4()}.json",
|
|
254
255
|
)
|
|
255
256
|
@click.option(
|
|
@@ -264,7 +265,6 @@ def classify_species(model_genus, input_path, output_path, sparse_sampling_step)
|
|
|
264
265
|
classify.classify_species(
|
|
265
266
|
model_genus, Path(input_path), Path(output_path), sparse_sampling_step
|
|
266
267
|
)
|
|
267
|
-
click.echo(f"Result saved as {output_path}.")
|
|
268
268
|
|
|
269
269
|
|
|
270
270
|
@classify_seqs.command(
|
|
@@ -275,15 +275,14 @@ def classify_species(model_genus, input_path, output_path, sparse_sampling_step)
|
|
|
275
275
|
"-i",
|
|
276
276
|
"--input-path",
|
|
277
277
|
help="Path to FASTA-file for mlst identification.",
|
|
278
|
-
type=click.Path(exists=True, dir_okay=
|
|
278
|
+
type=click.Path(exists=True, dir_okay=False, file_okay=True),
|
|
279
279
|
prompt=True,
|
|
280
280
|
)
|
|
281
281
|
@click.option(
|
|
282
282
|
"-o",
|
|
283
283
|
"--output-path",
|
|
284
284
|
help="Path to the output file.",
|
|
285
|
-
type=click.Path(dir_okay=
|
|
286
|
-
default=Path(".") / f"result_{uuid4()}.json",
|
|
285
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
287
286
|
)
|
|
288
287
|
def classify_mlst(input_path, output_path):
|
|
289
288
|
"""MLST classify a sample."""
|
|
@@ -321,37 +320,42 @@ def filter_seqs():
|
|
|
321
320
|
help="Path to FASTA or FASTQ file for classification.",
|
|
322
321
|
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
323
322
|
prompt=True,
|
|
323
|
+
default=Path("."),
|
|
324
324
|
)
|
|
325
325
|
@click.option(
|
|
326
326
|
"-o",
|
|
327
327
|
"--output-path",
|
|
328
328
|
help="Path to the output file.",
|
|
329
|
-
type=click.Path(dir_okay=
|
|
329
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
330
330
|
prompt=True,
|
|
331
|
+
default=Path(".") / f"genus_filtered_{uuid4()}.fasta",
|
|
331
332
|
)
|
|
332
333
|
@click.option(
|
|
334
|
+
"--classification-output-path",
|
|
335
|
+
help="Optional path to the classification output file.",
|
|
336
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
337
|
+
)
|
|
338
|
+
@click.option(
|
|
339
|
+
"-t",
|
|
333
340
|
"--threshold",
|
|
334
|
-
type=
|
|
341
|
+
type=click.FloatRange(0, 1),
|
|
335
342
|
help="Threshold for filtering (default: 0.7).",
|
|
336
343
|
default=0.7,
|
|
337
344
|
prompt=True,
|
|
338
345
|
)
|
|
339
|
-
def filter_genus(
|
|
346
|
+
def filter_genus(
|
|
347
|
+
model_genus, input_path, output_path, classification_output_path, threshold
|
|
348
|
+
):
|
|
340
349
|
"""Filter samples using a genus model."""
|
|
341
350
|
click.echo("Filtering...")
|
|
342
|
-
genus_model = get_genus_model(model_genus)
|
|
343
|
-
result = genus_model.predict(Path(input_path))
|
|
344
|
-
included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
|
|
345
|
-
if not included_ids:
|
|
346
|
-
click.echo("No sequences found for the given genus.")
|
|
347
|
-
return
|
|
348
351
|
|
|
349
|
-
filter_sequences(
|
|
352
|
+
filter_sequences.filter_genus(
|
|
353
|
+
model_genus,
|
|
350
354
|
Path(input_path),
|
|
351
355
|
Path(output_path),
|
|
352
|
-
|
|
356
|
+
threshold,
|
|
357
|
+
Path(classification_output_path) if classification_output_path else None,
|
|
353
358
|
)
|
|
354
|
-
click.echo(f"Filtered sequences saved at {output_path}.")
|
|
355
359
|
|
|
356
360
|
|
|
357
361
|
@filter_seqs.command(
|
|
@@ -378,24 +382,44 @@ def filter_genus(model_genus, input_path, output_path, threshold):
|
|
|
378
382
|
help="Path to FASTA or FASTQ file for classification.",
|
|
379
383
|
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
380
384
|
prompt=True,
|
|
385
|
+
default=Path("."),
|
|
381
386
|
)
|
|
382
387
|
@click.option(
|
|
383
388
|
"-o",
|
|
384
389
|
"--output-path",
|
|
385
390
|
help="Path to the output file.",
|
|
386
|
-
type=click.Path(dir_okay=
|
|
391
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
387
392
|
prompt=True,
|
|
393
|
+
default=Path(".") / f"species_filtered_{uuid4()}.fasta",
|
|
394
|
+
)
|
|
395
|
+
@click.option(
|
|
396
|
+
"--classification-output-path",
|
|
397
|
+
help="Optional path to the classification output file.",
|
|
398
|
+
type=click.Path(dir_okay=False, file_okay=True),
|
|
388
399
|
)
|
|
389
400
|
@click.option(
|
|
401
|
+
"-t",
|
|
390
402
|
"--threshold",
|
|
391
403
|
type=float,
|
|
392
404
|
help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring species.",
|
|
393
405
|
default=0.7,
|
|
394
406
|
prompt=True,
|
|
395
407
|
)
|
|
396
|
-
def filter_species(
|
|
408
|
+
def filter_species(
|
|
409
|
+
model_genus,
|
|
410
|
+
model_species,
|
|
411
|
+
input_path,
|
|
412
|
+
output_path,
|
|
413
|
+
threshold,
|
|
414
|
+
classification_output_path,
|
|
415
|
+
):
|
|
397
416
|
"""Filter a sample using the species model."""
|
|
398
417
|
|
|
418
|
+
if threshold != -1 and (threshold < 0 or threshold > 1):
|
|
419
|
+
raise click.BadParameter(
|
|
420
|
+
"Threshold must be between 0 and 1, or -1 for filtering by the highest scoring species."
|
|
421
|
+
)
|
|
422
|
+
|
|
399
423
|
available_species = get_model_metadata(f"{model_genus}-species")["display_names"]
|
|
400
424
|
available_species = {
|
|
401
425
|
id: name.replace(f"{model_genus} ", "")
|
|
@@ -420,18 +444,14 @@ def filter_species(model_genus, model_species, input_path, output_path, threshol
|
|
|
420
444
|
][0]
|
|
421
445
|
|
|
422
446
|
click.echo("Filtering...")
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
if not included_ids:
|
|
427
|
-
click.echo("No sequences found for the given species.")
|
|
428
|
-
return
|
|
429
|
-
filter_sequences(
|
|
447
|
+
filter_sequences.filter_species(
|
|
448
|
+
model_genus,
|
|
449
|
+
model_species,
|
|
430
450
|
Path(input_path),
|
|
431
451
|
Path(output_path),
|
|
432
|
-
|
|
452
|
+
threshold,
|
|
453
|
+
Path(classification_output_path) if classification_output_path else None,
|
|
433
454
|
)
|
|
434
|
-
click.echo(f"Filtered sequences saved at {output_path}.")
|
|
435
455
|
|
|
436
456
|
|
|
437
457
|
if __name__ == "__main__":
|
|
@@ -20,13 +20,13 @@ class ProbabilisticFilterModel:
|
|
|
20
20
|
self,
|
|
21
21
|
k: int,
|
|
22
22
|
model_display_name: str,
|
|
23
|
-
author: str,
|
|
24
|
-
author_email: str,
|
|
23
|
+
author: str | None,
|
|
24
|
+
author_email: str | None,
|
|
25
25
|
model_type: str,
|
|
26
26
|
base_path: Path,
|
|
27
27
|
fpr: float = 0.01,
|
|
28
28
|
num_hashes: int = 7,
|
|
29
|
-
training_accessions: dict[str, list[str]] = None,
|
|
29
|
+
training_accessions: dict[str, list[str]] | None = None,
|
|
30
30
|
) -> None:
|
|
31
31
|
if k < 1:
|
|
32
32
|
raise ValueError("Invalid k value, must be greater than 0")
|
|
@@ -49,7 +49,7 @@ class ProbabilisticFilterModel:
|
|
|
49
49
|
self.index = None
|
|
50
50
|
self.training_accessions = training_accessions
|
|
51
51
|
|
|
52
|
-
def get_cobs_index_path(self) ->
|
|
52
|
+
def get_cobs_index_path(self) -> str:
|
|
53
53
|
"""Returns the path to the cobs index"""
|
|
54
54
|
return str(self.base_path / self.slug() / "index.cobs_classic")
|
|
55
55
|
|
|
@@ -76,8 +76,8 @@ class ProbabilisticFilterModel:
|
|
|
76
76
|
def fit(
|
|
77
77
|
self,
|
|
78
78
|
dir_path: Path,
|
|
79
|
-
display_names: dict = None,
|
|
80
|
-
training_accessions: dict[str, list[str]] = None,
|
|
79
|
+
display_names: dict | None = None,
|
|
80
|
+
training_accessions: dict[str, list[str]] | None = None,
|
|
81
81
|
) -> None:
|
|
82
82
|
"""Adds filters to the model"""
|
|
83
83
|
|
|
@@ -123,7 +123,7 @@ class ProbabilisticFilterModel:
|
|
|
123
123
|
self.index = cobs.Search(self.get_cobs_index_path(), True)
|
|
124
124
|
|
|
125
125
|
def calculate_hits(
|
|
126
|
-
self, sequence: Seq, filter_ids: list[str] = None, step: int = 1
|
|
126
|
+
self, sequence: Seq, filter_ids: list[str] | None = None, step: int = 1
|
|
127
127
|
) -> dict:
|
|
128
128
|
"""Calculates the hits for a sequence"""
|
|
129
129
|
|
|
@@ -21,16 +21,16 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
21
21
|
self,
|
|
22
22
|
k: int,
|
|
23
23
|
model_display_name: str,
|
|
24
|
-
author: str,
|
|
25
|
-
author_email: str,
|
|
24
|
+
author: str | None,
|
|
25
|
+
author_email: str | None,
|
|
26
26
|
model_type: str,
|
|
27
27
|
base_path: Path,
|
|
28
28
|
kernel: str,
|
|
29
29
|
c: float,
|
|
30
30
|
fpr: float = 0.01,
|
|
31
31
|
num_hashes: int = 7,
|
|
32
|
-
training_accessions: dict[str, list[str]] = None,
|
|
33
|
-
svm_accessions: dict[str, list[str]] = None,
|
|
32
|
+
training_accessions: dict[str, list[str]] | None = None,
|
|
33
|
+
svm_accessions: dict[str, list[str]] | None = None,
|
|
34
34
|
) -> None:
|
|
35
35
|
super().__init__(
|
|
36
36
|
k=k,
|
|
@@ -64,10 +64,10 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
64
64
|
self,
|
|
65
65
|
dir_path: Path,
|
|
66
66
|
svm_path: Path,
|
|
67
|
-
display_names: dict = None,
|
|
67
|
+
display_names: dict[str, str] | None = None,
|
|
68
68
|
svm_step: int = 1,
|
|
69
|
-
training_accessions: list[str] = None,
|
|
70
|
-
svm_accessions: list[str] = None,
|
|
69
|
+
training_accessions: dict[str, list[str]] | None = None,
|
|
70
|
+
svm_accessions: dict[str, list[str]] | None = None,
|
|
71
71
|
) -> None:
|
|
72
72
|
"""Fit the SVM to the sequences and labels"""
|
|
73
73
|
|
|
@@ -20,12 +20,12 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
20
20
|
self,
|
|
21
21
|
k: int,
|
|
22
22
|
model_display_name: str,
|
|
23
|
-
author: str,
|
|
24
|
-
author_email: str,
|
|
23
|
+
author: str | None,
|
|
24
|
+
author_email: str | None,
|
|
25
25
|
model_type: str,
|
|
26
26
|
base_path: Path,
|
|
27
27
|
fpr: float = 0.01,
|
|
28
|
-
training_accessions: list[str] = None,
|
|
28
|
+
training_accessions: list[str] | None = None,
|
|
29
29
|
) -> None:
|
|
30
30
|
super().__init__(
|
|
31
31
|
k=k,
|
|
@@ -41,7 +41,10 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
41
41
|
self.bf = None
|
|
42
42
|
|
|
43
43
|
def fit(
|
|
44
|
-
self,
|
|
44
|
+
self,
|
|
45
|
+
file_path: Path,
|
|
46
|
+
display_name: str,
|
|
47
|
+
training_accessions: list[str] | None = None,
|
|
45
48
|
) -> None:
|
|
46
49
|
"""Fit the cobs classic index to the sequences and labels"""
|
|
47
50
|
self.training_accessions = training_accessions
|
xspect/ncbi.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import time
|
|
6
|
+
from loguru import logger
|
|
6
7
|
import requests
|
|
7
8
|
|
|
8
9
|
# pylint: disable=line-too-long
|
|
@@ -34,7 +35,7 @@ class NCBIHandler:
|
|
|
34
35
|
|
|
35
36
|
def __init__(
|
|
36
37
|
self,
|
|
37
|
-
api_key: str = None,
|
|
38
|
+
api_key: str | None = None,
|
|
38
39
|
):
|
|
39
40
|
"""Initialise the NCBI handler."""
|
|
40
41
|
self.api_key = api_key
|
|
@@ -229,7 +230,7 @@ class NCBIHandler:
|
|
|
229
230
|
== "OK"
|
|
230
231
|
]
|
|
231
232
|
except (IndexError, KeyError, TypeError):
|
|
232
|
-
|
|
233
|
+
logger.debug(
|
|
233
234
|
f"Could not get {assembly_level.value} accessions for taxon with ID: {taxon_id}. Skipping."
|
|
234
235
|
)
|
|
235
236
|
return []
|
xspect/train.py
CHANGED
|
@@ -25,12 +25,12 @@ def train_from_directory(
|
|
|
25
25
|
display_name: str,
|
|
26
26
|
dir_path: Path,
|
|
27
27
|
meta: bool = False,
|
|
28
|
-
training_accessions: dict[str, list[str]] = None,
|
|
29
|
-
svm_accessions: list[str] = None,
|
|
28
|
+
training_accessions: dict[str, list[str]] | None = None,
|
|
29
|
+
svm_accessions: dict[str, list[str]] | None = None,
|
|
30
30
|
svm_step: int = 1,
|
|
31
|
-
translation_dict: dict[str, str] = None,
|
|
32
|
-
author: str = None,
|
|
33
|
-
author_email: str = None,
|
|
31
|
+
translation_dict: dict[str, str] | None = None,
|
|
32
|
+
author: str | None = None,
|
|
33
|
+
author_email: str | None = None,
|
|
34
34
|
):
|
|
35
35
|
"""
|
|
36
36
|
Train a model from a directory containing training data.
|
|
@@ -113,10 +113,11 @@ def train_from_directory(
|
|
|
113
113
|
species_dir = tmp_dir / "species"
|
|
114
114
|
species_dir.mkdir(parents=True, exist_ok=True)
|
|
115
115
|
|
|
116
|
-
|
|
116
|
+
logger.info("Concatenating genomes for species training...")
|
|
117
117
|
concatenate_species_fasta_files(cobs_folders, species_dir)
|
|
118
118
|
|
|
119
119
|
if svm_path.exists():
|
|
120
|
+
logger.info("Training species SVM model...")
|
|
120
121
|
species_model = ProbabilisticFilterSVMModel(
|
|
121
122
|
k=21,
|
|
122
123
|
model_display_name=display_name,
|
|
@@ -136,6 +137,7 @@ def train_from_directory(
|
|
|
136
137
|
svm_accessions=svm_accessions,
|
|
137
138
|
)
|
|
138
139
|
else:
|
|
140
|
+
logger.info("Training species model...")
|
|
139
141
|
species_model = ProbabilisticFilterModel(
|
|
140
142
|
k=21,
|
|
141
143
|
model_display_name=display_name,
|
|
@@ -153,9 +155,11 @@ def train_from_directory(
|
|
|
153
155
|
species_model.save()
|
|
154
156
|
|
|
155
157
|
if meta:
|
|
158
|
+
logger.info("Concatenating genomes for metagenome training...")
|
|
156
159
|
meta_fasta = tmp_dir / f"{display_name}.fasta"
|
|
157
160
|
concatenate_metagenome(species_dir, meta_fasta)
|
|
158
161
|
|
|
162
|
+
logger.info("Training metagenome model...")
|
|
159
163
|
genus_model = ProbabilisticSingleFilterModel(
|
|
160
164
|
k=21,
|
|
161
165
|
model_display_name=display_name,
|
|
@@ -179,8 +183,9 @@ def train_from_directory(
|
|
|
179
183
|
def train_from_ncbi(
|
|
180
184
|
genus: str,
|
|
181
185
|
svm_step: int = 1,
|
|
182
|
-
author: str = None,
|
|
183
|
-
author_email: str = None,
|
|
186
|
+
author: str | None = None,
|
|
187
|
+
author_email: str | None = None,
|
|
188
|
+
ncbi_api_key: str | None = None,
|
|
184
189
|
):
|
|
185
190
|
"""Train a model using NCBI assembly data for a given genus.
|
|
186
191
|
|
|
@@ -193,6 +198,7 @@ def train_from_ncbi(
|
|
|
193
198
|
svm_step (int, optional): Step size for SVM training. Defaults to 1.
|
|
194
199
|
author (str, optional): Author of the model. Defaults to None.
|
|
195
200
|
author_email (str, optional): Author's email. Defaults to None.
|
|
201
|
+
ncbi_api_key (str, optional): NCBI API key for accessing NCBI resources. Defaults to None.
|
|
196
202
|
|
|
197
203
|
Raises:
|
|
198
204
|
TypeError: If `genus` is not a string.
|
|
@@ -205,7 +211,8 @@ def train_from_ncbi(
|
|
|
205
211
|
if not isinstance(genus, str):
|
|
206
212
|
raise TypeError("genus must be a string")
|
|
207
213
|
|
|
208
|
-
|
|
214
|
+
logger.info("Getting NCBI metadata...")
|
|
215
|
+
ncbi_handler = NCBIHandler(api_key=ncbi_api_key)
|
|
209
216
|
genus_tax_id = ncbi_handler.get_genus_taxon_id(genus)
|
|
210
217
|
species_ids = ncbi_handler.get_species(genus_tax_id)
|
|
211
218
|
species_names = ncbi_handler.get_taxon_names(species_ids)
|
|
@@ -243,7 +250,7 @@ def train_from_ncbi(
|
|
|
243
250
|
cobs_dir.mkdir(parents=True, exist_ok=True)
|
|
244
251
|
svm_dir.mkdir(parents=True, exist_ok=True)
|
|
245
252
|
|
|
246
|
-
|
|
253
|
+
logger.info("Downloading genomes from NCBI...")
|
|
247
254
|
all_accessions = sum(accessions.values(), [])
|
|
248
255
|
batch_size = 100
|
|
249
256
|
accession_paths = {}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -55,7 +55,7 @@ Requires-Dist: pytest-retry; extra == "test"
|
|
|
55
55
|
Requires-Dist: httpx; extra == "test"
|
|
56
56
|
Dynamic: license-file
|
|
57
57
|
|
|
58
|
-
# XspecT
|
|
58
|
+
# XspecT
|
|
59
59
|
<!-- start intro -->
|
|
60
60
|

|
|
61
61
|
[](https://github.com/pylint-dev/pylint)
|
|
@@ -63,7 +63,7 @@ Dynamic: license-file
|
|
|
63
63
|
|
|
64
64
|
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [Support Vector Machine].
|
|
65
65
|
|
|
66
|
-
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a
|
|
66
|
+
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a kmer index. Probablistic data structures ensure a fast lookup in this process. For a final prediction, the results are classified using a Support Vector Machine.
|
|
67
67
|
|
|
68
68
|
The tool is available as a web-based application and as a command line interface.
|
|
69
69
|
|
|
@@ -91,16 +91,22 @@ xspect models train ncbi
|
|
|
91
91
|
```
|
|
92
92
|
|
|
93
93
|
### How to run the web app
|
|
94
|
-
To run the web app,
|
|
94
|
+
To run the web app, simply execute:
|
|
95
95
|
```
|
|
96
96
|
xspect web
|
|
97
97
|
```
|
|
98
98
|
|
|
99
|
+
This will start a local web server. You can access the web app by navigating to `http://localhost:8000` in your web browser.
|
|
100
|
+
|
|
99
101
|
### How to use the XspecT command line interface
|
|
100
|
-
|
|
102
|
+
To use the XspecT command line interface, execute `xspect` with the desired subcommand and parameters.
|
|
103
|
+
|
|
104
|
+
**Example**:
|
|
101
105
|
```
|
|
102
106
|
xspect classify species
|
|
103
107
|
```
|
|
108
|
+
|
|
109
|
+
If you do not provide the required parameters, the command line interface will prompt you for them.
|
|
104
110
|
For further instructions on how to use the command line interface, please refer to the [documentation] or execute:
|
|
105
111
|
```
|
|
106
112
|
xspect --help
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
xspect/classify.py,sha256=
|
|
3
|
-
xspect/definitions.py,sha256=
|
|
2
|
+
xspect/classify.py,sha256=ZLTtgsaPK_Eo-B52zvYIlBOr46lSJOO1UlsXvTn7BaY,2426
|
|
3
|
+
xspect/definitions.py,sha256=s3oGQiF3ZQAyqUCcH7qLytvOZB6uRUZhaEW-bH0lfUM,1407
|
|
4
4
|
xspect/download_models.py,sha256=y1wFJZa1xOJfvUP78zKkRs46O-WqKBL90vmo5AYUio0,853
|
|
5
5
|
xspect/file_io.py,sha256=-3xm7IfOlmWJHxfrKPX5Qqit10zqsBUVt5Z7z1No2AI,5669
|
|
6
|
-
xspect/filter_sequences.py,sha256=
|
|
7
|
-
xspect/main.py,sha256=
|
|
6
|
+
xspect/filter_sequences.py,sha256=KNTjTQuv2eeCcOHdjYLNUnqNYP5WIBWZppZP2mmPZJk,5698
|
|
7
|
+
xspect/main.py,sha256=8fh43RFw88DtUzR-Egmj9vV-879LEfJvHq-VUmLnqt0,12138
|
|
8
8
|
xspect/model_management.py,sha256=UbmAr3YOZ4oy_9iVvApCLstYHGkcmneHEC_yftRIqCI,3010
|
|
9
|
-
xspect/ncbi.py,sha256=
|
|
10
|
-
xspect/train.py,sha256=
|
|
9
|
+
xspect/ncbi.py,sha256=Zn5YIIzbclM3rHAnpOcUZAqopcbix7_K0tl3mUyuIBI,10140
|
|
10
|
+
xspect/train.py,sha256=nUrj4kbAF4rR_MZjsd1nVHTjdRwuNEUC2DSId11Mfc8,11583
|
|
11
11
|
xspect/web.py,sha256=M4fQUbmCnkpmdJeL-j-FD8r115EctWtWQZttuZWEsL8,5115
|
|
12
12
|
xspect/mlst_feature/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
xspect/mlst_feature/mlst_helper.py,sha256=Ff0kUdu-80969ZyYL6qUJNwEqf9eq72CC8zUyuoDElk,8114
|
|
14
14
|
xspect/mlst_feature/pub_mlst_handler.py,sha256=Ez5YHKfhsLsKdHf1aNMfz7JJVVV_DpA27mah9fgNeJc,5919
|
|
15
15
|
xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
xspect/models/probabilistic_filter_mlst_model.py,sha256=v9yriJX_i8-SRzU8W8RvDPrBvlR_ONSMVypZWGAJpv8,16164
|
|
17
|
-
xspect/models/probabilistic_filter_model.py,sha256=
|
|
18
|
-
xspect/models/probabilistic_filter_svm_model.py,sha256=
|
|
19
|
-
xspect/models/probabilistic_single_filter_model.py,sha256=
|
|
17
|
+
xspect/models/probabilistic_filter_model.py,sha256=CX-D3BFQ_l1SqV09knsJ7ueik-VbxjvZMA6LBnIDsHc,10109
|
|
18
|
+
xspect/models/probabilistic_filter_svm_model.py,sha256=ZWErmMgoEC-mlwnzhf4IiBuntkw8p85fmu4v3RYmVH4,6326
|
|
19
|
+
xspect/models/probabilistic_single_filter_model.py,sha256=QGjcTYrx7He2I-Jr_oJSrDCl6zyxdjnv42LmdaZL-XI,4143
|
|
20
20
|
xspect/models/result.py,sha256=ELWiDlQPlxNG7ceLpth60Z_Hb1ZdopDJ3vgHBPgSRm8,3989
|
|
21
21
|
xspect/xspect-web/.gitignore,sha256=_nGOe6uxTzy60tl_CIibnOUhXtP-DkOyuM-_s7m4ROg,253
|
|
22
22
|
xspect/xspect-web/README.md,sha256=Fa5cCk66ohbqD_AAVgnXUZLhuzshnLxhlUFhxyscScc,1942
|
|
@@ -77,9 +77,9 @@ xspect/xspect-web/src/components/ui/switch.tsx,sha256=uIqRXtd41ba0eusIEUWVyYZv82
|
|
|
77
77
|
xspect/xspect-web/src/components/ui/table.tsx,sha256=M2-TIHKwPFWuXrwysSufdQRSMJT-K9jPzGOokfU6PXo,2463
|
|
78
78
|
xspect/xspect-web/src/components/ui/tabs.tsx,sha256=BImHKcdDCtrS3CCV1AGgn8qg0b65RB5P-QdH49IAhx0,1955
|
|
79
79
|
xspect/xspect-web/src/lib/utils.ts,sha256=66ibdQiEHKftZBq1OMLmOKqWma1BkO-O60rc1IQYwLE,165
|
|
80
|
-
xspect-0.5.
|
|
81
|
-
xspect-0.5.
|
|
82
|
-
xspect-0.5.
|
|
83
|
-
xspect-0.5.
|
|
84
|
-
xspect-0.5.
|
|
85
|
-
xspect-0.5.
|
|
80
|
+
xspect-0.5.1.dist-info/licenses/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
81
|
+
xspect-0.5.1.dist-info/METADATA,sha256=o5zoUCtrA5rkvaLDMBnda5W5mTNAdQwV__LrWg4UJ3A,4569
|
|
82
|
+
xspect-0.5.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
83
|
+
xspect-0.5.1.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
84
|
+
xspect-0.5.1.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
85
|
+
xspect-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|