XspecT 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/definitions.py +0 -7
- xspect/download_models.py +25 -24
- xspect/fastapi.py +23 -26
- xspect/file_io.py +86 -2
- xspect/main.py +333 -98
- xspect/mlst_feature/mlst_helper.py +5 -7
- xspect/model_management.py +6 -0
- xspect/models/probabilistic_filter_model.py +16 -5
- xspect/models/probabilistic_filter_svm_model.py +33 -18
- xspect/models/probabilistic_single_filter_model.py +8 -1
- xspect/models/result.py +15 -61
- xspect/ncbi.py +265 -0
- xspect/train.py +258 -247
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/METADATA +14 -21
- xspect-0.4.0.dist-info/RECORD +24 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/WHEEL +1 -1
- XspecT-0.2.6.dist-info/RECORD +0 -34
- xspect/pipeline.py +0 -201
- xspect/run.py +0 -38
- xspect/train_filter/__init__.py +0 -0
- xspect/train_filter/create_svm.py +0 -45
- xspect/train_filter/extract_and_concatenate.py +0 -124
- xspect/train_filter/html_scrap.py +0 -114
- xspect/train_filter/ncbi_api/__init__.py +0 -0
- xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/top_level.txt +0 -0
xspect/main.py
CHANGED
|
@@ -1,28 +1,26 @@
|
|
|
1
1
|
"""Project CLI"""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
import
|
|
5
|
-
import uuid
|
|
4
|
+
from uuid import uuid4
|
|
6
5
|
import click
|
|
7
6
|
import uvicorn
|
|
8
7
|
from xspect import fastapi
|
|
9
8
|
from xspect.download_models import download_test_models
|
|
10
|
-
from xspect.
|
|
11
|
-
from xspect.
|
|
12
|
-
StepType,
|
|
13
|
-
)
|
|
9
|
+
from xspect.file_io import filter_sequences
|
|
10
|
+
from xspect.train import train_from_directory, train_from_ncbi
|
|
14
11
|
from xspect.definitions import (
|
|
15
|
-
get_xspect_runs_path,
|
|
16
|
-
fasta_endings,
|
|
17
|
-
fastq_endings,
|
|
18
12
|
get_xspect_model_path,
|
|
19
13
|
)
|
|
20
|
-
from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
|
|
21
14
|
from xspect.mlst_feature.mlst_helper import pick_scheme, pick_scheme_from_models_dir
|
|
22
15
|
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
23
16
|
from xspect.models.probabilistic_filter_mlst_model import (
|
|
24
17
|
ProbabilisticFilterMlstSchemeModel,
|
|
25
18
|
)
|
|
19
|
+
from xspect.model_management import (
|
|
20
|
+
get_genus_model,
|
|
21
|
+
get_models,
|
|
22
|
+
get_species_model,
|
|
23
|
+
)
|
|
26
24
|
|
|
27
25
|
|
|
28
26
|
@click.group()
|
|
@@ -32,103 +30,133 @@ def cli():
|
|
|
32
30
|
|
|
33
31
|
|
|
34
32
|
@cli.command()
|
|
35
|
-
def
|
|
33
|
+
def web():
|
|
34
|
+
"""Open the XspecT web application."""
|
|
35
|
+
uvicorn.run(fastapi.app, host="0.0.0.0", port=8000)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# # # # # # # # # # # # # # #
|
|
39
|
+
# Model management commands #
|
|
40
|
+
# # # # # # # # # # # # # # #
|
|
41
|
+
@cli.group()
|
|
42
|
+
def models():
|
|
43
|
+
"""Model management commands."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@models.command(
|
|
48
|
+
help="Download models from the internet.",
|
|
49
|
+
)
|
|
50
|
+
def download():
|
|
36
51
|
"""Download models."""
|
|
37
52
|
click.echo("Downloading models, this may take a while...")
|
|
38
|
-
download_test_models("
|
|
53
|
+
download_test_models("http://assets.adrianromberg.com/xspect-models.zip")
|
|
39
54
|
|
|
40
55
|
|
|
41
|
-
@
|
|
42
|
-
|
|
43
|
-
|
|
56
|
+
@models.command(
|
|
57
|
+
name="list",
|
|
58
|
+
help="List all models in the model directory.",
|
|
59
|
+
)
|
|
60
|
+
def list_models():
|
|
61
|
+
"""List models."""
|
|
62
|
+
available_models = get_models()
|
|
63
|
+
if not available_models:
|
|
64
|
+
click.echo("No models found.")
|
|
65
|
+
return
|
|
66
|
+
# todo: make this machine readable
|
|
67
|
+
click.echo("Models found:")
|
|
68
|
+
click.echo("--------------")
|
|
69
|
+
for model_type, names in available_models.items():
|
|
70
|
+
if not names:
|
|
71
|
+
continue
|
|
72
|
+
click.echo(f" {model_type}:")
|
|
73
|
+
for name in names:
|
|
74
|
+
click.echo(f" - {name}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@models.group()
|
|
78
|
+
def train():
|
|
79
|
+
"""Train models."""
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@train.command(
|
|
84
|
+
name="ncbi",
|
|
85
|
+
help="Train a species and a genus model based on NCBI data.",
|
|
86
|
+
)
|
|
87
|
+
@click.option("-g", "--genus", "model_genus", prompt=True)
|
|
88
|
+
@click.option("--svm_steps", type=int, default=1)
|
|
44
89
|
@click.option(
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
default=False,
|
|
90
|
+
"--author",
|
|
91
|
+
help="Author of the model.",
|
|
92
|
+
default=None,
|
|
49
93
|
)
|
|
50
94
|
@click.option(
|
|
51
|
-
"-
|
|
52
|
-
"
|
|
53
|
-
|
|
54
|
-
default=1,
|
|
95
|
+
"--author-email",
|
|
96
|
+
help="Email of the author.",
|
|
97
|
+
default=None,
|
|
55
98
|
)
|
|
56
|
-
def
|
|
57
|
-
"""
|
|
58
|
-
click.echo("
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
for f in Path(path).iterdir()
|
|
66
|
-
if f.is_file() and f.suffix[1:] in fasta_endings + fastq_endings
|
|
67
|
-
]
|
|
68
|
-
else:
|
|
69
|
-
file_paths = [Path(path)]
|
|
70
|
-
|
|
71
|
-
# define pipeline
|
|
72
|
-
pipeline = Pipeline(genus + " classification", "Test Author", "test@example.com")
|
|
73
|
-
species_execution = ModelExecution(
|
|
74
|
-
genus.lower() + "-species", sparse_sampling_step=step
|
|
75
|
-
)
|
|
76
|
-
if meta:
|
|
77
|
-
species_filtering_step = PipelineStep(
|
|
78
|
-
StepType.FILTERING, genus, 0.7, species_execution
|
|
79
|
-
)
|
|
80
|
-
genus_execution = ModelExecution(
|
|
81
|
-
genus.lower() + "-genus", sparse_sampling_step=step
|
|
82
|
-
)
|
|
83
|
-
genus_execution.add_pipeline_step(species_filtering_step)
|
|
84
|
-
pipeline.add_pipeline_step(genus_execution)
|
|
85
|
-
else:
|
|
86
|
-
pipeline.add_pipeline_step(species_execution)
|
|
87
|
-
|
|
88
|
-
for idx, file_path in enumerate(file_paths):
|
|
89
|
-
run = pipeline.run(file_path)
|
|
90
|
-
time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
91
|
-
save_path = get_xspect_runs_path() / f"run_{time_str}_{uuid.uuid4()}.json"
|
|
92
|
-
run.save(save_path)
|
|
93
|
-
print(
|
|
94
|
-
f"[{idx+1}/{len(file_paths)}] Run finished. Results saved to '{save_path}'."
|
|
95
|
-
)
|
|
99
|
+
def train_ncbi(model_genus, svm_steps, author, author_email):
|
|
100
|
+
"""Train a species and a genus model based on NCBI data."""
|
|
101
|
+
click.echo(f"Training {model_genus} species and genus metagenome model.")
|
|
102
|
+
try:
|
|
103
|
+
train_from_ncbi(model_genus, svm_steps, author, author_email)
|
|
104
|
+
except ValueError as e:
|
|
105
|
+
click.echo(f"Error: {e}")
|
|
106
|
+
return
|
|
107
|
+
click.echo(f"Training of {model_genus} model finished.")
|
|
96
108
|
|
|
97
109
|
|
|
98
|
-
@
|
|
99
|
-
|
|
110
|
+
@train.command(
|
|
111
|
+
name="directory",
|
|
112
|
+
help="Train a species (and possibly a genus) model based on local data.",
|
|
113
|
+
)
|
|
114
|
+
@click.option("-g", "--genus", "model_genus", prompt=True)
|
|
100
115
|
@click.option(
|
|
101
|
-
"-
|
|
102
|
-
"--
|
|
103
|
-
|
|
104
|
-
|
|
116
|
+
"-i",
|
|
117
|
+
"--input-path",
|
|
118
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
119
|
+
prompt=True,
|
|
105
120
|
)
|
|
106
121
|
@click.option(
|
|
107
|
-
"
|
|
108
|
-
|
|
109
|
-
help="
|
|
110
|
-
|
|
122
|
+
"--meta",
|
|
123
|
+
is_flag=True,
|
|
124
|
+
help="Train a metagenome model for the genus.",
|
|
125
|
+
default=True,
|
|
111
126
|
)
|
|
112
127
|
@click.option(
|
|
113
|
-
"-
|
|
114
|
-
|
|
128
|
+
"--svm-steps",
|
|
129
|
+
type=int,
|
|
115
130
|
help="SVM Sparse sampling step size (e. g. only every 500th kmer for step=500).",
|
|
116
131
|
default=1,
|
|
117
132
|
)
|
|
118
|
-
|
|
119
|
-
""
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
133
|
+
@click.option(
|
|
134
|
+
"--author",
|
|
135
|
+
help="Author of the model.",
|
|
136
|
+
default=None,
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--author-email",
|
|
140
|
+
help="Email of the author.",
|
|
141
|
+
default=None,
|
|
142
|
+
)
|
|
143
|
+
def train_directory(model_genus, input_path, svm_steps, meta, author, author_email):
|
|
144
|
+
"""Train a model based on data from a directory for a given genus."""
|
|
145
|
+
click.echo(f"Training {model_genus} model with {svm_steps} SVM steps.")
|
|
146
|
+
train_from_directory(
|
|
147
|
+
model_genus,
|
|
148
|
+
Path(input_path),
|
|
149
|
+
svm_step=svm_steps,
|
|
150
|
+
meta=meta,
|
|
151
|
+
author=author,
|
|
152
|
+
author_email=author_email,
|
|
153
|
+
)
|
|
129
154
|
|
|
130
155
|
|
|
131
|
-
@
|
|
156
|
+
@train.command(
|
|
157
|
+
name="mlst",
|
|
158
|
+
help="Train a MLST model based on PubMLST data.",
|
|
159
|
+
)
|
|
132
160
|
@click.option(
|
|
133
161
|
"-c",
|
|
134
162
|
"--choose_schemes",
|
|
@@ -154,27 +182,234 @@ def train_mlst(choose_schemes):
|
|
|
154
182
|
click.echo(f"Saved at {model.cobs_path}")
|
|
155
183
|
|
|
156
184
|
|
|
157
|
-
|
|
185
|
+
# # # # # # # # # # # # # # #
|
|
186
|
+
# Classification commands #
|
|
187
|
+
# # # # # # # # # # # # # # #
|
|
188
|
+
@cli.group(
|
|
189
|
+
name="classify",
|
|
190
|
+
help="Classify sequences using XspecT models.",
|
|
191
|
+
)
|
|
192
|
+
def classify_seqs():
|
|
193
|
+
"""Classification commands."""
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@classify_seqs.command()
|
|
198
|
+
@click.option(
|
|
199
|
+
"-g",
|
|
200
|
+
"--genus",
|
|
201
|
+
"model_genus",
|
|
202
|
+
help="Genus of the model to classify.",
|
|
203
|
+
type=click.Choice(get_models().get("Genus"), None),
|
|
204
|
+
prompt=True,
|
|
205
|
+
)
|
|
206
|
+
@click.option(
|
|
207
|
+
"-i",
|
|
208
|
+
"--input-path",
|
|
209
|
+
help="Path to FASTA or FASTQ file for classification.",
|
|
210
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
211
|
+
prompt=True,
|
|
212
|
+
)
|
|
213
|
+
@click.option(
|
|
214
|
+
"-o",
|
|
215
|
+
"--output-path",
|
|
216
|
+
help="Path to the output file.",
|
|
217
|
+
type=click.Path(dir_okay=True, file_okay=True),
|
|
218
|
+
default=Path(".") / f"result_{uuid4()}.json",
|
|
219
|
+
)
|
|
220
|
+
def genus(model_genus, input_path, output_path):
|
|
221
|
+
"""Classify samples using a genus model."""
|
|
222
|
+
click.echo("Classifying...")
|
|
223
|
+
genus_model = get_genus_model(model_genus)
|
|
224
|
+
result = genus_model.predict(Path(input_path))
|
|
225
|
+
result.save(output_path)
|
|
226
|
+
click.echo(f"Result saved as {output_path}.")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@classify_seqs.command()
|
|
158
230
|
@click.option(
|
|
159
|
-
"-
|
|
160
|
-
"--
|
|
231
|
+
"-g",
|
|
232
|
+
"--genus",
|
|
233
|
+
"model_genus",
|
|
234
|
+
help="Genus of the model to classify.",
|
|
235
|
+
type=click.Choice(get_models().get("Species"), None),
|
|
236
|
+
prompt=True,
|
|
237
|
+
)
|
|
238
|
+
@click.option(
|
|
239
|
+
"-i",
|
|
240
|
+
"--input-path",
|
|
241
|
+
help="Path to FASTA or FASTQ file for classification.",
|
|
242
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
243
|
+
prompt=True,
|
|
244
|
+
)
|
|
245
|
+
@click.option(
|
|
246
|
+
"-o",
|
|
247
|
+
"--output-path",
|
|
248
|
+
help="Path to the output file.",
|
|
249
|
+
type=click.Path(dir_okay=True, file_okay=True),
|
|
250
|
+
default=Path(".") / f"result_{uuid4()}.json",
|
|
251
|
+
)
|
|
252
|
+
@click.option(
|
|
253
|
+
"--sparse-sampling-step",
|
|
254
|
+
type=int,
|
|
255
|
+
help="Sparse sampling step size (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
|
|
256
|
+
default=1,
|
|
257
|
+
)
|
|
258
|
+
def species(model_genus, input_path, output_path, sparse_sampling_step):
|
|
259
|
+
"""Classify samples using a species model."""
|
|
260
|
+
click.echo("Classifying...")
|
|
261
|
+
species_model = get_species_model(model_genus)
|
|
262
|
+
result = species_model.predict(Path(input_path), step=sparse_sampling_step)
|
|
263
|
+
result.save(output_path)
|
|
264
|
+
click.echo(f"Result saved as {output_path}.")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
@classify_seqs.command(
|
|
268
|
+
name="mlst",
|
|
269
|
+
help="Classify samples using a MLST model.",
|
|
270
|
+
)
|
|
271
|
+
@click.option(
|
|
272
|
+
"-i",
|
|
273
|
+
"--input-path",
|
|
161
274
|
help="Path to FASTA-file for mlst identification.",
|
|
162
275
|
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
276
|
+
prompt=True,
|
|
277
|
+
)
|
|
278
|
+
@click.option(
|
|
279
|
+
"-o",
|
|
280
|
+
"--output-path",
|
|
281
|
+
help="Path to the output file.",
|
|
282
|
+
type=click.Path(dir_okay=True, file_okay=True),
|
|
283
|
+
default=Path(".") / f"result_{uuid4()}.json",
|
|
163
284
|
)
|
|
164
|
-
def classify_mlst(
|
|
285
|
+
def classify_mlst(input_path, output_path):
|
|
165
286
|
"""MLST classify a sample."""
|
|
166
287
|
click.echo("Classifying...")
|
|
167
|
-
|
|
288
|
+
input_path = Path(input_path)
|
|
168
289
|
scheme_path = pick_scheme_from_models_dir()
|
|
169
290
|
model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
|
|
170
|
-
model.predict(scheme_path,
|
|
171
|
-
|
|
291
|
+
result = model.predict(scheme_path, input_path)
|
|
292
|
+
result.save(output_path)
|
|
293
|
+
click.echo(f"Result saved as {output_path}.")
|
|
172
294
|
|
|
173
295
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
296
|
+
# # # # # # # # # # # # # # #
|
|
297
|
+
# Filtering commands #
|
|
298
|
+
# # # # # # # # # # # # # # #
|
|
299
|
+
@cli.group(
|
|
300
|
+
name="filter",
|
|
301
|
+
help="Filter sequences using XspecT models.",
|
|
302
|
+
)
|
|
303
|
+
def filter_seqs():
|
|
304
|
+
"""Filter commands."""
|
|
305
|
+
pass
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@filter_seqs.command(
|
|
309
|
+
name="genus",
|
|
310
|
+
help="Filter sequences using a genus model.",
|
|
311
|
+
)
|
|
312
|
+
@click.option(
|
|
313
|
+
"-g",
|
|
314
|
+
"--genus",
|
|
315
|
+
"model_genus",
|
|
316
|
+
help="Genus of the model to use for filtering.",
|
|
317
|
+
type=click.Choice(get_models().get("Species"), None),
|
|
318
|
+
prompt=True,
|
|
319
|
+
)
|
|
320
|
+
@click.option(
|
|
321
|
+
"-i",
|
|
322
|
+
"--input-path",
|
|
323
|
+
help="Path to FASTA or FASTQ file for classification.",
|
|
324
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
325
|
+
prompt=True,
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"-o",
|
|
329
|
+
"--output-path",
|
|
330
|
+
help="Path to the output file.",
|
|
331
|
+
type=click.Path(dir_okay=True, file_okay=True),
|
|
332
|
+
prompt=True,
|
|
333
|
+
)
|
|
334
|
+
@click.option(
|
|
335
|
+
"--threshold",
|
|
336
|
+
type=float,
|
|
337
|
+
help="Threshold for filtering (default: 0.7).",
|
|
338
|
+
default=0.7,
|
|
339
|
+
)
|
|
340
|
+
def filter_genus(model_genus, input_path, output_path, threshold):
|
|
341
|
+
"""Filter samples using a genus model."""
|
|
342
|
+
click.echo("Filtering...")
|
|
343
|
+
genus_model = get_genus_model(model_genus)
|
|
344
|
+
result = genus_model.predict(Path(input_path))
|
|
345
|
+
included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
|
|
346
|
+
if not included_ids:
|
|
347
|
+
click.echo("No sequences found for the given genus.")
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
filter_sequences(
|
|
351
|
+
Path(input_path),
|
|
352
|
+
Path(output_path),
|
|
353
|
+
included_ids=included_ids,
|
|
354
|
+
)
|
|
355
|
+
click.echo(f"Filtered sequences saved at {output_path}.")
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
@filter_seqs.command(
|
|
359
|
+
name="species",
|
|
360
|
+
help="Filter sequences using a species model.",
|
|
361
|
+
)
|
|
362
|
+
@click.option(
|
|
363
|
+
"-g",
|
|
364
|
+
"--genus",
|
|
365
|
+
"model_genus",
|
|
366
|
+
help="Genus of the model to use for filtering.",
|
|
367
|
+
type=click.Choice(get_models().get("Species"), None),
|
|
368
|
+
prompt=True,
|
|
369
|
+
)
|
|
370
|
+
@click.option(
|
|
371
|
+
# todo: this should be a choice of the species in the model w/ display names
|
|
372
|
+
"-s",
|
|
373
|
+
"--species",
|
|
374
|
+
"model_species",
|
|
375
|
+
help="Species of the model to filter for.",
|
|
376
|
+
prompt=True,
|
|
377
|
+
)
|
|
378
|
+
@click.option(
|
|
379
|
+
"-i",
|
|
380
|
+
"--input-path",
|
|
381
|
+
help="Path to FASTA or FASTQ file for classification.",
|
|
382
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
383
|
+
prompt=True,
|
|
384
|
+
)
|
|
385
|
+
@click.option(
|
|
386
|
+
"-o",
|
|
387
|
+
"--output-path",
|
|
388
|
+
help="Path to the output file.",
|
|
389
|
+
type=click.Path(dir_okay=True, file_okay=True),
|
|
390
|
+
prompt=True,
|
|
391
|
+
)
|
|
392
|
+
@click.option(
|
|
393
|
+
"--threshold",
|
|
394
|
+
type=float,
|
|
395
|
+
help="Threshold for filtering (default: 0.7).",
|
|
396
|
+
default=0.7,
|
|
397
|
+
)
|
|
398
|
+
def filter_species(model_genus, model_species, input_path, output_path, threshold):
|
|
399
|
+
"""Filter a sample using the species model."""
|
|
400
|
+
click.echo("Filtering...")
|
|
401
|
+
species_model = get_species_model(model_genus)
|
|
402
|
+
result = species_model.predict(Path(input_path))
|
|
403
|
+
included_ids = result.get_filtered_subsequence_labels(model_species, threshold)
|
|
404
|
+
if not included_ids:
|
|
405
|
+
click.echo("No sequences found for the given species.")
|
|
406
|
+
return
|
|
407
|
+
filter_sequences(
|
|
408
|
+
Path(input_path),
|
|
409
|
+
Path(output_path),
|
|
410
|
+
included_ids=included_ids,
|
|
411
|
+
)
|
|
412
|
+
click.echo(f"Filtered sequences saved at {output_path}.")
|
|
178
413
|
|
|
179
414
|
|
|
180
415
|
if __name__ == "__main__":
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Module for utility functions used in other modules regarding MLST."""
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
@@ -144,12 +144,10 @@ class MlstResult:
|
|
|
144
144
|
}
|
|
145
145
|
return result
|
|
146
146
|
|
|
147
|
-
def save(self,
|
|
148
|
-
"""Saves the result
|
|
149
|
-
|
|
150
|
-
json_path = get_xspect_runs_path() / "MLST" / f"{file_name}-{display}.json"
|
|
151
|
-
json_path.parent.mkdir(exist_ok=True, parents=True)
|
|
147
|
+
def save(self, output_path: Path) -> None:
|
|
148
|
+
"""Saves the result as a JSON file."""
|
|
149
|
+
output_path.parent.mkdir(exist_ok=True, parents=True)
|
|
152
150
|
json_object = json.dumps(self.to_dict(), indent=4)
|
|
153
151
|
|
|
154
|
-
with open(
|
|
152
|
+
with open(output_path, "w", encoding="utf-8") as file:
|
|
155
153
|
file.write(json_object)
|
xspect/model_management.py
CHANGED
|
@@ -85,3 +85,9 @@ def get_models():
|
|
|
85
85
|
model_metadata["model_display_name"]
|
|
86
86
|
)
|
|
87
87
|
return model_dict
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_model_display_names(model_slug: str):
|
|
91
|
+
"""Get the display names included in a model."""
|
|
92
|
+
model_metadata = get_model_metadata(model_slug)
|
|
93
|
+
return list(model_metadata["display_names"].values())
|
|
@@ -26,6 +26,7 @@ class ProbabilisticFilterModel:
|
|
|
26
26
|
base_path: Path,
|
|
27
27
|
fpr: float = 0.01,
|
|
28
28
|
num_hashes: int = 7,
|
|
29
|
+
training_accessions: dict[str, list[str]] = None,
|
|
29
30
|
) -> None:
|
|
30
31
|
if k < 1:
|
|
31
32
|
raise ValueError("Invalid k value, must be greater than 0")
|
|
@@ -46,6 +47,7 @@ class ProbabilisticFilterModel:
|
|
|
46
47
|
self.fpr = fpr
|
|
47
48
|
self.num_hashes = num_hashes
|
|
48
49
|
self.index = None
|
|
50
|
+
self.training_accessions = training_accessions
|
|
49
51
|
|
|
50
52
|
def get_cobs_index_path(self) -> Path:
|
|
51
53
|
"""Returns the path to the cobs index"""
|
|
@@ -63,13 +65,19 @@ class ProbabilisticFilterModel:
|
|
|
63
65
|
"display_names": self.display_names,
|
|
64
66
|
"fpr": self.fpr,
|
|
65
67
|
"num_hashes": self.num_hashes,
|
|
68
|
+
"training_accessions": self.training_accessions,
|
|
66
69
|
}
|
|
67
70
|
|
|
68
71
|
def slug(self) -> str:
|
|
69
72
|
"""Returns a slug representation of the model"""
|
|
70
73
|
return slugify(self.model_display_name + "-" + str(self.model_type))
|
|
71
74
|
|
|
72
|
-
def fit(
|
|
75
|
+
def fit(
|
|
76
|
+
self,
|
|
77
|
+
dir_path: Path,
|
|
78
|
+
display_names: dict = None,
|
|
79
|
+
training_accessions: dict[str, list[str]] = None,
|
|
80
|
+
) -> None:
|
|
73
81
|
"""Adds filters to the model"""
|
|
74
82
|
|
|
75
83
|
if display_names is None:
|
|
@@ -84,16 +92,18 @@ class ProbabilisticFilterModel:
|
|
|
84
92
|
if not dir_path.is_dir():
|
|
85
93
|
raise ValueError("Directory path must be a directory")
|
|
86
94
|
|
|
95
|
+
self.training_accessions = training_accessions
|
|
96
|
+
|
|
87
97
|
doclist = cobs.DocumentList()
|
|
88
98
|
for file in dir_path.iterdir():
|
|
89
99
|
if file.is_file() and file.suffix[1:] in fasta_endings + fastq_endings:
|
|
90
100
|
# cobs only uses the file name to the first "." as the document name
|
|
91
|
-
if file.
|
|
92
|
-
self.display_names[file.
|
|
93
|
-
file.
|
|
101
|
+
if file.stem in display_names:
|
|
102
|
+
self.display_names[file.stem.split(".")[0]] = display_names[
|
|
103
|
+
file.stem
|
|
94
104
|
]
|
|
95
105
|
else:
|
|
96
|
-
self.display_names[file.
|
|
106
|
+
self.display_names[file.stem.split(".")[0]] = file.stem
|
|
97
107
|
doclist.add(str(file))
|
|
98
108
|
|
|
99
109
|
if len(doclist) == 0:
|
|
@@ -200,6 +210,7 @@ class ProbabilisticFilterModel:
|
|
|
200
210
|
path.parent,
|
|
201
211
|
model_json["fpr"],
|
|
202
212
|
model_json["num_hashes"],
|
|
213
|
+
model_json["training_accessions"],
|
|
203
214
|
)
|
|
204
215
|
model.display_names = model_json["display_names"]
|
|
205
216
|
|