XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +38 -8
- xspect/definitions.py +30 -10
- xspect/file_io.py +2 -1
- xspect/filter_sequences.py +20 -4
- xspect/main.py +126 -28
- xspect/misclassification_detection/__init__.py +0 -0
- xspect/misclassification_detection/mapping.py +168 -0
- xspect/misclassification_detection/point_pattern_analysis.py +102 -0
- xspect/misclassification_detection/simulate_reads.py +55 -0
- xspect/mlst_feature/mlst_helper.py +15 -19
- xspect/mlst_feature/pub_mlst_handler.py +16 -19
- xspect/model_management.py +14 -17
- xspect/models/probabilistic_filter_mlst_model.py +11 -10
- xspect/models/probabilistic_filter_model.py +142 -8
- xspect/models/probabilistic_filter_svm_model.py +29 -14
- xspect/models/probabilistic_single_filter_model.py +9 -7
- xspect/models/result.py +22 -15
- xspect/ncbi.py +82 -7
- xspect/train.py +21 -4
- xspect/web.py +13 -4
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/METADATA +4 -1
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/RECORD +26 -22
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/WHEEL +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/entry_points.txt +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/top_level.txt +0 -0
xspect/classify.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
"""Classification module"""
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
|
-
from
|
|
4
|
+
from importlib import import_module
|
|
3
5
|
import xspect.model_management as mm
|
|
4
|
-
from xspect.models.probabilistic_filter_mlst_model import (
|
|
5
|
-
ProbabilisticFilterMlstSchemeModel,
|
|
6
|
-
)
|
|
7
6
|
from xspect.file_io import prepare_input_output_paths
|
|
8
7
|
|
|
8
|
+
# inline imports lead to "invalid name" issues
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def classify_genus(
|
|
11
13
|
model_genus: str, input_path: Path, output_path: Path, step: int = 1
|
|
@@ -22,7 +24,12 @@ def classify_genus(
|
|
|
22
24
|
output_path (Path): The path to the output file where results will be saved.
|
|
23
25
|
step (int): The amount of kmers to be skipped.
|
|
24
26
|
"""
|
|
25
|
-
|
|
27
|
+
ProbabilisticSingleFilterModel = import_module(
|
|
28
|
+
"xspect.models.probabilistic_single_filter_model"
|
|
29
|
+
).ProbabilisticSingleFilterModel
|
|
30
|
+
|
|
31
|
+
model_path = mm.get_genus_model_path(model_genus)
|
|
32
|
+
model = ProbabilisticSingleFilterModel.load(model_path)
|
|
26
33
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
27
34
|
|
|
28
35
|
for idx, current_path in enumerate(input_paths):
|
|
@@ -34,7 +41,12 @@ def classify_genus(
|
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
def classify_species(
|
|
37
|
-
model_genus: str,
|
|
44
|
+
model_genus: str,
|
|
45
|
+
input_path: Path,
|
|
46
|
+
output_path: Path,
|
|
47
|
+
step: int = 1,
|
|
48
|
+
display_name: bool = False,
|
|
49
|
+
validation: bool = False,
|
|
38
50
|
):
|
|
39
51
|
"""
|
|
40
52
|
Classify the species of sequences.
|
|
@@ -47,12 +59,24 @@ def classify_species(
|
|
|
47
59
|
input_path (Path): The path to the input file/directory containing sequences.
|
|
48
60
|
output_path (Path): The path to the output file where results will be saved.
|
|
49
61
|
step (int): The amount of kmers to be skipped.
|
|
62
|
+
display_name (bool): Includes a display name for each tax_ID.
|
|
63
|
+
validation (bool): Sorts out misclassified reads.
|
|
50
64
|
"""
|
|
51
|
-
|
|
65
|
+
ProbabilisticFilterSVMModel = import_module(
|
|
66
|
+
"xspect.models.probabilistic_filter_svm_model"
|
|
67
|
+
).ProbabilisticFilterSVMModel
|
|
68
|
+
|
|
69
|
+
model_path = mm.get_species_model_path(model_genus)
|
|
70
|
+
model = ProbabilisticFilterSVMModel.load(model_path)
|
|
52
71
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
53
72
|
|
|
54
73
|
for idx, current_path in enumerate(input_paths):
|
|
55
|
-
result = model.predict(
|
|
74
|
+
result = model.predict(
|
|
75
|
+
current_path,
|
|
76
|
+
step=step,
|
|
77
|
+
display_name=display_name,
|
|
78
|
+
validation=validation,
|
|
79
|
+
)
|
|
56
80
|
result.input_source = current_path.name
|
|
57
81
|
cls_path = get_output_path(idx, output_path)
|
|
58
82
|
result.save(cls_path)
|
|
@@ -68,6 +92,12 @@ def classify_mlst(input_path: Path, output_path: Path, limit: bool):
|
|
|
68
92
|
output_path (Path): The path to the output file where results will be saved.
|
|
69
93
|
limit (bool): A limit for the highest allele_id results that are shown.
|
|
70
94
|
"""
|
|
95
|
+
pick_scheme_from_models_dir = import_module(
|
|
96
|
+
"xspect.mlst_feature.mlst_helper"
|
|
97
|
+
).pick_scheme_from_models_dir
|
|
98
|
+
ProbabilisticFilterMlstSchemeModel = import_module(
|
|
99
|
+
"xspect.models.probabilistic_filter_mlst_model"
|
|
100
|
+
).ProbabilisticFilterMlstSchemeModel
|
|
71
101
|
|
|
72
102
|
scheme_path = pick_scheme_from_models_dir()
|
|
73
103
|
model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
|
xspect/definitions.py
CHANGED
|
@@ -11,8 +11,9 @@ def get_xspect_root_path() -> Path:
|
|
|
11
11
|
"""
|
|
12
12
|
Return the root path for XspecT data.
|
|
13
13
|
|
|
14
|
-
Returns the path to the XspecT data directory, which can be located either in the user's home
|
|
15
|
-
If neither exists, it creates the directory in
|
|
14
|
+
Returns the path to the XspecT data directory, which can be located either in the user's home
|
|
15
|
+
directory or in the current working directory. If neither exists, it creates the directory in
|
|
16
|
+
the user's home directory.
|
|
16
17
|
|
|
17
18
|
Returns:
|
|
18
19
|
Path: The path to the XspecT data directory.
|
|
@@ -34,8 +35,8 @@ def get_xspect_model_path() -> Path:
|
|
|
34
35
|
"""
|
|
35
36
|
Return the path to the XspecT models.
|
|
36
37
|
|
|
37
|
-
Returns the path to the XspecT models directory, which is located within the XspecT data
|
|
38
|
-
If the directory does not exist, it creates the directory.
|
|
38
|
+
Returns the path to the XspecT models directory, which is located within the XspecT data
|
|
39
|
+
directory. If the directory does not exist, it creates the directory.
|
|
39
40
|
|
|
40
41
|
Returns:
|
|
41
42
|
Path: The path to the XspecT models directory.
|
|
@@ -49,8 +50,8 @@ def get_xspect_upload_path() -> Path:
|
|
|
49
50
|
"""
|
|
50
51
|
Return the path to the XspecT upload directory.
|
|
51
52
|
|
|
52
|
-
Returns the path to the XspecT uploads directory, which is located within the XspecT data
|
|
53
|
-
If the directory does not exist, it creates the directory.
|
|
53
|
+
Returns the path to the XspecT uploads directory, which is located within the XspecT data
|
|
54
|
+
directory. If the directory does not exist, it creates the directory.
|
|
54
55
|
|
|
55
56
|
Returns:
|
|
56
57
|
Path: The path to the XspecT uploads directory.
|
|
@@ -64,8 +65,8 @@ def get_xspect_runs_path() -> Path:
|
|
|
64
65
|
"""
|
|
65
66
|
Return the path to the XspecT runs directory.
|
|
66
67
|
|
|
67
|
-
Returns the path to the XspecT runs directory, which is located within the XspecT data
|
|
68
|
-
If the directory does not exist, it creates the directory.
|
|
68
|
+
Returns the path to the XspecT runs directory, which is located within the XspecT data
|
|
69
|
+
directory. If the directory does not exist, it creates the directory.
|
|
69
70
|
|
|
70
71
|
Returns:
|
|
71
72
|
Path: The path to the XspecT runs directory.
|
|
@@ -79,8 +80,8 @@ def get_xspect_mlst_path() -> Path:
|
|
|
79
80
|
"""
|
|
80
81
|
Return the path to the XspecT MLST directory.
|
|
81
82
|
|
|
82
|
-
Returns the path to the XspecT MLST directory, which is located within the XspecT data
|
|
83
|
-
If the directory does not exist, it creates the directory.
|
|
83
|
+
Returns the path to the XspecT MLST directory, which is located within the XspecT data
|
|
84
|
+
directory. If the directory does not exist, it creates the directory.
|
|
84
85
|
|
|
85
86
|
Returns:
|
|
86
87
|
Path: The path to the XspecT MLST directory.
|
|
@@ -88,3 +89,22 @@ def get_xspect_mlst_path() -> Path:
|
|
|
88
89
|
mlst_path = get_xspect_root_path() / "mlst"
|
|
89
90
|
mlst_path.mkdir(exist_ok=True, parents=True)
|
|
90
91
|
return mlst_path
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_xspect_misclassification_path() -> Path:
|
|
95
|
+
"""
|
|
96
|
+
Notes:
|
|
97
|
+
Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
|
|
98
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
99
|
+
|
|
100
|
+
Return the path to the XspecT Misclassification directory.
|
|
101
|
+
|
|
102
|
+
Returns the path to the XspecT Misclassification directory, which is located within the XspecT data
|
|
103
|
+
directory. If the directory does not exist, it creates the directory.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Path: The path to the XspecT Misclassification directory.
|
|
107
|
+
"""
|
|
108
|
+
misclassification_path = get_xspect_root_path() / "misclassification"
|
|
109
|
+
misclassification_path.mkdir(exist_ok=True, parents=True)
|
|
110
|
+
return misclassification_path
|
xspect/file_io.py
CHANGED
|
@@ -113,7 +113,8 @@ def concatenate_metagenome(fasta_dir: Path, meta_path: Path) -> None:
|
|
|
113
113
|
Concatenate all fasta files in a directory into one file.
|
|
114
114
|
|
|
115
115
|
This function searches for all fasta files in the specified directory and writes their contents
|
|
116
|
-
into a single output file. The output file will contain the concatenated sequences from all
|
|
116
|
+
into a single output file. The output file will contain the concatenated sequences from all
|
|
117
|
+
fasta files.
|
|
117
118
|
|
|
118
119
|
Args:
|
|
119
120
|
fasta_dir (Path): Path to the directory with the fasta files.
|
xspect/filter_sequences.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
"""Sequence filtering module"""
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
|
-
from
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
from xspect.model_management import get_genus_model_path, get_species_model_path
|
|
3
6
|
from xspect.file_io import filter_sequences, prepare_input_output_paths
|
|
4
7
|
|
|
8
|
+
# inline imports lead to "invalid name" issues
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
|
|
5
11
|
|
|
6
12
|
def filter_species(
|
|
7
13
|
model_genus: str,
|
|
@@ -31,7 +37,12 @@ def filter_species(
|
|
|
31
37
|
available species scores.
|
|
32
38
|
sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
|
|
33
39
|
"""
|
|
34
|
-
|
|
40
|
+
ProbabilisticFilterSVMModel = import_module(
|
|
41
|
+
"xspect.models.probabilistic_filter_svm_model"
|
|
42
|
+
).ProbabilisticFilterSVMModel
|
|
43
|
+
|
|
44
|
+
species_model_path = get_species_model_path(model_genus)
|
|
45
|
+
species_model = ProbabilisticFilterSVMModel.load(species_model_path)
|
|
35
46
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
36
47
|
|
|
37
48
|
for idx, current_path in enumerate(input_paths):
|
|
@@ -82,11 +93,16 @@ def filter_genus(
|
|
|
82
93
|
sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
|
|
83
94
|
|
|
84
95
|
"""
|
|
85
|
-
|
|
96
|
+
ProbabilisticSingleFilterModel = import_module(
|
|
97
|
+
"xspect.models.probabilistic_single_filter_model"
|
|
98
|
+
).ProbabilisticSingleFilterModel
|
|
99
|
+
|
|
100
|
+
genus_model_path = get_genus_model_path(model_genus)
|
|
101
|
+
genus_model = ProbabilisticSingleFilterModel.load(genus_model_path)
|
|
86
102
|
input_paths, get_output_path = prepare_input_output_paths(input_path)
|
|
87
103
|
|
|
88
104
|
for idx, current_path in enumerate(input_paths):
|
|
89
|
-
result =
|
|
105
|
+
result = genus_model.predict(current_path, step=sparse_sampling_step)
|
|
90
106
|
result.input_source = current_path.name
|
|
91
107
|
|
|
92
108
|
if classification_output_path:
|
xspect/main.py
CHANGED
|
@@ -2,25 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from uuid import uuid4
|
|
5
|
+
from importlib import import_module
|
|
5
6
|
import click
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from xspect import filter_sequences
|
|
11
|
-
from xspect.train import train_from_directory, train_from_ncbi
|
|
12
|
-
from xspect.definitions import (
|
|
13
|
-
get_xspect_model_path,
|
|
14
|
-
)
|
|
15
|
-
from xspect.mlst_feature.mlst_helper import pick_scheme
|
|
16
|
-
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
17
|
-
from xspect.models.probabilistic_filter_mlst_model import (
|
|
18
|
-
ProbabilisticFilterMlstSchemeModel,
|
|
19
|
-
)
|
|
20
|
-
from xspect.model_management import (
|
|
21
|
-
get_model_metadata,
|
|
22
|
-
get_models,
|
|
23
|
-
)
|
|
7
|
+
from xspect.model_management import get_models
|
|
8
|
+
|
|
9
|
+
# inline imports lead to "invalid name" issues
|
|
10
|
+
# pylint: disable=invalid-name
|
|
24
11
|
|
|
25
12
|
|
|
26
13
|
@click.group()
|
|
@@ -32,7 +19,10 @@ def cli():
|
|
|
32
19
|
@cli.command()
|
|
33
20
|
def web():
|
|
34
21
|
"""Open the XspecT web application."""
|
|
35
|
-
|
|
22
|
+
app = import_module("xspect.web").app
|
|
23
|
+
run = import_module("uvicorn").run
|
|
24
|
+
|
|
25
|
+
run(app, host="0.0.0.0", port=8000)
|
|
36
26
|
|
|
37
27
|
|
|
38
28
|
# # # # # # # # # # # # # # #
|
|
@@ -49,6 +39,8 @@ def models():
|
|
|
49
39
|
def download():
|
|
50
40
|
"""Download models."""
|
|
51
41
|
click.echo("Downloading models, this may take a while...")
|
|
42
|
+
download_test_models = import_module("xspect.download_models").download_test_models
|
|
43
|
+
|
|
52
44
|
download_test_models(
|
|
53
45
|
"https://assets.adrianromberg.com/science/xspect-models-07-08-2025.zip"
|
|
54
46
|
)
|
|
@@ -64,7 +56,6 @@ def list_models():
|
|
|
64
56
|
if not available_models:
|
|
65
57
|
click.echo("No models found.")
|
|
66
58
|
return
|
|
67
|
-
# todo: make this machine readable
|
|
68
59
|
click.echo("Models found:")
|
|
69
60
|
click.echo("--------------")
|
|
70
61
|
for model_type, names in available_models.items():
|
|
@@ -96,11 +87,62 @@ def train():
|
|
|
96
87
|
help="Email of the author.",
|
|
97
88
|
default=None,
|
|
98
89
|
)
|
|
99
|
-
|
|
90
|
+
@click.option(
|
|
91
|
+
"--min-n50",
|
|
92
|
+
type=int,
|
|
93
|
+
help="Minimum contig N50 to filter the accessions (default: 10000).",
|
|
94
|
+
default=10000,
|
|
95
|
+
)
|
|
96
|
+
@click.option(
|
|
97
|
+
"--include-atypical/--exclude-atypical",
|
|
98
|
+
help="Include or exclude atypical accessions (default: exclude).",
|
|
99
|
+
default=False,
|
|
100
|
+
)
|
|
101
|
+
@click.option(
|
|
102
|
+
"--allow-inconclusive",
|
|
103
|
+
is_flag=True,
|
|
104
|
+
help="Allow the use of accessions with inconclusive taxonomy check status for training.",
|
|
105
|
+
default=False,
|
|
106
|
+
)
|
|
107
|
+
@click.option(
|
|
108
|
+
"--allow-candidatus",
|
|
109
|
+
is_flag=True,
|
|
110
|
+
help="Allow the use of Candidatus species for training.",
|
|
111
|
+
default=False,
|
|
112
|
+
)
|
|
113
|
+
@click.option(
|
|
114
|
+
"--allow-sp",
|
|
115
|
+
is_flag=True,
|
|
116
|
+
help="Allow the use of species with 'sp.' in their names for training.",
|
|
117
|
+
default=False,
|
|
118
|
+
)
|
|
119
|
+
def train_ncbi(
|
|
120
|
+
model_genus,
|
|
121
|
+
svm_steps,
|
|
122
|
+
author,
|
|
123
|
+
author_email,
|
|
124
|
+
min_n50,
|
|
125
|
+
include_atypical,
|
|
126
|
+
allow_inconclusive,
|
|
127
|
+
allow_candidatus,
|
|
128
|
+
allow_sp,
|
|
129
|
+
):
|
|
100
130
|
"""Train a species and a genus model based on NCBI data."""
|
|
101
131
|
click.echo(f"Training {model_genus} species and genus metagenome model.")
|
|
102
132
|
try:
|
|
103
|
-
train_from_ncbi
|
|
133
|
+
train_from_ncbi = import_module("xspect.train").train_from_ncbi
|
|
134
|
+
|
|
135
|
+
train_from_ncbi(
|
|
136
|
+
model_genus,
|
|
137
|
+
svm_steps,
|
|
138
|
+
author,
|
|
139
|
+
author_email,
|
|
140
|
+
min_n50=min_n50,
|
|
141
|
+
exclude_atypical=not include_atypical,
|
|
142
|
+
allow_inconclusive=allow_inconclusive,
|
|
143
|
+
allow_candidatus=allow_candidatus,
|
|
144
|
+
allow_sp=allow_sp,
|
|
145
|
+
)
|
|
104
146
|
except ValueError as e:
|
|
105
147
|
click.echo(f"Error: {e}")
|
|
106
148
|
return
|
|
@@ -143,6 +185,8 @@ def train_ncbi(model_genus, svm_steps, author, author_email):
|
|
|
143
185
|
def train_directory(model_genus, input_path, svm_steps, meta, author, author_email):
|
|
144
186
|
"""Train a model based on data from a directory for a given genus."""
|
|
145
187
|
click.echo(f"Training {model_genus} model with {svm_steps} SVM steps.")
|
|
188
|
+
train_from_directory = import_module("xspect.train").train_from_directory
|
|
189
|
+
|
|
146
190
|
train_from_directory(
|
|
147
191
|
model_genus,
|
|
148
192
|
Path(input_path),
|
|
@@ -167,12 +211,28 @@ def train_directory(model_genus, input_path, svm_steps, meta, author, author_ema
|
|
|
167
211
|
def train_mlst(choose_schemes):
|
|
168
212
|
"""Download alleles and train bloom filters."""
|
|
169
213
|
click.echo("Updating alleles")
|
|
214
|
+
mlst_helper = import_module("xspect.mlst_feature.mlst_helper")
|
|
215
|
+
pick_scheme = mlst_helper.pick_scheme
|
|
216
|
+
|
|
217
|
+
pub_mlst_handler = import_module("xspect.mlst_feature.pub_mlst_handler")
|
|
218
|
+
PubMLSTHandler = pub_mlst_handler.PubMLSTHandler
|
|
219
|
+
|
|
220
|
+
probabilistic_filter_mlst_model = import_module(
|
|
221
|
+
"xspect.models.probabilistic_filter_mlst_model"
|
|
222
|
+
)
|
|
223
|
+
ProbabilisticFilterMlstSchemeModel = (
|
|
224
|
+
probabilistic_filter_mlst_model.ProbabilisticFilterMlstSchemeModel
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
definitions = import_module("xspect.definitions")
|
|
228
|
+
get_xspect_model_path = definitions.get_xspect_model_path
|
|
229
|
+
|
|
170
230
|
handler = PubMLSTHandler()
|
|
171
231
|
handler.download_alleles(choose_schemes)
|
|
172
232
|
click.echo("Download finished")
|
|
173
233
|
scheme_path = pick_scheme(handler.get_scheme_paths())
|
|
174
234
|
species_name = str(scheme_path).split("/")[-2]
|
|
175
|
-
scheme_name = str(scheme_path).
|
|
235
|
+
scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
176
236
|
scheme_url = handler.scheme_mapping[str(scheme_path)]
|
|
177
237
|
model = ProbabilisticFilterMlstSchemeModel(
|
|
178
238
|
31, f"{species_name}:{scheme_name}", get_xspect_model_path(), scheme_url
|
|
@@ -230,6 +290,8 @@ def classify_seqs():
|
|
|
230
290
|
def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
|
|
231
291
|
"""Classify samples using a genus model."""
|
|
232
292
|
click.echo("Classifying...")
|
|
293
|
+
classify = import_module("xspect.classify")
|
|
294
|
+
|
|
233
295
|
classify.classify_genus(
|
|
234
296
|
model_genus, Path(input_path), Path(output_path), sparse_sampling_step
|
|
235
297
|
)
|
|
@@ -268,11 +330,37 @@ def classify_genus(model_genus, input_path, output_path, sparse_sampling_step):
|
|
|
268
330
|
help="Sparse sampling step (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
|
|
269
331
|
default=1,
|
|
270
332
|
)
|
|
271
|
-
|
|
333
|
+
@click.option(
|
|
334
|
+
"-n",
|
|
335
|
+
"--display-names",
|
|
336
|
+
help="Includes the display names next to taxonomy-IDs.",
|
|
337
|
+
is_flag=True,
|
|
338
|
+
)
|
|
339
|
+
@click.option(
|
|
340
|
+
"-v",
|
|
341
|
+
"--validation",
|
|
342
|
+
help="Detects misclassification for small reads or contigs.",
|
|
343
|
+
is_flag=True,
|
|
344
|
+
)
|
|
345
|
+
def classify_species(
|
|
346
|
+
model_genus,
|
|
347
|
+
input_path,
|
|
348
|
+
output_path,
|
|
349
|
+
sparse_sampling_step,
|
|
350
|
+
display_names,
|
|
351
|
+
validation,
|
|
352
|
+
):
|
|
272
353
|
"""Classify samples using a species model."""
|
|
273
354
|
click.echo("Classifying...")
|
|
355
|
+
classify = import_module("xspect.classify")
|
|
356
|
+
|
|
274
357
|
classify.classify_species(
|
|
275
|
-
model_genus,
|
|
358
|
+
model_genus,
|
|
359
|
+
Path(input_path),
|
|
360
|
+
Path(output_path),
|
|
361
|
+
sparse_sampling_step,
|
|
362
|
+
display_names,
|
|
363
|
+
validation,
|
|
276
364
|
)
|
|
277
365
|
|
|
278
366
|
|
|
@@ -301,6 +389,8 @@ def classify_species(model_genus, input_path, output_path, sparse_sampling_step)
|
|
|
301
389
|
def classify_mlst(input_path, output_path, limit):
|
|
302
390
|
"""MLST classify a sample."""
|
|
303
391
|
click.echo("Classifying...")
|
|
392
|
+
classify = import_module("xspect.classify")
|
|
393
|
+
|
|
304
394
|
classify.classify_mlst(Path(input_path), Path(output_path), limit)
|
|
305
395
|
|
|
306
396
|
|
|
@@ -372,6 +462,7 @@ def filter_genus(
|
|
|
372
462
|
):
|
|
373
463
|
"""Filter samples using a genus model."""
|
|
374
464
|
click.echo("Filtering...")
|
|
465
|
+
filter_sequences = import_module("xspect.filter_sequences")
|
|
375
466
|
|
|
376
467
|
filter_sequences.filter_genus(
|
|
377
468
|
model_genus,
|
|
@@ -426,14 +517,16 @@ def filter_genus(
|
|
|
426
517
|
"-t",
|
|
427
518
|
"--threshold",
|
|
428
519
|
type=float,
|
|
429
|
-
help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring
|
|
520
|
+
help="Threshold for filtering (default: 0.7). Use -1 to filter for the highest scoring "
|
|
521
|
+
"species.",
|
|
430
522
|
default=0.7,
|
|
431
523
|
prompt=True,
|
|
432
524
|
)
|
|
433
525
|
@click.option(
|
|
434
526
|
"--sparse-sampling-step",
|
|
435
527
|
type=int,
|
|
436
|
-
help="Sparse sampling step (e. g. only every 500th kmer for
|
|
528
|
+
help="Sparse sampling step (e. g. only every 500th kmer for "
|
|
529
|
+
"'--sparse-sampling-step 500').",
|
|
437
530
|
default=1,
|
|
438
531
|
)
|
|
439
532
|
def filter_species(
|
|
@@ -449,9 +542,12 @@ def filter_species(
|
|
|
449
542
|
|
|
450
543
|
if threshold != -1 and (threshold < 0 or threshold > 1):
|
|
451
544
|
raise click.BadParameter(
|
|
452
|
-
"Threshold must be between 0 and 1, or -1 for filtering by the highest
|
|
545
|
+
"Threshold must be between 0 and 1, or -1 for filtering by the highest "
|
|
546
|
+
"scoring species."
|
|
453
547
|
)
|
|
454
548
|
|
|
549
|
+
get_model_metadata = import_module("xspect.model_management").get_model_metadata
|
|
550
|
+
|
|
455
551
|
available_species = get_model_metadata(f"{model_genus}-species")["display_names"]
|
|
456
552
|
available_species = {
|
|
457
553
|
id: name.replace(f"{model_genus} ", "")
|
|
@@ -476,6 +572,8 @@ def filter_species(
|
|
|
476
572
|
][0]
|
|
477
573
|
|
|
478
574
|
click.echo("Filtering...")
|
|
575
|
+
filter_sequences = import_module("xspect.filter_sequences")
|
|
576
|
+
|
|
479
577
|
filter_sequences.filter_species(
|
|
480
578
|
model_genus,
|
|
481
579
|
model_species,
|
|
File without changes
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mapping handler for the alignment-based misclassification detection.
|
|
3
|
+
|
|
4
|
+
Notes:
|
|
5
|
+
Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
|
|
6
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import mappy, pysam, os, csv
|
|
10
|
+
from Bio import SeqIO
|
|
11
|
+
from xspect.definitions import fasta_endings
|
|
12
|
+
|
|
13
|
+
__author__ = "Cetin, Oemer"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MappingHandler:
|
|
17
|
+
"""Handler class for all mapping related procedures."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, ref_genome_path: str, reads_path: str) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Initialise the mapping handler.
|
|
22
|
+
|
|
23
|
+
This method sets up the paths to the reference genome and query sequences.
|
|
24
|
+
Additionally, the paths to the output formats (SAM, BAM and TSV) are generated.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
ref_genome_path (str): The path to the reference genome.
|
|
28
|
+
reads_path (str): The path to the query sequences.
|
|
29
|
+
"""
|
|
30
|
+
if not os.path.isfile(ref_genome_path):
|
|
31
|
+
raise ValueError("The path to the reference genome does not exist.")
|
|
32
|
+
|
|
33
|
+
if not os.path.isfile(reads_path):
|
|
34
|
+
raise ValueError("The path to the reads does not exist.")
|
|
35
|
+
|
|
36
|
+
if not ref_genome_path.endswith(tuple(fasta_endings)) and reads_path.endswith(
|
|
37
|
+
tuple(fasta_endings)
|
|
38
|
+
):
|
|
39
|
+
raise ValueError("The files must be FASTA-files!")
|
|
40
|
+
|
|
41
|
+
stem = reads_path.rsplit(".", 1)[0] + "_mapped"
|
|
42
|
+
self.ref_genome_path = ref_genome_path
|
|
43
|
+
self.reads_path = reads_path
|
|
44
|
+
self.sam = stem + ".sam"
|
|
45
|
+
self.bam = stem + ".sorted.bam"
|
|
46
|
+
self.tsv = stem + ".start_coordinates.tsv"
|
|
47
|
+
|
|
48
|
+
def map_reads_onto_reference(self) -> None:
|
|
49
|
+
"""
|
|
50
|
+
A Method that maps reads against the respective reference genome.
|
|
51
|
+
|
|
52
|
+
This function creates a SAM file via Mappy and converts it into a BAM file.
|
|
53
|
+
"""
|
|
54
|
+
# create header (entry = sequences of the reference genome)
|
|
55
|
+
ref_seq = [
|
|
56
|
+
{"SN": rec.id, "LN": len(rec.seq)}
|
|
57
|
+
for rec in SeqIO.parse(self.ref_genome_path, "fasta")
|
|
58
|
+
]
|
|
59
|
+
header = {"HD": {"VN": "1.0"}, "SQ": ref_seq}
|
|
60
|
+
target_id = {sequence["SN"]: number for number, sequence in enumerate(ref_seq)}
|
|
61
|
+
|
|
62
|
+
reads = list(SeqIO.parse(self.reads_path, "fasta"))
|
|
63
|
+
if not reads:
|
|
64
|
+
raise ValueError("Reads file is empty.")
|
|
65
|
+
|
|
66
|
+
read_length = len(reads[0].seq)
|
|
67
|
+
preset = "map-ont" if read_length > 150 else "sr"
|
|
68
|
+
# create SAM-file
|
|
69
|
+
aln = mappy.Aligner(self.ref_genome_path, preset=preset)
|
|
70
|
+
with pysam.AlignmentFile(self.sam, "w", header=header) as out:
|
|
71
|
+
for read in reads:
|
|
72
|
+
read_seq = str(read.seq)
|
|
73
|
+
for hit in aln.map(read_seq):
|
|
74
|
+
if hit.cigar_str is None:
|
|
75
|
+
continue
|
|
76
|
+
# add soft-clips so CIGAR length == len(read_seq) IMPORTANT!!
|
|
77
|
+
leftS = hit.q_st
|
|
78
|
+
rightS = len(read_seq) - hit.q_en
|
|
79
|
+
cigar = (
|
|
80
|
+
(f"{leftS}S" if leftS > 0 else "")
|
|
81
|
+
+ hit.cigar_str
|
|
82
|
+
+ (f"{rightS}S" if rightS > 0 else "")
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
mapped_region = pysam.AlignedSegment()
|
|
86
|
+
mapped_region.query_name = read.id
|
|
87
|
+
mapped_region.query_sequence = read_seq
|
|
88
|
+
mapped_region.flag = 16 if hit.strand == -1 else 0
|
|
89
|
+
mapped_region.reference_id = target_id[hit.ctg]
|
|
90
|
+
mapped_region.reference_start = hit.r_st
|
|
91
|
+
mapped_region.mapping_quality = (
|
|
92
|
+
hit.mapq or 255
|
|
93
|
+
) # 0-60 (255 means unavailable)
|
|
94
|
+
mapped_region.cigarstring = cigar
|
|
95
|
+
out.write(mapped_region)
|
|
96
|
+
break # keep only primary
|
|
97
|
+
|
|
98
|
+
# create BAM-file
|
|
99
|
+
pysam.sort("-o", self.bam, self.sam)
|
|
100
|
+
pysam.index(self.bam)
|
|
101
|
+
|
|
102
|
+
def get_total_genome_length(self) -> int:
|
|
103
|
+
"""
|
|
104
|
+
Get the genome length from a BAM-file.
|
|
105
|
+
|
|
106
|
+
This function opens a BAM-file and extracts the genome length information.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
int: The genome length.
|
|
110
|
+
"""
|
|
111
|
+
with pysam.AlignmentFile(self.bam, "rb") as bam:
|
|
112
|
+
return sum(bam.lengths)
|
|
113
|
+
|
|
114
|
+
def extract_starting_coordinates(self) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Extract starting coordinates of mapped regions from a BAM-file.
|
|
117
|
+
|
|
118
|
+
This function scans through a BAM-file and creates a TSV-file.
|
|
119
|
+
The information that is extracted is the starting coordinate for each mapped read.
|
|
120
|
+
"""
|
|
121
|
+
# create tsv-file with all start positions
|
|
122
|
+
with open(self.tsv, "w") as tsv:
|
|
123
|
+
tsv.write("reference_genome\tread\tmapped_starting_coordinate\n")
|
|
124
|
+
try:
|
|
125
|
+
with pysam.AlignmentFile(self.bam, "rb") as bam:
|
|
126
|
+
entry = {
|
|
127
|
+
i: seq["SN"] for i, seq in enumerate(bam.header.to_dict()["SQ"])
|
|
128
|
+
}
|
|
129
|
+
seen = set()
|
|
130
|
+
for ref_seq in bam.references:
|
|
131
|
+
for hit in bam.fetch(ref_seq):
|
|
132
|
+
if (
|
|
133
|
+
hit.is_unmapped
|
|
134
|
+
or hit.is_secondary
|
|
135
|
+
or hit.is_supplementary
|
|
136
|
+
):
|
|
137
|
+
continue
|
|
138
|
+
key = (hit.reference_id, hit.reference_start)
|
|
139
|
+
if key in seen:
|
|
140
|
+
continue
|
|
141
|
+
seen.add(key)
|
|
142
|
+
tsv.write(
|
|
143
|
+
f"{entry[hit.reference_id]}\t{hit.query_name}\t{hit.reference_start}\n"
|
|
144
|
+
)
|
|
145
|
+
except ValueError:
|
|
146
|
+
tsv.write("dummy_reference\tdummy_read\t1000\n")
|
|
147
|
+
|
|
148
|
+
def get_start_coordinates(self) -> list[int]:
|
|
149
|
+
"""
|
|
150
|
+
Get the coordinates of a TSV-file.
|
|
151
|
+
|
|
152
|
+
This function opens a TSV-file and saves all starting coordinates in a list.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
list[int]: The list containing all starting coordinates.
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ValueError: If no column with starting coordinates is found.
|
|
159
|
+
"""
|
|
160
|
+
coordinates = []
|
|
161
|
+
with open(self.tsv, "r", newline="") as f:
|
|
162
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
163
|
+
for row in reader:
|
|
164
|
+
val = row.get("mapped_starting_coordinate")
|
|
165
|
+
if val is None:
|
|
166
|
+
raise ValueError("Column with starting coordinates not found.")
|
|
167
|
+
coordinates.append(int(val))
|
|
168
|
+
return coordinates
|