XspecT 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/definitions.py +0 -7
- xspect/download_models.py +25 -24
- xspect/fastapi.py +23 -26
- xspect/file_io.py +86 -2
- xspect/main.py +333 -98
- xspect/mlst_feature/mlst_helper.py +5 -7
- xspect/model_management.py +6 -0
- xspect/models/probabilistic_filter_model.py +16 -5
- xspect/models/probabilistic_filter_svm_model.py +33 -18
- xspect/models/probabilistic_single_filter_model.py +8 -1
- xspect/models/result.py +15 -61
- xspect/ncbi.py +265 -0
- xspect/train.py +258 -247
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/METADATA +14 -21
- xspect-0.4.0.dist-info/RECORD +24 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/WHEEL +1 -1
- XspecT-0.2.6.dist-info/RECORD +0 -34
- xspect/pipeline.py +0 -201
- xspect/run.py +0 -38
- xspect/train_filter/__init__.py +0 -0
- xspect/train_filter/create_svm.py +0 -45
- xspect/train_filter/extract_and_concatenate.py +0 -124
- xspect/train_filter/html_scrap.py +0 -114
- xspect/train_filter/ncbi_api/__init__.py +0 -0
- xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/top_level.txt +0 -0
xspect/train.py
CHANGED
|
@@ -2,271 +2,282 @@
|
|
|
2
2
|
This module contains the main functions for training the models.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import os
|
|
6
5
|
import shutil
|
|
7
6
|
from pathlib import Path
|
|
8
|
-
import
|
|
9
|
-
from time import localtime, perf_counter, asctime, sleep
|
|
7
|
+
from tempfile import TemporaryDirectory
|
|
10
8
|
from loguru import logger
|
|
11
|
-
from xspect.definitions import get_xspect_model_path
|
|
12
|
-
from xspect.file_io import
|
|
9
|
+
from xspect.definitions import get_xspect_model_path
|
|
10
|
+
from xspect.file_io import (
|
|
11
|
+
concatenate_species_fasta_files,
|
|
12
|
+
concatenate_metagenome,
|
|
13
|
+
extract_zip,
|
|
14
|
+
get_ncbi_dataset_accession_paths,
|
|
15
|
+
)
|
|
16
|
+
from xspect.models.probabilistic_filter_model import ProbabilisticFilterModel
|
|
13
17
|
from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
|
|
14
18
|
from xspect.models.probabilistic_single_filter_model import (
|
|
15
19
|
ProbabilisticSingleFilterModel,
|
|
16
20
|
)
|
|
17
|
-
from xspect.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"""The given input of the user will be checked. The input has to be a genus in NCBI.
|
|
32
|
-
|
|
33
|
-
:return: The genus name.
|
|
21
|
+
from xspect.ncbi import AssemblySource, NCBIHandler
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def train_from_directory(
|
|
25
|
+
display_name: str,
|
|
26
|
+
dir_path: Path,
|
|
27
|
+
meta: bool = False,
|
|
28
|
+
training_accessions: dict[str, list[str]] = None,
|
|
29
|
+
svm_accessions: list[str] = None,
|
|
30
|
+
svm_step: int = 1,
|
|
31
|
+
translation_dict: dict[str, str] = None,
|
|
32
|
+
author: str = None,
|
|
33
|
+
author_email: str = None,
|
|
34
|
+
):
|
|
34
35
|
"""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
36
|
+
Train a model from a directory containing training data.
|
|
37
|
+
|
|
38
|
+
This function trains a probabilistic filter model using the data in the specified directory.
|
|
39
|
+
The training data should be organized in the following way:
|
|
40
|
+
- dir_path
|
|
41
|
+
- cobs
|
|
42
|
+
- <species_name_1>
|
|
43
|
+
- <fasta_file_1>
|
|
44
|
+
- <fasta_file_2>
|
|
45
|
+
- <species_name_2>
|
|
46
|
+
- <fasta_file_1>
|
|
47
|
+
- <fasta_file_2>
|
|
48
|
+
- svm (optional)
|
|
49
|
+
- <species_name_1>
|
|
50
|
+
- <svm_file_1>
|
|
51
|
+
- <svm_file_2>
|
|
52
|
+
- <species_name_2>
|
|
53
|
+
- <svm_file_1>
|
|
54
|
+
- <svm_file_2>
|
|
55
|
+
If no SVM directory is found, the model will be trained without SVM.
|
|
56
|
+
The training data should be in FASTA format. The model is saved to the xspect_data directory.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
display_name (str): Name of the model to be trained.
|
|
60
|
+
dir_path (Path): Path to the directory containing training data.
|
|
61
|
+
meta (bool, optional): Whether to train a metagenome model. Defaults to False.
|
|
62
|
+
training_accessions (list[str], optional): List of training accessions. Defaults to None.
|
|
63
|
+
svm_accessions (list[str], optional): List of SVM accession identifiers. Defaults to None.
|
|
64
|
+
svm_step (int, optional): Step size for SVM training. Defaults to 1.
|
|
65
|
+
translation_dict (dict[str, str], optional): Dictionary for display names. Defaults to None.
|
|
66
|
+
author (str, optional): Author of the model. Defaults to None.
|
|
67
|
+
author_email (str, optional): Author's email. Defaults to None.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
TypeError: If `display_name` is not a string.
|
|
71
|
+
TypeError: If `dir_path` is not a Path object to a valid directory.
|
|
72
|
+
ValueError: If the "cobs" directory is not found in `dir_path`.
|
|
73
|
+
ValueError: If no folders are found in the "cobs" directory.
|
|
74
|
+
ValueError: If the number of SVM folders does not match the number of COBS folders.
|
|
75
|
+
ValueError: If the names of COBS folders and SVM folders do not match.
|
|
76
|
+
ValueError: If no FASTA files are found in a COBS folder.
|
|
77
|
+
|
|
78
|
+
Notes:
|
|
79
|
+
- If the "svm" directory is not found, the model will be trained without SVM.
|
|
80
|
+
- Temporary directories are used for intermediate processing.
|
|
67
81
|
"""
|
|
68
|
-
genus = dir_name.split("_")[0]
|
|
69
|
-
|
|
70
|
-
# Starting logger.
|
|
71
|
-
logger.remove()
|
|
72
|
-
logger.add(sys.stderr, format="{time:HH:mm:ss} | {level} | {message}", level="INFO")
|
|
73
|
-
log_path = get_xspect_tmp_path() / dir_name / (genus + ".log")
|
|
74
|
-
logger.add(log_path, format="{time:HH:mm:ss} | {level} | {message}", level="DEBUG")
|
|
75
|
-
|
|
76
82
|
|
|
77
|
-
|
|
78
|
-
|
|
83
|
+
if not isinstance(display_name, str):
|
|
84
|
+
raise TypeError("display_name must be a string")
|
|
79
85
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
if not isinstance(dir_path, Path) and dir_path.exists() and dir_path.is_dir():
|
|
87
|
+
raise TypeError("dir must be Path object to a valid directory")
|
|
88
|
+
|
|
89
|
+
cobs_training_path = dir_path / "cobs"
|
|
90
|
+
if not cobs_training_path.exists():
|
|
91
|
+
raise ValueError("cobs directory not found")
|
|
92
|
+
|
|
93
|
+
cobs_folders = [f for f in cobs_training_path.iterdir() if f.is_dir()]
|
|
94
|
+
if len(cobs_folders) == 0:
|
|
95
|
+
raise ValueError("no folders found in cobs directory")
|
|
96
|
+
|
|
97
|
+
svm_path = dir_path / "svm"
|
|
98
|
+
if svm_path.exists():
|
|
99
|
+
svm_folders = [f for f in svm_path.iterdir() if f.is_dir()]
|
|
100
|
+
if len(svm_folders) != len(cobs_folders):
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"number of svm folders does not match number of cobs folders"
|
|
103
|
+
)
|
|
92
104
|
|
|
93
|
-
|
|
105
|
+
for cobs_folder, svm_folder in zip(cobs_folders, svm_folders):
|
|
106
|
+
if cobs_folder.name != svm_folder.name:
|
|
107
|
+
raise ValueError("cobs folder and svm folder names do not match")
|
|
108
|
+
else:
|
|
109
|
+
print("SVM directory not found. Model will be trained without SVM.")
|
|
110
|
+
|
|
111
|
+
with TemporaryDirectory() as tmp_dir:
|
|
112
|
+
tmp_dir = Path(tmp_dir)
|
|
113
|
+
species_dir = tmp_dir / "species"
|
|
114
|
+
species_dir.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
|
|
116
|
+
# concatenate files in cobs_training_data for each species
|
|
117
|
+
concatenate_species_fasta_files(cobs_folders, species_dir)
|
|
118
|
+
|
|
119
|
+
if svm_path.exists():
|
|
120
|
+
species_model = ProbabilisticFilterSVMModel(
|
|
121
|
+
k=21,
|
|
122
|
+
model_display_name=display_name,
|
|
123
|
+
author=author,
|
|
124
|
+
author_email=author_email,
|
|
125
|
+
model_type="Species",
|
|
126
|
+
base_path=get_xspect_model_path(),
|
|
127
|
+
kernel="rbf",
|
|
128
|
+
c=1.0,
|
|
129
|
+
)
|
|
130
|
+
species_model.fit(
|
|
131
|
+
species_dir,
|
|
132
|
+
svm_path,
|
|
133
|
+
display_names=translation_dict,
|
|
134
|
+
svm_step=svm_step,
|
|
135
|
+
training_accessions=training_accessions,
|
|
136
|
+
svm_accessions=svm_accessions,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
species_model = ProbabilisticFilterModel(
|
|
140
|
+
k=21,
|
|
141
|
+
model_display_name=display_name,
|
|
142
|
+
author=author,
|
|
143
|
+
author_email=author_email,
|
|
144
|
+
model_type="Species",
|
|
145
|
+
base_path=get_xspect_model_path(),
|
|
146
|
+
)
|
|
147
|
+
species_model.fit(
|
|
148
|
+
species_dir,
|
|
149
|
+
display_names=translation_dict,
|
|
150
|
+
training_accessions=training_accessions,
|
|
151
|
+
)
|
|
94
152
|
|
|
153
|
+
species_model.save()
|
|
95
154
|
|
|
96
|
-
|
|
97
|
-
|
|
155
|
+
if meta:
|
|
156
|
+
meta_fasta = tmp_dir / f"{display_name}.fasta"
|
|
157
|
+
concatenate_metagenome(species_dir, meta_fasta)
|
|
98
158
|
|
|
99
|
-
|
|
159
|
+
genus_model = ProbabilisticSingleFilterModel(
|
|
160
|
+
k=21,
|
|
161
|
+
model_display_name=display_name,
|
|
162
|
+
author=author,
|
|
163
|
+
author_email=author_email,
|
|
164
|
+
model_type="Genus",
|
|
165
|
+
base_path=get_xspect_model_path(),
|
|
166
|
+
)
|
|
167
|
+
genus_model.fit(
|
|
168
|
+
meta_fasta,
|
|
169
|
+
display_name,
|
|
170
|
+
training_accessions=(
|
|
171
|
+
sum(training_accessions.values(), [])
|
|
172
|
+
if training_accessions
|
|
173
|
+
else None
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
genus_model.save()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def train_from_ncbi(
|
|
180
|
+
genus: str,
|
|
181
|
+
svm_step: int = 1,
|
|
182
|
+
author: str = None,
|
|
183
|
+
author_email: str = None,
|
|
184
|
+
):
|
|
185
|
+
"""Train a model using NCBI assembly data for a given genus.
|
|
186
|
+
|
|
187
|
+
This function trains a probabilistic filter model using the assembly data from NCBI.
|
|
188
|
+
The training data is downloaded and processed, and the model is saved to the
|
|
189
|
+
xspect_data directory.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
genus (str): Genus name for which the model will be trained.
|
|
193
|
+
svm_step (int, optional): Step size for SVM training. Defaults to 1.
|
|
194
|
+
author (str, optional): Author of the model. Defaults to None.
|
|
195
|
+
author_email (str, optional): Author's email. Defaults to None.
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
TypeError: If `genus` is not a string.
|
|
199
|
+
ValueError: If no species with accessions are found.
|
|
200
|
+
|
|
201
|
+
Notes:
|
|
202
|
+
- The function uses NCBI API to fetch assembly metadata.
|
|
203
|
+
- Temporary directories are used for intermediate processing.
|
|
100
204
|
"""
|
|
101
|
-
path = get_xspect_tmp_path() / dir_name / "concatenate"
|
|
102
|
-
files = os.listdir(path)
|
|
103
|
-
for file in files:
|
|
104
|
-
file_split = file.split(".")[0].split("_")
|
|
105
|
-
tax_id = file_split[0]
|
|
106
|
-
new_file_name = f"{tax_id}.fasta"
|
|
107
|
-
os.rename((path / file), (path / new_file_name))
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def get_current_time():
|
|
111
|
-
"""Returns the current time in the form hh:mm:ss."""
|
|
112
|
-
return asctime(localtime()).split()[3]
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def train_ncbi(genus: str, svm_step: int = 1):
|
|
116
|
-
"""Train genus and species models with NCBI assemblies from the given genus."""
|
|
117
|
-
|
|
118
205
|
if not isinstance(genus, str):
|
|
119
206
|
raise TypeError("genus must be a string")
|
|
120
207
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
logger.info("Getting all species of the genus")
|
|
136
|
-
children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
|
|
137
|
-
species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
|
|
138
|
-
|
|
139
|
-
# Get all gcf accessions that have Taxonomy check result OK.
|
|
140
|
-
logger.info("Checking ANI data for updates")
|
|
141
|
-
ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
|
|
142
|
-
|
|
143
|
-
# Look for up to 8 assembly accessions per species.
|
|
144
|
-
logger.info("Getting assembly metadata")
|
|
145
|
-
all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
|
|
146
|
-
all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
|
|
147
|
-
)
|
|
148
|
-
all_metadata = all_metadata.get_all_metadata()
|
|
149
|
-
|
|
150
|
-
# Ensure that the genus has at least one species with accessions.
|
|
151
|
-
if not all_metadata:
|
|
152
|
-
raise ValueError("No species with accessions found")
|
|
153
|
-
|
|
154
|
-
# Download the chosen assemblies.
|
|
155
|
-
# One file for each species with it's downloaded assemblies in zip format.
|
|
156
|
-
|
|
157
|
-
# Iterate through all species.
|
|
158
|
-
logger.info("Downloading assemblies for bloomfilter training")
|
|
159
|
-
for metadata in all_metadata.values():
|
|
160
|
-
# Only try to download when the species has accessions.
|
|
161
|
-
if len(metadata["accessions"]) >= 1:
|
|
162
|
-
sleep(5)
|
|
163
|
-
species_name = metadata["sci_name"]
|
|
164
|
-
tax_id = metadata["tax_id"]
|
|
165
|
-
logger.info("Downloading {id}_{name}", id=tax_id, name=species_name)
|
|
166
|
-
file_name = f"{tax_id}_{species_name}.zip"
|
|
167
|
-
|
|
168
|
-
# Selecting the first 4 assemblies for training the filters.
|
|
169
|
-
accessions = metadata["accessions"][:4]
|
|
170
|
-
|
|
171
|
-
download_assemblies.download_assemblies(
|
|
172
|
-
accessions=accessions,
|
|
173
|
-
dir_name=dir_name,
|
|
174
|
-
target_folder="zip_files",
|
|
175
|
-
zip_file_name=file_name,
|
|
176
|
-
)
|
|
177
|
-
logger.info("Concatenating and extracting")
|
|
178
|
-
|
|
179
|
-
# Concatenate all assemblies of each species.
|
|
180
|
-
extract_and_concatenate.bf(dir_name=dir_name, delete=True)
|
|
181
|
-
concatenate_meta(get_xspect_tmp_path() / dir_name, genus)
|
|
208
|
+
ncbi_handler = NCBIHandler()
|
|
209
|
+
genus_tax_id = ncbi_handler.get_genus_taxon_id(genus)
|
|
210
|
+
species_ids = ncbi_handler.get_species(genus_tax_id)
|
|
211
|
+
species_names = ncbi_handler.get_taxon_names(species_ids)
|
|
212
|
+
|
|
213
|
+
filtered_species_ids = [
|
|
214
|
+
tax_id
|
|
215
|
+
for tax_id in species_ids
|
|
216
|
+
if "candidatus" not in species_names[tax_id].lower()
|
|
217
|
+
and " sp." not in species_names[tax_id].lower()
|
|
218
|
+
]
|
|
219
|
+
filtered_species_names = {
|
|
220
|
+
str(tax_id): species_names[tax_id] for tax_id in filtered_species_ids
|
|
221
|
+
}
|
|
182
222
|
|
|
183
|
-
# Download assemblies for svm creation.
|
|
184
|
-
logger.info("Downloading assemblies for support-vector-machine training")
|
|
185
223
|
accessions = {}
|
|
186
|
-
for
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
end = perf_counter()
|
|
248
|
-
|
|
249
|
-
logger.info("Program runtime: {time} m", time=(round((end - start) / 60, 2)))
|
|
250
|
-
logger.info("XspecT-trainer is finished.")
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def train_from_directory(display_name: str, dir_path: Path, meta: bool = False):
|
|
254
|
-
"""Train the gene family and gene filter.
|
|
255
|
-
|
|
256
|
-
:param display_name: Name of the model.
|
|
257
|
-
:param dir: Input directory.
|
|
258
|
-
"""
|
|
259
|
-
|
|
260
|
-
if not isinstance(display_name, str):
|
|
261
|
-
raise TypeError("display_name must be a string")
|
|
262
|
-
|
|
263
|
-
if not isinstance(dir_path, Path) and dir_path.exists() and dir_path.is_dir():
|
|
264
|
-
raise ValueError("dir must be Path object to a valid directory")
|
|
265
|
-
|
|
266
|
-
# check if the directory contains the necessary files
|
|
267
|
-
# copy to temp path
|
|
268
|
-
# check if svm training data exists
|
|
269
|
-
# train model, with svm data if it exists
|
|
270
|
-
# add display names
|
|
271
|
-
# train metagenome model
|
|
272
|
-
# clean up temp path
|
|
224
|
+
for tax_id in filtered_species_ids:
|
|
225
|
+
taxon_accessions = ncbi_handler.get_highest_quality_accessions(
|
|
226
|
+
tax_id, AssemblySource.REFSEQ, 8
|
|
227
|
+
)
|
|
228
|
+
if not taxon_accessions:
|
|
229
|
+
logger.warning(f"No assemblies found for tax_id {tax_id}. Skipping.")
|
|
230
|
+
filtered_species_names.pop(str(tax_id), None)
|
|
231
|
+
continue
|
|
232
|
+
accessions[tax_id] = taxon_accessions
|
|
233
|
+
|
|
234
|
+
if not accessions:
|
|
235
|
+
raise ValueError(
|
|
236
|
+
"No species with accessions found. Please check the genus name."
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
with TemporaryDirectory() as tmp_dir:
|
|
240
|
+
tmp_dir = Path(tmp_dir)
|
|
241
|
+
cobs_dir = tmp_dir / "cobs"
|
|
242
|
+
svm_dir = tmp_dir / "svm"
|
|
243
|
+
cobs_dir.mkdir(parents=True, exist_ok=True)
|
|
244
|
+
svm_dir.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
|
|
246
|
+
ncbi_handler.download_assemblies(
|
|
247
|
+
accessions=sum(accessions.values(), []), output_dir=tmp_dir
|
|
248
|
+
)
|
|
249
|
+
extract_zip(tmp_dir, tmp_dir)
|
|
250
|
+
accession_paths = get_ncbi_dataset_accession_paths(tmp_dir / "ncbi_dataset")
|
|
251
|
+
|
|
252
|
+
# select accessions
|
|
253
|
+
cobs_accessions = {}
|
|
254
|
+
svm_accessions = {}
|
|
255
|
+
for tax_id, accession_list in accessions.items():
|
|
256
|
+
cobs_accessions[tax_id] = accession_list[:4]
|
|
257
|
+
svm_accessions[tax_id] = accession_list[-4:]
|
|
258
|
+
|
|
259
|
+
# move files
|
|
260
|
+
for tax_id, accession_list in cobs_accessions.items():
|
|
261
|
+
tax_id_dir = cobs_dir / str(tax_id)
|
|
262
|
+
tax_id_dir.mkdir(parents=True, exist_ok=True)
|
|
263
|
+
for accession in accession_list:
|
|
264
|
+
accession_path = accession_paths[accession]
|
|
265
|
+
shutil.copy(accession_path, tax_id_dir / f"{accession}.fasta")
|
|
266
|
+
for tax_id, accession_list in svm_accessions.items():
|
|
267
|
+
tax_id_dir = svm_dir / str(tax_id)
|
|
268
|
+
tax_id_dir.mkdir(parents=True, exist_ok=True)
|
|
269
|
+
for accession in accession_list:
|
|
270
|
+
accession_path = accession_paths[accession]
|
|
271
|
+
shutil.copy(accession_path, tax_id_dir / f"{accession}.fasta")
|
|
272
|
+
|
|
273
|
+
train_from_directory(
|
|
274
|
+
display_name=genus,
|
|
275
|
+
dir_path=tmp_dir,
|
|
276
|
+
meta=True,
|
|
277
|
+
training_accessions=cobs_accessions,
|
|
278
|
+
svm_accessions=svm_accessions,
|
|
279
|
+
svm_step=svm_step,
|
|
280
|
+
translation_dict=filtered_species_names,
|
|
281
|
+
author=author,
|
|
282
|
+
author_email=author_email,
|
|
283
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -54,34 +54,27 @@ Requires-Dist: sphinx-autobuild; extra == "docs"
|
|
|
54
54
|
Provides-Extra: test
|
|
55
55
|
Requires-Dist: pytest; extra == "test"
|
|
56
56
|
Requires-Dist: pytest-cov; extra == "test"
|
|
57
|
+
Dynamic: license-file
|
|
57
58
|
|
|
58
59
|
# XspecT - Acinetobacter Species Assignment Tool
|
|
60
|
+
<!-- start intro -->
|
|
59
61
|

|
|
60
62
|
[](https://github.com/pylint-dev/pylint)
|
|
61
63
|
[](https://github.com/psf/black)
|
|
62
64
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
<!-- start intro -->
|
|
66
|
-
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [Bloom Filters] and a [Support Vector Machine].
|
|
67
|
-
<br/><br/>
|
|
68
|
-
|
|
69
|
-
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a reference database. Bloom Filter ensure a fast lookup in this process. For a final prediction the results are classified using a Support Vector Machine.
|
|
70
|
-
<br/>
|
|
65
|
+
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [kmer indices] and a [Support Vector Machine].
|
|
71
66
|
|
|
72
|
-
|
|
73
|
-
<br/>
|
|
67
|
+
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a reference database. Bloom Filter ensure a fast lookup in this process. For a final prediction, the results are classified using a Support Vector Machine.
|
|
74
68
|
|
|
75
|
-
The tool is available as a web-based application and a
|
|
69
|
+
The tool is available as a web-based application and as a command line interface.
|
|
76
70
|
|
|
77
|
-
[
|
|
71
|
+
[kmer indices]: https://arxiv.org/abs/1905.09624
|
|
78
72
|
[Support Vector Machine]: https://en.wikipedia.org/wiki/Support-vector_machine
|
|
79
|
-
[blaOxa-genes]: https://en.wikipedia.org/wiki/Beta-lactamase#OXA_beta-lactamases_(class_D)
|
|
80
73
|
<!-- end intro -->
|
|
81
74
|
|
|
82
75
|
<!-- start quickstart -->
|
|
83
76
|
## Installation
|
|
84
|
-
To install
|
|
77
|
+
To install XspecT, please download the lastest 64 bit Python version and install the package using pip:
|
|
85
78
|
```
|
|
86
79
|
pip install xspect
|
|
87
80
|
```
|
|
@@ -91,23 +84,23 @@ Please note that Windows and Alpine Linux is currently not supported.
|
|
|
91
84
|
### Get the models
|
|
92
85
|
To download basic pre-trained models, you can use the built-in command:
|
|
93
86
|
```
|
|
94
|
-
xspect download
|
|
87
|
+
xspect models download
|
|
95
88
|
```
|
|
96
89
|
Additional species models can be trained using:
|
|
97
90
|
```
|
|
98
|
-
xspect train
|
|
91
|
+
xspect models train ncbi
|
|
99
92
|
```
|
|
100
93
|
|
|
101
94
|
### How to run the web app
|
|
102
95
|
To run the web app, install and run [XspecT Web](https://github.com/aromberg/xspect-web). Additionally, run XspecT in API mode:
|
|
103
96
|
```
|
|
104
|
-
xspect
|
|
97
|
+
xspect web
|
|
105
98
|
```
|
|
106
99
|
|
|
107
100
|
### How to use the XspecT command line interface
|
|
108
|
-
Run
|
|
101
|
+
Run XspecT with the configuration you want to run it with as arguments.
|
|
109
102
|
```
|
|
110
|
-
xspect classify
|
|
103
|
+
xspect classify species
|
|
111
104
|
```
|
|
112
105
|
For further instructions on how to use the command line interface, please refer to the [documentation] or execute:
|
|
113
106
|
```
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
xspect/definitions.py,sha256=fVn_li_s2hriOSGJ69o_H8H-vkw1znvkryhBj7WMnF4,1219
|
|
3
|
+
xspect/download_models.py,sha256=y1wFJZa1xOJfvUP78zKkRs46O-WqKBL90vmo5AYUio0,853
|
|
4
|
+
xspect/fastapi.py,sha256=DOef3MqWPdBmdYBo8Z9SPmWrbJHOsQxQe3GrC4f__Rc,3165
|
|
5
|
+
xspect/file_io.py,sha256=YmfoKEQdHHEi8dO2G5Kt4tSNi5LuWW0VZ74pyYRHiTo,5937
|
|
6
|
+
xspect/main.py,sha256=uVj1fooDU5WW8sMug5YPwuAphb8zd3PDpNFNlTIyXBw,11155
|
|
7
|
+
xspect/model_management.py,sha256=LItMidbfxZfttEZHa8da_nnkwkH7XVLWDM0uVrFUZ0Q,3753
|
|
8
|
+
xspect/ncbi.py,sha256=sSJO3g8n89Qw6UJjAy13bpjOcIGSquTKNKVHNUMbDeM,10072
|
|
9
|
+
xspect/train.py,sha256=7I7-inWGJe_VDzII9dLZ8U-8SUCZDIrhb-eNOZEyfss,10703
|
|
10
|
+
xspect/mlst_feature/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
xspect/mlst_feature/mlst_helper.py,sha256=3zhhEomkk-qiObbQ82TM_YHuyVCJ7_XgyzsYM_4TS0E,5760
|
|
12
|
+
xspect/mlst_feature/pub_mlst_handler.py,sha256=oss3CkJNt6041p3qnMdOfoX8ZgUfpB93CUim-Yakc9A,5031
|
|
13
|
+
xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
xspect/models/probabilistic_filter_mlst_model.py,sha256=JMc0yBJPo7J9b-GpvhDmzhwWPIKOwatAq0edDgM72PE,11735
|
|
15
|
+
xspect/models/probabilistic_filter_model.py,sha256=l8mhcRgHPso7qIgI56buCnE3ZleO3gPWOZEpgrycOBA,10029
|
|
16
|
+
xspect/models/probabilistic_filter_svm_model.py,sha256=xXimcv3iWnG1JiFyrk6UqkP9hFIxWGDdb__fRdQYwro,6245
|
|
17
|
+
xspect/models/probabilistic_single_filter_model.py,sha256=yxWnCt4IP-3ZRLP4pRA3f2VTHc0_4g17PDCyOFayDDg,4090
|
|
18
|
+
xspect/models/result.py,sha256=fhTS43XYAIkNiiAMyNpaif0kM4Ab3xLBnVJnutkOuFU,3400
|
|
19
|
+
xspect-0.4.0.dist-info/licenses/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
20
|
+
xspect-0.4.0.dist-info/METADATA,sha256=mmsNmdiRqOC0RCBe7yW6oofue2OctwErCWVyiJD86nI,4439
|
|
21
|
+
xspect-0.4.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
|
22
|
+
xspect-0.4.0.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
23
|
+
xspect-0.4.0.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
24
|
+
xspect-0.4.0.dist-info/RECORD,,
|