XspecT 0.2.7__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/definitions.py +0 -7
- xspect/download_models.py +25 -24
- xspect/fastapi.py +23 -26
- xspect/file_io.py +86 -2
- xspect/main.py +360 -98
- xspect/mlst_feature/mlst_helper.py +4 -6
- xspect/model_management.py +7 -15
- xspect/models/probabilistic_filter_model.py +16 -5
- xspect/models/probabilistic_filter_svm_model.py +33 -18
- xspect/models/probabilistic_single_filter_model.py +8 -1
- xspect/models/result.py +32 -66
- xspect/ncbi.py +265 -0
- xspect/train.py +258 -242
- {xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/METADATA +15 -21
- xspect-0.4.1.dist-info/RECORD +24 -0
- {xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/WHEEL +1 -1
- xspect/pipeline.py +0 -201
- xspect/run.py +0 -38
- xspect/train_filter/__init__.py +0 -0
- xspect/train_filter/create_svm.py +0 -45
- xspect/train_filter/extract_and_concatenate.py +0 -124
- xspect/train_filter/ncbi_api/__init__.py +0 -0
- xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
- xspect-0.2.7.dist-info/RECORD +0 -33
- {xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/entry_points.txt +0 -0
- {xspect-0.2.7.dist-info → xspect-0.4.1.dist-info/licenses}/LICENSE +0 -0
- {xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/top_level.txt +0 -0
xspect/pipeline.py
DELETED
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
"""Module for defining the Pipeline class."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from Bio.SeqRecord import SeqRecord
|
|
6
|
-
from Bio import SeqIO
|
|
7
|
-
from xspect.file_io import get_records_by_id
|
|
8
|
-
from xspect.models.result import StepType, SubprocessingStep
|
|
9
|
-
from xspect.run import Run
|
|
10
|
-
from xspect.models.result import ModelResult
|
|
11
|
-
from xspect.model_management import get_model_by_slug
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ModelExecution:
|
|
15
|
-
"""Class for storing a processing step of an XspecT pipeline."""
|
|
16
|
-
|
|
17
|
-
def __init__(
|
|
18
|
-
self,
|
|
19
|
-
model_slug: str,
|
|
20
|
-
sparse_sampling_step: int = 1,
|
|
21
|
-
):
|
|
22
|
-
self.model_slug = model_slug
|
|
23
|
-
self.sparse_sampling_step = sparse_sampling_step
|
|
24
|
-
self.pipeline_steps = []
|
|
25
|
-
|
|
26
|
-
def add_pipeline_step(
|
|
27
|
-
self,
|
|
28
|
-
pipeline_step: "PipelineStep",
|
|
29
|
-
):
|
|
30
|
-
"""Add a subprocessing step to the pipeline step."""
|
|
31
|
-
self.pipeline_steps.append(pipeline_step)
|
|
32
|
-
|
|
33
|
-
def to_dict(self) -> dict:
|
|
34
|
-
"""Return the processing step as a dictionary."""
|
|
35
|
-
return {
|
|
36
|
-
"model_slug": self.model_slug,
|
|
37
|
-
"sparse_sampling_step": self.sparse_sampling_step,
|
|
38
|
-
"pipeline_steps": [
|
|
39
|
-
pipeline_step.to_dict() for pipeline_step in self.pipeline_steps
|
|
40
|
-
],
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
def run(
|
|
44
|
-
self,
|
|
45
|
-
sequence_input: (
|
|
46
|
-
SeqRecord
|
|
47
|
-
| list[SeqRecord]
|
|
48
|
-
| SeqIO.FastaIO.FastaIterator
|
|
49
|
-
| SeqIO.QualityIO.FastqPhredIterator
|
|
50
|
-
| Path
|
|
51
|
-
),
|
|
52
|
-
) -> ModelResult:
|
|
53
|
-
"""Run the model on a given input."""
|
|
54
|
-
model = get_model_by_slug(self.model_slug)
|
|
55
|
-
model_result = model.predict(sequence_input, step=self.sparse_sampling_step)
|
|
56
|
-
|
|
57
|
-
for pipeline_step in self.pipeline_steps:
|
|
58
|
-
if pipeline_step.subprocessing_type == StepType.PREDICTION:
|
|
59
|
-
score = model_result.get_scores()["total"][pipeline_step.label]
|
|
60
|
-
if score >= pipeline_step.treshold:
|
|
61
|
-
prediction_model_result = pipeline_step.model_execution.run(
|
|
62
|
-
sequence_input
|
|
63
|
-
)
|
|
64
|
-
subprocessing_step = SubprocessingStep(
|
|
65
|
-
pipeline_step.subprocessing_type,
|
|
66
|
-
pipeline_step.label,
|
|
67
|
-
pipeline_step.treshold,
|
|
68
|
-
prediction_model_result,
|
|
69
|
-
)
|
|
70
|
-
model_result.add_subprocessing_step(subprocessing_step)
|
|
71
|
-
elif pipeline_step.subprocessing_type == StepType.FILTERING:
|
|
72
|
-
filtered_sequence_ids = model_result.get_filtered_subsequences(
|
|
73
|
-
pipeline_step.label, pipeline_step.treshold
|
|
74
|
-
)
|
|
75
|
-
sequence_input = get_records_by_id(
|
|
76
|
-
sequence_input, filtered_sequence_ids
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
filtering_model_result = None
|
|
80
|
-
if sequence_input:
|
|
81
|
-
filtering_model_result = pipeline_step.model_execution.run(
|
|
82
|
-
sequence_input
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
subprocessing_step = SubprocessingStep(
|
|
86
|
-
pipeline_step.subprocessing_type,
|
|
87
|
-
pipeline_step.label,
|
|
88
|
-
pipeline_step.treshold,
|
|
89
|
-
filtering_model_result,
|
|
90
|
-
)
|
|
91
|
-
model_result.add_subprocessing_step(subprocessing_step)
|
|
92
|
-
else:
|
|
93
|
-
raise ValueError(
|
|
94
|
-
f"Invalid subprocessing type {pipeline_step.subprocessing_type}"
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
return model_result
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class PipelineStep:
|
|
101
|
-
"""Class for storing a subprocessing step of an XspecT model."""
|
|
102
|
-
|
|
103
|
-
def __init__(
|
|
104
|
-
self,
|
|
105
|
-
subprocessing_type: StepType,
|
|
106
|
-
label: str,
|
|
107
|
-
treshold: float,
|
|
108
|
-
model_execution: ModelExecution,
|
|
109
|
-
):
|
|
110
|
-
self.subprocessing_type = subprocessing_type
|
|
111
|
-
self.label = label
|
|
112
|
-
self.treshold = treshold
|
|
113
|
-
self.model_execution = model_execution
|
|
114
|
-
|
|
115
|
-
def to_dict(self) -> dict:
|
|
116
|
-
"""Return the subprocessing step as a dictionary."""
|
|
117
|
-
return {
|
|
118
|
-
"subprocessing_type": str(self.subprocessing_type),
|
|
119
|
-
"label": self.label,
|
|
120
|
-
"treshold": self.treshold,
|
|
121
|
-
"model_execution": self.model_execution.to_dict(),
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class Pipeline:
|
|
126
|
-
"""Class for storing an XspecT pipeline consisting of multiple model processing steps."""
|
|
127
|
-
|
|
128
|
-
def __init__(self, display_name: str, author: str, author_email: str):
|
|
129
|
-
self.display_name = display_name
|
|
130
|
-
self.author = author
|
|
131
|
-
self.author_email = author_email
|
|
132
|
-
self.model_executions = []
|
|
133
|
-
|
|
134
|
-
def add_pipeline_step(
|
|
135
|
-
self,
|
|
136
|
-
pipeline_step: ModelExecution,
|
|
137
|
-
):
|
|
138
|
-
"""Add a processing step to the pipeline."""
|
|
139
|
-
self.model_executions.append(pipeline_step)
|
|
140
|
-
|
|
141
|
-
def to_dict(self) -> dict:
|
|
142
|
-
"""Return the pipeline as a dictionary."""
|
|
143
|
-
return {
|
|
144
|
-
"display_name": self.display_name,
|
|
145
|
-
"author": self.author,
|
|
146
|
-
"author_email": self.author_email,
|
|
147
|
-
"model_executions": [
|
|
148
|
-
model_execution.to_dict() for model_execution in self.model_executions
|
|
149
|
-
],
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
def to_json(self) -> str:
|
|
153
|
-
"""Return the pipeline as a JSON string."""
|
|
154
|
-
return json.dumps(self.to_dict())
|
|
155
|
-
|
|
156
|
-
def save(self, path: Path) -> None:
|
|
157
|
-
"""Save the pipeline as a JSON file."""
|
|
158
|
-
with open(path, "w", encoding="utf-8") as f:
|
|
159
|
-
f.write(self.to_json())
|
|
160
|
-
|
|
161
|
-
@staticmethod
|
|
162
|
-
def from_file(path: Path) -> "Pipeline":
|
|
163
|
-
"""Load the pipeline from a JSON file."""
|
|
164
|
-
with open(path, "r", encoding="utf-8") as f:
|
|
165
|
-
pipeline_json = json.load(f)
|
|
166
|
-
pipeline = Pipeline(
|
|
167
|
-
pipeline_json["display_name"],
|
|
168
|
-
pipeline_json["author"],
|
|
169
|
-
pipeline_json["author_email"],
|
|
170
|
-
)
|
|
171
|
-
for model_execution in pipeline_json["model_executions"]:
|
|
172
|
-
model_execution = ModelExecution(
|
|
173
|
-
model_execution["model_slug"],
|
|
174
|
-
model_execution["sparse_sampling_step"],
|
|
175
|
-
)
|
|
176
|
-
for pipeline_step in model_execution["pipeline_steps"]:
|
|
177
|
-
model_execution.add_pipeline_step(
|
|
178
|
-
PipelineStep(
|
|
179
|
-
StepType(pipeline_step["subprocessing_type"]),
|
|
180
|
-
pipeline_step["label"],
|
|
181
|
-
pipeline_step["treshold"],
|
|
182
|
-
ModelExecution(
|
|
183
|
-
pipeline_step["model_execution"]["model_slug"],
|
|
184
|
-
pipeline_step["model_execution"][
|
|
185
|
-
"sparse_sampling_step"
|
|
186
|
-
],
|
|
187
|
-
),
|
|
188
|
-
)
|
|
189
|
-
)
|
|
190
|
-
pipeline.add_pipeline_step(model_execution)
|
|
191
|
-
return pipeline
|
|
192
|
-
|
|
193
|
-
def run(self, input_file: Path) -> Run:
|
|
194
|
-
"""Run the pipeline on a given input."""
|
|
195
|
-
run = Run(self.display_name, input_file)
|
|
196
|
-
|
|
197
|
-
for model_execution in self.model_executions:
|
|
198
|
-
result = model_execution.run(input_file)
|
|
199
|
-
run.add_result(result)
|
|
200
|
-
|
|
201
|
-
return run
|
xspect/run.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
"""Module with XspecT global run class, which summarizes individual model results."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from xspect.models.result import ModelResult
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Run:
|
|
9
|
-
"""Class for storing the results of an XspecT run."""
|
|
10
|
-
|
|
11
|
-
def __init__(self, display_name: str, input_file: str):
|
|
12
|
-
self.display_name = display_name
|
|
13
|
-
self.input_file = input_file
|
|
14
|
-
self.results = []
|
|
15
|
-
|
|
16
|
-
def add_result(self, result: ModelResult):
|
|
17
|
-
"""Add a result to the run."""
|
|
18
|
-
self.results.append(result)
|
|
19
|
-
|
|
20
|
-
def to_dict(self) -> dict:
|
|
21
|
-
"""Return the run as a dictionary."""
|
|
22
|
-
return {
|
|
23
|
-
"display_name": self.display_name,
|
|
24
|
-
"input_file": str(self.input_file),
|
|
25
|
-
"results": (
|
|
26
|
-
[result.to_dict() for result in self.results] if self.results else []
|
|
27
|
-
),
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
def to_json(self) -> str:
|
|
31
|
-
"""Return the run as a JSON string."""
|
|
32
|
-
json_dict = self.to_dict()
|
|
33
|
-
return json.dumps(json_dict, indent=4)
|
|
34
|
-
|
|
35
|
-
def save(self, path: Path) -> None:
|
|
36
|
-
"""Save the run as a JSON file."""
|
|
37
|
-
with open(path, "w", encoding="utf-8") as f:
|
|
38
|
-
f.write(self.to_json())
|
xspect/train_filter/__init__.py
DELETED
|
File without changes
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"""This module contains functions to select and download assemblies for SVM creation."""
|
|
2
|
-
|
|
3
|
-
from time import sleep
|
|
4
|
-
from loguru import logger
|
|
5
|
-
from xspect.train_filter.ncbi_api import download_assemblies
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def select_assemblies(accessions):
|
|
9
|
-
"""Selects up to 4 assemblies, ideally assemblies that were not used for training the filters.
|
|
10
|
-
|
|
11
|
-
:param accessions: All selected assembly accessions for every species.
|
|
12
|
-
:type accessions: dict
|
|
13
|
-
:return: Dict with species name as key and selected accessions as value.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
all_accessions = {
|
|
17
|
-
sci_name: curr_accessions[-4:]
|
|
18
|
-
for sci_name, curr_accessions in accessions.items()
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
return all_accessions
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_svm_assemblies(all_accessions, dir_name):
|
|
25
|
-
"""Download assemblies for svm creation.
|
|
26
|
-
|
|
27
|
-
:param all_accessions: Contains lists with all previously selected assemblies for every species.
|
|
28
|
-
:type all_accessions: dict
|
|
29
|
-
:param dir_name: Name of the parent directory.
|
|
30
|
-
:type dir_name: str
|
|
31
|
-
"""
|
|
32
|
-
# Select accessions for download.
|
|
33
|
-
selected_accessions = select_assemblies(all_accessions)
|
|
34
|
-
|
|
35
|
-
# Download assemblies.
|
|
36
|
-
for sci_name, accessions in selected_accessions.items():
|
|
37
|
-
sleep(5)
|
|
38
|
-
logger.info("Downloading {name}", name=sci_name)
|
|
39
|
-
file_name = sci_name + ".zip"
|
|
40
|
-
download_assemblies.download_assemblies(
|
|
41
|
-
accessions=accessions,
|
|
42
|
-
dir_name=dir_name,
|
|
43
|
-
target_folder="training_data_zipped",
|
|
44
|
-
zip_file_name=file_name,
|
|
45
|
-
)
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
"""Module for extracting and concatenating assemblies."""
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import shutil
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from Bio import SeqIO
|
|
9
|
-
from xspect import file_io
|
|
10
|
-
from xspect.definitions import get_xspect_tmp_path, fasta_endings
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def change_header(assemblies_path, species_accessions: dict):
|
|
14
|
-
"""Change the header of the assemblies to the species name."""
|
|
15
|
-
files = os.listdir(assemblies_path)
|
|
16
|
-
# Iterate through all species.
|
|
17
|
-
for name, accessions in species_accessions.items():
|
|
18
|
-
# Iterate through all accessions of the current species.
|
|
19
|
-
for accession in accessions:
|
|
20
|
-
# Iterate through all file names.
|
|
21
|
-
for file in files:
|
|
22
|
-
if accession in file:
|
|
23
|
-
file_path = assemblies_path / str(file)
|
|
24
|
-
# Change the header.
|
|
25
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
26
|
-
sequence = ""
|
|
27
|
-
for line in f.readlines():
|
|
28
|
-
if line[0] != ">":
|
|
29
|
-
sequence += line
|
|
30
|
-
new_header = f">{name}\n"
|
|
31
|
-
with open(file_path, "w", encoding="utf-8") as f:
|
|
32
|
-
f.write(new_header)
|
|
33
|
-
f.write(sequence)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def copy_assemblies(unzipped_path, assemblies_path):
|
|
37
|
-
"""Copy all assemblies to a new directory."""
|
|
38
|
-
os.mkdir(assemblies_path)
|
|
39
|
-
for folder in os.listdir(unzipped_path):
|
|
40
|
-
for root, _, files in os.walk(unzipped_path / str(folder)):
|
|
41
|
-
for file in files:
|
|
42
|
-
file_ending = file.split(".")[-1]
|
|
43
|
-
if file_ending in fasta_endings:
|
|
44
|
-
file_path = Path(root) / file
|
|
45
|
-
shutil.copy(file_path, (assemblies_path / file))
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def concatenate_bf(unzipped_path, concatenate_path):
|
|
49
|
-
"""Concatenate all assemblies for Bloom filter training."""
|
|
50
|
-
|
|
51
|
-
all_assemblies = []
|
|
52
|
-
|
|
53
|
-
# Make new directory
|
|
54
|
-
os.mkdir(concatenate_path)
|
|
55
|
-
|
|
56
|
-
# Open the fasta files for each species.
|
|
57
|
-
for folder in os.listdir(unzipped_path):
|
|
58
|
-
species_files = []
|
|
59
|
-
# Walk through dirs to get all fasta files.
|
|
60
|
-
for root, _, files in os.walk(unzipped_path / folder):
|
|
61
|
-
for file in files:
|
|
62
|
-
file_ending = file.split(".")[-1]
|
|
63
|
-
if file_ending in fasta_endings:
|
|
64
|
-
species_files.append(Path(root) / file)
|
|
65
|
-
all_assemblies.append(".".join(str(file).split(".")[:-1]))
|
|
66
|
-
|
|
67
|
-
# Gather all sequences and headers.
|
|
68
|
-
sequences = []
|
|
69
|
-
headers = []
|
|
70
|
-
for file in species_files:
|
|
71
|
-
records = SeqIO.parse(file, "fasta")
|
|
72
|
-
for record in records:
|
|
73
|
-
headers.append(record.id)
|
|
74
|
-
sequences.append(str(record.seq))
|
|
75
|
-
|
|
76
|
-
# Concatenate sequences
|
|
77
|
-
species_sequence = "".join(sequences)
|
|
78
|
-
species_header = ">" + " § ".join(headers) + "\n"
|
|
79
|
-
|
|
80
|
-
# Save concatenated sequences and headers
|
|
81
|
-
species_path = concatenate_path / (folder + ".fasta")
|
|
82
|
-
with open(species_path, "w", encoding="utf-8") as species_file:
|
|
83
|
-
species_file.write(species_header)
|
|
84
|
-
species_file.write(species_sequence)
|
|
85
|
-
|
|
86
|
-
return all_assemblies
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def save_all_assemblies(dir_path: Path, all_assemblies: list[str]):
|
|
90
|
-
"""Save all assemblies to a file."""
|
|
91
|
-
path = dir_path / "all_bf_assemblies.txt"
|
|
92
|
-
with open(path, "w", encoding="utf-8") as file:
|
|
93
|
-
for assembly in all_assemblies:
|
|
94
|
-
file.write(f"{assembly}\n")
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def bf(dir_name: str, delete: bool):
|
|
98
|
-
"""Extract and concatenate assemblies for Bloom filter training."""
|
|
99
|
-
dir_path = get_xspect_tmp_path() / dir_name
|
|
100
|
-
zip_path = dir_path / "zip_files"
|
|
101
|
-
unzipped_path = dir_path / "zip_files_extracted"
|
|
102
|
-
concatenate_path = dir_path / "concatenate"
|
|
103
|
-
file_io.extract_zip(zip_path, unzipped_path)
|
|
104
|
-
all_assemblies = concatenate_bf(unzipped_path, concatenate_path)
|
|
105
|
-
save_all_assemblies(dir_path, all_assemblies)
|
|
106
|
-
if delete:
|
|
107
|
-
file_io.delete_zip_files(zip_path)
|
|
108
|
-
shutil.rmtree(zip_path, ignore_errors=False)
|
|
109
|
-
shutil.rmtree(unzipped_path, ignore_errors=False)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def svm(species_accessions: dict, dir_name: str, delete: bool):
|
|
113
|
-
"""Extract and concatenate assemblies for generating SVM training data."""
|
|
114
|
-
dir_path = get_xspect_tmp_path() / dir_name
|
|
115
|
-
zip_path = dir_path / "training_data_zipped"
|
|
116
|
-
unzipped_path = dir_path / "training_data_unzipped"
|
|
117
|
-
assemblies_path = dir_path / "training_data"
|
|
118
|
-
file_io.extract_zip(zip_path, unzipped_path)
|
|
119
|
-
copy_assemblies(unzipped_path, assemblies_path)
|
|
120
|
-
change_header(assemblies_path, species_accessions)
|
|
121
|
-
if delete:
|
|
122
|
-
file_io.delete_zip_files(zip_path)
|
|
123
|
-
shutil.rmtree(zip_path, ignore_errors=False)
|
|
124
|
-
shutil.rmtree(unzipped_path, ignore_errors=False)
|
|
File without changes
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
"""This module contains methods to download assemblies from the NCBI database."""
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
# pylint: disable=line-too-long
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import requests
|
|
9
|
-
from xspect.definitions import get_xspect_tmp_path
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def download_assemblies(accessions, dir_name, target_folder, zip_file_name):
|
|
13
|
-
"""Download and save assemblies from the NCBI database.
|
|
14
|
-
|
|
15
|
-
:param accessions: All collected accessions from the NCBI RefSeq-database.
|
|
16
|
-
:type accessions: list
|
|
17
|
-
:param dir_name: Name of the directory where the assemblies will be saved.
|
|
18
|
-
:type dir_name: str
|
|
19
|
-
:param target_folder: Name for the folder in which the downloaded files will be stored.
|
|
20
|
-
:type target_folder: str
|
|
21
|
-
:param zip_file_name: Name of the zip file. E.g. Klebsiella aerogenes.zip.
|
|
22
|
-
:type zip_file_name: str
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
path = get_xspect_tmp_path() / dir_name / target_folder / zip_file_name
|
|
26
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{','.join(accessions)}/download"
|
|
27
|
-
parameters = {"include_annotation_type": "GENOME_FASTA", "filename": zip_file_name}
|
|
28
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
29
|
-
genome_download = requests.get(api_url, params=parameters, timeout=30)
|
|
30
|
-
with open(path, "wb") as f:
|
|
31
|
-
f.write(genome_download.content)
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
"""Collects metadata of assemblies from NCBI API"""
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
from time import sleep
|
|
6
|
-
|
|
7
|
-
import requests
|
|
8
|
-
|
|
9
|
-
from loguru import logger
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class NCBIAssemblyMetadata:
|
|
13
|
-
"""Class to collect metadata of assemblies from the NCBI API."""
|
|
14
|
-
|
|
15
|
-
_all_metadata: dict
|
|
16
|
-
_count: int
|
|
17
|
-
_parameters: dict
|
|
18
|
-
_accessions: list[str]
|
|
19
|
-
_contig_n50: int
|
|
20
|
-
_all_metadata_complete: dict
|
|
21
|
-
|
|
22
|
-
def __init__(self, all_metadata: dict, count=8, contig_n50=10000):
|
|
23
|
-
self._all_metadata = all_metadata
|
|
24
|
-
self._count = count
|
|
25
|
-
self._contig_n50 = contig_n50
|
|
26
|
-
|
|
27
|
-
self._set_parameters()
|
|
28
|
-
|
|
29
|
-
tmp_metadata = {}
|
|
30
|
-
for tax_id, curr_metadata in self._all_metadata.items():
|
|
31
|
-
sleep(2)
|
|
32
|
-
species_name = curr_metadata["sci_name"]
|
|
33
|
-
logger.info("Collecting metadata of {name}", name=species_name)
|
|
34
|
-
accessions = self._make_request(taxon=tax_id)
|
|
35
|
-
if len(accessions) != 0:
|
|
36
|
-
curr_metadata["accessions"] = accessions
|
|
37
|
-
tmp_metadata[tax_id] = curr_metadata
|
|
38
|
-
|
|
39
|
-
self._all_metadata_complete = tmp_metadata
|
|
40
|
-
|
|
41
|
-
def _set_parameters(self):
|
|
42
|
-
params = {
|
|
43
|
-
"filters.reference_only": "false",
|
|
44
|
-
"filters.assembly_source": "refseq",
|
|
45
|
-
"filters.exclude_atypical": "true",
|
|
46
|
-
"page_size": self._count,
|
|
47
|
-
"page_token": "",
|
|
48
|
-
}
|
|
49
|
-
params_ref = params.copy()
|
|
50
|
-
params_ref["filters.reference_only"] = "true"
|
|
51
|
-
|
|
52
|
-
params_comp_genome = params.copy()
|
|
53
|
-
params_comp_genome["filters.assembly_level"] = "complete_genome"
|
|
54
|
-
|
|
55
|
-
params_chrom = params.copy()
|
|
56
|
-
params_chrom["filters.assembly_level"] = "chromosome"
|
|
57
|
-
|
|
58
|
-
params_scaffold = params.copy()
|
|
59
|
-
params_scaffold["filters.assembly_level"] = "scaffold"
|
|
60
|
-
|
|
61
|
-
params_contig = params.copy()
|
|
62
|
-
params_contig["filters.assembly_level"] = "contig"
|
|
63
|
-
|
|
64
|
-
self._parameters = {
|
|
65
|
-
"params_ref": params_ref,
|
|
66
|
-
"params_comp_genome": params_comp_genome,
|
|
67
|
-
"params_chrom": params_chrom,
|
|
68
|
-
"params_scaffold": params_scaffold,
|
|
69
|
-
"params_contig": params_contig,
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
def _make_request(self, taxon: str):
|
|
73
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
|
|
74
|
-
accessions = []
|
|
75
|
-
count = 0
|
|
76
|
-
for request_type, parameters in self._parameters.items():
|
|
77
|
-
raw_response = requests.get(api_url, params=parameters, timeout=5)
|
|
78
|
-
response = raw_response.json()
|
|
79
|
-
if response:
|
|
80
|
-
try:
|
|
81
|
-
reports = response["reports"]
|
|
82
|
-
for report in reports:
|
|
83
|
-
accession = report["accession"]
|
|
84
|
-
contig_n50 = report["assembly_stats"]["contig_n50"]
|
|
85
|
-
taxonomy_check_status = report["average_nucleotide_identity"][
|
|
86
|
-
"taxonomy_check_status"
|
|
87
|
-
]
|
|
88
|
-
if count < self._count:
|
|
89
|
-
if (
|
|
90
|
-
taxonomy_check_status == "OK"
|
|
91
|
-
and contig_n50 > self._contig_n50
|
|
92
|
-
):
|
|
93
|
-
accessions.append(accession)
|
|
94
|
-
count += 1
|
|
95
|
-
else:
|
|
96
|
-
break
|
|
97
|
-
except KeyError:
|
|
98
|
-
logger.debug(
|
|
99
|
-
"While requesting: {type} an error response was given",
|
|
100
|
-
type=request_type,
|
|
101
|
-
)
|
|
102
|
-
logger.debug(str(response))
|
|
103
|
-
|
|
104
|
-
if count >= self._count:
|
|
105
|
-
break
|
|
106
|
-
return accessions
|
|
107
|
-
|
|
108
|
-
def get_all_metadata(self):
|
|
109
|
-
"""Returns all metadata of the assemblies."""
|
|
110
|
-
return self._all_metadata_complete
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
"""This class uses the NCBI Datasets API to get the taxonomy tree of a given Taxon.
|
|
2
|
-
|
|
3
|
-
The taxonomy tree consists of only the next children to the parent taxon.
|
|
4
|
-
The children are only of the next lower rank of the parent taxon.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
__author__ = "Berger, Phillip"
|
|
8
|
-
|
|
9
|
-
import sys
|
|
10
|
-
import requests
|
|
11
|
-
|
|
12
|
-
from loguru import logger
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class NCBIChildrenTree:
|
|
16
|
-
_taxon: str
|
|
17
|
-
_response: dict
|
|
18
|
-
_parent_taxon_id: str
|
|
19
|
-
_children_taxon_ids = list()
|
|
20
|
-
|
|
21
|
-
def __init__(self, taxon: str):
|
|
22
|
-
self._taxon = taxon
|
|
23
|
-
self._request_tree()
|
|
24
|
-
|
|
25
|
-
def _request_tree(self):
|
|
26
|
-
"""Make the request for the children tree at the NCBI Datasets API."""
|
|
27
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{self._taxon}/filtered_subtree"
|
|
28
|
-
raw_response = requests.get(api_url, timeout=5)
|
|
29
|
-
self._response = raw_response.json()["edges"]
|
|
30
|
-
self._parent_taxon_id = str(self._response["1"]["visible_children"][0])
|
|
31
|
-
try:
|
|
32
|
-
tmp_children_ids = self._response[self._parent_taxon_id]["visible_children"]
|
|
33
|
-
except KeyError:
|
|
34
|
-
logger.error("KeyError for key: {key}", key=self._parent_taxon_id)
|
|
35
|
-
logger.error("Response: {response}", response=str(self._response))
|
|
36
|
-
logger.error("Aborting")
|
|
37
|
-
sys.exit()
|
|
38
|
-
for child_id in tmp_children_ids:
|
|
39
|
-
self._children_taxon_ids.append(str(child_id))
|
|
40
|
-
|
|
41
|
-
def parent_id(self):
|
|
42
|
-
"""The NCBI taxon ID of the given Taxon at the initialisation.
|
|
43
|
-
|
|
44
|
-
:return: The taxon ID.
|
|
45
|
-
"""
|
|
46
|
-
return self._parent_taxon_id
|
|
47
|
-
|
|
48
|
-
def children_ids(self) -> list[str]:
|
|
49
|
-
"""The NCBI taxon IDs of all children of the given Taxon. The children are all of a lower rank than the parent.
|
|
50
|
-
|
|
51
|
-
:return: The taxon IDs as a list.
|
|
52
|
-
"""
|
|
53
|
-
return self._children_taxon_ids
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
"""This module is used to retrieve metadata from the NCBI taxonomy database."""
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
import requests
|
|
6
|
-
|
|
7
|
-
from loguru import logger
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class NCBITaxonMetadata:
|
|
11
|
-
"""Class to retrieve metadata from the NCBI taxonomy database."""
|
|
12
|
-
|
|
13
|
-
_taxon: str
|
|
14
|
-
_response: dict
|
|
15
|
-
_all_metadata: dict
|
|
16
|
-
|
|
17
|
-
def __init__(self, taxon: list[str]):
|
|
18
|
-
self._taxon = ",".join(taxon)
|
|
19
|
-
self._all_metadata = {}
|
|
20
|
-
self._request_metadata()
|
|
21
|
-
self._collect_all_metadata()
|
|
22
|
-
|
|
23
|
-
def _request_metadata(self):
|
|
24
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{str(self._taxon)}"
|
|
25
|
-
raw_response = requests.get(api_url, timeout=5)
|
|
26
|
-
self._response = raw_response.json()["taxonomy_nodes"]
|
|
27
|
-
|
|
28
|
-
def _collect_all_metadata(self):
|
|
29
|
-
for child_metadata in self._response:
|
|
30
|
-
taxonomy = child_metadata["taxonomy"]
|
|
31
|
-
rank = taxonomy["rank"]
|
|
32
|
-
name = taxonomy["organism_name"]
|
|
33
|
-
tax_id = str(taxonomy["tax_id"])
|
|
34
|
-
lineage = taxonomy["lineage"]
|
|
35
|
-
if "Candidatus" not in name:
|
|
36
|
-
if " sp. " not in name:
|
|
37
|
-
metadata = {
|
|
38
|
-
"sci_name": name,
|
|
39
|
-
"tax_id": tax_id,
|
|
40
|
-
"rank": rank,
|
|
41
|
-
"lineage": lineage,
|
|
42
|
-
}
|
|
43
|
-
self._all_metadata[tax_id] = metadata
|
|
44
|
-
else:
|
|
45
|
-
logger.debug("{name} was not used for training", name=name)
|
|
46
|
-
else:
|
|
47
|
-
logger.debug("{name} was not used for training", name=name)
|
|
48
|
-
|
|
49
|
-
def get_response(self):
|
|
50
|
-
"""Returns the raw response from the NCBI taxonomy database."""
|
|
51
|
-
return self._response
|
|
52
|
-
|
|
53
|
-
def get_metadata(self):
|
|
54
|
-
"""Returns the metadata from the NCBI taxonomy database."""
|
|
55
|
-
return self._all_metadata
|