XspecT 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

@@ -1,34 +0,0 @@
1
- xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- xspect/definitions.py,sha256=Z6RiCOQnsA_i8DPIq-7SUhrKo7KYf1Azp36UZZpcOX4,1419
3
- xspect/download_models.py,sha256=lml8pSyM0pF-MxghgSRC9noDT4pkUcBZraaTTImVfbA,739
4
- xspect/fastapi.py,sha256=FDiGXJmLEeTLD83Hem8yV5aoPJ-GhSG5WmDIQRAA_w4,3257
5
- xspect/file_io.py,sha256=zKhl6Fd9KZAYiD8YgIyje5TbDYk5lxMp1WUrNkGSBo8,2779
6
- xspect/main.py,sha256=3HqmnMowjkLNwhaZWtY4aeJCyCyT6h_nZWzYIunHfKg,5325
7
- xspect/model_management.py,sha256=xF-wjVNJbXYv64RajsIcpLfZUvicDyalJEdSeCx3nQI,3542
8
- xspect/pipeline.py,sha256=h7duhVZ-hupwO_KQPstzFo8KMfMI2yleb9HmtTiMjic,7219
9
- xspect/run.py,sha256=OJ7pCFqva3AhIYklKjVnqWGooVRO7S3b56kIAy-xabY,1189
10
- xspect/train.py,sha256=p_5BPh7XNA7R2h8MwpN0-AwzjbNIxdmeMKztP7RU4g8,9499
11
- xspect/mlst_feature/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- xspect/mlst_feature/mlst_helper.py,sha256=omqKmreah5qaspnJ5qKp_9oZsdHxi0tUJnEzZbpWPEw,5916
13
- xspect/mlst_feature/pub_mlst_handler.py,sha256=oss3CkJNt6041p3qnMdOfoX8ZgUfpB93CUim-Yakc9A,5031
14
- xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- xspect/models/probabilistic_filter_mlst_model.py,sha256=JMc0yBJPo7J9b-GpvhDmzhwWPIKOwatAq0edDgM72PE,11735
16
- xspect/models/probabilistic_filter_model.py,sha256=zCn5dcuq5Z4pvmsV9igS0lQ1plUi9-Kky_zRflfrIkI,9659
17
- xspect/models/probabilistic_filter_svm_model.py,sha256=uabDrF1_CSuIWf9wWyQAkqjAuRUBzEZLkv3J6YHfJsM,5641
18
- xspect/models/probabilistic_single_filter_model.py,sha256=TdGbQp8ylOif7dD13OSWaS-zFNJo8mXOb6BaQ0mcPdo,3810
19
- xspect/models/result.py,sha256=RuYqagyG5QbFlW408haXzDbPYJ6yS2flcWcJ8Dy6UjM,4834
20
- xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- xspect/train_filter/create_svm.py,sha256=w6gq40yHINVfNzLhJfYFykUaNCwpU9AEDcbkUfis3DY,1504
22
- xspect/train_filter/extract_and_concatenate.py,sha256=lLrczGgfZi2vAGqxq8fcEmJi5pvqyK33JkB_ZoCNYG8,4840
23
- xspect/train_filter/html_scrap.py,sha256=76VV_ZbvD2I3IxRb62SiQwRPu2tr4fwn1HkfJQYaosM,3809
24
- xspect/train_filter/ncbi_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- xspect/train_filter/ncbi_api/download_assemblies.py,sha256=MB_mxSjCTL05DqIt1WQem8AGU3PjtJnzPndeI9J-AOI,1285
26
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=puzDIws-yyBAEHwSAIYUM7g8FpLFmvOKh5xH1EsY8ZE,3830
27
- xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=_8puOsnsKp5lsMV2gZY1ijkfD_BZKG9eXZCX09qph5E,1819
28
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=O6JDXC4E6AYaf7NPnb34eSJyZhMB8r--bjoVF_ZsEdA,1868
29
- XspecT-0.2.6.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
30
- XspecT-0.2.6.dist-info/METADATA,sha256=cO5cpS3zD45dpARRO92XG9bCfCbJ1LE-bAaIxkb0bwQ,4714
31
- XspecT-0.2.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
32
- XspecT-0.2.6.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
33
- XspecT-0.2.6.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
34
- XspecT-0.2.6.dist-info/RECORD,,
xspect/pipeline.py DELETED
@@ -1,201 +0,0 @@
1
- """ Module for defining the Pipeline class. """
2
-
3
- import json
4
- from pathlib import Path
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio import SeqIO
7
- from xspect.file_io import get_records_by_id
8
- from xspect.models.result import StepType, SubprocessingStep
9
- from xspect.run import Run
10
- from xspect.models.result import ModelResult
11
- from xspect.model_management import get_model_by_slug
12
-
13
-
14
- class ModelExecution:
15
- """Class for storing a processing step of an XspecT pipeline."""
16
-
17
- def __init__(
18
- self,
19
- model_slug: str,
20
- sparse_sampling_step: int = 1,
21
- ):
22
- self.model_slug = model_slug
23
- self.sparse_sampling_step = sparse_sampling_step
24
- self.pipeline_steps = []
25
-
26
- def add_pipeline_step(
27
- self,
28
- pipeline_step: "PipelineStep",
29
- ):
30
- """Add a subprocessing step to the pipeline step."""
31
- self.pipeline_steps.append(pipeline_step)
32
-
33
- def to_dict(self) -> dict:
34
- """Return the processing step as a dictionary."""
35
- return {
36
- "model_slug": self.model_slug,
37
- "sparse_sampling_step": self.sparse_sampling_step,
38
- "pipeline_steps": [
39
- pipeline_step.to_dict() for pipeline_step in self.pipeline_steps
40
- ],
41
- }
42
-
43
- def run(
44
- self,
45
- sequence_input: (
46
- SeqRecord
47
- | list[SeqRecord]
48
- | SeqIO.FastaIO.FastaIterator
49
- | SeqIO.QualityIO.FastqPhredIterator
50
- | Path
51
- ),
52
- ) -> ModelResult:
53
- """Run the model on a given input."""
54
- model = get_model_by_slug(self.model_slug)
55
- model_result = model.predict(sequence_input, step=self.sparse_sampling_step)
56
-
57
- for pipeline_step in self.pipeline_steps:
58
- if pipeline_step.subprocessing_type == StepType.PREDICTION:
59
- score = model_result.get_scores()["total"][pipeline_step.label]
60
- if score >= pipeline_step.treshold:
61
- prediction_model_result = pipeline_step.model_execution.run(
62
- sequence_input
63
- )
64
- subprocessing_step = SubprocessingStep(
65
- pipeline_step.subprocessing_type,
66
- pipeline_step.label,
67
- pipeline_step.treshold,
68
- prediction_model_result,
69
- )
70
- model_result.add_subprocessing_step(subprocessing_step)
71
- elif pipeline_step.subprocessing_type == StepType.FILTERING:
72
- filtered_sequence_ids = model_result.get_filtered_subsequences(
73
- pipeline_step.label, pipeline_step.treshold
74
- )
75
- sequence_input = get_records_by_id(
76
- sequence_input, filtered_sequence_ids
77
- )
78
-
79
- filtering_model_result = None
80
- if sequence_input:
81
- filtering_model_result = pipeline_step.model_execution.run(
82
- sequence_input
83
- )
84
-
85
- subprocessing_step = SubprocessingStep(
86
- pipeline_step.subprocessing_type,
87
- pipeline_step.label,
88
- pipeline_step.treshold,
89
- filtering_model_result,
90
- )
91
- model_result.add_subprocessing_step(subprocessing_step)
92
- else:
93
- raise ValueError(
94
- f"Invalid subprocessing type {pipeline_step.subprocessing_type}"
95
- )
96
-
97
- return model_result
98
-
99
-
100
- class PipelineStep:
101
- """Class for storing a subprocessing step of an XspecT model."""
102
-
103
- def __init__(
104
- self,
105
- subprocessing_type: StepType,
106
- label: str,
107
- treshold: float,
108
- model_execution: ModelExecution,
109
- ):
110
- self.subprocessing_type = subprocessing_type
111
- self.label = label
112
- self.treshold = treshold
113
- self.model_execution = model_execution
114
-
115
- def to_dict(self) -> dict:
116
- """Return the subprocessing step as a dictionary."""
117
- return {
118
- "subprocessing_type": str(self.subprocessing_type),
119
- "label": self.label,
120
- "treshold": self.treshold,
121
- "model_execution": self.model_execution.to_dict(),
122
- }
123
-
124
-
125
- class Pipeline:
126
- """Class for storing an XspecT pipeline consisting of multiple model processing steps."""
127
-
128
- def __init__(self, display_name: str, author: str, author_email: str):
129
- self.display_name = display_name
130
- self.author = author
131
- self.author_email = author_email
132
- self.model_executions = []
133
-
134
- def add_pipeline_step(
135
- self,
136
- pipeline_step: ModelExecution,
137
- ):
138
- """Add a processing step to the pipeline."""
139
- self.model_executions.append(pipeline_step)
140
-
141
- def to_dict(self) -> dict:
142
- """Return the pipeline as a dictionary."""
143
- return {
144
- "display_name": self.display_name,
145
- "author": self.author,
146
- "author_email": self.author_email,
147
- "model_executions": [
148
- model_execution.to_dict() for model_execution in self.model_executions
149
- ],
150
- }
151
-
152
- def to_json(self) -> str:
153
- """Return the pipeline as a JSON string."""
154
- return json.dumps(self.to_dict())
155
-
156
- def save(self, path: Path) -> None:
157
- """Save the pipeline as a JSON file."""
158
- with open(path, "w", encoding="utf-8") as f:
159
- f.write(self.to_json())
160
-
161
- @staticmethod
162
- def from_file(path: Path) -> "Pipeline":
163
- """Load the pipeline from a JSON file."""
164
- with open(path, "r", encoding="utf-8") as f:
165
- pipeline_json = json.load(f)
166
- pipeline = Pipeline(
167
- pipeline_json["display_name"],
168
- pipeline_json["author"],
169
- pipeline_json["author_email"],
170
- )
171
- for model_execution in pipeline_json["model_executions"]:
172
- model_execution = ModelExecution(
173
- model_execution["model_slug"],
174
- model_execution["sparse_sampling_step"],
175
- )
176
- for pipeline_step in model_execution["pipeline_steps"]:
177
- model_execution.add_pipeline_step(
178
- PipelineStep(
179
- StepType(pipeline_step["subprocessing_type"]),
180
- pipeline_step["label"],
181
- pipeline_step["treshold"],
182
- ModelExecution(
183
- pipeline_step["model_execution"]["model_slug"],
184
- pipeline_step["model_execution"][
185
- "sparse_sampling_step"
186
- ],
187
- ),
188
- )
189
- )
190
- pipeline.add_pipeline_step(model_execution)
191
- return pipeline
192
-
193
- def run(self, input_file: Path) -> Run:
194
- """Run the pipeline on a given input."""
195
- run = Run(self.display_name, input_file)
196
-
197
- for model_execution in self.model_executions:
198
- result = model_execution.run(input_file)
199
- run.add_result(result)
200
-
201
- return run
xspect/run.py DELETED
@@ -1,38 +0,0 @@
1
- """ Module with XspecT global run class, which summarizes individual model results. """
2
-
3
- import json
4
- from pathlib import Path
5
- from xspect.models.result import ModelResult
6
-
7
-
8
- class Run:
9
- """Class for storing the results of an XspecT run."""
10
-
11
- def __init__(self, display_name: str, input_file: str):
12
- self.display_name = display_name
13
- self.input_file = input_file
14
- self.results = []
15
-
16
- def add_result(self, result: ModelResult):
17
- """Add a result to the run."""
18
- self.results.append(result)
19
-
20
- def to_dict(self) -> dict:
21
- """Return the run as a dictionary."""
22
- return {
23
- "display_name": self.display_name,
24
- "input_file": str(self.input_file),
25
- "results": (
26
- [result.to_dict() for result in self.results] if self.results else []
27
- ),
28
- }
29
-
30
- def to_json(self) -> str:
31
- """Return the run as a JSON string."""
32
- json_dict = self.to_dict()
33
- return json.dumps(json_dict, indent=4)
34
-
35
- def save(self, path: Path) -> None:
36
- """Save the run as a JSON file."""
37
- with open(path, "w", encoding="utf-8") as f:
38
- f.write(self.to_json())
File without changes
@@ -1,45 +0,0 @@
1
- """This module contains functions to select and download assemblies for SVM creation."""
2
-
3
- from time import sleep
4
- from loguru import logger
5
- from xspect.train_filter.ncbi_api import download_assemblies
6
-
7
-
8
- def select_assemblies(accessions):
9
- """Selects up to 4 assemblies, ideally assemblies that were not used for training the filters.
10
-
11
- :param accessions: All selected assembly accessions for every species.
12
- :type accessions: dict
13
- :return: Dict with species name as key and selected accessions as value.
14
- """
15
-
16
- all_accessions = {
17
- sci_name: curr_accessions[-4:]
18
- for sci_name, curr_accessions in accessions.items()
19
- }
20
-
21
- return all_accessions
22
-
23
-
24
- def get_svm_assemblies(all_accessions, dir_name):
25
- """Download assemblies for svm creation.
26
-
27
- :param all_accessions: Contains lists with all previously selected assemblies for every species.
28
- :type all_accessions: dict
29
- :param dir_name: Name of the parent directory.
30
- :type dir_name: str
31
- """
32
- # Select accessions for download.
33
- selected_accessions = select_assemblies(all_accessions)
34
-
35
- # Download assemblies.
36
- for sci_name, accessions in selected_accessions.items():
37
- sleep(5)
38
- logger.info("Downloading {name}", name=sci_name)
39
- file_name = sci_name + ".zip"
40
- download_assemblies.download_assemblies(
41
- accessions=accessions,
42
- dir_name=dir_name,
43
- target_folder="training_data_zipped",
44
- zip_file_name=file_name,
45
- )
@@ -1,124 +0,0 @@
1
- """ Module for extracting and concatenating assemblies. """
2
-
3
- __author__ = "Berger, Phillip"
4
-
5
- import os
6
- import shutil
7
- from pathlib import Path
8
- from Bio import SeqIO
9
- from xspect import file_io
10
- from xspect.definitions import get_xspect_tmp_path, fasta_endings
11
-
12
-
13
- def change_header(assemblies_path, species_accessions: dict):
14
- """Change the header of the assemblies to the species name."""
15
- files = os.listdir(assemblies_path)
16
- # Iterate through all species.
17
- for name, accessions in species_accessions.items():
18
- # Iterate through all accessions of the current species.
19
- for accession in accessions:
20
- # Iterate through all file names.
21
- for file in files:
22
- if accession in file:
23
- file_path = assemblies_path / str(file)
24
- # Change the header.
25
- with open(file_path, "r", encoding="utf-8") as f:
26
- sequence = ""
27
- for line in f.readlines():
28
- if line[0] != ">":
29
- sequence += line
30
- new_header = f">{name}\n"
31
- with open(file_path, "w", encoding="utf-8") as f:
32
- f.write(new_header)
33
- f.write(sequence)
34
-
35
-
36
- def copy_assemblies(unzipped_path, assemblies_path):
37
- """Copy all assemblies to a new directory."""
38
- os.mkdir(assemblies_path)
39
- for folder in os.listdir(unzipped_path):
40
- for root, _, files in os.walk(unzipped_path / str(folder)):
41
- for file in files:
42
- file_ending = file.split(".")[-1]
43
- if file_ending in fasta_endings:
44
- file_path = Path(root) / file
45
- shutil.copy(file_path, (assemblies_path / file))
46
-
47
-
48
- def concatenate_bf(unzipped_path, concatenate_path):
49
- """Concatenate all assemblies for Bloom filter training."""
50
-
51
- all_assemblies = []
52
-
53
- # Make new directory
54
- os.mkdir(concatenate_path)
55
-
56
- # Open the fasta files for each species.
57
- for folder in os.listdir(unzipped_path):
58
- species_files = []
59
- # Walk through dirs to get all fasta files.
60
- for root, _, files in os.walk(unzipped_path / folder):
61
- for file in files:
62
- file_ending = file.split(".")[-1]
63
- if file_ending in fasta_endings:
64
- species_files.append(Path(root) / file)
65
- all_assemblies.append(".".join(str(file).split(".")[:-1]))
66
-
67
- # Gather all sequences and headers.
68
- sequences = []
69
- headers = []
70
- for file in species_files:
71
- records = SeqIO.parse(file, "fasta")
72
- for record in records:
73
- headers.append(record.id)
74
- sequences.append(str(record.seq))
75
-
76
- # Concatenate sequences
77
- species_sequence = "".join(sequences)
78
- species_header = ">" + " § ".join(headers) + "\n"
79
-
80
- # Save concatenated sequences and headers
81
- species_path = concatenate_path / (folder + ".fasta")
82
- with open(species_path, "w", encoding="utf-8") as species_file:
83
- species_file.write(species_header)
84
- species_file.write(species_sequence)
85
-
86
- return all_assemblies
87
-
88
-
89
- def save_all_assemblies(dir_path: Path, all_assemblies: list[str]):
90
- """Save all assemblies to a file."""
91
- path = dir_path / "all_bf_assemblies.txt"
92
- with open(path, "w", encoding="utf-8") as file:
93
- for assembly in all_assemblies:
94
- file.write(f"{assembly}\n")
95
-
96
-
97
- def bf(dir_name: str, delete: bool):
98
- """Extract and concatenate assemblies for Bloom filter training."""
99
- dir_path = get_xspect_tmp_path() / dir_name
100
- zip_path = dir_path / "zip_files"
101
- unzipped_path = dir_path / "zip_files_extracted"
102
- concatenate_path = dir_path / "concatenate"
103
- file_io.extract_zip(zip_path, unzipped_path)
104
- all_assemblies = concatenate_bf(unzipped_path, concatenate_path)
105
- save_all_assemblies(dir_path, all_assemblies)
106
- if delete:
107
- file_io.delete_zip_files(zip_path)
108
- shutil.rmtree(zip_path, ignore_errors=False)
109
- shutil.rmtree(unzipped_path, ignore_errors=False)
110
-
111
-
112
- def svm(species_accessions: dict, dir_name: str, delete: bool):
113
- """Extract and concatenate assemblies for generating SVM training data."""
114
- dir_path = get_xspect_tmp_path() / dir_name
115
- zip_path = dir_path / "training_data_zipped"
116
- unzipped_path = dir_path / "training_data_unzipped"
117
- assemblies_path = dir_path / "training_data"
118
- file_io.extract_zip(zip_path, unzipped_path)
119
- copy_assemblies(unzipped_path, assemblies_path)
120
- change_header(assemblies_path, species_accessions)
121
- if delete:
122
- file_io.delete_zip_files(zip_path)
123
- shutil.rmtree(zip_path, ignore_errors=False)
124
- shutil.rmtree(unzipped_path, ignore_errors=False)
@@ -1,114 +0,0 @@
1
- """ HTML Scraping for the taxonomy check results from NCBI."""
2
-
3
- __author__ = "Berger, Phillip"
4
-
5
- import datetime
6
- import pickle
7
- import sys
8
- import time
9
- import requests
10
- from loguru import logger
11
- from xspect.definitions import get_xspect_root_path
12
-
13
-
14
- class TaxonomyCheck:
15
- """Class to get the GCFs that passed the taxonomy check from NCBI."""
16
-
17
- _main_url = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/ANI_report_prokaryotes.txt"
18
- _test_url = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/"
19
- _main_path = get_xspect_root_path() / "taxonomy_check.txt"
20
- _test_path = get_xspect_root_path() / "tax_check_date.txt"
21
- _new_time: list
22
- _tax_check_ok = []
23
-
24
- def __init__(self):
25
- old_time = self._get_old_tax_date()
26
- self._new_time = self._get_new_tax_date()
27
- # Both Dates could be found.
28
- # Check if the html file was updated since the last time it was downloaded.
29
- # If yes than update the file.
30
- if self._new_time and old_time:
31
- if self._new_time == old_time:
32
- logger.info("File was not updated")
33
- self._get_old_file()
34
- else:
35
- logger.info("Updating file")
36
- self._update_tax_check()
37
-
38
- # The old date does not exist.
39
- # Get the html file for the taxonomy check results.
40
- elif self._new_time and not old_time:
41
- logger.info("No file was found. Creating new file")
42
- self._update_tax_check()
43
-
44
- elif not self._new_time and old_time:
45
- self._get_old_file()
46
-
47
- else:
48
- logger.error("Nothing was found")
49
- logger.error("Aborting")
50
- sys.exit()
51
-
52
- def _get_old_tax_date(self):
53
- try:
54
- with open(self._test_path, "rb") as f:
55
- old_time = pickle.load(f)
56
- return old_time
57
- except FileNotFoundError:
58
- return None
59
-
60
- def _get_new_tax_date(self):
61
- raw_response = requests.get(self._test_url, timeout=5)
62
- data = raw_response.text.split("\n")
63
- for line in data:
64
- if "ANI_report_prokaryotes.txt" in line:
65
- line_parts = line.split()
66
- date_parts = line_parts[-3].split("-")
67
- date = datetime.date(
68
- int(date_parts[0]), int(date_parts[1]), int(date_parts[2])
69
- )
70
- time_parts = line_parts[-2].split(":")
71
- combined_time = datetime.time(int(time_parts[0]), int(time_parts[1]))
72
- new_time = [date, combined_time]
73
-
74
- return new_time
75
-
76
- return None
77
-
78
- def _update_tax_check(self):
79
- raw_response = requests.get(self._main_url, timeout=5)
80
- all_tax_checks = raw_response.text.split("\n")[1:-1]
81
- self._get_gcf_ok(all_tax_checks)
82
- self._save_time()
83
- self._save_file()
84
-
85
- def _get_gcf_ok(self, all_tax_checks: list):
86
- tax_check_ok = []
87
- for line in all_tax_checks:
88
- line_parts = line.split("\t")
89
- gcf = line_parts[1]
90
- tax_check_status = line_parts[-1]
91
- if tax_check_status == "OK":
92
- tax_check_ok.append(gcf)
93
-
94
- self._tax_check_ok = tax_check_ok
95
-
96
- def _save_time(self):
97
- with open(self._test_path, "wb") as f:
98
- pickle.dump(self._new_time, f)
99
-
100
- def _save_file(self):
101
- with open(self._main_path, "wb") as f:
102
- pickle.dump(self._tax_check_ok, f)
103
-
104
- def _get_old_file(self):
105
- with open(self._main_path, "rb") as f:
106
- self._tax_check_ok = pickle.load(f)
107
-
108
- @staticmethod
109
- def _get_current_time():
110
- return time.asctime(time.localtime()).split()[3]
111
-
112
- def ani_gcf(self):
113
- """Returns ANI GCFs that passed the taxonomy check."""
114
- return self._tax_check_ok
File without changes
@@ -1,31 +0,0 @@
1
- """This module contains methods to download assemblies from the NCBI database."""
2
-
3
- __author__ = "Berger, Phillip"
4
-
5
- # pylint: disable=line-too-long
6
-
7
- import os
8
- import requests
9
- from xspect.definitions import get_xspect_tmp_path
10
-
11
-
12
- def download_assemblies(accessions, dir_name, target_folder, zip_file_name):
13
- """Download and save assemblies from the NCBI database.
14
-
15
- :param accessions: All collected accessions from the NCBI RefSeq-database.
16
- :type accessions: list
17
- :param dir_name: Name of the directory where the assemblies will be saved.
18
- :type dir_name: str
19
- :param target_folder: Name for the folder in which the downloaded files will be stored.
20
- :type target_folder: str
21
- :param zip_file_name: Name of the zip file. E.g. Klebsiella aerogenes.zip.
22
- :type zip_file_name: str
23
- """
24
-
25
- path = get_xspect_tmp_path() / dir_name / target_folder / zip_file_name
26
- api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/{','.join(accessions)}/download"
27
- parameters = {"include_annotation_type": "GENOME_FASTA", "filename": zip_file_name}
28
- os.makedirs(os.path.dirname(path), exist_ok=True)
29
- genome_download = requests.get(api_url, params=parameters, timeout=20)
30
- with open(path, "wb") as f:
31
- f.write(genome_download.content)