XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
- XspecT-0.2.0.dist-info/RECORD +30 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
- xspect/definitions.py +42 -0
- xspect/download_filters.py +11 -26
- xspect/fastapi.py +101 -0
- xspect/file_io.py +34 -103
- xspect/main.py +70 -66
- xspect/model_management.py +88 -0
- xspect/models/__init__.py +0 -0
- xspect/models/probabilistic_filter_model.py +277 -0
- xspect/models/probabilistic_filter_svm_model.py +169 -0
- xspect/models/probabilistic_single_filter_model.py +109 -0
- xspect/models/result.py +148 -0
- xspect/pipeline.py +201 -0
- xspect/run.py +38 -0
- xspect/train.py +304 -0
- xspect/train_filter/create_svm.py +6 -183
- xspect/train_filter/extract_and_concatenate.py +117 -121
- xspect/train_filter/html_scrap.py +16 -28
- xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
- XspecT-0.1.3.dist-info/RECORD +0 -49
- xspect/BF_v2.py +0 -637
- xspect/Bootstrap.py +0 -29
- xspect/Classifier.py +0 -142
- xspect/OXA_Table.py +0 -53
- xspect/WebApp.py +0 -724
- xspect/XspecT_mini.py +0 -1363
- xspect/XspecT_trainer.py +0 -611
- xspect/map_kmers.py +0 -155
- xspect/search_filter.py +0 -504
- xspect/static/How-To.png +0 -0
- xspect/static/Logo.png +0 -0
- xspect/static/Logo2.png +0 -0
- xspect/static/Workflow_AspecT.png +0 -0
- xspect/static/Workflow_ClAssT.png +0 -0
- xspect/static/js.js +0 -615
- xspect/static/main.css +0 -280
- xspect/templates/400.html +0 -64
- xspect/templates/401.html +0 -62
- xspect/templates/404.html +0 -62
- xspect/templates/500.html +0 -62
- xspect/templates/about.html +0 -544
- xspect/templates/home.html +0 -51
- xspect/templates/layoutabout.html +0 -87
- xspect/templates/layouthome.html +0 -63
- xspect/templates/layoutspecies.html +0 -468
- xspect/templates/species.html +0 -33
- xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
- xspect/train_filter/get_paths.py +0 -35
- xspect/train_filter/interface_XspecT.py +0 -204
- xspect/train_filter/k_mer_count.py +0 -162
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/train.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains the main functions for training the models.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import sys
|
|
9
|
+
from time import localtime, perf_counter, asctime, sleep
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from xspect.definitions import get_xspect_model_path, get_xspect_tmp_path
|
|
12
|
+
from xspect.file_io import concatenate_meta
|
|
13
|
+
from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
|
|
14
|
+
from xspect.models.probabilistic_single_filter_model import (
|
|
15
|
+
ProbabilisticSingleFilterModel,
|
|
16
|
+
)
|
|
17
|
+
from xspect.train_filter.ncbi_api import (
|
|
18
|
+
ncbi_assembly_metadata,
|
|
19
|
+
ncbi_taxon_metadata,
|
|
20
|
+
ncbi_children_tree,
|
|
21
|
+
download_assemblies,
|
|
22
|
+
)
|
|
23
|
+
from xspect.train_filter import (
|
|
24
|
+
create_svm,
|
|
25
|
+
html_scrap,
|
|
26
|
+
extract_and_concatenate,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def check_user_input(user_input: str):
|
|
31
|
+
"""The given input of the user will be checked. The input has to be a genus in NCBI.
|
|
32
|
+
|
|
33
|
+
:return: The genus name.
|
|
34
|
+
"""
|
|
35
|
+
taxon_metadata = ncbi_taxon_metadata.NCBITaxonMetadata([user_input])
|
|
36
|
+
all_metadata = taxon_metadata.get_metadata()
|
|
37
|
+
for metadata in all_metadata.values():
|
|
38
|
+
sci_name = metadata["sci_name"]
|
|
39
|
+
tax_id = metadata["tax_id"]
|
|
40
|
+
rank = metadata["rank"]
|
|
41
|
+
lineage = metadata["lineage"]
|
|
42
|
+
bacteria_id = 2
|
|
43
|
+
if not sci_name == user_input and not tax_id == user_input:
|
|
44
|
+
print(
|
|
45
|
+
f"{get_current_time()}| The given genus: {user_input} was found as"
|
|
46
|
+
f" genus: {sci_name} ID: {tax_id}"
|
|
47
|
+
)
|
|
48
|
+
print(f"{get_current_time()}| Using {sci_name} as genus name.")
|
|
49
|
+
if rank == "GENUS":
|
|
50
|
+
if bacteria_id not in lineage:
|
|
51
|
+
print(f"{get_current_time()}| The given genus is not a bacteria.")
|
|
52
|
+
print(f"{get_current_time()}| Do you want to continue: [y/n]")
|
|
53
|
+
choice = input("-> ").lower()
|
|
54
|
+
if choice == "y":
|
|
55
|
+
return str(sci_name)
|
|
56
|
+
print(f"{get_current_time()}| Exiting...")
|
|
57
|
+
sys.exit()
|
|
58
|
+
return str(sci_name)
|
|
59
|
+
print(f"{get_current_time()}| {user_input} is rank {rank} and not genus.")
|
|
60
|
+
sys.exit()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def copy_custom_data(bf_path: str, svm_path: str, dir_name: str):
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
:param bf_path:
|
|
67
|
+
:param svm_path:
|
|
68
|
+
:param dir_name:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
path = Path(os.getcwd()) / "genus_metadata" / dir_name
|
|
72
|
+
new_bf_path = path / "concatenate"
|
|
73
|
+
new_svm_path = path / "training_data"
|
|
74
|
+
|
|
75
|
+
# Make the new directories.
|
|
76
|
+
path.mkdir(exist_ok=True)
|
|
77
|
+
new_bf_path.mkdir(exist_ok=True)
|
|
78
|
+
new_svm_path.mkdir(exist_ok=True)
|
|
79
|
+
|
|
80
|
+
# Move bloomfilter files.
|
|
81
|
+
bf_files = os.listdir(bf_path)
|
|
82
|
+
for file in bf_files:
|
|
83
|
+
file_path = Path(bf_path) / file
|
|
84
|
+
new_file_path = new_bf_path / file
|
|
85
|
+
shutil.copy2(file_path, new_file_path)
|
|
86
|
+
|
|
87
|
+
# Move svm files.
|
|
88
|
+
svm_files = os.listdir(svm_path)
|
|
89
|
+
for file in svm_files:
|
|
90
|
+
file_path = Path(svm_path) / file
|
|
91
|
+
new_file_path = new_svm_path / file
|
|
92
|
+
shutil.copy2(file_path, new_file_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def set_logger(dir_name: str):
|
|
96
|
+
"""Sets the logger parameters.
|
|
97
|
+
|
|
98
|
+
:param dir_name: Name of the folder where the log should be saved.
|
|
99
|
+
"""
|
|
100
|
+
genus = dir_name.split("_")[0]
|
|
101
|
+
|
|
102
|
+
# Starting logger.
|
|
103
|
+
logger.remove()
|
|
104
|
+
logger.add(sys.stderr, format="{time:HH:mm:ss} | {level} | {message}", level="INFO")
|
|
105
|
+
log_path = get_xspect_tmp_path() / dir_name / (genus + ".log")
|
|
106
|
+
logger.add(log_path, format="{time:HH:mm:ss} | {level} | {message}", level="DEBUG")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def create_translation_dict(dir_name: str) -> dict[str, str]:
|
|
110
|
+
"""Create a translation dictionary to translate the taxon ID to its scientific name.
|
|
111
|
+
|
|
112
|
+
:param dir_name: Directory name for current genus.
|
|
113
|
+
:return: The created translation dictionary.
|
|
114
|
+
"""
|
|
115
|
+
path = get_xspect_tmp_path() / dir_name / "concatenate"
|
|
116
|
+
files = os.listdir(path)
|
|
117
|
+
translation_dict = {}
|
|
118
|
+
for file in files:
|
|
119
|
+
file_split = file.split(".")[0].split("_")
|
|
120
|
+
tax_id = file_split[0]
|
|
121
|
+
final_file_name = tax_id + ".fasta"
|
|
122
|
+
name = file_split[1]
|
|
123
|
+
translation_dict[final_file_name] = name
|
|
124
|
+
|
|
125
|
+
return translation_dict
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def change_bf_assembly_file_names(dir_name: str):
|
|
129
|
+
"""Change all concatenated assembly names to only the taxon ID.
|
|
130
|
+
|
|
131
|
+
:param dir_name: Directory name for current genus.
|
|
132
|
+
"""
|
|
133
|
+
path = get_xspect_tmp_path() / dir_name / "concatenate"
|
|
134
|
+
files = os.listdir(path)
|
|
135
|
+
for file in files:
|
|
136
|
+
file_split = file.split(".")[0].split("_")
|
|
137
|
+
tax_id = file_split[0]
|
|
138
|
+
new_file_name = f"{tax_id}.fasta"
|
|
139
|
+
os.rename((path / file), (path / new_file_name))
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_current_time():
|
|
143
|
+
"""Returns the current time in the form hh:mm:ss."""
|
|
144
|
+
return asctime(localtime()).split()[3]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def train_ncbi(genus: str, svm_step: int = 1):
|
|
148
|
+
"""Train genus and species models with NCBI assemblies from the given genus."""
|
|
149
|
+
|
|
150
|
+
if not isinstance(genus, str):
|
|
151
|
+
raise TypeError("genus must be a string")
|
|
152
|
+
|
|
153
|
+
# Check user input.
|
|
154
|
+
genus = check_user_input(user_input=genus)
|
|
155
|
+
|
|
156
|
+
# The directory name is defined in the following format: 'genus'_DD_MM_YYYY_hh-mm-ss
|
|
157
|
+
curr_time = localtime()
|
|
158
|
+
dir_name = f"{genus}_{curr_time[2]}_{curr_time[1]}_{curr_time[0]}_{curr_time[3]}-{curr_time[4]}-{curr_time[5]}"
|
|
159
|
+
|
|
160
|
+
# Set the logger.
|
|
161
|
+
set_logger(dir_name)
|
|
162
|
+
|
|
163
|
+
# Time for the whole program.
|
|
164
|
+
start = perf_counter()
|
|
165
|
+
|
|
166
|
+
# Search for every defined species of the genus.
|
|
167
|
+
logger.info("Getting all species of the genus")
|
|
168
|
+
children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
|
|
169
|
+
species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
|
|
170
|
+
|
|
171
|
+
# Get all gcf accessions that have Taxonomy check result OK.
|
|
172
|
+
logger.info("Checking ANI data for updates")
|
|
173
|
+
ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
|
|
174
|
+
|
|
175
|
+
# Look for up to 8 assembly accessions per species.
|
|
176
|
+
logger.info("Getting assembly metadata")
|
|
177
|
+
all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
|
|
178
|
+
all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
|
|
179
|
+
)
|
|
180
|
+
all_metadata = all_metadata.get_all_metadata()
|
|
181
|
+
|
|
182
|
+
# Ensure that the genus has at least one species with accessions.
|
|
183
|
+
if not all_metadata:
|
|
184
|
+
raise ValueError("No species with accessions found")
|
|
185
|
+
|
|
186
|
+
# Download the chosen assemblies.
|
|
187
|
+
# One file for each species with it's downloaded assemblies in zip format.
|
|
188
|
+
|
|
189
|
+
# Iterate through all species.
|
|
190
|
+
logger.info("Downloading assemblies for bloomfilter training")
|
|
191
|
+
for metadata in all_metadata.values():
|
|
192
|
+
# Only try to download when the species has accessions.
|
|
193
|
+
if len(metadata["accessions"]) >= 1:
|
|
194
|
+
sleep(5)
|
|
195
|
+
species_name = metadata["sci_name"]
|
|
196
|
+
tax_id = metadata["tax_id"]
|
|
197
|
+
logger.info("Downloading {id}_{name}", id=tax_id, name=species_name)
|
|
198
|
+
file_name = f"{tax_id}_{species_name}.zip"
|
|
199
|
+
|
|
200
|
+
# Selecting the first 4 assemblies for training the filters.
|
|
201
|
+
accessions = metadata["accessions"][:4]
|
|
202
|
+
|
|
203
|
+
download_assemblies.download_assemblies(
|
|
204
|
+
accessions=accessions,
|
|
205
|
+
dir_name=dir_name,
|
|
206
|
+
target_folder="zip_files",
|
|
207
|
+
zip_file_name=file_name,
|
|
208
|
+
)
|
|
209
|
+
logger.info("Concatenating and extracting")
|
|
210
|
+
|
|
211
|
+
# Concatenate all assemblies of each species.
|
|
212
|
+
extract_and_concatenate.bf(dir_name=dir_name, delete=True)
|
|
213
|
+
concatenate_meta(get_xspect_tmp_path() / dir_name, genus)
|
|
214
|
+
|
|
215
|
+
# Download assemblies for svm creation.
|
|
216
|
+
logger.info("Downloading assemblies for support-vector-machine training")
|
|
217
|
+
accessions = {}
|
|
218
|
+
for metadata in all_metadata.values():
|
|
219
|
+
# Only add taxon with accessions.
|
|
220
|
+
if len(metadata["accessions"]) >= 1:
|
|
221
|
+
accessions[metadata["tax_id"]] = metadata["accessions"]
|
|
222
|
+
|
|
223
|
+
# Downloading assemblies.
|
|
224
|
+
create_svm.get_svm_assemblies(all_accessions=accessions, dir_name=dir_name)
|
|
225
|
+
|
|
226
|
+
logger.info("Extracting SVM assemblies")
|
|
227
|
+
|
|
228
|
+
# Extracting assemblies.
|
|
229
|
+
extract_and_concatenate.svm(
|
|
230
|
+
species_accessions=accessions, dir_name=dir_name, delete=True
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Make dictionary for translating taxon ID to scientific name.
|
|
234
|
+
translation_dict = create_translation_dict(dir_name)
|
|
235
|
+
change_bf_assembly_file_names(dir_name)
|
|
236
|
+
|
|
237
|
+
species_files_path = get_xspect_tmp_path() / dir_name / "concatenate"
|
|
238
|
+
species_result_path = get_xspect_model_path() / genus
|
|
239
|
+
|
|
240
|
+
# Train Bloomfilter for complete genus.
|
|
241
|
+
logger.info("Training metagenome model")
|
|
242
|
+
mg_files_path = get_xspect_tmp_path() / dir_name
|
|
243
|
+
|
|
244
|
+
genus_model = ProbabilisticSingleFilterModel(
|
|
245
|
+
k=21,
|
|
246
|
+
model_display_name=genus,
|
|
247
|
+
author="Test",
|
|
248
|
+
author_email="test@example.com",
|
|
249
|
+
model_type="Genus",
|
|
250
|
+
base_path=Path(species_result_path).parent,
|
|
251
|
+
)
|
|
252
|
+
genus_model.fit(mg_files_path / f"{genus}.fasta", genus)
|
|
253
|
+
genus_model.save()
|
|
254
|
+
|
|
255
|
+
logger.info("Training species model")
|
|
256
|
+
|
|
257
|
+
species_model = ProbabilisticFilterSVMModel(
|
|
258
|
+
k=21,
|
|
259
|
+
model_display_name=genus,
|
|
260
|
+
author="Test",
|
|
261
|
+
author_email="test@example.com",
|
|
262
|
+
model_type="Species",
|
|
263
|
+
base_path=Path(species_result_path).parent,
|
|
264
|
+
kernel="rbf",
|
|
265
|
+
c=1.0,
|
|
266
|
+
)
|
|
267
|
+
svm_dir = get_xspect_tmp_path() / dir_name / "training_data"
|
|
268
|
+
species_model.fit(
|
|
269
|
+
Path(species_files_path),
|
|
270
|
+
svm_dir,
|
|
271
|
+
display_names=translation_dict,
|
|
272
|
+
svm_step=svm_step,
|
|
273
|
+
)
|
|
274
|
+
species_model.save()
|
|
275
|
+
|
|
276
|
+
# Cleanup files.
|
|
277
|
+
shutil.rmtree(get_xspect_tmp_path() / dir_name)
|
|
278
|
+
|
|
279
|
+
end = perf_counter()
|
|
280
|
+
|
|
281
|
+
logger.info("Program runtime: {time} m", time=(round((end - start) / 60, 2)))
|
|
282
|
+
logger.info("XspecT-trainer is finished.")
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def train_from_directory(display_name: str, dir_path: Path, meta: bool = False):
|
|
286
|
+
"""Train the gene family and gene filter.
|
|
287
|
+
|
|
288
|
+
:param display_name: Name of the model.
|
|
289
|
+
:param dir: Input directory.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
if not isinstance(display_name, str):
|
|
293
|
+
raise TypeError("display_name must be a string")
|
|
294
|
+
|
|
295
|
+
if not isinstance(dir_path, Path) and dir_path.exists() and dir_path.is_dir():
|
|
296
|
+
raise ValueError("dir must be Path object to a valid directory")
|
|
297
|
+
|
|
298
|
+
# check if the directory contains the necessary files
|
|
299
|
+
# copy to temp path
|
|
300
|
+
# check if svm training data exists
|
|
301
|
+
# train model, with svm data if it exists
|
|
302
|
+
# add display names
|
|
303
|
+
# train metagenome model
|
|
304
|
+
# clean up temp path
|
|
@@ -1,19 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
import os
|
|
3
|
-
import pickle
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from time import sleep
|
|
1
|
+
"""This module contains functions to select and download assemblies for SVM creation."""
|
|
6
2
|
|
|
7
|
-
from
|
|
3
|
+
from time import sleep
|
|
8
4
|
from loguru import logger
|
|
9
|
-
|
|
10
|
-
import xspect.BF_v2 as BF_v2
|
|
11
|
-
from xspect.file_io import (
|
|
12
|
-
delete_non_fasta,
|
|
13
|
-
get_accessions,
|
|
14
|
-
get_file_paths,
|
|
15
|
-
get_species_names,
|
|
16
|
-
)
|
|
17
5
|
from xspect.train_filter.ncbi_api import download_assemblies
|
|
18
6
|
|
|
19
7
|
|
|
@@ -24,17 +12,11 @@ def select_assemblies(accessions):
|
|
|
24
12
|
:type accessions: dict
|
|
25
13
|
:return: Dict with species name as key and selected accessions as value.
|
|
26
14
|
"""
|
|
27
|
-
all_accessions = {}
|
|
28
15
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
selected_accessions.append(current_accessions[i])
|
|
34
|
-
if len(selected_accessions) == 4:
|
|
35
|
-
break
|
|
36
|
-
|
|
37
|
-
all_accessions[sci_name] = selected_accessions
|
|
16
|
+
all_accessions = {
|
|
17
|
+
sci_name: curr_accessions[-4:]
|
|
18
|
+
for sci_name, curr_accessions in accessions.items()
|
|
19
|
+
}
|
|
38
20
|
|
|
39
21
|
return all_accessions
|
|
40
22
|
|
|
@@ -61,162 +43,3 @@ def get_svm_assemblies(all_accessions, dir_name):
|
|
|
61
43
|
target_folder="training_data_zipped",
|
|
62
44
|
zip_file_name=file_name,
|
|
63
45
|
)
|
|
64
|
-
logger.info("Downloads finished")
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def init_bf(genus, array_size, hashes=7, k=21):
|
|
68
|
-
"""Initializes bloomfilter.
|
|
69
|
-
|
|
70
|
-
:param genus: Name of the genus.
|
|
71
|
-
:type genus: str
|
|
72
|
-
:param array_size: Size of the bloomfilter.
|
|
73
|
-
:type array_size: int
|
|
74
|
-
:param hashes: The number of hash functions the bf uses.
|
|
75
|
-
:type hashes: int
|
|
76
|
-
:param k: Length of k-mers.
|
|
77
|
-
:type k: int
|
|
78
|
-
:return: The bloomfilter object.
|
|
79
|
-
"""
|
|
80
|
-
path = Path(os.getcwd()) / "filter"
|
|
81
|
-
|
|
82
|
-
# Initialize bloomfilter for genus.
|
|
83
|
-
BF = BF_v2.AbaumanniiBloomfilter(array_size)
|
|
84
|
-
BF.set_arraysize(array_size)
|
|
85
|
-
BF.set_hashes(hashes)
|
|
86
|
-
BF.set_k(k)
|
|
87
|
-
|
|
88
|
-
# Get all species names.
|
|
89
|
-
names_path = path / "species_names" / ("Filter" + genus + ".txt")
|
|
90
|
-
with open(names_path, "rb") as f:
|
|
91
|
-
clonetypes = pickle.load(f)
|
|
92
|
-
|
|
93
|
-
# Get bloomfilter paths.
|
|
94
|
-
bf_path = path / genus
|
|
95
|
-
paths = sorted(os.listdir(bf_path))
|
|
96
|
-
for i in range(len(paths)):
|
|
97
|
-
paths[i] = str(bf_path / str(paths[i]))
|
|
98
|
-
# Setup bloomfilters.
|
|
99
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
100
|
-
|
|
101
|
-
return BF
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def perform_lookup(bloomfilter, files, file_paths, accessions, names, spacing):
|
|
105
|
-
"""Performs a lookup on a bloomfilter object and gives the scores as a list.
|
|
106
|
-
|
|
107
|
-
:param bloomfilter: The bloomfilter object on which the lookup is performed.
|
|
108
|
-
:param files: List of file names.
|
|
109
|
-
:type files: list[str]
|
|
110
|
-
:param file_paths: List with the file paths.
|
|
111
|
-
:type file_paths: list[str]
|
|
112
|
-
:param accessions: List of all accessions.
|
|
113
|
-
:type accessions: list[str]
|
|
114
|
-
:param names: List with all species names.
|
|
115
|
-
:type names: list[str]
|
|
116
|
-
:return: List with all scores of the lookup.
|
|
117
|
-
"""
|
|
118
|
-
scores = list()
|
|
119
|
-
BF = bloomfilter
|
|
120
|
-
|
|
121
|
-
# Lookup.
|
|
122
|
-
for i in range(len(files)):
|
|
123
|
-
BF.number_of_kmeres = 0
|
|
124
|
-
BF.hits_per_filter = [0] * BF.clonetypes
|
|
125
|
-
|
|
126
|
-
for sequence in SeqIO.parse(file_paths[i], "fasta"):
|
|
127
|
-
# Dominik: changed sample size to var
|
|
128
|
-
for j in range(0, len(sequence.seq) - BF.k, spacing):
|
|
129
|
-
BF.number_of_kmeres += 1
|
|
130
|
-
BF.lookup_canonical(str(sequence.seq[j : j + BF.k]))
|
|
131
|
-
|
|
132
|
-
score = BF.get_score()
|
|
133
|
-
score = [str(x) for x in score]
|
|
134
|
-
score = ",".join(score)
|
|
135
|
-
scores.append(accessions[i] + "," + score + "," + names[i])
|
|
136
|
-
|
|
137
|
-
return scores
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
# https://stackoverflow.com/questions/21431052/sort-list-of-strings-by-a-part-of-the-string
|
|
141
|
-
def sort_list(scores, names):
|
|
142
|
-
"""Sorts the scores list by species name.
|
|
143
|
-
|
|
144
|
-
:param scores: The scores gathered by a lookup of a bloomfilter.
|
|
145
|
-
:type scores: list
|
|
146
|
-
:param names: List with all species names.
|
|
147
|
-
:type names: list[str]
|
|
148
|
-
:return: The sorted scores list.
|
|
149
|
-
"""
|
|
150
|
-
scores.sort(key=lambda x: x.split(",")[-1][:2])
|
|
151
|
-
names = [x for x in names if x != "none"]
|
|
152
|
-
names = list(dict.fromkeys(names))
|
|
153
|
-
scores.insert(0, sorted(names))
|
|
154
|
-
scores[0] = ["File"] + scores[0] + ["Label"]
|
|
155
|
-
|
|
156
|
-
for i in range(1, len(scores)):
|
|
157
|
-
line = scores[i].split(",")
|
|
158
|
-
scores[i] = line
|
|
159
|
-
|
|
160
|
-
return scores
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def save_csv(genus, scores):
|
|
164
|
-
"""Saves the scores as csv file.
|
|
165
|
-
|
|
166
|
-
:param genus: Name of the genus.
|
|
167
|
-
:type genus: str
|
|
168
|
-
:param scores: The scores gathered by a lookup of a bloomfilter.
|
|
169
|
-
:type scores: list
|
|
170
|
-
"""
|
|
171
|
-
training_data_path = Path(os.getcwd()) / "Training_data"
|
|
172
|
-
if not os.path.exists(training_data_path):
|
|
173
|
-
os.mkdir(training_data_path)
|
|
174
|
-
|
|
175
|
-
path = training_data_path / (genus + "_Training_data_spec.csv")
|
|
176
|
-
with open(path, "w", newline="") as file:
|
|
177
|
-
writer = csv.writer(file)
|
|
178
|
-
writer.writerows(scores)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
# Dominik: added spacing
|
|
182
|
-
def new_helper(spacing, genus, dir_name, array_size, k=21):
|
|
183
|
-
"""Create support vector machine for bloomfilters of a genus.
|
|
184
|
-
|
|
185
|
-
:param spacing:
|
|
186
|
-
:param genus: Name of the genus.
|
|
187
|
-
:type genus: str
|
|
188
|
-
:param dir_name: Name of the parent directory.
|
|
189
|
-
:type dir_name: str
|
|
190
|
-
:param array_size: Size for the byte array which is the bloomfilter.
|
|
191
|
-
:type array_size: int
|
|
192
|
-
:param k: Length of the k-mers.
|
|
193
|
-
:type k: int
|
|
194
|
-
"""
|
|
195
|
-
# Get all files.
|
|
196
|
-
base_path = Path(os.getcwd()) / "genus_metadata" / dir_name / "training_data"
|
|
197
|
-
files = os.listdir(base_path)
|
|
198
|
-
|
|
199
|
-
# Delete all non fasta files.
|
|
200
|
-
files = delete_non_fasta(files)
|
|
201
|
-
|
|
202
|
-
# Get accessions from file names.
|
|
203
|
-
accessions = get_accessions(files)
|
|
204
|
-
|
|
205
|
-
# Get all complete file paths.
|
|
206
|
-
file_paths = get_file_paths(base_path, files)
|
|
207
|
-
|
|
208
|
-
# Get all species names from the header in the fasta files.
|
|
209
|
-
names = get_species_names(file_paths)
|
|
210
|
-
|
|
211
|
-
# Initialize bloomfilter.
|
|
212
|
-
bf = init_bf(genus, array_size)
|
|
213
|
-
|
|
214
|
-
# Perform lookup on bloomfilter.
|
|
215
|
-
# Dominik: added spacing
|
|
216
|
-
scores = perform_lookup(bf, files, file_paths, accessions, names, spacing)
|
|
217
|
-
|
|
218
|
-
# Sort score list by species names.
|
|
219
|
-
scores = sort_list(scores, names)
|
|
220
|
-
|
|
221
|
-
# Save results in csv file.
|
|
222
|
-
save_csv(genus, scores)
|