XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/XspecT_trainer.py DELETED
@@ -1,611 +0,0 @@
1
- import argparse
2
- import os
3
- import shutil
4
- from linecache import getline
5
- from pathlib import Path
6
- import pickle
7
- from platform import python_version
8
- import sys
9
- from time import localtime, perf_counter, asctime, sleep
10
- from loguru import logger
11
- from numpy import mean
12
- from xspect import file_io
13
- from xspect.file_io import concatenate_meta
14
- from xspect.train_filter.ncbi_api import (
15
- ncbi_assembly_metadata,
16
- ncbi_taxon_metadata,
17
- ncbi_children_tree,
18
- download_assemblies,
19
- )
20
- from xspect.train_filter import (
21
- create_svm,
22
- html_scrap,
23
- extract_and_concatenate,
24
- get_paths,
25
- interface_XspecT,
26
- k_mer_count,
27
- )
28
-
29
-
30
- def check_user_input(user_input: str):
31
- """The given input of the user will be checked. The input has to be a genus in NCBI.
32
-
33
- :return: The genus name.
34
- """
35
- taxon_metadata = ncbi_taxon_metadata.NCBITaxonMetadata([user_input])
36
- all_metadata = taxon_metadata.get_metadata()
37
- for metadata in all_metadata.values():
38
- sci_name = metadata["sci_name"]
39
- tax_id = metadata["tax_id"]
40
- rank = metadata["rank"]
41
- lineage = metadata["lineage"]
42
- bacteria_id = 2
43
- if not sci_name == user_input and not tax_id == user_input:
44
- print(
45
- f"{get_current_time()}| The given genus: {user_input} was found as genus: {sci_name} "
46
- f"ID: {tax_id}"
47
- )
48
- print(f"{get_current_time()}| Using {sci_name} as genus name.")
49
- if rank == "GENUS":
50
- if bacteria_id not in lineage:
51
- print(f"{get_current_time()}| The given genus is not a bacteria.")
52
- print(f"{get_current_time()}| Do you want to continue: [y/n]")
53
- choice = input("-> ").lower()
54
- if choice == "y":
55
- return str(sci_name)
56
- else:
57
- print(f"{get_current_time()}| Exiting...")
58
- exit()
59
- else:
60
- return str(sci_name)
61
- else:
62
- print(f"{get_current_time()}| {user_input} is rank {rank} and not genus.")
63
- exit()
64
-
65
-
66
- def copy_custom_data(bf_path: str, svm_path: str, dir_name: str):
67
- """
68
-
69
- :param bf_path:
70
- :param svm_path:
71
- :param dir_name:
72
- :return:
73
- """
74
- path = Path(os.getcwd()) / "genus_metadata" / dir_name
75
- new_bf_path = path / "concatenate"
76
- new_svm_path = path / "training_data"
77
-
78
- # Make the new directories.
79
- os.mkdir(path)
80
- os.mkdir(new_bf_path)
81
- os.mkdir(new_svm_path)
82
-
83
- # Move bloomfilter files.
84
- bf_files = os.listdir(bf_path)
85
- for file in bf_files:
86
- file_path = Path(bf_path) / file
87
- new_file_path = new_bf_path / file
88
- shutil.copy2(file_path, new_file_path)
89
-
90
- # Move svm files.
91
- svm_files = os.listdir(svm_path)
92
- for file in svm_files:
93
- file_path = Path(svm_path) / file
94
- new_file_path = new_svm_path / file
95
- shutil.copy2(file_path, new_file_path)
96
-
97
-
98
- def count_avg_seq_len(dir_name):
99
- """Counts the sequence length for each species concatenated fasta file and computes the average.
100
-
101
- :param dir_name: Directory name for current genus.
102
- :type dir_name: str
103
- :return: The average sequence length.
104
- """
105
- path = get_paths.get_concatenate_file_path(dir_name)
106
-
107
- # Create list with all sequence lengths.
108
- files = os.listdir(path)
109
- counts = list()
110
- for file in files:
111
- file_path = path / str(file)
112
- sequence = getline(str(file_path), 2)
113
- counts.append(len(sequence))
114
-
115
- # Return avg. sequence length.
116
- return int(round(float(mean(counts)), 0))
117
-
118
-
119
- def check_meta_file_size(dir_name) -> bool:
120
- """Checks the metagenome fasta file if every concatenated species file was used for
121
- its creation by comparing the file sizes.
122
-
123
- :param dir_name: Directory name for current genus.
124
- :type dir_name: str
125
- :return: True or False depending on the answer.
126
- """
127
- path = Path(os.getcwd())
128
- species_path = path / "genus_metadata" / dir_name / "concatenate"
129
- genus = dir_name.split("_")[0]
130
- meta_path = path / "genus_metadata" / dir_name / (str(genus) + ".fasta")
131
-
132
- species_files = os.listdir(species_path)
133
- all_files_size = 0
134
- for file in species_files:
135
- file_size = os.path.getsize(species_path / str(file))
136
- all_files_size += file_size
137
-
138
- meta_file_size = os.path.getsize(meta_path)
139
-
140
- all_files_size = round(all_files_size / (1024**2))
141
- meta_file_size = round(meta_file_size / (1024**2))
142
-
143
- # Compare both sizes.
144
- same = False
145
- if all_files_size == meta_file_size:
146
- same = True
147
- return same
148
-
149
-
150
- def check_meta_file_content(dir_name: str):
151
- """Checks if every sequence used to concatenate the meta file is fully inside the meta file.
152
-
153
- :param dir_name: Directory name for current genus.
154
- :return: True if every sequence is inside the meta file.
155
- """
156
- path = Path(os.getcwd()) / "genus_metadata" / dir_name
157
- concatenate_path = path / "concatenate"
158
- genus = dir_name.split("_")[0]
159
- mg_file_name = f"{genus}.fasta"
160
- mg_file_path = path / mg_file_name
161
- mg_str = ""
162
- with open(mg_file_path, "r") as mg_file:
163
- for line in mg_file:
164
- if line[0] != ">":
165
- mg_str = line
166
- files = os.listdir(concatenate_path)
167
-
168
- for file in files:
169
- file_path = concatenate_path / file
170
- with open(file_path, "r") as con_file:
171
- for line in con_file:
172
- if line[0] == ">":
173
- continue
174
-
175
- if line not in mg_str:
176
- logger.error(f"{file} not in metagenome")
177
- else:
178
- logger.info(f"{file} in metagenome")
179
-
180
-
181
- def init_argparse() -> argparse.ArgumentParser:
182
- """Initiate the command line parser for XspecT-trainer.py"""
183
- parser = argparse.ArgumentParser(
184
- prog="XspecT-trainer",
185
- description="Automatically trains bloomfilter, of a given genus, so they can later be used by XspecT to "
186
- "assign species to assemblies.",
187
- )
188
- parser.add_argument(
189
- "genus",
190
- type=str,
191
- help="The name of the genus for which the filters will be trained. Can also be a NCBI Taxon ID",
192
- )
193
- parser.add_argument(
194
- "mode",
195
- metavar="mode",
196
- choices=["1", "2", "3"],
197
- type=str,
198
- help="Declares which mode should be used. 1: Train bloomfilters with assemblies from the ncbi"
199
- "RefSeq database. 2: Train bloomfilters with custom assembles. The paths to the assemblies"
200
- "need to be given. 3: Check if metagenome file was correctly created.",
201
- )
202
- parser.add_argument(
203
- "-bf",
204
- "--bf_path",
205
- type=str,
206
- help="The path to the assemblies that will be used to train the bloomfilters if mode 2 is used.",
207
- )
208
- parser.add_argument(
209
- "-svm",
210
- "--svm_path",
211
- type=str,
212
- help="The path to the assemblies that will be used to train the support-vector-machine if "
213
- "mode 2 is used.",
214
- )
215
- parser.add_argument(
216
- "-c",
217
- "--complete",
218
- action="store_true",
219
- help="Declares if all of every 500th k-mere should be used to train the bloomfilters.",
220
- )
221
- parser.add_argument(
222
- "-d",
223
- "--dir_name",
224
- type=str,
225
- help="Write the directory name from genus_metadata to check metagenome file.",
226
- )
227
-
228
- return parser
229
-
230
-
231
- def set_logger(dir_name: str):
232
- """Sets the logger parameters.
233
-
234
- :param dir_name: Name of the folder where the log should be saved.
235
- """
236
- genus = dir_name.split("_")[0]
237
-
238
- # Starting logger.
239
- logger.remove()
240
- logger.add(sys.stderr, format="{time:HH:mm:ss} | {level} | {message}", level="INFO")
241
- log_path = Path(os.getcwd()) / "genus_metadata" / dir_name / (genus + ".log")
242
- logger.add(log_path, format="{time:HH:mm:ss} | {level} | {message}", level="DEBUG")
243
-
244
-
245
- def create_translation_dict(dir_name: str) -> dict[str, str]:
246
- """Create a translation dictionary to translate the taxon ID to its scientific name from the file names.
247
-
248
- :param dir_name: Directory name for current genus.
249
- :return: The created translation dictionary.
250
- """
251
- path = Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
252
- files = os.listdir(path)
253
- translation_dict = dict()
254
- for file in files:
255
- file_split = file.split(".")[0].split("_")
256
- tax_id = file_split[0]
257
- name = file_split[1]
258
- translation_dict[tax_id] = name
259
-
260
- return translation_dict
261
-
262
-
263
- def save_translation_dict(dir_name: str, translation_dict: dict[str, str]):
264
- """Saves the translation dict in filter/translation_dicts as pickle file.
265
-
266
- :param dir_name: Directory name for current genus.
267
- :param translation_dict: A dictionary with taxon ID as key and its corresponding scientific name as value.
268
- """
269
- genus = dir_name.split("_")[0]
270
- folder_path = Path(os.getcwd()) / "filter" / "translation_dicts"
271
- # Check if folder exists
272
- if os.path.exists(folder_path):
273
- # Check if it is a folder
274
- if not os.path.isdir(folder_path):
275
- logger.error("Path: {path} is not a folder", path=folder_path)
276
- logger.error("Aborting")
277
- exit()
278
- else:
279
- # Create folder
280
- os.mkdir(folder_path)
281
-
282
- file_name = f"{genus}.pickle"
283
- file_path = folder_path / file_name
284
- with open(file_path, "wb") as f:
285
- pickle.dump(translation_dict, f)
286
-
287
-
288
- def change_bf_assembly_file_names(dir_name: str):
289
- """Change all concatenated assembly names to only the taxon ID.
290
-
291
- :param dir_name: Directory name for current genus.
292
- """
293
- path = Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
294
- files = os.listdir(path)
295
- for file in files:
296
- file_split = file.split(".")[0].split("_")
297
- tax_id = file_split[0]
298
- new_file_name = f"{tax_id}.fasta"
299
- os.rename((path / file), (path / new_file_name))
300
-
301
-
302
- def get_current_time():
303
- """Returns the current time in the form hh:mm:ss."""
304
- return asctime(localtime()).split()[3]
305
-
306
-
307
- def delete_dir(dir_path: Path):
308
- """
309
-
310
- :param dir_path:
311
- """
312
- shutil.rmtree(dir_path, ignore_errors=False, onerror=None)
313
-
314
-
315
- def main():
316
- # command line should look like this: python XspecT_trainer.py genus mode path_to_bf_files path_to_svm_files
317
- parser = init_argparse()
318
- args = parser.parse_args()
319
- genus = args.genus
320
- mode = args.mode
321
- complete = args.complete
322
- bf_path = args.bf_path
323
- svm_path = args.svm_path
324
- dir_name = args.dir_name
325
- train(genus, mode, complete, bf_path, svm_path, dir_name)
326
-
327
-
328
- def train(genus, mode, complete, bf_path, svm_path, dir_name):
329
- if complete:
330
- spacing = 1
331
- else:
332
- spacing = 500
333
-
334
- # Check folder structure
335
- file_io.check_folder_structure()
336
-
337
- # Check user input.
338
- genus = check_user_input(user_input=genus)
339
-
340
- # The directory name is defined in the following format: 'genus'_DD_MM_YYYY_hh-mm-ss
341
- curr_time = localtime()
342
- dir_name = f"{genus}_{curr_time[2]}_{curr_time[1]}_{curr_time[0]}_{curr_time[3]}-{curr_time[4]}-{curr_time[5]}"
343
-
344
- # Set the logger.
345
- set_logger(dir_name)
346
-
347
- # Time for the whole program.
348
- start_all = perf_counter()
349
- if mode == "1":
350
- # Search for every defined species of the genus.
351
- start_tax = perf_counter()
352
- logger.info("Getting all species of the genus")
353
- children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
354
- species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(
355
- children_ids
356
- ).get_metadata()
357
- end_tax = perf_counter()
358
-
359
- # Get all gcf accessions that have Taxonomy check result OK.
360
- logger.info("Checking ANI data for updates")
361
- ani = html_scrap.TaxonomyCheck()
362
- ani_gcf = ani.ani_gcf()
363
-
364
- # Look for up to 8 assembly accessions per species.
365
- start_meta = perf_counter()
366
- logger.info("Getting assembly metadata")
367
- all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
368
- all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
369
- )
370
- all_metadata = all_metadata.get_all_metadata()
371
- logger.info("Finished metadata collecting\n")
372
- end_meta = perf_counter()
373
-
374
- # Download the chosen assemblies.
375
- # One file for each species with it's downloaded assemblies in zip format.
376
- start_download = perf_counter()
377
-
378
- # Iterate through all species.
379
- logger.info("Downloading assemblies for bloomfilter training")
380
- for metadata in all_metadata.values():
381
- # Only try to download when the species has accessions.
382
- if len(metadata["accessions"]) >= 1:
383
- sleep(5)
384
- species_name = metadata["sci_name"]
385
- tax_id = metadata["tax_id"]
386
- logger.info("Downloading {id}_{name}", id=tax_id, name=species_name)
387
- file_name = f"{tax_id}_{species_name}.zip"
388
-
389
- # Selecting the first 4 assemblies for training the filters.
390
- accessions = list()
391
- for accession in metadata["accessions"]:
392
- accessions.append(accession)
393
- if len(accessions) == 4:
394
- break
395
-
396
- download_assemblies.download_assemblies(
397
- accessions=accessions,
398
- dir_name=dir_name,
399
- target_folder="zip_files",
400
- zip_file_name=file_name,
401
- )
402
- logger.info("Downloads finished\n")
403
- end_download = perf_counter()
404
-
405
- # Concatenate all assemblies of each species.
406
- start_concatenate = perf_counter()
407
- extract_bf = extract_and_concatenate.ExtractConcatenate(
408
- dir_name=dir_name, delete=True
409
- )
410
- extract_bf.bf()
411
- concatenate_meta(Path(os.getcwd()) / "genus_metadata" / dir_name, genus)
412
- logger.info("Finished extracting and concatenating\n")
413
- end_concatenate = perf_counter()
414
-
415
- # Compute average sequence length.
416
- avg_len = count_avg_seq_len(dir_name)
417
-
418
- # Download assemblies for svm creation.
419
- start_svm_dl = perf_counter()
420
- logger.info("Downloading assemblies for support-vector-machine training")
421
- accessions = dict()
422
- for metadata in all_metadata.values():
423
- # Only add taxon with accessions.
424
- if len(metadata["accessions"]) >= 1:
425
- accessions[metadata["tax_id"]] = metadata["accessions"]
426
-
427
- # Downloading assemblies.
428
- create_svm.get_svm_assemblies(all_accessions=accessions, dir_name=dir_name)
429
- logger.info("Finished downloading\n")
430
-
431
- # Extracting assemblies.
432
- extract_svm = extract_and_concatenate.ExtractConcatenate(
433
- dir_name=dir_name, delete=True
434
- )
435
- extract_svm.svm(species_accessions=accessions)
436
-
437
- end_svm_dl = perf_counter()
438
-
439
- elif mode == "2":
440
- # Mode 2 needs to folders one with concatenated fasta files.
441
- # The files should have .fasta as a file ending and its name should be the species ID and its name without
442
- # the genus name. e.g. 28901_enterica.fasta for Salmonella enterica. The ID can be any ID. The standard is ncbi
443
- # taxon IDs. The ID should only contain numbers from 0-9.
444
- # The second folder should have assembly fasta files for every species. These should have a code in the file
445
- # name to understand where the data came from. Its header should have > at the start and after the species
446
- # ID. E.g. >28901\n
447
-
448
- # Check if paths were given.
449
- if bf_path:
450
- if not os.path.exists(bf_path):
451
- logger.error(
452
- "The given path to the bloomfilter assemblies doesn't exist"
453
- )
454
- logger.error("Aborting")
455
- exit()
456
- else:
457
- logger.error("There was no path to the bloomfilter assemblies given")
458
- logger.error("Aborting")
459
- exit()
460
- if svm_path:
461
- if not os.path.exists(svm_path):
462
- logger.error(
463
- "The given path to the support-vector-machine assemblies doesn't exist"
464
- )
465
- logger.error("Aborting")
466
- exit()
467
- else:
468
- logger.error(
469
- "There was no path to the support-vector-machine assemblies given"
470
- )
471
- logger.error("Aborting")
472
- exit()
473
-
474
- # Move the given files to genus_metadata.
475
- logger.info("Copying data given into genus_metadata")
476
- copy_custom_data(bf_path=bf_path, svm_path=svm_path, dir_name=dir_name)
477
-
478
- # Create Metagenome fasta file of all concatenated fasta files.
479
- logger.info("Creating meta fasta file")
480
- concatenate_meta(Path(os.getcwd()) / "genus_metadata" / dir_name, genus)
481
-
482
- elif mode == "3":
483
- logger.info("Checking metagenome file")
484
- mg_check_dir_name = dir_name
485
- if not mg_check_dir_name:
486
- logger.error("There was no directory name given")
487
- logger.error("Aborting")
488
- exit()
489
- check_meta_file_content(mg_check_dir_name)
490
- logger.info("Finished")
491
- logger.opt(record=True).info("Elapsed time: {record[elapsed]}")
492
- exit()
493
-
494
- # Check file sizes.
495
- result = False
496
- logger.info("Checking if metagenome file was correctly created")
497
- result = check_meta_file_size(dir_name)
498
- count = 0
499
- while not result:
500
- logger.error("Metagenome file was not correctly created")
501
- logger.info("Trying to remake metagenome fasta file")
502
- concatenate_meta(Path(os.getcwd()) / "genus_metadata" / dir_name, genus)
503
- logger.info("Rechecking metagenome file")
504
- result = check_meta_file_size(dir_name)
505
- count += 1
506
- if count == 3:
507
- logger.error("Can't create metagenome file")
508
- logger.error("Aborting")
509
- exit()
510
-
511
- # Make dictionary for translating taxon ID to scientific name.
512
- translation_dict = create_translation_dict(dir_name)
513
- change_bf_assembly_file_names(dir_name)
514
-
515
- # Count all distinct k-mers and return the highest count.
516
- start_count = perf_counter()
517
- logger.info("Counting all distinct k-meres")
518
- highest_counts = k_mer_count.get_highest_k_mer_count(dir_name)
519
- output_file_path = Path(os.getcwd()) / "output"
520
- os.remove(output_file_path)
521
- end_count = perf_counter()
522
-
523
- # Train new Bloomfilters with concatenated files of each species.
524
- start_bf = perf_counter()
525
- # Compute the array size with the highest count of distinct k-mers.
526
- array_size_species = int(
527
- round(interface_XspecT.compute_array_size(highest_counts[0]) + 1000000, -6)
528
- )
529
- array_size_complete = int(
530
- round(interface_XspecT.compute_array_size(highest_counts[1]) + 1000000, -6)
531
- )
532
-
533
- # Save array sizes for XspecT.
534
- logger.info("Saving bloomfilter sizes\n")
535
- interface_XspecT.save_array_sizes(
536
- genus, [str(array_size_species), str(array_size_complete)]
537
- )
538
-
539
- # Train Bloomfilters of species.
540
- logger.info("Training bloomfilters")
541
- species_files_path, species_result_path = interface_XspecT.make_paths(
542
- dir_name, genus
543
- )
544
- interface_XspecT.new_train_core(
545
- species_files_path, species_result_path, array_size_species
546
- )
547
- interface_XspecT.new_write_file_dyn(species_result_path, genus, meta_mode=False)
548
-
549
- # Train Bloomfilter for complete genus.
550
- logger.info("Training metagenome bloomfilter")
551
- mg_files_path = get_paths.get_current_dir_file_path(dir_name)
552
- mg_result_path = get_paths.get_metagenome_filter_path()
553
- interface_XspecT.new_train_core(
554
- str(mg_files_path), str(mg_result_path), array_size_complete
555
- )
556
- interface_XspecT.new_write_file_dyn(str(mg_result_path), genus, meta_mode=True)
557
-
558
- # Delete concatenated assemblies.
559
- # Delete species files.
560
- delete_dir(species_files_path)
561
- # Delete metagenome file.
562
- os.remove(mg_files_path / f"{genus}.fasta")
563
-
564
- end_bf = perf_counter()
565
-
566
- # Create support vector machine.
567
- start_svm = perf_counter()
568
- logger.info("Training support-vector-machine")
569
- # Create svm.
570
- create_svm.new_helper(
571
- spacing, genus=genus, dir_name=dir_name, array_size=array_size_species, k=21
572
- )
573
-
574
- # Delete used assemblies.
575
- assemblies_path = get_paths.get_current_dir_file_path(dir_name) / "training_data"
576
- delete_dir(assemblies_path)
577
-
578
- end_svm = perf_counter()
579
- end_all = perf_counter()
580
-
581
- logger.info(
582
- "Program runtime: {time} m", time=(round((end_all - start_all) / 60, 2))
583
- )
584
-
585
- if mode == "1":
586
- # Print and save collected statistics.
587
- logger.info("Saving collected runtime statistics")
588
- time_print = (
589
- f"Python version: {python_version()} \n"
590
- f"Average sequence length: {avg_len:,} \n"
591
- f"All time: {(end_all-start_all)/60:.2f} m\n"
592
- f"Tax time: {(end_tax-start_tax):.2f} s\n"
593
- f"Meta time: {(end_meta-start_meta)/60:.2f} m\n"
594
- f"Download time: {(end_download-start_download)/60:.2f} m\n"
595
- f"Concatenate time: {(end_concatenate-start_concatenate):.2f} s\n"
596
- f"Count time: {(end_count-start_count)/60:.2f} m\n"
597
- f"Training time: {(end_bf-start_bf)/60:.2f} m\n"
598
- f"Support vector machine time: {((end_svm+end_svm_dl)-(start_svm+start_svm_dl))/60:.2f} m\n"
599
- )
600
-
601
- # Save time measurements.
602
- interface_XspecT.save_time_stats(time_print, dir_name)
603
-
604
- # Save translation dict
605
- save_translation_dict(dir_name, translation_dict)
606
-
607
- logger.info("XspecT-trainer is finished.")
608
-
609
-
610
- if __name__ == "__main__":
611
- main()