XspecT 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (57) hide show
  1. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.2.dist-info/RECORD +0 -48
  26. xspect/BF_v2.py +0 -648
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -737
  31. xspect/XspecT_mini.py +0 -1377
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/get_paths.py +0 -35
  53. xspect/train_filter/interface_XspecT.py +0 -204
  54. xspect/train_filter/k_mer_count.py +0 -162
  55. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  56. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  57. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/XspecT_mini.py DELETED
@@ -1,1377 +0,0 @@
1
- import os
2
- import warnings
3
- import time
4
- import csv
5
- import pickle
6
- import statistics
7
- import sys
8
- from pathlib import Path
9
- from Bio import SeqIO, Seq
10
- from numpy import sum
11
- import psutil
12
- import xspect.Classifier as Classifier
13
- import xspect.search_filter as search_filter
14
- from xspect.OXA_Table import OXATable
15
- import xspect.Bootstrap as bs
16
- from xspect.train_filter.interface_XspecT import load_translation_dict
17
-
18
-
19
- warnings.filterwarnings("ignore")
20
-
21
-
22
- def xspecT_mini(
23
- file_path,
24
- XspecT,
25
- ClAssT,
26
- oxa,
27
- file_format,
28
- read_amount,
29
- csv_table,
30
- metagenome,
31
- genus,
32
- mode,
33
- ):
34
- """performs a BF-lookup for a set of genomes for testing purpose"""
35
- itemlist = [
36
- "albensis",
37
- "apis",
38
- "baretiae",
39
- "baumannii",
40
- "baylyi",
41
- "beijerinckii",
42
- "bereziniae",
43
- "bohemicus",
44
- "boissieri",
45
- "bouvetii",
46
- "brisouii",
47
- "calcoaceticus",
48
- "celticus",
49
- "chengduensis",
50
- "chinensis",
51
- "colistiniresistens",
52
- "courvalinii",
53
- "cumulans",
54
- "defluvii",
55
- "dispersus",
56
- "equi",
57
- "gandensis",
58
- "gerneri",
59
- "gs06",
60
- "gs16",
61
- "guerrae",
62
- "guillouiae",
63
- "gyllenbergii",
64
- "haemolyticus",
65
- "halotolerans",
66
- "harbinensis",
67
- "idrijaensis",
68
- "indicus",
69
- "johnsonii",
70
- "junii",
71
- "kanungonis",
72
- "kookii",
73
- "kyonggiensis",
74
- "lactucae",
75
- "lanii",
76
- "larvae",
77
- "lwoffii",
78
- "marinus",
79
- "modestus",
80
- "nectaris",
81
- "nosocomialis",
82
- "oleivorans",
83
- "parvus",
84
- "piscicola",
85
- "pittii",
86
- "pollinis",
87
- "populi",
88
- "portensis",
89
- "pseudolwoffii",
90
- "pullicarnis",
91
- "pragensis",
92
- "proteolyticus",
93
- "puyangensis",
94
- "qingfengensis",
95
- "radioresistens",
96
- "rathckeae",
97
- "rongchengensis",
98
- "rudis",
99
- "schindleri",
100
- "seifertii",
101
- "seohaensis",
102
- "shaoyimingii",
103
- "sichuanensis",
104
- "soli",
105
- "stercoris",
106
- "tandoii",
107
- "terrae",
108
- "terrestris",
109
- "tianfuensis",
110
- "tjernbergiae",
111
- "towneri",
112
- "ursingii",
113
- "variabilis",
114
- "venetianus",
115
- "vivanii",
116
- "wanghuae",
117
- "wuhouensis",
118
- "sp.",
119
- ]
120
- print("Preparing Bloomfilter...")
121
- start = time.time()
122
- if XspecT:
123
- # BF = search_filter.pre_processing()
124
- # Phillip
125
- # Getting the array sizes for pre processing of all bloomfilters.
126
- genera = search_filter.get_genera_array_sizes()
127
-
128
- # Pre processing of the bloomfilters for the species.
129
- BF = search_filter.pre_process_all(genera, k=21, meta_mode=False, genus=[genus])
130
-
131
- # aktuelle Speichernutzung auslesen
132
- process = psutil.Process()
133
- memory_info = process.memory_info()
134
- # Ausgabe des Speicherverbrauchs
135
- print(
136
- f"Aktueller Speicherverbrauch mit den Spezies BF: {memory_info.rss / 1024 / 1024:.2f} MB"
137
- )
138
-
139
- # BF_1 = search_filter.pre_processing_prefilter()
140
- # BF_1_1 = search_filter.pre_processing_prefilter2()
141
- # Phillip
142
- # Pre processing of the bloomfilters for the metagenome mode.
143
- BF_1_1 = search_filter.pre_process_all(
144
- genera, k=21, meta_mode=True, genus=[genus]
145
- )
146
-
147
- # aktuelle Speichernutzung auslesen
148
- process = psutil.Process()
149
- memory_info = process.memory_info()
150
- # Ausgabe des Speicherverbrauchs
151
- print(
152
- f"Aktueller Speicherverbrauch mit dem Master BF: {memory_info.rss / 1024 / 1024:.2f} MB"
153
- )
154
-
155
- if ClAssT:
156
- BF_2 = search_filter.pre_processing_ClAssT()
157
- if oxa:
158
- BF_3 = search_filter.pre_processing_oxa()
159
- end = time.time()
160
- needed = round(end - start, 2)
161
- print("Time needed for preprocessing: ", needed)
162
- try:
163
- files = sorted(os.listdir(file_path))
164
- except FileNotFoundError:
165
- print("Error: Invalid filepath!")
166
- quit()
167
- if file_format == "fna" or file_format == "fasta" or file_format == "fa":
168
- for i in range(len(files) - 1, -1, -1):
169
- if "fna" in files[i] or "fasta" in files[i]:
170
- continue
171
- else:
172
- del files[i]
173
- elif file_format == "fastq" or file_format == "fq":
174
- for i in range(len(files) - 1, -1, -1):
175
- if "fastq" in files[i] or "fq" in files[i]:
176
- continue
177
- else:
178
- del files[i]
179
- if len(files) == 0:
180
- print("Error: No " + str(file_format) + " files in directory!")
181
- quit()
182
- paths = files[:]
183
- file_path2 = file_path[:]
184
- for i in range(len(file_path2)):
185
- if file_path2[i] == "\\":
186
- list_temp = list(file_path2)
187
- list_temp[i] = "/"
188
- file_path2 = "".join(list_temp)
189
- start = time.time()
190
- for i in range(len(files)):
191
- paths[i] = file_path2 + "/" + paths[i]
192
- if XspecT:
193
- predictions, scores = xspecT(
194
- BF[genus],
195
- BF_1_1[genus],
196
- files,
197
- paths,
198
- file_format,
199
- read_amount,
200
- metagenome,
201
- genus,
202
- mode,
203
- )
204
- if ClAssT:
205
- predictions_ClAssT, scores_ClAssT = clAssT(
206
- BF_2, files, paths, file_format, read_amount
207
- )
208
- if oxa:
209
- scores_oxa, scores_oxa_ind = blaOXA(
210
- BF_3, files, paths, file_format, read_amount
211
- )
212
- print("Preparing results...")
213
- print("")
214
- end = time.time()
215
- needed = round(end - start, 2)
216
- print("Time needed: ", needed)
217
- print("")
218
- header_filename = "Filename"
219
- spaces = []
220
- space = " "
221
- underscore = "________"
222
- name_max = len(max(itemlist, key=len))
223
- if XspecT:
224
- for i in range(len(predictions)):
225
- while len(predictions[i]) < name_max:
226
- predictions[i] += " "
227
- file_max = len(max(files, key=len))
228
- while len(header_filename) < file_max:
229
- header_filename += " "
230
- underscore += "_"
231
- for j in range(len(files)):
232
- for i in range(len(header_filename) - len(files[j])):
233
- space += " "
234
- spaces.append(space)
235
- space = " "
236
- excel = []
237
- # formatting
238
- if ClAssT:
239
- for i in range(len(predictions_ClAssT)):
240
- if predictions_ClAssT[i] != "none" and predictions_ClAssT[i] != "None":
241
- predictions_ClAssT[i] += " "
242
- if XspecT and ClAssT:
243
- for i in range(len(scores_ClAssT)):
244
- if scores[i] == "1.0":
245
- scores[i] += " "
246
-
247
- if XspecT and ClAssT and oxa:
248
- excelv2 = []
249
- print(scores_oxa)
250
- print(scores_oxa_ind)
251
- for i in range(len(files)):
252
- if scores_oxa == ["None"]:
253
- excel.append(
254
- files[i]
255
- + spaces[i]
256
- + predictions[i]
257
- + " "
258
- + scores[i]
259
- + " "
260
- + predictions_ClAssT[i]
261
- + " "
262
- + scores_ClAssT[i]
263
- + " "
264
- + str(scores_oxa[i])
265
- + " "
266
- + str(scores_oxa_ind[i][0])
267
- + " "
268
- + str(scores_oxa_ind[i][1])
269
- )
270
- else:
271
- excel.append(
272
- files[i]
273
- + spaces[i]
274
- + predictions[i]
275
- + " "
276
- + scores[i]
277
- + " "
278
- + predictions_ClAssT[i]
279
- + " "
280
- + scores_ClAssT[i]
281
- + " "
282
- + str(scores_oxa[i])
283
- + " "
284
- + str(scores_oxa_ind[i][0])
285
- + " "
286
- + str(scores_oxa_ind[i][1])
287
- )
288
- excelv2.append(
289
- files[i]
290
- + ","
291
- + predictions[i]
292
- + ","
293
- + scores[i]
294
- + predictions_ClAssT[i]
295
- + ","
296
- + scores_ClAssT[i]
297
- + ","
298
- + str(scores_oxa[i])
299
- )
300
- print(
301
- header_filename
302
- + " Species Score Sub-Type Score blaOXA-Family blaOXA-Gene Score"
303
- )
304
- print(
305
- underscore
306
- + "___________________________________________________________________________________________________________________________________________"
307
- )
308
- for i in excel:
309
- print(i)
310
- for i in range(0, len(excelv2)):
311
- excelv2[i] = [excelv2[i]]
312
- if csv_table:
313
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
314
- writer = csv.writer(file)
315
- writer.writerows(excelv2)
316
- print("")
317
- print("")
318
- elif XspecT and not ClAssT and not oxa:
319
- excelv2 = []
320
- for i in range(len(files)):
321
- excel.append(files[i] + spaces[i] + predictions[i] + " " + scores[i])
322
- excelv2.append(files[i] + "," + predictions[i] + "," + scores[i])
323
- print(header_filename + " Species Score")
324
- print(underscore + "_________________________________________")
325
- for i in excel:
326
- print(i)
327
- for i in range(0, len(excelv2)):
328
- excelv2[i] = [excelv2[i]]
329
- if csv_table:
330
- with open(
331
- r"Results/XspecT_mini_csv/Results_XspecT.csv", "w", newline=""
332
- ) as file:
333
- writer = csv.writer(file)
334
- writer.writerows(excelv2)
335
- print("")
336
- print("")
337
- elif ClAssT and not XspecT and not oxa:
338
- excelv2 = []
339
- for i in range(len(files)):
340
- excel.append(
341
- files[i]
342
- + spaces[i]
343
- + predictions_ClAssT[i]
344
- + " "
345
- + scores_ClAssT[i]
346
- )
347
- excelv2.append(
348
- files[i] + "," + predictions_ClAssT[i] + "," + scores_ClAssT[i]
349
- )
350
- print(header_filename + " Sub-Type Score")
351
- print(underscore + "________________________________")
352
- for i in excel:
353
- print(i)
354
- print("")
355
- print("")
356
- for i in range(0, len(excelv2)):
357
- excelv2[i] = [excelv2[i]]
358
- if csv_table:
359
- with open(
360
- r"Results/XspecT_mini_csv/Results_ClAssT.csv", "w", newline=""
361
- ) as file:
362
- writer = csv.writer(file)
363
- writer.writerows(excelv2)
364
- elif oxa and not ClAssT and not XspecT:
365
- excelv2 = []
366
- for i in range(len(files)):
367
- if scores_oxa == ["None"]:
368
- excel.append(
369
- files[i]
370
- + spaces[i]
371
- + str(scores_oxa[i])
372
- + " "
373
- + str(scores_oxa_ind[i][0])
374
- + " "
375
- + str(scores_oxa_ind[i][1])
376
- )
377
- else:
378
- excel.append(
379
- files[i]
380
- + spaces[i]
381
- + str(scores_oxa[i])
382
- + " "
383
- + str(scores_oxa_ind[i][0])
384
- + " "
385
- + str(scores_oxa_ind[i][1])
386
- )
387
-
388
- excelv2.append(files[i] + "," + str(scores_oxa[i]))
389
- print(
390
- header_filename
391
- + " blaOXA-Family blaOXA-Gene Score"
392
- )
393
- print(
394
- underscore
395
- + "_______________________________________________________________________"
396
- )
397
- for i in excel:
398
- print(i)
399
- print("")
400
- print("")
401
- for i in range(0, len(excelv2)):
402
- excelv2[i] = [excelv2[i]]
403
- if csv_table:
404
- with open(
405
- r"Results/XspecT_mini_csv/Results_Oxa.csv", "w", newline=""
406
- ) as file:
407
- writer = csv.writer(file)
408
- writer.writerows(excelv2)
409
- elif XspecT and ClAssT and not oxa:
410
- excelv2 = []
411
- for i in range(len(files)):
412
- excel.append(
413
- files[i]
414
- + spaces[i]
415
- + predictions[i]
416
- + " "
417
- + scores[i]
418
- + " "
419
- + predictions_ClAssT[i]
420
- + " "
421
- + scores_ClAssT[i]
422
- )
423
- excelv2.append(
424
- files[i]
425
- + ","
426
- + predictions[i]
427
- + ","
428
- + scores[i]
429
- + ","
430
- + predictions_ClAssT[i]
431
- + ","
432
- + scores_ClAssT[i]
433
- )
434
- print(
435
- header_filename
436
- + " Species Score Sub-Type Score"
437
- )
438
- print(
439
- underscore
440
- + "________________________________________________________________________"
441
- )
442
- for i in excel:
443
- print(i)
444
- print("")
445
- print("")
446
- for i in range(0, len(excelv2)):
447
- excelv2[i] = [excelv2[i]]
448
- if csv_table:
449
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
450
- writer = csv.writer(file)
451
- writer.writerows(excelv2)
452
- elif XspecT and oxa and not ClAssT:
453
- excelv2 = []
454
- for i in range(len(files)):
455
- if scores_oxa == ["None"]:
456
- excel.append(
457
- files[i]
458
- + spaces[i]
459
- + predictions[i]
460
- + " "
461
- + scores[i]
462
- + " "
463
- + str(scores_oxa[i])
464
- + " "
465
- + str(scores_oxa_ind[i][0])
466
- + " "
467
- + str(scores_oxa_ind[i][1])
468
- )
469
- else:
470
- excel.append(
471
- files[i]
472
- + spaces[i]
473
- + predictions[i]
474
- + " "
475
- + scores[i]
476
- + " "
477
- + str(scores_oxa[i])
478
- + " "
479
- + str(scores_oxa_ind[i][0])
480
- + " "
481
- + str(scores_oxa_ind[i][1])
482
- )
483
- excelv2.append(
484
- files[i] + "," + predictions[i] + "," + scores[i] + str(scores_oxa[i])
485
- )
486
- print(
487
- header_filename
488
- + " Species Score blaOXA-Family blaOXA-Gene Score"
489
- )
490
- print(
491
- underscore
492
- + "_______________________________________________________________________________________________________________"
493
- )
494
- for i in excel:
495
- print(i)
496
- for i in range(0, len(excelv2)):
497
- excelv2[i] = [excelv2[i]]
498
- if csv_table:
499
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
500
- writer = csv.writer(file)
501
- writer.writerows(excelv2)
502
- print("")
503
- print("")
504
- elif ClAssT and oxa and not XspecT:
505
- excelv2 = []
506
- for i in range(len(files)):
507
- if scores_oxa == ["None"]:
508
- excel.append(
509
- files[i]
510
- + spaces[i]
511
- + predictions_ClAssT[i]
512
- + " "
513
- + scores_ClAssT[i]
514
- + " "
515
- + str(scores_oxa[i])
516
- + " "
517
- + str(scores_oxa_ind[i][0])
518
- + " "
519
- + str(scores_oxa_ind[i][1])
520
- )
521
- else:
522
- excel.append(
523
- files[i]
524
- + spaces[i]
525
- + predictions_ClAssT[i]
526
- + " "
527
- + scores_ClAssT[i]
528
- + " "
529
- + str(scores_oxa[i])
530
- + " "
531
- + str(scores_oxa_ind[i][0])
532
- + " "
533
- + str(scores_oxa_ind[i][1])
534
- )
535
- excelv2.append(
536
- files[i]
537
- + ","
538
- + predictions_ClAssT[i]
539
- + ","
540
- + scores_ClAssT[i]
541
- + ","
542
- + str(scores_oxa[i])
543
- )
544
- print(
545
- header_filename
546
- + " Sub-Type Score blaOXA-Family blaOXA-Gene Score"
547
- )
548
- print(
549
- underscore
550
- + "______________________________________________________________________________________________________"
551
- )
552
- for i in excel:
553
- print(i)
554
- for i in range(0, len(excelv2)):
555
- excelv2[i] = [excelv2[i]]
556
- if csv_table:
557
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
558
- writer = csv.writer(file)
559
- writer.writerows(excelv2)
560
- print("")
561
- print("")
562
-
563
-
564
- def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus, mode):
565
- """performs a BF-lookup for a set of genomes for testing purpose"""
566
- print("Starting taxonomic assignment on species-level...")
567
- predictions = []
568
- scores = []
569
- counterx = 0
570
- contig_header = []
571
- contig_seq = []
572
- # Phillip
573
- names_path = (
574
- Path(os.getcwd()) / "filter" / "species_names" / ("Filter" + genus + ".txt")
575
- )
576
- with open(names_path, "rb") as fp:
577
- names = pickle.load(fp)
578
- names = sorted(names)
579
- # translation_dict = load_translation_dict(genus)
580
- for i in range(len(files)):
581
- if (
582
- i == int(len(files) / 6)
583
- or i == int(len(files) / 3)
584
- or i == int(len(files) / 2)
585
- or i == int(len(files) / 1.5)
586
- or i == int(len(files) / 1.2)
587
- ):
588
- print("...")
589
- BF.number_of_kmeres = 0
590
- BF.hits_per_filter = [0] * BF.clonetypes
591
- BF_1_1.number_of_kmeres = 0
592
- BF_1_1.hits_per_filter = [0]
593
- if file_format == "fasta" or file_format == "fna" or file_format == "fa":
594
- if metagenome:
595
- contigs = []
596
- contigs_classified = {}
597
- for sequence in SeqIO.parse(paths[i], "fasta"):
598
- contigs = []
599
- contigs_kmers = []
600
- BF_1_1.kmer_hits_single = []
601
- BF_1_1.number_of_kmeres = 0
602
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
603
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
604
- # then the contigs won't be tested further
605
- hit_sum = sum(BF_1_1.hits_per_filter)
606
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
607
- sample_size = int(len(str(sequence.seq)) ** 0.5)
608
- threshold_contig = sample_size * 0.7
609
- for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
610
- if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
611
- BF_1_1.lookup(str(sequence.seq[i : i + BF_1_1.k]).upper())
612
-
613
- # needs at least 70% hits to continue with the contig
614
- counter = 0
615
- if (sum(BF_1_1.hits_per_filter) - hit_sum) > threshold_contig:
616
- for j in range(len(str(sequence.seq)) - BF_1_1.k):
617
- if "N" not in str(sequence.seq[j : j + BF_1_1.k]):
618
- contigs_kmers.append(
619
- str(sequence.seq[j : j + BF_1_1.k]).upper()
620
- )
621
- counter += 1
622
- # how many kmers? to use
623
- if counter >= 5000000:
624
- break
625
- # contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
626
- contigs.append(contigs_kmers)
627
- BF_1_1.hits_per_filter = hits_per_filter_copy
628
- else:
629
- # resetting hit counter
630
- BF_1_1.hits_per_filter = hits_per_filter_copy
631
- continue
632
-
633
- contigs_filtered = []
634
- counter = 0
635
- # Since we classify individual contigs now, the var contigs only contains one item which makes those loops unneccesary
636
- for i in range(len(contigs)):
637
- threshold = 0
638
- for j in range(len(contigs[i])):
639
- BF_1_1.number_of_kmeres += 1
640
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
641
- BF_1_1.lookup(contigs[i][j])
642
- if hits_per_filter_copy != BF_1_1.hits_per_filter:
643
- threshold += 1
644
- # parameter value needs to be determined
645
- if threshold >= (0.7 * len(contigs[i])):
646
- contigs_filtered += contigs[i]
647
- counter += len(contigs[i])
648
- if counter >= 5000:
649
- break
650
-
651
- # since we do indv. contig classifications we need to reset the BF vars
652
- BF.kmer_hits_single = []
653
- BF.number_of_kmeres = 0
654
- BF.hits_per_filter = [0] * BF.clonetypes
655
- for kmer in contigs_filtered:
656
- BF.number_of_kmeres += 1
657
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
658
- if kmer > kmer_reversed:
659
- BF.lookup(kmer)
660
- else:
661
- BF.lookup(kmer_reversed)
662
- score = BF.get_score()
663
- score_edit = [str(x) for x in score]
664
- score_edit = ",".join(score_edit)
665
-
666
- # making prediction
667
- index_result = max(range(len(score)), key=score.__getitem__)
668
- prediction = names[index_result]
669
-
670
- # skip ambiguous contigs
671
- if max(score) == sorted(score)[-2]:
672
- continue
673
-
674
- # bootstrapping
675
- bootstrap_n = 100
676
- samples = bs.bootstrap(
677
- BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
678
- )
679
- sample_scores = bs.bootstrap_scores(
680
- samples, BF.number_of_kmeres, BF.clonetypes
681
- )
682
- bootstrap_score = 0
683
- bootstrap_predictions = []
684
- for i in range(len(sample_scores)):
685
- # skip ambiguous contigs (species with same score)
686
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
687
- bootstrap_predictions.append(
688
- names[
689
- max(
690
- range(len(sample_scores[i])),
691
- key=sample_scores[i].__getitem__,
692
- )
693
- ]
694
- )
695
- if (
696
- max(
697
- range(len(sample_scores[i])),
698
- key=sample_scores[i].__getitem__,
699
- )
700
- == index_result
701
- ):
702
- bootstrap_score += 1
703
- else:
704
- continue
705
- bootstrap_score = bootstrap_score / bootstrap_n
706
-
707
- # ---------------------------------------------------------------------------------------------
708
- # Collect results
709
- # change this var to the species you want your contigs saved from
710
- save_contigs = "none"
711
-
712
- if (genus[0] + ". " + prediction) not in contigs_classified:
713
- contigs_classified[genus[0] + ". " + prediction] = [
714
- [max(score)],
715
- 1,
716
- [len(str(sequence.seq))],
717
- sorted(score)[-2] / max(score),
718
- [bootstrap_score],
719
- contigs_filtered,
720
- None,
721
- ]
722
- if prediction == save_contigs:
723
- contig_header += [sequence.description]
724
- contig_seq += [str(sequence.seq)]
725
- else:
726
- contigs_classified[genus[0] + ". " + prediction][0] += [
727
- max(score)
728
- ]
729
- contigs_classified[genus[0] + ". " + prediction][1] += 1
730
- contigs_classified[genus[0] + ". " + prediction][2] += [
731
- len(str(sequence.seq))
732
- ]
733
- contigs_classified[genus[0] + ". " + prediction][3] += sorted(
734
- score
735
- )[-2] / max(score)
736
- contigs_classified[genus[0] + ". " + prediction][4] += [
737
- bootstrap_score
738
- ]
739
- contigs_classified[genus[0] + ". " + prediction][
740
- 5
741
- ] += contigs_filtered
742
- if prediction == save_contigs:
743
- contig_header += [sequence.description]
744
- contig_seq += [str(sequence.seq)]
745
- # scores.append(str(max(score)))
746
- else:
747
- # Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
748
- BF.kmer_hits_single = []
749
- for sequence in SeqIO.parse(paths[i], "fasta"):
750
- for j in range(0, len(sequence.seq) - BF.k, mode):
751
- BF.number_of_kmeres += 1
752
- kmer = str(sequence.seq[j : j + BF.k])
753
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
754
- if kmer > kmer_reversed:
755
- BF.lookup(kmer)
756
- else:
757
- BF.lookup(kmer_reversed)
758
-
759
- score = BF.get_score()
760
- # print("Scores: ", score)
761
- if metagenome:
762
- # map kmers to genome for HGT detection
763
- # change later to new functions this is OLD
764
- if False:
765
- for prediction in contigs_classified:
766
- kmers = contigs_classified[prediction][5]
767
- # Strip "A."
768
- prediction = prediction[2:]
769
- # kmer mapping to genome, start by loading the kmer_dict in
770
- path_pos = (
771
- "filter\kmer_positions\Acinetobacter\\"
772
- + prediction
773
- + "_positions.txt"
774
- )
775
- # delete later
776
- path_posv2 = (
777
- "filter\kmer_positions\Acinetobacter\\"
778
- + prediction
779
- + "_complete_positions.txt"
780
- )
781
- # cluster kmers to contigs
782
- # delete try later
783
- try:
784
- with open(path_pos, "rb") as fp:
785
- kmer_dict = pickle.load(fp)
786
- except:
787
- with open(path_posv2, "rb") as fp:
788
- kmer_dict = pickle.load(fp)
789
- contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
790
- contigs_classified[genus[0] + ". " + prediction][
791
- 6
792
- ] = contig_amounts_distances
793
- # del kmer_dict
794
- for key, value in contigs_classified.items():
795
- number_of_contigs = value[1]
796
- # save results
797
- results_clustering = [
798
- [
799
- key
800
- + ","
801
- + str(statistics.median(value[0]))
802
- + ","
803
- + str(number_of_contigs),
804
- str(statistics.median(value[2]))
805
- + ","
806
- + str(round(value[3] / number_of_contigs, 2))
807
- + ","
808
- + str(statistics.median(value[4])),
809
- ]
810
- ]
811
- # with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
812
- # writer = csv.writer(file)
813
- # writer.writerows(results_clustering)
814
- value[0] = "Score Median: " + str(statistics.median(value[0]))
815
- value[1] = "Number of Contigs: " + str(number_of_contigs)
816
- value[2] = "Contig-Length Median: " + str(
817
- statistics.median(value[2])
818
- )
819
- value[3] = "Repetiviness: " + str(
820
- round(value[3] / number_of_contigs, 2)
821
- )
822
- value[4] = "Bootstrap Median: " + str(statistics.median(value[4]))
823
- # value[6] = "Clusters: " + str(value[6])
824
- contigs_classified[key] = value
825
- print("Species: ", key)
826
- print(value[0])
827
- print(value[1])
828
- print(value[2])
829
- print(value[3])
830
- print(value[4])
831
- print(value[6])
832
- print()
833
-
834
- save_contigs = "none"
835
- if save_contigs != "none":
836
- with open(r"Results/Contigs_saved.fasta", "w") as file:
837
- for j in range(len(contig_header)):
838
- file.write(contig_header[j] + "\n")
839
- file.write(contig_seq[j] + "\n")
840
- file.write("\n")
841
- elif file_format == "fastq" or file_format == "fq":
842
- if metagenome:
843
- # ---------------------------------------------------------------------------------------------
844
- # initialize variables
845
- BF_1_1.kmer_hits_single = []
846
- BF_1_1.number_of_kmeres = 0
847
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
848
- counter = 0
849
- reads = []
850
- reads_classified = {}
851
- reads_passed = 0
852
- ambiguous_reads = 0
853
-
854
- # ---------------------------------------------------------------------------------------------
855
- # First prefiltering step: Check if read contains at least 3 kmeres
856
- for sequence in SeqIO.parse(paths[i], "fastq"):
857
- dna_composition = {}
858
- dna_composition = calculate_dna_composition(sequence.seq)
859
- BF_1_1.kmer_hits_single = []
860
- BF_1_1.number_of_kmeres = 0
861
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
862
- # reverse_sequence = sequence.seq.reverse_complement()
863
- read_kmers = []
864
- reads = []
865
- if counter < read_amount:
866
- counter += 1
867
- else:
868
- break
869
- k1 = str(sequence.seq[0 : BF_1_1.k]) # first k-mer
870
- k2 = str(
871
- sequence.seq[len(str(sequence.seq)) - BF_1_1.k :]
872
- ) # last k-mer
873
- mid = len(str(sequence.seq)) // 2
874
- k3 = str(sequence.seq[mid : mid + BF_1_1.k]) # k-mer in middle
875
- k4 = str(sequence.seq[BF_1_1.k : BF_1_1.k * 2])
876
- k5 = str(sequence.seq[mid + BF_1_1.k : mid + BF_1_1.k * 2])
877
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
878
- # then the read won't be tested further
879
- hit_sum = sum(BF_1_1.hits_per_filter)
880
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
881
- # sample_size = int(len(str(sequence.seq)) ** 0.5)
882
- # threshold_read = sample_size * 0.7
883
- # for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
884
- # if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
885
- # BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
886
- if "N" not in str(sequence.seq):
887
- BF_1_1.lookup(k1)
888
- BF_1_1.lookup(k2)
889
- BF_1_1.lookup(k3)
890
- BF_1_1.lookup(k4)
891
- BF_1_1.lookup(k5)
892
- else:
893
- continue
894
- # needs at least 2 of 3 hits to continue with read
895
- if (sum(BF_1_1.hits_per_filter) - hit_sum) > 3:
896
- for j in range(len(str(sequence.seq)) - BF_1_1.k):
897
- read_kmers.append(str(sequence.seq[j : j + BF_1_1.k]))
898
- # read_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
899
- reads.append(read_kmers)
900
- BF_1_1.hits_per_filter = hits_per_filter_copy
901
- else:
902
- # resetting hit counter
903
- BF_1_1.hits_per_filter = hits_per_filter_copy
904
- continue
905
-
906
- # ---------------------------------------------------------------------------------------------
907
- # Second prefiltering step: Check if read contains at least 80% of kmers from one species
908
- # reads_filtered = set()
909
- reads_filtered = []
910
- for i in range(len(reads)):
911
- threshold = 0
912
- for j in range(len(reads[i])):
913
- BF_1_1.number_of_kmeres += 1
914
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
915
- if "N" not in reads[i][j]:
916
- BF_1_1.lookup(reads[i][j])
917
- if hits_per_filter_copy != BF_1_1.hits_per_filter:
918
- threshold += 1
919
- if threshold >= 0.7 * len(reads[i]):
920
- reads_filtered += reads[i]
921
- if len(reads_filtered) == 0:
922
- continue
923
-
924
- # ---------------------------------------------------------------------------------------------
925
- # Start of the actual classification
926
- BF.number_of_kmeres = 0
927
- BF.hits_per_filter = [0] * BF.clonetypes
928
- BF.kmer_hits_single = []
929
- for kmer in reads_filtered:
930
- if "N" not in kmer:
931
- BF.number_of_kmeres += 1
932
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
933
- if kmer > kmer_reversed:
934
- BF.lookup(kmer)
935
- else:
936
- BF.lookup(kmer_reversed)
937
- else:
938
- continue
939
- score = BF.get_score()
940
- score_edit = [str(x) for x in score]
941
- score_edit = ",".join(score_edit)
942
-
943
- # making prediction
944
- index_result = max(range(len(score)), key=score.__getitem__)
945
- prediction = names[index_result]
946
- if max(score) == sorted(score)[-2]:
947
- ambiguous_reads += 1
948
- # print("Ambiguous read")
949
- # continue
950
-
951
- # ---------------------------------------------------------------------------------------------
952
- # bootstrapping
953
- bootstrap_n = 100
954
- samples = bs.bootstrap(
955
- BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
956
- )
957
- sample_scores = bs.bootstrap_scores(
958
- samples, BF.number_of_kmeres, BF.clonetypes
959
- )
960
- bootstrap_score = 0
961
- bootstrap_predictions = []
962
- for i in range(len(sample_scores)):
963
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
964
- bootstrap_predictions.append(
965
- names[
966
- max(
967
- range(len(sample_scores[i])),
968
- key=sample_scores[i].__getitem__,
969
- )
970
- ]
971
- )
972
- if (
973
- max(
974
- range(len(sample_scores[i])),
975
- key=sample_scores[i].__getitem__,
976
- )
977
- == index_result
978
- ):
979
- bootstrap_score += 1
980
- else:
981
- continue
982
- bootstrap_score = bootstrap_score / bootstrap_n
983
-
984
- # ---------------------------------------------------------------------------------------------
985
- # HGT identification pipeline start
986
-
987
- # skip species clear reads
988
- # if max(score) <= 0.9:
989
- # identify split reads from HGT
990
- # split_regions = map.identify_split_reads(score, BF.kmer_hits_single)
991
-
992
- # split_read contains touples --> ([first part of list, second part of list], index of species)
993
-
994
- # check if it is in fact a split read , 0.6 is arbitrary value, it is the threshold for the difference between the two regions
995
- # if abs(sum(split_regions[0][0]) - sum(split_regions[0][1])) > 0.6:
996
- # get the species names
997
- # acceptor_species = names[split_regions[0][1]]
998
- # donor_species = names[split_regions[1][1]]
999
- # donor_acceptor = [donor_species, acceptor_species]
1000
- # else:
1001
- # donor_acceptor = [None]
1002
-
1003
- # ---------------------------------------------------------------------------------------------
1004
- # Collect results from classification
1005
- if (genus[0] + ". " + prediction) not in reads_classified:
1006
- reads_classified[genus[0] + ". " + prediction] = [
1007
- max(score),
1008
- 1,
1009
- sorted(score)[-2] / max(score),
1010
- BF.number_of_kmeres,
1011
- [bootstrap_score],
1012
- reads_filtered,
1013
- None,
1014
- ]
1015
- else:
1016
- reads_classified[genus[0] + ". " + prediction][1] += 1
1017
- reads_classified[genus[0] + ". " + prediction][0] += max(score)
1018
- reads_classified[genus[0] + ". " + prediction][2] += sorted(
1019
- score
1020
- )[-2] / max(score)
1021
- reads_classified[genus[0] + ". " + prediction][
1022
- 3
1023
- ] += BF.number_of_kmeres
1024
- reads_classified[genus[0] + ". " + prediction][4] += [
1025
- bootstrap_score
1026
- ]
1027
- reads_classified[genus[0] + ". " + prediction][
1028
- 5
1029
- ] += reads_filtered
1030
- # reads_classified[genus[0] + ". " + prediction][7] += [dna_composition]
1031
- # reads_classified[genus[0] + ". " + prediction][8] += [donor_acceptor]
1032
-
1033
- else:
1034
- # classification for sequence pure reads, check every 10th kmer (or everyone for "complete" mode)
1035
- counter = 0
1036
- # Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
1037
- BF.kmer_hits_single = []
1038
- for sequence in SeqIO.parse(paths[i], "fastq"):
1039
- if counter < read_amount:
1040
- counter += 1
1041
- for j in range(0, len(sequence.seq) - BF.k + 1, mode):
1042
- BF.number_of_kmeres += 1
1043
- kmer = str(sequence.seq[j : j + BF.k])
1044
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
1045
- if kmer > kmer_reversed:
1046
- BF.lookup(kmer)
1047
- else:
1048
- BF.lookup(kmer_reversed)
1049
- else:
1050
- break
1051
- score = BF.get_score()
1052
-
1053
- if metagenome:
1054
- # ---------------------------------------------------------------------------------------------
1055
- # map kmers to genome for HGT detection
1056
- if False:
1057
- # map and cluster single reads to genome
1058
- read_clusters = []
1059
- for prediction in reads_classified:
1060
- # load list of kmers from read
1061
- kmers = reads_classified[prediction][5]
1062
- # Strip genus name
1063
- prediction = prediction[2:]
1064
-
1065
- # kmer mapping to genome, start by loading the kmer_dict in
1066
- path_pos = (
1067
- "filter\kmer_positions\Acinetobacter\\"
1068
- + prediction
1069
- + "_positions.txt"
1070
- )
1071
- # delete later
1072
- path_posv2 = (
1073
- "filter\kmer_positions\Acinetobacter\\"
1074
- + prediction
1075
- + "_complete_positions.txt"
1076
- )
1077
- # cluster kmers to reads
1078
- # delete try later
1079
- try:
1080
- with open(path_pos, "rb") as fp:
1081
- kmer_dict = pickle.load(fp)
1082
- except:
1083
- with open(path_posv2, "rb") as fp:
1084
- kmer_dict = pickle.load(fp)
1085
- test = map.map_kmers(kmers, kmer_dict, genus)
1086
- clusters = map.cluster_kmers(kmers, kmer_dict)
1087
- read_clusters.append(clusters)
1088
- reads_classified[genus[0] + ". " + prediction][
1089
- 6
1090
- ] = reads_amounts_distances
1091
- # del kmer_dict
1092
-
1093
- # now cluster mappings of multiple reads to genome
1094
- for cluster in read_clusters:
1095
- # TODO
1096
- continue
1097
-
1098
- # ---------------------------------------------------------------------------------------------
1099
- # Collect results from classification
1100
- for key, value in reads_classified.items():
1101
- if key == "unknown":
1102
- continue
1103
- value.insert(2, value[0] / value[1])
1104
- value.pop(0)
1105
- reads_classified[key] = value
1106
- print(
1107
- key,
1108
- value[0],
1109
- round(value[1], 2),
1110
- round(value[2] / value[0], 2),
1111
- round(value[3] / value[0], 2),
1112
- statistics.median(value[4]),
1113
- )
1114
- score_edit = [str(x) for x in score]
1115
- score_edit = ",".join(score_edit)
1116
- # making prediction
1117
- if not metagenome:
1118
- # prediction = Classifier.classify(r'Training_data/Training_data_spec.csv', score, True)
1119
- # Phillip
1120
- # file_name = genus + "_Training_data_spec.csv"
1121
- # path = Path(__file__).parent.absolute() / "Training_data" / file_name
1122
- # prediction = Classifier.classify(path, score, True)
1123
- # SVM TURNED OFF TEsting!!
1124
- index_result = max(range(len(score)), key=score.__getitem__)
1125
- prediction = names[index_result]
1126
- names_copy = names[:]
1127
- # sort score by descending order and names_copy accordingly
1128
- score, names_copy = zip(*sorted(zip(score, names_copy), reverse=True))
1129
- # print(score[0:3])
1130
- # print(names_copy[0:3])
1131
- else:
1132
- # Phillip
1133
- # prediction_name = translation_dict[prediction]
1134
- # predictions.append(prediction_name)
1135
- index_result = max(range(len(score)), key=score.__getitem__)
1136
- prediction = names[index_result]
1137
- translation_dict = load_translation_dict(genus)
1138
- predictions.append(translation_dict[prediction])
1139
- scores.append(str(max(score)))
1140
- print("Taxonomic assignment done...")
1141
- return predictions, scores
1142
-
1143
-
1144
- def clAssT(BF_2, files, paths, file_format, read_amount):
1145
- print("Starting strain-typing on sub-type-level...")
1146
- predictions_ClAssT = []
1147
- scores_ClAssT = []
1148
- for i in range(len(files)):
1149
- if (
1150
- i == int(len(files) / 6)
1151
- or i == int(len(files) / 3)
1152
- or i == int(len(files) / 2)
1153
- or i == int(len(files) / 1.5)
1154
- or i == int(len(files) / 1.2)
1155
- ):
1156
- print("...")
1157
- BF_2.number_of_kmeres = 0
1158
- BF_2.hits_per_filter = [0] * BF_2.clonetypes
1159
- if file_format == "fasta" or file_format == "fna":
1160
- for sequence in SeqIO.parse(paths[i], "fasta"):
1161
- # Originally 10
1162
- for j in range(0, len(sequence.seq) - BF_2.k, 500):
1163
- BF_2.number_of_kmeres += 1
1164
- BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
1165
- elif file_format == "fastq" or file_format == "fq":
1166
- counter = 0
1167
- for sequence in SeqIO.parse(paths[i], "fastq"):
1168
- if counter < read_amount:
1169
- counter += 1
1170
- for j in range(0, len(sequence.seq) - BF_2.k + 1, 10):
1171
- BF_2.number_of_kmeres += 1
1172
- BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
1173
- else:
1174
- break
1175
- score_ClAssT = BF_2.get_score()
1176
- score_edit_ClAssT = [str(x) for x in score_ClAssT]
1177
- score_edit_ClAssT = ",".join(score_edit_ClAssT)
1178
- prediction_ClAssT = Classifier.classify(
1179
- r"Training_data/Training_data_IC.csv",
1180
- score_ClAssT,
1181
- [True, True, True, True, True, True, True, True, False],
1182
- )
1183
- predictions_ClAssT.append(prediction_ClAssT)
1184
- scores_ClAssT.append(str(max(score_ClAssT)))
1185
-
1186
- print("Strain-typing on sub-type-level done...")
1187
- return predictions_ClAssT, scores_ClAssT
1188
-
1189
-
1190
- def blaOXA(BF_3, files, paths, file_format, read_amount):
1191
- start = time.time()
1192
- print("Start screening for blaOXA-genes...")
1193
- paths_oxa = sorted(os.listdir(r"filter/OXAs/families"))
1194
- BF_families = BF_3["OXA-families"]
1195
- oxas = []
1196
- scores_oxa = []
1197
- scores_oxa_ind = []
1198
- for i in paths_oxa:
1199
- oxas.append(i[:-4])
1200
- # print("OXA-families: ", oxas) # correct
1201
- for i in range(len(files)):
1202
- oxa_dic = {}
1203
- if (
1204
- i == int(len(files) / 6)
1205
- or i == int(len(files) / 3)
1206
- or i == int(len(files) / 2)
1207
- or i == int(len(files) / 1.5)
1208
- or i == int(len(files) / 1.2)
1209
- ):
1210
- print("...")
1211
- # Checking file type
1212
- # if the file is fasta -> concat lines
1213
- reads = []
1214
- BF_families.number_of_kmeres = 0
1215
- BF_families.hits_per_filter = [0] * BF_families.clonetypes
1216
- BF_families.table = OXATable()
1217
- BF_families.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
1218
- if file_format == "fasta" or file_format == "fna":
1219
- for sequence in SeqIO.parse(paths[i], "fasta"):
1220
- reads.append(str(sequence.seq))
1221
- BF_families.lookup_oxa(reads, ".fna")
1222
- elif file_format == "fastq" or file_format == "fq":
1223
- counter = 0
1224
- for sequence in SeqIO.parse(paths[i], "fastq"):
1225
- if counter < read_amount:
1226
- counter += 1
1227
- reads.append(str(sequence.seq))
1228
- else:
1229
- break
1230
- BF_families.lookup_oxa(reads, ".fq")
1231
- # print("Reads used: ", counter)
1232
- score_oxa = BF_families.get_oxa_score()
1233
- # print("Score: ", score_oxa)
1234
- for i in range(len(oxas)):
1235
- oxa_dic[oxas[i]] = score_oxa[i]
1236
- for i in range(len(oxa_dic)):
1237
- if oxa_dic[oxas[i]] < 0.3:
1238
- del oxa_dic[oxas[i]]
1239
- if len(oxa_dic) == 0:
1240
- oxa_dic = "None"
1241
- if oxa_dic != "None":
1242
- oxa_dic = dict(sorted(oxa_dic.items(), key=lambda item: item[1]))
1243
- scores_oxa.append(oxa_dic)
1244
- # prepare data for next taxonomic level
1245
- oxa_names = []
1246
- # print(oxa_dic)
1247
- for oxa_family in oxa_dic:
1248
- oxa_names.append(oxa_family[:-7])
1249
- for oxa_family in oxa_names:
1250
- if oxa_dic == "None":
1251
- scores_oxa_ind.append(["None", 0])
1252
- break
1253
- # print("blaOXA: ", oxa_dic)
1254
- oxa_dic_ind = {}
1255
- ## TODO:
1256
- # print("blaOXA: ", oxa_family)
1257
- BF_ind = BF_3[oxa_family]
1258
- BF_ind.number_of_kmeres = 0
1259
- BF_ind.hits_per_filter = [0] * BF_ind.clonetypes
1260
- BF_ind.table = OXATable()
1261
- BF_ind.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
1262
- paths_oxa = sorted(os.listdir(r"filter/OXAs/individual/" + oxa_family))
1263
- oxas_ind = []
1264
- for i in paths_oxa:
1265
- oxas_ind.append(i[:-4])
1266
- if file_format == "fasta" or file_format == "fna":
1267
- BF_ind.lookup_oxa(reads, ".fna")
1268
- elif file_format == "fastq" or file_format == "fq":
1269
- BF_ind.lookup_oxa(reads, ".fq")
1270
- score_oxa = BF_ind.get_oxa_score()
1271
- # build dict with oxa-gen and its score
1272
- for i in range(len(oxas_ind)):
1273
- oxa_dic_ind[oxas_ind[i]] = score_oxa[i]
1274
- # filter dict by score
1275
- if len(oxa_dic_ind) == 0 or max(oxa_dic_ind.values()) < 0.3:
1276
- scores_oxa_ind.append("None")
1277
- else:
1278
- scores_oxa_ind.append(
1279
- [
1280
- max(oxa_dic_ind, key=oxa_dic_ind.get),
1281
- oxa_dic_ind[max(oxa_dic_ind, key=oxa_dic_ind.get)],
1282
- ]
1283
- )
1284
- end = time.time()
1285
- needed = round(end - start, 2)
1286
- print("Time needed: ", needed)
1287
- print("Screening for blaOXA-genes done...")
1288
- return scores_oxa, scores_oxa_ind
1289
-
1290
-
1291
- def calculate_dna_composition(sequence):
1292
- """calculates the DNA composition of a sequence"""
1293
- composition = {"A": 0, "C": 0, "G": 0, "T": 0}
1294
-
1295
- total = 0
1296
- for base in sequence:
1297
- if base in composition:
1298
- composition[base] += 1
1299
- total += 1
1300
- for base in composition:
1301
- composition[base] = round(composition[base] / total, 2)
1302
-
1303
- return composition
1304
-
1305
-
1306
- def main():
1307
- """Parse CLI arguments and call respective functions"""
1308
- arg_list = sys.argv
1309
- # Phillip
1310
- genus = arg_list[1]
1311
- genera = search_filter.get_genera_array_sizes()
1312
- genera = list(genera.keys())
1313
-
1314
- if genus not in genera:
1315
- print(f"{genus} is unknown.")
1316
- quit()
1317
- if "XspecT" in arg_list or "xspect" in arg_list:
1318
- xspect = True
1319
- else:
1320
- xspect = False
1321
- if "ClAssT" in arg_list or "classt" in arg_list and genus == "Acinetobacter":
1322
- classt = True
1323
- elif "ClAssT" in arg_list or "classt" in arg_list and genus != "Acinetobacter":
1324
- print(f"ClAssT unavailable for {genus}")
1325
- else:
1326
- classt = False
1327
- if "Oxa" in arg_list or "oxa" in arg_list and genus == "Acinetobacter":
1328
- oxa = True
1329
- elif "Oxa" in arg_list or "oxa" in arg_list and genus != "Acinetobacter":
1330
- print(f"Oxa unavailable for {genus}")
1331
- else:
1332
- oxa = False
1333
- if "Metagenome" in arg_list or "metagenome" in arg_list:
1334
- metagenome = True
1335
- else:
1336
- metagenome = False
1337
- if ("fasta" in arg_list) or ("fna" in arg_list) or ("fa" in arg_list):
1338
- file_format = "fasta"
1339
- read_amount = 342480
1340
- elif ("fastq" in arg_list) or ("fq" in arg_list):
1341
- file_format = "fastq"
1342
- index = arg_list.index("fastq")
1343
- if arg_list[index + 1].isdigit():
1344
- read_amount = int(arg_list[index + 1])
1345
- else:
1346
- print("Error: Wrong Input, use a number after fastq!")
1347
- quit()
1348
- else:
1349
- print("Error: Wrong Input, use fasta/fna/fa or fastq/fq!")
1350
- quit()
1351
- if "save" in arg_list or "Save" in arg_list:
1352
- csv_table = True
1353
- else:
1354
- csv_table = False
1355
- if "complete" in arg_list or "Complete" in arg_list:
1356
- mode = 1
1357
- else:
1358
- mode = 500
1359
-
1360
- file_path = arg_list[-1]
1361
-
1362
- xspecT_mini(
1363
- file_path,
1364
- xspect,
1365
- classt,
1366
- oxa,
1367
- file_format,
1368
- read_amount,
1369
- csv_table,
1370
- metagenome,
1371
- genus,
1372
- mode,
1373
- )
1374
-
1375
-
1376
- if __name__ == "__main__":
1377
- main()