XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/XspecT_mini.py DELETED
@@ -1,1363 +0,0 @@
1
- import os
2
- import warnings
3
- import time
4
- import csv
5
- import pickle
6
- import statistics
7
- import sys
8
- from pathlib import Path
9
- from Bio import SeqIO, Seq
10
- from numpy import sum
11
- import psutil
12
- import xspect.Classifier as Classifier
13
- import xspect.search_filter as search_filter
14
- from xspect.OXA_Table import OXATable
15
- import xspect.Bootstrap as bs
16
- from xspect.train_filter.interface_XspecT import load_translation_dict
17
-
18
-
19
- warnings.filterwarnings("ignore")
20
-
21
-
22
- def xspecT_mini(
23
- file_path,
24
- XspecT,
25
- ClAssT,
26
- oxa,
27
- file_format,
28
- read_amount,
29
- csv_table,
30
- metagenome,
31
- genus,
32
- mode,
33
- ):
34
- """performs a BF-lookup for a set of genomes for testing purpose"""
35
- itemlist = [
36
- "albensis",
37
- "apis",
38
- "baretiae",
39
- "baumannii",
40
- "baylyi",
41
- "beijerinckii",
42
- "bereziniae",
43
- "bohemicus",
44
- "boissieri",
45
- "bouvetii",
46
- "brisouii",
47
- "calcoaceticus",
48
- "celticus",
49
- "chengduensis",
50
- "chinensis",
51
- "colistiniresistens",
52
- "courvalinii",
53
- "cumulans",
54
- "defluvii",
55
- "dispersus",
56
- "equi",
57
- "gandensis",
58
- "gerneri",
59
- "gs06",
60
- "gs16",
61
- "guerrae",
62
- "guillouiae",
63
- "gyllenbergii",
64
- "haemolyticus",
65
- "halotolerans",
66
- "harbinensis",
67
- "idrijaensis",
68
- "indicus",
69
- "johnsonii",
70
- "junii",
71
- "kanungonis",
72
- "kookii",
73
- "kyonggiensis",
74
- "lactucae",
75
- "lanii",
76
- "larvae",
77
- "lwoffii",
78
- "marinus",
79
- "modestus",
80
- "nectaris",
81
- "nosocomialis",
82
- "oleivorans",
83
- "parvus",
84
- "piscicola",
85
- "pittii",
86
- "pollinis",
87
- "populi",
88
- "portensis",
89
- "pseudolwoffii",
90
- "pullicarnis",
91
- "pragensis",
92
- "proteolyticus",
93
- "puyangensis",
94
- "qingfengensis",
95
- "radioresistens",
96
- "rathckeae",
97
- "rongchengensis",
98
- "rudis",
99
- "schindleri",
100
- "seifertii",
101
- "seohaensis",
102
- "shaoyimingii",
103
- "sichuanensis",
104
- "soli",
105
- "stercoris",
106
- "tandoii",
107
- "terrae",
108
- "terrestris",
109
- "tianfuensis",
110
- "tjernbergiae",
111
- "towneri",
112
- "ursingii",
113
- "variabilis",
114
- "venetianus",
115
- "vivanii",
116
- "wanghuae",
117
- "wuhouensis",
118
- "sp.",
119
- ]
120
- print("Preparing Bloomfilter...")
121
- start = time.time()
122
- if XspecT:
123
- # BF = search_filter.pre_processing()
124
- # Phillip
125
- # Getting the array sizes for pre processing of all bloomfilters.
126
- genera = search_filter.get_genera_array_sizes()
127
-
128
- # Pre processing of the bloomfilters for the species.
129
- BF = search_filter.pre_process_all(genera, k=21, meta_mode=False, genus=[genus])
130
-
131
- # aktuelle Speichernutzung auslesen
132
- process = psutil.Process()
133
- memory_info = process.memory_info()
134
- # Ausgabe des Speicherverbrauchs
135
- print(
136
- f"Aktueller Speicherverbrauch mit den Spezies BF: {memory_info.rss / 1024 / 1024:.2f} MB"
137
- )
138
-
139
- # BF_1 = search_filter.pre_processing_prefilter()
140
- # BF_1_1 = search_filter.pre_processing_prefilter2()
141
- # Phillip
142
- # Pre processing of the bloomfilters for the metagenome mode.
143
- BF_1_1 = search_filter.pre_process_all(
144
- genera, k=21, meta_mode=True, genus=[genus]
145
- )
146
-
147
- # aktuelle Speichernutzung auslesen
148
- process = psutil.Process()
149
- memory_info = process.memory_info()
150
- # Ausgabe des Speicherverbrauchs
151
- print(
152
- f"Aktueller Speicherverbrauch mit dem Master BF: {memory_info.rss / 1024 / 1024:.2f} MB"
153
- )
154
-
155
- if ClAssT:
156
- BF_2 = search_filter.pre_processing_ClAssT()
157
- if oxa:
158
- BF_3 = search_filter.pre_processing_oxa()
159
- end = time.time()
160
- needed = round(end - start, 2)
161
- print("Time needed for preprocessing: ", needed)
162
- try:
163
- files = sorted(os.listdir(file_path))
164
- except FileNotFoundError:
165
- print("Error: Invalid filepath!")
166
- quit()
167
- if file_format == "fna" or file_format == "fasta" or file_format == "fa":
168
- for i in range(len(files) - 1, -1, -1):
169
- if "fna" in files[i] or "fasta" in files[i]:
170
- continue
171
- else:
172
- del files[i]
173
- elif file_format == "fastq" or file_format == "fq":
174
- for i in range(len(files) - 1, -1, -1):
175
- if "fastq" in files[i] or "fq" in files[i]:
176
- continue
177
- else:
178
- del files[i]
179
- if len(files) == 0:
180
- print("Error: No " + str(file_format) + " files in directory!")
181
- quit()
182
- paths = files[:]
183
- file_path2 = file_path[:]
184
- for i in range(len(file_path2)):
185
- if file_path2[i] == "\\":
186
- list_temp = list(file_path2)
187
- list_temp[i] = "/"
188
- file_path2 = "".join(list_temp)
189
- start = time.time()
190
- for i in range(len(files)):
191
- paths[i] = file_path2 + "/" + paths[i]
192
- if XspecT:
193
- predictions, scores = xspecT(
194
- BF[genus],
195
- BF_1_1[genus],
196
- files,
197
- paths,
198
- file_format,
199
- read_amount,
200
- metagenome,
201
- genus,
202
- mode,
203
- )
204
- if ClAssT:
205
- predictions_ClAssT, scores_ClAssT = clAssT(
206
- BF_2, files, paths, file_format, read_amount
207
- )
208
- if oxa:
209
- scores_oxa, scores_oxa_ind = blaOXA(
210
- BF_3, files, paths, file_format, read_amount
211
- )
212
- print("Preparing results...")
213
- print("")
214
- end = time.time()
215
- needed = round(end - start, 2)
216
- print("Time needed: ", needed)
217
- print("")
218
- header_filename = "Filename"
219
- spaces = []
220
- space = " "
221
- underscore = "________"
222
- name_max = len(max(itemlist, key=len))
223
- if XspecT:
224
- for i in range(len(predictions)):
225
- while len(predictions[i]) < name_max:
226
- predictions[i] += " "
227
- file_max = len(max(files, key=len))
228
- while len(header_filename) < file_max:
229
- header_filename += " "
230
- underscore += "_"
231
- for j in range(len(files)):
232
- for i in range(len(header_filename) - len(files[j])):
233
- space += " "
234
- spaces.append(space)
235
- space = " "
236
- excel = []
237
- # formatting
238
- if ClAssT:
239
- for i in range(len(predictions_ClAssT)):
240
- if predictions_ClAssT[i] != "none" and predictions_ClAssT[i] != "None":
241
- predictions_ClAssT[i] += " "
242
- if XspecT and ClAssT:
243
- for i in range(len(scores_ClAssT)):
244
- if scores[i] == "1.0":
245
- scores[i] += " "
246
-
247
- if XspecT and ClAssT and oxa:
248
- excelv2 = []
249
- print(scores_oxa)
250
- print(scores_oxa_ind)
251
- for i in range(len(files)):
252
- if scores_oxa == ["None"]:
253
- excel.append(
254
- files[i]
255
- + spaces[i]
256
- + predictions[i]
257
- + " "
258
- + scores[i]
259
- + " "
260
- + predictions_ClAssT[i]
261
- + " "
262
- + scores_ClAssT[i]
263
- + " "
264
- + str(scores_oxa[i])
265
- + " "
266
- + str(scores_oxa_ind[i][0])
267
- + " "
268
- + str(scores_oxa_ind[i][1])
269
- )
270
- else:
271
- excel.append(
272
- files[i]
273
- + spaces[i]
274
- + predictions[i]
275
- + " "
276
- + scores[i]
277
- + " "
278
- + predictions_ClAssT[i]
279
- + " "
280
- + scores_ClAssT[i]
281
- + " "
282
- + str(scores_oxa[i])
283
- + " "
284
- + str(scores_oxa_ind[i][0])
285
- + " "
286
- + str(scores_oxa_ind[i][1])
287
- )
288
- excelv2.append(
289
- files[i]
290
- + ","
291
- + predictions[i]
292
- + ","
293
- + scores[i]
294
- + predictions_ClAssT[i]
295
- + ","
296
- + scores_ClAssT[i]
297
- + ","
298
- + str(scores_oxa[i])
299
- )
300
- print(
301
- header_filename
302
- + " Species Score Sub-Type Score blaOXA-Family blaOXA-Gene Score"
303
- )
304
- print(
305
- underscore
306
- + "___________________________________________________________________________________________________________________________________________"
307
- )
308
- for i in excel:
309
- print(i)
310
- for i in range(0, len(excelv2)):
311
- excelv2[i] = [excelv2[i]]
312
- if csv_table:
313
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
314
- writer = csv.writer(file)
315
- writer.writerows(excelv2)
316
- print("")
317
- print("")
318
- elif XspecT and not ClAssT and not oxa:
319
- excelv2 = []
320
- for i in range(len(files)):
321
- excel.append(files[i] + spaces[i] + predictions[i] + " " + scores[i])
322
- excelv2.append(files[i] + "," + predictions[i] + "," + scores[i])
323
- print(header_filename + " Species Score")
324
- print(underscore + "_________________________________________")
325
- for i in excel:
326
- print(i)
327
- for i in range(0, len(excelv2)):
328
- excelv2[i] = [excelv2[i]]
329
- if csv_table:
330
- with open(
331
- r"Results/XspecT_mini_csv/Results_XspecT.csv", "w", newline=""
332
- ) as file:
333
- writer = csv.writer(file)
334
- writer.writerows(excelv2)
335
- print("")
336
- print("")
337
- elif ClAssT and not XspecT and not oxa:
338
- excelv2 = []
339
- for i in range(len(files)):
340
- excel.append(
341
- files[i]
342
- + spaces[i]
343
- + predictions_ClAssT[i]
344
- + " "
345
- + scores_ClAssT[i]
346
- )
347
- excelv2.append(
348
- files[i] + "," + predictions_ClAssT[i] + "," + scores_ClAssT[i]
349
- )
350
- print(header_filename + " Sub-Type Score")
351
- print(underscore + "________________________________")
352
- for i in excel:
353
- print(i)
354
- print("")
355
- print("")
356
- for i in range(0, len(excelv2)):
357
- excelv2[i] = [excelv2[i]]
358
- if csv_table:
359
- with open(
360
- r"Results/XspecT_mini_csv/Results_ClAssT.csv", "w", newline=""
361
- ) as file:
362
- writer = csv.writer(file)
363
- writer.writerows(excelv2)
364
- elif oxa and not ClAssT and not XspecT:
365
- excelv2 = []
366
- for i in range(len(files)):
367
- if scores_oxa == ["None"]:
368
- excel.append(
369
- files[i]
370
- + spaces[i]
371
- + str(scores_oxa[i])
372
- + " "
373
- + str(scores_oxa_ind[i][0])
374
- + " "
375
- + str(scores_oxa_ind[i][1])
376
- )
377
- else:
378
- excel.append(
379
- files[i]
380
- + spaces[i]
381
- + str(scores_oxa[i])
382
- + " "
383
- + str(scores_oxa_ind[i][0])
384
- + " "
385
- + str(scores_oxa_ind[i][1])
386
- )
387
-
388
- excelv2.append(files[i] + "," + str(scores_oxa[i]))
389
- print(
390
- header_filename
391
- + " blaOXA-Family blaOXA-Gene Score"
392
- )
393
- print(
394
- underscore
395
- + "_______________________________________________________________________"
396
- )
397
- for i in excel:
398
- print(i)
399
- print("")
400
- print("")
401
- for i in range(0, len(excelv2)):
402
- excelv2[i] = [excelv2[i]]
403
- if csv_table:
404
- with open(
405
- r"Results/XspecT_mini_csv/Results_Oxa.csv", "w", newline=""
406
- ) as file:
407
- writer = csv.writer(file)
408
- writer.writerows(excelv2)
409
- elif XspecT and ClAssT and not oxa:
410
- excelv2 = []
411
- for i in range(len(files)):
412
- excel.append(
413
- files[i]
414
- + spaces[i]
415
- + predictions[i]
416
- + " "
417
- + scores[i]
418
- + " "
419
- + predictions_ClAssT[i]
420
- + " "
421
- + scores_ClAssT[i]
422
- )
423
- excelv2.append(
424
- files[i]
425
- + ","
426
- + predictions[i]
427
- + ","
428
- + scores[i]
429
- + ","
430
- + predictions_ClAssT[i]
431
- + ","
432
- + scores_ClAssT[i]
433
- )
434
- print(
435
- header_filename
436
- + " Species Score Sub-Type Score"
437
- )
438
- print(
439
- underscore
440
- + "________________________________________________________________________"
441
- )
442
- for i in excel:
443
- print(i)
444
- print("")
445
- print("")
446
- for i in range(0, len(excelv2)):
447
- excelv2[i] = [excelv2[i]]
448
- if csv_table:
449
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
450
- writer = csv.writer(file)
451
- writer.writerows(excelv2)
452
- elif XspecT and oxa and not ClAssT:
453
- excelv2 = []
454
- for i in range(len(files)):
455
- if scores_oxa == ["None"]:
456
- excel.append(
457
- files[i]
458
- + spaces[i]
459
- + predictions[i]
460
- + " "
461
- + scores[i]
462
- + " "
463
- + str(scores_oxa[i])
464
- + " "
465
- + str(scores_oxa_ind[i][0])
466
- + " "
467
- + str(scores_oxa_ind[i][1])
468
- )
469
- else:
470
- excel.append(
471
- files[i]
472
- + spaces[i]
473
- + predictions[i]
474
- + " "
475
- + scores[i]
476
- + " "
477
- + str(scores_oxa[i])
478
- + " "
479
- + str(scores_oxa_ind[i][0])
480
- + " "
481
- + str(scores_oxa_ind[i][1])
482
- )
483
- excelv2.append(
484
- files[i] + "," + predictions[i] + "," + scores[i] + str(scores_oxa[i])
485
- )
486
- print(
487
- header_filename
488
- + " Species Score blaOXA-Family blaOXA-Gene Score"
489
- )
490
- print(
491
- underscore
492
- + "_______________________________________________________________________________________________________________"
493
- )
494
- for i in excel:
495
- print(i)
496
- for i in range(0, len(excelv2)):
497
- excelv2[i] = [excelv2[i]]
498
- if csv_table:
499
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
500
- writer = csv.writer(file)
501
- writer.writerows(excelv2)
502
- print("")
503
- print("")
504
- elif ClAssT and oxa and not XspecT:
505
- excelv2 = []
506
- for i in range(len(files)):
507
- if scores_oxa == ["None"]:
508
- excel.append(
509
- files[i]
510
- + spaces[i]
511
- + predictions_ClAssT[i]
512
- + " "
513
- + scores_ClAssT[i]
514
- + " "
515
- + str(scores_oxa[i])
516
- + " "
517
- + str(scores_oxa_ind[i][0])
518
- + " "
519
- + str(scores_oxa_ind[i][1])
520
- )
521
- else:
522
- excel.append(
523
- files[i]
524
- + spaces[i]
525
- + predictions_ClAssT[i]
526
- + " "
527
- + scores_ClAssT[i]
528
- + " "
529
- + str(scores_oxa[i])
530
- + " "
531
- + str(scores_oxa_ind[i][0])
532
- + " "
533
- + str(scores_oxa_ind[i][1])
534
- )
535
- excelv2.append(
536
- files[i]
537
- + ","
538
- + predictions_ClAssT[i]
539
- + ","
540
- + scores_ClAssT[i]
541
- + ","
542
- + str(scores_oxa[i])
543
- )
544
- print(
545
- header_filename
546
- + " Sub-Type Score blaOXA-Family blaOXA-Gene Score"
547
- )
548
- print(
549
- underscore
550
- + "______________________________________________________________________________________________________"
551
- )
552
- for i in excel:
553
- print(i)
554
- for i in range(0, len(excelv2)):
555
- excelv2[i] = [excelv2[i]]
556
- if csv_table:
557
- with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
558
- writer = csv.writer(file)
559
- writer.writerows(excelv2)
560
- print("")
561
- print("")
562
-
563
-
564
- def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus, mode):
565
- """performs a BF-lookup for a set of genomes for testing purpose"""
566
- print("Starting taxonomic assignment on species-level...")
567
- predictions = []
568
- scores = []
569
- counterx = 0
570
- contig_header = []
571
- contig_seq = []
572
- # Phillip
573
- names_path = (
574
- Path(os.getcwd()) / "filter" / "species_names" / ("Filter" + genus + ".txt")
575
- )
576
- with open(names_path, "rb") as fp:
577
- names = pickle.load(fp)
578
- names = sorted(names)
579
- # translation_dict = load_translation_dict(genus)
580
- for i in range(len(files)):
581
- if (
582
- i == int(len(files) / 6)
583
- or i == int(len(files) / 3)
584
- or i == int(len(files) / 2)
585
- or i == int(len(files) / 1.5)
586
- or i == int(len(files) / 1.2)
587
- ):
588
- print("...")
589
- BF.number_of_kmeres = 0
590
- BF.hits_per_filter = [0] * BF.clonetypes
591
- BF_1_1.number_of_kmeres = 0
592
- BF_1_1.hits_per_filter = [0]
593
- if file_format == "fasta" or file_format == "fna" or file_format == "fa":
594
- if metagenome:
595
- contigs = []
596
- contigs_classified = {}
597
- for sequence in SeqIO.parse(paths[i], "fasta"):
598
- contigs = []
599
- contigs_kmers = []
600
- BF_1_1.kmer_hits_single = []
601
- BF_1_1.number_of_kmeres = 0
602
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
603
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
604
- # then the contigs won't be tested further
605
- hit_sum = sum(BF_1_1.hits_per_filter)
606
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
607
- sample_size = int(len(str(sequence.seq)) ** 0.5)
608
- threshold_contig = sample_size * 0.7
609
- for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
610
- if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
611
- BF_1_1.lookup_canonical(
612
- str(sequence.seq[i : i + BF_1_1.k]).upper()
613
- )
614
-
615
- # needs at least 70% hits to continue with the contig
616
- counter = 0
617
- if (sum(BF_1_1.hits_per_filter) - hit_sum) > threshold_contig:
618
- for j in range(len(str(sequence.seq)) - BF_1_1.k):
619
- if "N" not in str(sequence.seq[j : j + BF_1_1.k]):
620
- contigs_kmers.append(
621
- str(sequence.seq[j : j + BF_1_1.k]).upper()
622
- )
623
- counter += 1
624
- # how many kmers? to use
625
- if counter >= 5000:
626
- break
627
- # contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
628
- contigs.append(contigs_kmers)
629
- BF_1_1.hits_per_filter = hits_per_filter_copy
630
- else:
631
- # resetting hit counter
632
- BF_1_1.hits_per_filter = hits_per_filter_copy
633
- continue
634
-
635
- contigs_filtered = []
636
- counter = 0
637
- # Since we classify individual contigs now, the var contigs only contains one item which makes those loops unneccesary
638
- for i in range(len(contigs)):
639
- threshold = 0
640
- for j in range(len(contigs[i])):
641
- BF_1_1.number_of_kmeres += 1
642
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
643
- BF_1_1.lookup_canonical(contigs[i][j])
644
- if hits_per_filter_copy != BF_1_1.hits_per_filter:
645
- threshold += 1
646
- # parameter value needs to be determined
647
- if threshold >= (0.7 * len(contigs[i])):
648
- contigs_filtered += contigs[i]
649
- counter += len(contigs[i])
650
- if counter >= 5000:
651
- break
652
-
653
- # since we do indv. contig classifications we need to reset the BF vars
654
- BF.kmer_hits_single = []
655
- BF.number_of_kmeres = 0
656
- BF.hits_per_filter = [0] * BF.clonetypes
657
- for kmer in contigs_filtered:
658
- BF.number_of_kmeres += 1
659
- BF.lookup_canonical(kmer)
660
- score = BF.get_score()
661
- score_edit = [str(x) for x in score]
662
- score_edit = ",".join(score_edit)
663
-
664
- # making prediction
665
- index_result = max(range(len(score)), key=score.__getitem__)
666
- prediction = names[index_result]
667
-
668
- # skip ambiguous contigs
669
- if max(score) == sorted(score)[-2]:
670
- continue
671
-
672
- # bootstrapping
673
- bootstrap_n = 100
674
- samples = bs.bootstrap(
675
- BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
676
- )
677
- sample_scores = bs.bootstrap_scores(
678
- samples, BF.number_of_kmeres, BF.clonetypes
679
- )
680
- bootstrap_score = 0
681
- bootstrap_predictions = []
682
- for i in range(len(sample_scores)):
683
- # skip ambiguous contigs (species with same score)
684
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
685
- bootstrap_predictions.append(
686
- names[
687
- max(
688
- range(len(sample_scores[i])),
689
- key=sample_scores[i].__getitem__,
690
- )
691
- ]
692
- )
693
- if (
694
- max(
695
- range(len(sample_scores[i])),
696
- key=sample_scores[i].__getitem__,
697
- )
698
- == index_result
699
- ):
700
- bootstrap_score += 1
701
- else:
702
- continue
703
- bootstrap_score = bootstrap_score / bootstrap_n
704
-
705
- # ---------------------------------------------------------------------------------------------
706
- # Collect results
707
- # change this var to the species you want your contigs saved from
708
- save_contigs = "none"
709
-
710
- if (genus[0] + ". " + prediction) not in contigs_classified:
711
- contigs_classified[genus[0] + ". " + prediction] = [
712
- [max(score)],
713
- 1,
714
- [len(str(sequence.seq))],
715
- sorted(score)[-2] / max(score),
716
- [bootstrap_score],
717
- contigs_filtered,
718
- None,
719
- ]
720
- if prediction == save_contigs:
721
- contig_header += [sequence.description]
722
- contig_seq += [str(sequence.seq)]
723
- else:
724
- contigs_classified[genus[0] + ". " + prediction][0] += [
725
- max(score)
726
- ]
727
- contigs_classified[genus[0] + ". " + prediction][1] += 1
728
- contigs_classified[genus[0] + ". " + prediction][2] += [
729
- len(str(sequence.seq))
730
- ]
731
- contigs_classified[genus[0] + ". " + prediction][3] += sorted(
732
- score
733
- )[-2] / max(score)
734
- contigs_classified[genus[0] + ". " + prediction][4] += [
735
- bootstrap_score
736
- ]
737
- contigs_classified[genus[0] + ". " + prediction][
738
- 5
739
- ] += contigs_filtered
740
- if prediction == save_contigs:
741
- contig_header += [sequence.description]
742
- contig_seq += [str(sequence.seq)]
743
- # scores.append(str(max(score)))
744
- else:
745
- # Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
746
- BF.kmer_hits_single = []
747
- for sequence in SeqIO.parse(paths[i], "fasta"):
748
- for j in range(0, len(sequence.seq) - BF.k, mode):
749
- BF.number_of_kmeres += 1
750
- kmer = str(sequence.seq[j : j + BF.k])
751
- BF.lookup_canonical(kmer)
752
-
753
- score = BF.get_score()
754
- # print("Scores: ", score)
755
- if metagenome:
756
- # map kmers to genome for HGT detection
757
- # change later to new functions this is OLD
758
- if False:
759
- for prediction in contigs_classified:
760
- kmers = contigs_classified[prediction][5]
761
- # Strip "A."
762
- prediction = prediction[2:]
763
- # kmer mapping to genome, start by loading the kmer_dict in
764
- path_pos = (
765
- "filter\kmer_positions\Acinetobacter\\"
766
- + prediction
767
- + "_positions.txt"
768
- )
769
- # delete later
770
- path_posv2 = (
771
- "filter\kmer_positions\Acinetobacter\\"
772
- + prediction
773
- + "_complete_positions.txt"
774
- )
775
- # cluster kmers to contigs
776
- # delete try later
777
- try:
778
- with open(path_pos, "rb") as fp:
779
- kmer_dict = pickle.load(fp)
780
- except:
781
- with open(path_posv2, "rb") as fp:
782
- kmer_dict = pickle.load(fp)
783
- contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
784
- contigs_classified[genus[0] + ". " + prediction][
785
- 6
786
- ] = contig_amounts_distances
787
- # del kmer_dict
788
- for key, value in contigs_classified.items():
789
- number_of_contigs = value[1]
790
- # save results
791
- results_clustering = [
792
- [
793
- key
794
- + ","
795
- + str(statistics.median(value[0]))
796
- + ","
797
- + str(number_of_contigs),
798
- str(statistics.median(value[2]))
799
- + ","
800
- + str(round(value[3] / number_of_contigs, 2))
801
- + ","
802
- + str(statistics.median(value[4])),
803
- ]
804
- ]
805
- # with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
806
- # writer = csv.writer(file)
807
- # writer.writerows(results_clustering)
808
- value[0] = "Score Median: " + str(statistics.median(value[0]))
809
- value[1] = "Number of Contigs: " + str(number_of_contigs)
810
- value[2] = "Contig-Length Median: " + str(
811
- statistics.median(value[2])
812
- )
813
- value[3] = "Repetiviness: " + str(
814
- round(value[3] / number_of_contigs, 2)
815
- )
816
- value[4] = "Bootstrap Median: " + str(statistics.median(value[4]))
817
- # value[6] = "Clusters: " + str(value[6])
818
- contigs_classified[key] = value
819
- print("Species: ", key)
820
- print(value[0])
821
- print(value[1])
822
- print(value[2])
823
- print(value[3])
824
- print(value[4])
825
- print(value[6])
826
- print()
827
-
828
- save_contigs = "none"
829
- if save_contigs != "none":
830
- with open(r"Results/Contigs_saved.fasta", "w") as file:
831
- for j in range(len(contig_header)):
832
- file.write(contig_header[j] + "\n")
833
- file.write(contig_seq[j] + "\n")
834
- file.write("\n")
835
- elif file_format == "fastq" or file_format == "fq":
836
- if metagenome:
837
- # ---------------------------------------------------------------------------------------------
838
- # initialize variables
839
- BF_1_1.kmer_hits_single = []
840
- BF_1_1.number_of_kmeres = 0
841
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
842
- counter = 0
843
- reads = []
844
- reads_classified = {}
845
- reads_passed = 0
846
- ambiguous_reads = 0
847
-
848
- # ---------------------------------------------------------------------------------------------
849
- # First prefiltering step: Check if read contains at least 3 kmeres
850
- for sequence in SeqIO.parse(paths[i], "fastq"):
851
- dna_composition = {}
852
- dna_composition = calculate_dna_composition(sequence.seq)
853
- BF_1_1.kmer_hits_single = []
854
- BF_1_1.number_of_kmeres = 0
855
- BF_1_1.hits_per_filter = [0] * BF.clonetypes
856
- # reverse_sequence = sequence.seq.reverse_complement()
857
- read_kmers = []
858
- reads = []
859
- if counter < read_amount:
860
- counter += 1
861
- else:
862
- break
863
- k1 = str(sequence.seq[0 : BF_1_1.k]) # first k-mer
864
- k2 = str(
865
- sequence.seq[len(str(sequence.seq)) - BF_1_1.k :]
866
- ) # last k-mer
867
- mid = len(str(sequence.seq)) // 2
868
- k3 = str(sequence.seq[mid : mid + BF_1_1.k]) # k-mer in middle
869
- k4 = str(sequence.seq[BF_1_1.k : BF_1_1.k * 2])
870
- k5 = str(sequence.seq[mid + BF_1_1.k : mid + BF_1_1.k * 2])
871
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
872
- # then the read won't be tested further
873
- hit_sum = sum(BF_1_1.hits_per_filter)
874
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
875
- # sample_size = int(len(str(sequence.seq)) ** 0.5)
876
- # threshold_read = sample_size * 0.7
877
- # for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
878
- # if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
879
- # BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
880
- if "N" not in str(sequence.seq):
881
- BF_1_1.lookup_canonical(k1)
882
- BF_1_1.lookup_canonical(k2)
883
- BF_1_1.lookup_canonical(k3)
884
- BF_1_1.lookup_canonical(k4)
885
- BF_1_1.lookup_canonical(k5)
886
- else:
887
- continue
888
- # needs at least 2 of 3 hits to continue with read
889
- if (sum(BF_1_1.hits_per_filter) - hit_sum) > 3:
890
- for j in range(len(str(sequence.seq)) - BF_1_1.k):
891
- read_kmers.append(str(sequence.seq[j : j + BF_1_1.k]))
892
- # read_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
893
- reads.append(read_kmers)
894
- BF_1_1.hits_per_filter = hits_per_filter_copy
895
- else:
896
- # resetting hit counter
897
- BF_1_1.hits_per_filter = hits_per_filter_copy
898
- continue
899
-
900
- # ---------------------------------------------------------------------------------------------
901
- # Second prefiltering step: Check if read contains at least 80% of kmers from one species
902
- # reads_filtered = set()
903
- reads_filtered = []
904
- for i in range(len(reads)):
905
- threshold = 0
906
- for j in range(len(reads[i])):
907
- BF_1_1.number_of_kmeres += 1
908
- hits_per_filter_copy = BF_1_1.hits_per_filter[:]
909
- if "N" not in reads[i][j]:
910
- BF_1_1.lookup_canonical(reads[i][j])
911
- if hits_per_filter_copy != BF_1_1.hits_per_filter:
912
- threshold += 1
913
- if threshold >= 0.7 * len(reads[i]):
914
- reads_filtered += reads[i]
915
- if len(reads_filtered) == 0:
916
- continue
917
-
918
- # ---------------------------------------------------------------------------------------------
919
- # Start of the actual classification
920
- BF.number_of_kmeres = 0
921
- BF.hits_per_filter = [0] * BF.clonetypes
922
- BF.kmer_hits_single = []
923
- for kmer in reads_filtered:
924
- if "N" not in kmer:
925
- BF.number_of_kmeres += 1
926
- BF.lookup_canonical(kmer)
927
- else:
928
- continue
929
- score = BF.get_score()
930
- score_edit = [str(x) for x in score]
931
- score_edit = ",".join(score_edit)
932
-
933
- # making prediction
934
- index_result = max(range(len(score)), key=score.__getitem__)
935
- prediction = names[index_result]
936
- if max(score) == sorted(score)[-2]:
937
- ambiguous_reads += 1
938
- # print("Ambiguous read")
939
- # continue
940
-
941
- # ---------------------------------------------------------------------------------------------
942
- # bootstrapping
943
- bootstrap_n = 100
944
- samples = bs.bootstrap(
945
- BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
946
- )
947
- sample_scores = bs.bootstrap_scores(
948
- samples, BF.number_of_kmeres, BF.clonetypes
949
- )
950
- bootstrap_score = 0
951
- bootstrap_predictions = []
952
- for i in range(len(sample_scores)):
953
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
954
- bootstrap_predictions.append(
955
- names[
956
- max(
957
- range(len(sample_scores[i])),
958
- key=sample_scores[i].__getitem__,
959
- )
960
- ]
961
- )
962
- if (
963
- max(
964
- range(len(sample_scores[i])),
965
- key=sample_scores[i].__getitem__,
966
- )
967
- == index_result
968
- ):
969
- bootstrap_score += 1
970
- else:
971
- continue
972
- bootstrap_score = bootstrap_score / bootstrap_n
973
-
974
- # ---------------------------------------------------------------------------------------------
975
- # HGT identification pipeline start
976
-
977
- # skip species clear reads
978
- # if max(score) <= 0.9:
979
- # identify split reads from HGT
980
- # split_regions = map.identify_split_reads(score, BF.kmer_hits_single)
981
-
982
- # split_read contains touples --> ([first part of list, second part of list], index of species)
983
-
984
- # check if it is in fact a split read , 0.6 is arbitrary value, it is the threshold for the difference between the two regions
985
- # if abs(sum(split_regions[0][0]) - sum(split_regions[0][1])) > 0.6:
986
- # get the species names
987
- # acceptor_species = names[split_regions[0][1]]
988
- # donor_species = names[split_regions[1][1]]
989
- # donor_acceptor = [donor_species, acceptor_species]
990
- # else:
991
- # donor_acceptor = [None]
992
-
993
- # ---------------------------------------------------------------------------------------------
994
- # Collect results from classification
995
- if (genus[0] + ". " + prediction) not in reads_classified:
996
- reads_classified[genus[0] + ". " + prediction] = [
997
- max(score),
998
- 1,
999
- sorted(score)[-2] / max(score),
1000
- BF.number_of_kmeres,
1001
- [bootstrap_score],
1002
- reads_filtered,
1003
- None,
1004
- ]
1005
- else:
1006
- reads_classified[genus[0] + ". " + prediction][1] += 1
1007
- reads_classified[genus[0] + ". " + prediction][0] += max(score)
1008
- reads_classified[genus[0] + ". " + prediction][2] += sorted(
1009
- score
1010
- )[-2] / max(score)
1011
- reads_classified[genus[0] + ". " + prediction][
1012
- 3
1013
- ] += BF.number_of_kmeres
1014
- reads_classified[genus[0] + ". " + prediction][4] += [
1015
- bootstrap_score
1016
- ]
1017
- reads_classified[genus[0] + ". " + prediction][
1018
- 5
1019
- ] += reads_filtered
1020
- # reads_classified[genus[0] + ". " + prediction][7] += [dna_composition]
1021
- # reads_classified[genus[0] + ". " + prediction][8] += [donor_acceptor]
1022
-
1023
- else:
1024
- # classification for sequence pure reads, check every 10th kmer (or everyone for "complete" mode)
1025
- counter = 0
1026
- # Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
1027
- BF.kmer_hits_single = []
1028
- for sequence in SeqIO.parse(paths[i], "fastq"):
1029
- if counter < read_amount:
1030
- counter += 1
1031
- for j in range(0, len(sequence.seq) - BF.k + 1, mode):
1032
- BF.number_of_kmeres += 1
1033
- kmer = str(sequence.seq[j : j + BF.k])
1034
- BF.lookup_canonical(kmer)
1035
- else:
1036
- break
1037
- score = BF.get_score()
1038
-
1039
- if metagenome:
1040
- # ---------------------------------------------------------------------------------------------
1041
- # map kmers to genome for HGT detection
1042
- if False:
1043
- # map and cluster single reads to genome
1044
- read_clusters = []
1045
- for prediction in reads_classified:
1046
- # load list of kmers from read
1047
- kmers = reads_classified[prediction][5]
1048
- # Strip genus name
1049
- prediction = prediction[2:]
1050
-
1051
- # kmer mapping to genome, start by loading the kmer_dict in
1052
- path_pos = (
1053
- "filter\kmer_positions\Acinetobacter\\"
1054
- + prediction
1055
- + "_positions.txt"
1056
- )
1057
- # delete later
1058
- path_posv2 = (
1059
- "filter\kmer_positions\Acinetobacter\\"
1060
- + prediction
1061
- + "_complete_positions.txt"
1062
- )
1063
- # cluster kmers to reads
1064
- # delete try later
1065
- try:
1066
- with open(path_pos, "rb") as fp:
1067
- kmer_dict = pickle.load(fp)
1068
- except:
1069
- with open(path_posv2, "rb") as fp:
1070
- kmer_dict = pickle.load(fp)
1071
- test = map.map_kmers(kmers, kmer_dict, genus)
1072
- clusters = map.cluster_kmers(kmers, kmer_dict)
1073
- read_clusters.append(clusters)
1074
- reads_classified[genus[0] + ". " + prediction][
1075
- 6
1076
- ] = reads_amounts_distances
1077
- # del kmer_dict
1078
-
1079
- # now cluster mappings of multiple reads to genome
1080
- for cluster in read_clusters:
1081
- # TODO
1082
- continue
1083
-
1084
- # ---------------------------------------------------------------------------------------------
1085
- # Collect results from classification
1086
- for key, value in reads_classified.items():
1087
- if key == "unknown":
1088
- continue
1089
- value.insert(2, value[0] / value[1])
1090
- value.pop(0)
1091
- reads_classified[key] = value
1092
- print(
1093
- key,
1094
- value[0],
1095
- round(value[1], 2),
1096
- round(value[2] / value[0], 2),
1097
- round(value[3] / value[0], 2),
1098
- statistics.median(value[4]),
1099
- )
1100
- score_edit = [str(x) for x in score]
1101
- score_edit = ",".join(score_edit)
1102
- # making prediction
1103
- if not metagenome:
1104
- # prediction = Classifier.classify(r'Training_data/Training_data_spec.csv', score, True)
1105
- # Phillip
1106
- # file_name = genus + "_Training_data_spec.csv"
1107
- # path = Path(__file__).parent.absolute() / "Training_data" / file_name
1108
- # prediction = Classifier.classify(path, score, True)
1109
- # SVM TURNED OFF TEsting!!
1110
- index_result = max(range(len(score)), key=score.__getitem__)
1111
- prediction = names[index_result]
1112
- names_copy = names[:]
1113
- # sort score by descending order and names_copy accordingly
1114
- score, names_copy = zip(*sorted(zip(score, names_copy), reverse=True))
1115
- # print(score[0:3])
1116
- # print(names_copy[0:3])
1117
- else:
1118
- # Phillip
1119
- # prediction_name = translation_dict[prediction]
1120
- # predictions.append(prediction_name)
1121
- index_result = max(range(len(score)), key=score.__getitem__)
1122
- prediction = names[index_result]
1123
- translation_dict = load_translation_dict(genus)
1124
- predictions.append(translation_dict[prediction])
1125
- scores.append(str(max(score)))
1126
- print("Taxonomic assignment done...")
1127
- return predictions, scores
1128
-
1129
-
1130
- def clAssT(BF_2, files, paths, file_format, read_amount):
1131
- print("Starting strain-typing on sub-type-level...")
1132
- predictions_ClAssT = []
1133
- scores_ClAssT = []
1134
- for i in range(len(files)):
1135
- if (
1136
- i == int(len(files) / 6)
1137
- or i == int(len(files) / 3)
1138
- or i == int(len(files) / 2)
1139
- or i == int(len(files) / 1.5)
1140
- or i == int(len(files) / 1.2)
1141
- ):
1142
- print("...")
1143
- BF_2.number_of_kmeres = 0
1144
- BF_2.hits_per_filter = [0] * BF_2.clonetypes
1145
- if file_format == "fasta" or file_format == "fna":
1146
- for sequence in SeqIO.parse(paths[i], "fasta"):
1147
- # Originally 10
1148
- for j in range(0, len(sequence.seq) - BF_2.k, 500):
1149
- BF_2.number_of_kmeres += 1
1150
- BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
1151
- elif file_format == "fastq" or file_format == "fq":
1152
- counter = 0
1153
- for sequence in SeqIO.parse(paths[i], "fastq"):
1154
- if counter < read_amount:
1155
- counter += 1
1156
- for j in range(0, len(sequence.seq) - BF_2.k + 1, 10):
1157
- BF_2.number_of_kmeres += 1
1158
- BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
1159
- else:
1160
- break
1161
- score_ClAssT = BF_2.get_score()
1162
- score_edit_ClAssT = [str(x) for x in score_ClAssT]
1163
- score_edit_ClAssT = ",".join(score_edit_ClAssT)
1164
- prediction_ClAssT = Classifier.classify(
1165
- r"Training_data/Training_data_IC.csv",
1166
- score_ClAssT,
1167
- [True, True, True, True, True, True, True, True, False],
1168
- )
1169
- predictions_ClAssT.append(prediction_ClAssT)
1170
- scores_ClAssT.append(str(max(score_ClAssT)))
1171
-
1172
- print("Strain-typing on sub-type-level done...")
1173
- return predictions_ClAssT, scores_ClAssT
1174
-
1175
-
1176
- def blaOXA(BF_3, files, paths, file_format, read_amount):
1177
- start = time.time()
1178
- print("Start screening for blaOXA-genes...")
1179
- paths_oxa = sorted(os.listdir(r"filter/OXAs/families"))
1180
- BF_families = BF_3["OXA-families"]
1181
- oxas = []
1182
- scores_oxa = []
1183
- scores_oxa_ind = []
1184
- for i in paths_oxa:
1185
- oxas.append(i[:-4])
1186
- # print("OXA-families: ", oxas) # correct
1187
- for i in range(len(files)):
1188
- oxa_dic = {}
1189
- if (
1190
- i == int(len(files) / 6)
1191
- or i == int(len(files) / 3)
1192
- or i == int(len(files) / 2)
1193
- or i == int(len(files) / 1.5)
1194
- or i == int(len(files) / 1.2)
1195
- ):
1196
- print("...")
1197
- # Checking file type
1198
- # if the file is fasta -> concat lines
1199
- reads = []
1200
- BF_families.number_of_kmeres = 0
1201
- BF_families.hits_per_filter = [0] * BF_families.clonetypes
1202
- BF_families.table = OXATable()
1203
- BF_families.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
1204
- if file_format == "fasta" or file_format == "fna":
1205
- for sequence in SeqIO.parse(paths[i], "fasta"):
1206
- reads.append(str(sequence.seq))
1207
- BF_families.lookup_oxa(reads, ".fna")
1208
- elif file_format == "fastq" or file_format == "fq":
1209
- counter = 0
1210
- for sequence in SeqIO.parse(paths[i], "fastq"):
1211
- if counter < read_amount:
1212
- counter += 1
1213
- reads.append(str(sequence.seq))
1214
- else:
1215
- break
1216
- BF_families.lookup_oxa(reads, ".fq")
1217
- # print("Reads used: ", counter)
1218
- score_oxa = BF_families.get_oxa_score()
1219
- # print("Score: ", score_oxa)
1220
- for i in range(len(oxas)):
1221
- oxa_dic[oxas[i]] = score_oxa[i]
1222
- for i in range(len(oxa_dic)):
1223
- if oxa_dic[oxas[i]] < 0.3:
1224
- del oxa_dic[oxas[i]]
1225
- if len(oxa_dic) == 0:
1226
- oxa_dic = "None"
1227
- if oxa_dic != "None":
1228
- oxa_dic = dict(sorted(oxa_dic.items(), key=lambda item: item[1]))
1229
- scores_oxa.append(oxa_dic)
1230
- # prepare data for next taxonomic level
1231
- oxa_names = []
1232
- # print(oxa_dic)
1233
- for oxa_family in oxa_dic:
1234
- oxa_names.append(oxa_family[:-7])
1235
- for oxa_family in oxa_names:
1236
- if oxa_dic == "None":
1237
- scores_oxa_ind.append(["None", 0])
1238
- break
1239
- # print("blaOXA: ", oxa_dic)
1240
- oxa_dic_ind = {}
1241
- ## TODO:
1242
- # print("blaOXA: ", oxa_family)
1243
- BF_ind = BF_3[oxa_family]
1244
- BF_ind.number_of_kmeres = 0
1245
- BF_ind.hits_per_filter = [0] * BF_ind.clonetypes
1246
- BF_ind.table = OXATable()
1247
- BF_ind.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
1248
- paths_oxa = sorted(os.listdir(r"filter/OXAs/individual/" + oxa_family))
1249
- oxas_ind = []
1250
- for i in paths_oxa:
1251
- oxas_ind.append(i[:-4])
1252
- if file_format == "fasta" or file_format == "fna":
1253
- BF_ind.lookup_oxa(reads, ".fna")
1254
- elif file_format == "fastq" or file_format == "fq":
1255
- BF_ind.lookup_oxa(reads, ".fq")
1256
- score_oxa = BF_ind.get_oxa_score()
1257
- # build dict with oxa-gen and its score
1258
- for i in range(len(oxas_ind)):
1259
- oxa_dic_ind[oxas_ind[i]] = score_oxa[i]
1260
- # filter dict by score
1261
- if len(oxa_dic_ind) == 0 or max(oxa_dic_ind.values()) < 0.3:
1262
- scores_oxa_ind.append("None")
1263
- else:
1264
- scores_oxa_ind.append(
1265
- [
1266
- max(oxa_dic_ind, key=oxa_dic_ind.get),
1267
- oxa_dic_ind[max(oxa_dic_ind, key=oxa_dic_ind.get)],
1268
- ]
1269
- )
1270
- end = time.time()
1271
- needed = round(end - start, 2)
1272
- print("Time needed: ", needed)
1273
- print("Screening for blaOXA-genes done...")
1274
- return scores_oxa, scores_oxa_ind
1275
-
1276
-
1277
- def calculate_dna_composition(sequence):
1278
- """calculates the DNA composition of a sequence"""
1279
- composition = {"A": 0, "C": 0, "G": 0, "T": 0}
1280
-
1281
- total = 0
1282
- for base in sequence:
1283
- if base in composition:
1284
- composition[base] += 1
1285
- total += 1
1286
- for base in composition:
1287
- composition[base] = round(composition[base] / total, 2)
1288
-
1289
- return composition
1290
-
1291
-
1292
- def main():
1293
- """Parse CLI arguments and call respective functions"""
1294
- arg_list = sys.argv
1295
- # Phillip
1296
- genus = arg_list[1]
1297
- genera = search_filter.get_genera_array_sizes()
1298
- genera = list(genera.keys())
1299
-
1300
- if genus not in genera:
1301
- print(f"{genus} is unknown.")
1302
- quit()
1303
- if "XspecT" in arg_list or "xspect" in arg_list:
1304
- xspect = True
1305
- else:
1306
- xspect = False
1307
- if "ClAssT" in arg_list or "classt" in arg_list and genus == "Acinetobacter":
1308
- classt = True
1309
- elif "ClAssT" in arg_list or "classt" in arg_list and genus != "Acinetobacter":
1310
- print(f"ClAssT unavailable for {genus}")
1311
- else:
1312
- classt = False
1313
- if "Oxa" in arg_list or "oxa" in arg_list and genus == "Acinetobacter":
1314
- oxa = True
1315
- elif "Oxa" in arg_list or "oxa" in arg_list and genus != "Acinetobacter":
1316
- print(f"Oxa unavailable for {genus}")
1317
- else:
1318
- oxa = False
1319
- if "Metagenome" in arg_list or "metagenome" in arg_list:
1320
- metagenome = True
1321
- else:
1322
- metagenome = False
1323
- if ("fasta" in arg_list) or ("fna" in arg_list) or ("fa" in arg_list):
1324
- file_format = "fasta"
1325
- read_amount = 342480
1326
- elif ("fastq" in arg_list) or ("fq" in arg_list):
1327
- file_format = "fastq"
1328
- index = arg_list.index("fastq")
1329
- if arg_list[index + 1].isdigit():
1330
- read_amount = int(arg_list[index + 1])
1331
- else:
1332
- print("Error: Wrong Input, use a number after fastq!")
1333
- quit()
1334
- else:
1335
- print("Error: Wrong Input, use fasta/fna/fa or fastq/fq!")
1336
- quit()
1337
- if "save" in arg_list or "Save" in arg_list:
1338
- csv_table = True
1339
- else:
1340
- csv_table = False
1341
- if "complete" in arg_list or "Complete" in arg_list:
1342
- mode = 1
1343
- else:
1344
- mode = 500
1345
-
1346
- file_path = arg_list[-1]
1347
-
1348
- xspecT_mini(
1349
- file_path,
1350
- xspect,
1351
- classt,
1352
- oxa,
1353
- file_format,
1354
- read_amount,
1355
- csv_table,
1356
- metagenome,
1357
- genus,
1358
- mode,
1359
- )
1360
-
1361
-
1362
- if __name__ == "__main__":
1363
- main()