XspecT 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
- XspecT-0.2.0.dist-info/RECORD +30 -0
- {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
- xspect/definitions.py +42 -0
- xspect/download_filters.py +11 -26
- xspect/fastapi.py +101 -0
- xspect/file_io.py +34 -103
- xspect/main.py +70 -66
- xspect/model_management.py +88 -0
- xspect/models/__init__.py +0 -0
- xspect/models/probabilistic_filter_model.py +277 -0
- xspect/models/probabilistic_filter_svm_model.py +169 -0
- xspect/models/probabilistic_single_filter_model.py +109 -0
- xspect/models/result.py +148 -0
- xspect/pipeline.py +201 -0
- xspect/run.py +38 -0
- xspect/train.py +304 -0
- xspect/train_filter/create_svm.py +6 -183
- xspect/train_filter/extract_and_concatenate.py +117 -121
- xspect/train_filter/html_scrap.py +16 -28
- xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
- XspecT-0.1.2.dist-info/RECORD +0 -48
- xspect/BF_v2.py +0 -648
- xspect/Bootstrap.py +0 -29
- xspect/Classifier.py +0 -142
- xspect/OXA_Table.py +0 -53
- xspect/WebApp.py +0 -737
- xspect/XspecT_mini.py +0 -1377
- xspect/XspecT_trainer.py +0 -611
- xspect/map_kmers.py +0 -155
- xspect/search_filter.py +0 -504
- xspect/static/How-To.png +0 -0
- xspect/static/Logo.png +0 -0
- xspect/static/Logo2.png +0 -0
- xspect/static/Workflow_AspecT.png +0 -0
- xspect/static/Workflow_ClAssT.png +0 -0
- xspect/static/js.js +0 -615
- xspect/static/main.css +0 -280
- xspect/templates/400.html +0 -64
- xspect/templates/401.html +0 -62
- xspect/templates/404.html +0 -62
- xspect/templates/500.html +0 -62
- xspect/templates/about.html +0 -544
- xspect/templates/home.html +0 -51
- xspect/templates/layoutabout.html +0 -87
- xspect/templates/layouthome.html +0 -63
- xspect/templates/layoutspecies.html +0 -468
- xspect/templates/species.html +0 -33
- xspect/train_filter/get_paths.py +0 -35
- xspect/train_filter/interface_XspecT.py +0 -204
- xspect/train_filter/k_mer_count.py +0 -162
- {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
- {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/XspecT_mini.py
DELETED
|
@@ -1,1377 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import warnings
|
|
3
|
-
import time
|
|
4
|
-
import csv
|
|
5
|
-
import pickle
|
|
6
|
-
import statistics
|
|
7
|
-
import sys
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from Bio import SeqIO, Seq
|
|
10
|
-
from numpy import sum
|
|
11
|
-
import psutil
|
|
12
|
-
import xspect.Classifier as Classifier
|
|
13
|
-
import xspect.search_filter as search_filter
|
|
14
|
-
from xspect.OXA_Table import OXATable
|
|
15
|
-
import xspect.Bootstrap as bs
|
|
16
|
-
from xspect.train_filter.interface_XspecT import load_translation_dict
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
warnings.filterwarnings("ignore")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def xspecT_mini(
|
|
23
|
-
file_path,
|
|
24
|
-
XspecT,
|
|
25
|
-
ClAssT,
|
|
26
|
-
oxa,
|
|
27
|
-
file_format,
|
|
28
|
-
read_amount,
|
|
29
|
-
csv_table,
|
|
30
|
-
metagenome,
|
|
31
|
-
genus,
|
|
32
|
-
mode,
|
|
33
|
-
):
|
|
34
|
-
"""performs a BF-lookup for a set of genomes for testing purpose"""
|
|
35
|
-
itemlist = [
|
|
36
|
-
"albensis",
|
|
37
|
-
"apis",
|
|
38
|
-
"baretiae",
|
|
39
|
-
"baumannii",
|
|
40
|
-
"baylyi",
|
|
41
|
-
"beijerinckii",
|
|
42
|
-
"bereziniae",
|
|
43
|
-
"bohemicus",
|
|
44
|
-
"boissieri",
|
|
45
|
-
"bouvetii",
|
|
46
|
-
"brisouii",
|
|
47
|
-
"calcoaceticus",
|
|
48
|
-
"celticus",
|
|
49
|
-
"chengduensis",
|
|
50
|
-
"chinensis",
|
|
51
|
-
"colistiniresistens",
|
|
52
|
-
"courvalinii",
|
|
53
|
-
"cumulans",
|
|
54
|
-
"defluvii",
|
|
55
|
-
"dispersus",
|
|
56
|
-
"equi",
|
|
57
|
-
"gandensis",
|
|
58
|
-
"gerneri",
|
|
59
|
-
"gs06",
|
|
60
|
-
"gs16",
|
|
61
|
-
"guerrae",
|
|
62
|
-
"guillouiae",
|
|
63
|
-
"gyllenbergii",
|
|
64
|
-
"haemolyticus",
|
|
65
|
-
"halotolerans",
|
|
66
|
-
"harbinensis",
|
|
67
|
-
"idrijaensis",
|
|
68
|
-
"indicus",
|
|
69
|
-
"johnsonii",
|
|
70
|
-
"junii",
|
|
71
|
-
"kanungonis",
|
|
72
|
-
"kookii",
|
|
73
|
-
"kyonggiensis",
|
|
74
|
-
"lactucae",
|
|
75
|
-
"lanii",
|
|
76
|
-
"larvae",
|
|
77
|
-
"lwoffii",
|
|
78
|
-
"marinus",
|
|
79
|
-
"modestus",
|
|
80
|
-
"nectaris",
|
|
81
|
-
"nosocomialis",
|
|
82
|
-
"oleivorans",
|
|
83
|
-
"parvus",
|
|
84
|
-
"piscicola",
|
|
85
|
-
"pittii",
|
|
86
|
-
"pollinis",
|
|
87
|
-
"populi",
|
|
88
|
-
"portensis",
|
|
89
|
-
"pseudolwoffii",
|
|
90
|
-
"pullicarnis",
|
|
91
|
-
"pragensis",
|
|
92
|
-
"proteolyticus",
|
|
93
|
-
"puyangensis",
|
|
94
|
-
"qingfengensis",
|
|
95
|
-
"radioresistens",
|
|
96
|
-
"rathckeae",
|
|
97
|
-
"rongchengensis",
|
|
98
|
-
"rudis",
|
|
99
|
-
"schindleri",
|
|
100
|
-
"seifertii",
|
|
101
|
-
"seohaensis",
|
|
102
|
-
"shaoyimingii",
|
|
103
|
-
"sichuanensis",
|
|
104
|
-
"soli",
|
|
105
|
-
"stercoris",
|
|
106
|
-
"tandoii",
|
|
107
|
-
"terrae",
|
|
108
|
-
"terrestris",
|
|
109
|
-
"tianfuensis",
|
|
110
|
-
"tjernbergiae",
|
|
111
|
-
"towneri",
|
|
112
|
-
"ursingii",
|
|
113
|
-
"variabilis",
|
|
114
|
-
"venetianus",
|
|
115
|
-
"vivanii",
|
|
116
|
-
"wanghuae",
|
|
117
|
-
"wuhouensis",
|
|
118
|
-
"sp.",
|
|
119
|
-
]
|
|
120
|
-
print("Preparing Bloomfilter...")
|
|
121
|
-
start = time.time()
|
|
122
|
-
if XspecT:
|
|
123
|
-
# BF = search_filter.pre_processing()
|
|
124
|
-
# Phillip
|
|
125
|
-
# Getting the array sizes for pre processing of all bloomfilters.
|
|
126
|
-
genera = search_filter.get_genera_array_sizes()
|
|
127
|
-
|
|
128
|
-
# Pre processing of the bloomfilters for the species.
|
|
129
|
-
BF = search_filter.pre_process_all(genera, k=21, meta_mode=False, genus=[genus])
|
|
130
|
-
|
|
131
|
-
# aktuelle Speichernutzung auslesen
|
|
132
|
-
process = psutil.Process()
|
|
133
|
-
memory_info = process.memory_info()
|
|
134
|
-
# Ausgabe des Speicherverbrauchs
|
|
135
|
-
print(
|
|
136
|
-
f"Aktueller Speicherverbrauch mit den Spezies BF: {memory_info.rss / 1024 / 1024:.2f} MB"
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# BF_1 = search_filter.pre_processing_prefilter()
|
|
140
|
-
# BF_1_1 = search_filter.pre_processing_prefilter2()
|
|
141
|
-
# Phillip
|
|
142
|
-
# Pre processing of the bloomfilters for the metagenome mode.
|
|
143
|
-
BF_1_1 = search_filter.pre_process_all(
|
|
144
|
-
genera, k=21, meta_mode=True, genus=[genus]
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
# aktuelle Speichernutzung auslesen
|
|
148
|
-
process = psutil.Process()
|
|
149
|
-
memory_info = process.memory_info()
|
|
150
|
-
# Ausgabe des Speicherverbrauchs
|
|
151
|
-
print(
|
|
152
|
-
f"Aktueller Speicherverbrauch mit dem Master BF: {memory_info.rss / 1024 / 1024:.2f} MB"
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
if ClAssT:
|
|
156
|
-
BF_2 = search_filter.pre_processing_ClAssT()
|
|
157
|
-
if oxa:
|
|
158
|
-
BF_3 = search_filter.pre_processing_oxa()
|
|
159
|
-
end = time.time()
|
|
160
|
-
needed = round(end - start, 2)
|
|
161
|
-
print("Time needed for preprocessing: ", needed)
|
|
162
|
-
try:
|
|
163
|
-
files = sorted(os.listdir(file_path))
|
|
164
|
-
except FileNotFoundError:
|
|
165
|
-
print("Error: Invalid filepath!")
|
|
166
|
-
quit()
|
|
167
|
-
if file_format == "fna" or file_format == "fasta" or file_format == "fa":
|
|
168
|
-
for i in range(len(files) - 1, -1, -1):
|
|
169
|
-
if "fna" in files[i] or "fasta" in files[i]:
|
|
170
|
-
continue
|
|
171
|
-
else:
|
|
172
|
-
del files[i]
|
|
173
|
-
elif file_format == "fastq" or file_format == "fq":
|
|
174
|
-
for i in range(len(files) - 1, -1, -1):
|
|
175
|
-
if "fastq" in files[i] or "fq" in files[i]:
|
|
176
|
-
continue
|
|
177
|
-
else:
|
|
178
|
-
del files[i]
|
|
179
|
-
if len(files) == 0:
|
|
180
|
-
print("Error: No " + str(file_format) + " files in directory!")
|
|
181
|
-
quit()
|
|
182
|
-
paths = files[:]
|
|
183
|
-
file_path2 = file_path[:]
|
|
184
|
-
for i in range(len(file_path2)):
|
|
185
|
-
if file_path2[i] == "\\":
|
|
186
|
-
list_temp = list(file_path2)
|
|
187
|
-
list_temp[i] = "/"
|
|
188
|
-
file_path2 = "".join(list_temp)
|
|
189
|
-
start = time.time()
|
|
190
|
-
for i in range(len(files)):
|
|
191
|
-
paths[i] = file_path2 + "/" + paths[i]
|
|
192
|
-
if XspecT:
|
|
193
|
-
predictions, scores = xspecT(
|
|
194
|
-
BF[genus],
|
|
195
|
-
BF_1_1[genus],
|
|
196
|
-
files,
|
|
197
|
-
paths,
|
|
198
|
-
file_format,
|
|
199
|
-
read_amount,
|
|
200
|
-
metagenome,
|
|
201
|
-
genus,
|
|
202
|
-
mode,
|
|
203
|
-
)
|
|
204
|
-
if ClAssT:
|
|
205
|
-
predictions_ClAssT, scores_ClAssT = clAssT(
|
|
206
|
-
BF_2, files, paths, file_format, read_amount
|
|
207
|
-
)
|
|
208
|
-
if oxa:
|
|
209
|
-
scores_oxa, scores_oxa_ind = blaOXA(
|
|
210
|
-
BF_3, files, paths, file_format, read_amount
|
|
211
|
-
)
|
|
212
|
-
print("Preparing results...")
|
|
213
|
-
print("")
|
|
214
|
-
end = time.time()
|
|
215
|
-
needed = round(end - start, 2)
|
|
216
|
-
print("Time needed: ", needed)
|
|
217
|
-
print("")
|
|
218
|
-
header_filename = "Filename"
|
|
219
|
-
spaces = []
|
|
220
|
-
space = " "
|
|
221
|
-
underscore = "________"
|
|
222
|
-
name_max = len(max(itemlist, key=len))
|
|
223
|
-
if XspecT:
|
|
224
|
-
for i in range(len(predictions)):
|
|
225
|
-
while len(predictions[i]) < name_max:
|
|
226
|
-
predictions[i] += " "
|
|
227
|
-
file_max = len(max(files, key=len))
|
|
228
|
-
while len(header_filename) < file_max:
|
|
229
|
-
header_filename += " "
|
|
230
|
-
underscore += "_"
|
|
231
|
-
for j in range(len(files)):
|
|
232
|
-
for i in range(len(header_filename) - len(files[j])):
|
|
233
|
-
space += " "
|
|
234
|
-
spaces.append(space)
|
|
235
|
-
space = " "
|
|
236
|
-
excel = []
|
|
237
|
-
# formatting
|
|
238
|
-
if ClAssT:
|
|
239
|
-
for i in range(len(predictions_ClAssT)):
|
|
240
|
-
if predictions_ClAssT[i] != "none" and predictions_ClAssT[i] != "None":
|
|
241
|
-
predictions_ClAssT[i] += " "
|
|
242
|
-
if XspecT and ClAssT:
|
|
243
|
-
for i in range(len(scores_ClAssT)):
|
|
244
|
-
if scores[i] == "1.0":
|
|
245
|
-
scores[i] += " "
|
|
246
|
-
|
|
247
|
-
if XspecT and ClAssT and oxa:
|
|
248
|
-
excelv2 = []
|
|
249
|
-
print(scores_oxa)
|
|
250
|
-
print(scores_oxa_ind)
|
|
251
|
-
for i in range(len(files)):
|
|
252
|
-
if scores_oxa == ["None"]:
|
|
253
|
-
excel.append(
|
|
254
|
-
files[i]
|
|
255
|
-
+ spaces[i]
|
|
256
|
-
+ predictions[i]
|
|
257
|
-
+ " "
|
|
258
|
-
+ scores[i]
|
|
259
|
-
+ " "
|
|
260
|
-
+ predictions_ClAssT[i]
|
|
261
|
-
+ " "
|
|
262
|
-
+ scores_ClAssT[i]
|
|
263
|
-
+ " "
|
|
264
|
-
+ str(scores_oxa[i])
|
|
265
|
-
+ " "
|
|
266
|
-
+ str(scores_oxa_ind[i][0])
|
|
267
|
-
+ " "
|
|
268
|
-
+ str(scores_oxa_ind[i][1])
|
|
269
|
-
)
|
|
270
|
-
else:
|
|
271
|
-
excel.append(
|
|
272
|
-
files[i]
|
|
273
|
-
+ spaces[i]
|
|
274
|
-
+ predictions[i]
|
|
275
|
-
+ " "
|
|
276
|
-
+ scores[i]
|
|
277
|
-
+ " "
|
|
278
|
-
+ predictions_ClAssT[i]
|
|
279
|
-
+ " "
|
|
280
|
-
+ scores_ClAssT[i]
|
|
281
|
-
+ " "
|
|
282
|
-
+ str(scores_oxa[i])
|
|
283
|
-
+ " "
|
|
284
|
-
+ str(scores_oxa_ind[i][0])
|
|
285
|
-
+ " "
|
|
286
|
-
+ str(scores_oxa_ind[i][1])
|
|
287
|
-
)
|
|
288
|
-
excelv2.append(
|
|
289
|
-
files[i]
|
|
290
|
-
+ ","
|
|
291
|
-
+ predictions[i]
|
|
292
|
-
+ ","
|
|
293
|
-
+ scores[i]
|
|
294
|
-
+ predictions_ClAssT[i]
|
|
295
|
-
+ ","
|
|
296
|
-
+ scores_ClAssT[i]
|
|
297
|
-
+ ","
|
|
298
|
-
+ str(scores_oxa[i])
|
|
299
|
-
)
|
|
300
|
-
print(
|
|
301
|
-
header_filename
|
|
302
|
-
+ " Species Score Sub-Type Score blaOXA-Family blaOXA-Gene Score"
|
|
303
|
-
)
|
|
304
|
-
print(
|
|
305
|
-
underscore
|
|
306
|
-
+ "___________________________________________________________________________________________________________________________________________"
|
|
307
|
-
)
|
|
308
|
-
for i in excel:
|
|
309
|
-
print(i)
|
|
310
|
-
for i in range(0, len(excelv2)):
|
|
311
|
-
excelv2[i] = [excelv2[i]]
|
|
312
|
-
if csv_table:
|
|
313
|
-
with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
|
|
314
|
-
writer = csv.writer(file)
|
|
315
|
-
writer.writerows(excelv2)
|
|
316
|
-
print("")
|
|
317
|
-
print("")
|
|
318
|
-
elif XspecT and not ClAssT and not oxa:
|
|
319
|
-
excelv2 = []
|
|
320
|
-
for i in range(len(files)):
|
|
321
|
-
excel.append(files[i] + spaces[i] + predictions[i] + " " + scores[i])
|
|
322
|
-
excelv2.append(files[i] + "," + predictions[i] + "," + scores[i])
|
|
323
|
-
print(header_filename + " Species Score")
|
|
324
|
-
print(underscore + "_________________________________________")
|
|
325
|
-
for i in excel:
|
|
326
|
-
print(i)
|
|
327
|
-
for i in range(0, len(excelv2)):
|
|
328
|
-
excelv2[i] = [excelv2[i]]
|
|
329
|
-
if csv_table:
|
|
330
|
-
with open(
|
|
331
|
-
r"Results/XspecT_mini_csv/Results_XspecT.csv", "w", newline=""
|
|
332
|
-
) as file:
|
|
333
|
-
writer = csv.writer(file)
|
|
334
|
-
writer.writerows(excelv2)
|
|
335
|
-
print("")
|
|
336
|
-
print("")
|
|
337
|
-
elif ClAssT and not XspecT and not oxa:
|
|
338
|
-
excelv2 = []
|
|
339
|
-
for i in range(len(files)):
|
|
340
|
-
excel.append(
|
|
341
|
-
files[i]
|
|
342
|
-
+ spaces[i]
|
|
343
|
-
+ predictions_ClAssT[i]
|
|
344
|
-
+ " "
|
|
345
|
-
+ scores_ClAssT[i]
|
|
346
|
-
)
|
|
347
|
-
excelv2.append(
|
|
348
|
-
files[i] + "," + predictions_ClAssT[i] + "," + scores_ClAssT[i]
|
|
349
|
-
)
|
|
350
|
-
print(header_filename + " Sub-Type Score")
|
|
351
|
-
print(underscore + "________________________________")
|
|
352
|
-
for i in excel:
|
|
353
|
-
print(i)
|
|
354
|
-
print("")
|
|
355
|
-
print("")
|
|
356
|
-
for i in range(0, len(excelv2)):
|
|
357
|
-
excelv2[i] = [excelv2[i]]
|
|
358
|
-
if csv_table:
|
|
359
|
-
with open(
|
|
360
|
-
r"Results/XspecT_mini_csv/Results_ClAssT.csv", "w", newline=""
|
|
361
|
-
) as file:
|
|
362
|
-
writer = csv.writer(file)
|
|
363
|
-
writer.writerows(excelv2)
|
|
364
|
-
elif oxa and not ClAssT and not XspecT:
|
|
365
|
-
excelv2 = []
|
|
366
|
-
for i in range(len(files)):
|
|
367
|
-
if scores_oxa == ["None"]:
|
|
368
|
-
excel.append(
|
|
369
|
-
files[i]
|
|
370
|
-
+ spaces[i]
|
|
371
|
-
+ str(scores_oxa[i])
|
|
372
|
-
+ " "
|
|
373
|
-
+ str(scores_oxa_ind[i][0])
|
|
374
|
-
+ " "
|
|
375
|
-
+ str(scores_oxa_ind[i][1])
|
|
376
|
-
)
|
|
377
|
-
else:
|
|
378
|
-
excel.append(
|
|
379
|
-
files[i]
|
|
380
|
-
+ spaces[i]
|
|
381
|
-
+ str(scores_oxa[i])
|
|
382
|
-
+ " "
|
|
383
|
-
+ str(scores_oxa_ind[i][0])
|
|
384
|
-
+ " "
|
|
385
|
-
+ str(scores_oxa_ind[i][1])
|
|
386
|
-
)
|
|
387
|
-
|
|
388
|
-
excelv2.append(files[i] + "," + str(scores_oxa[i]))
|
|
389
|
-
print(
|
|
390
|
-
header_filename
|
|
391
|
-
+ " blaOXA-Family blaOXA-Gene Score"
|
|
392
|
-
)
|
|
393
|
-
print(
|
|
394
|
-
underscore
|
|
395
|
-
+ "_______________________________________________________________________"
|
|
396
|
-
)
|
|
397
|
-
for i in excel:
|
|
398
|
-
print(i)
|
|
399
|
-
print("")
|
|
400
|
-
print("")
|
|
401
|
-
for i in range(0, len(excelv2)):
|
|
402
|
-
excelv2[i] = [excelv2[i]]
|
|
403
|
-
if csv_table:
|
|
404
|
-
with open(
|
|
405
|
-
r"Results/XspecT_mini_csv/Results_Oxa.csv", "w", newline=""
|
|
406
|
-
) as file:
|
|
407
|
-
writer = csv.writer(file)
|
|
408
|
-
writer.writerows(excelv2)
|
|
409
|
-
elif XspecT and ClAssT and not oxa:
|
|
410
|
-
excelv2 = []
|
|
411
|
-
for i in range(len(files)):
|
|
412
|
-
excel.append(
|
|
413
|
-
files[i]
|
|
414
|
-
+ spaces[i]
|
|
415
|
-
+ predictions[i]
|
|
416
|
-
+ " "
|
|
417
|
-
+ scores[i]
|
|
418
|
-
+ " "
|
|
419
|
-
+ predictions_ClAssT[i]
|
|
420
|
-
+ " "
|
|
421
|
-
+ scores_ClAssT[i]
|
|
422
|
-
)
|
|
423
|
-
excelv2.append(
|
|
424
|
-
files[i]
|
|
425
|
-
+ ","
|
|
426
|
-
+ predictions[i]
|
|
427
|
-
+ ","
|
|
428
|
-
+ scores[i]
|
|
429
|
-
+ ","
|
|
430
|
-
+ predictions_ClAssT[i]
|
|
431
|
-
+ ","
|
|
432
|
-
+ scores_ClAssT[i]
|
|
433
|
-
)
|
|
434
|
-
print(
|
|
435
|
-
header_filename
|
|
436
|
-
+ " Species Score Sub-Type Score"
|
|
437
|
-
)
|
|
438
|
-
print(
|
|
439
|
-
underscore
|
|
440
|
-
+ "________________________________________________________________________"
|
|
441
|
-
)
|
|
442
|
-
for i in excel:
|
|
443
|
-
print(i)
|
|
444
|
-
print("")
|
|
445
|
-
print("")
|
|
446
|
-
for i in range(0, len(excelv2)):
|
|
447
|
-
excelv2[i] = [excelv2[i]]
|
|
448
|
-
if csv_table:
|
|
449
|
-
with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
|
|
450
|
-
writer = csv.writer(file)
|
|
451
|
-
writer.writerows(excelv2)
|
|
452
|
-
elif XspecT and oxa and not ClAssT:
|
|
453
|
-
excelv2 = []
|
|
454
|
-
for i in range(len(files)):
|
|
455
|
-
if scores_oxa == ["None"]:
|
|
456
|
-
excel.append(
|
|
457
|
-
files[i]
|
|
458
|
-
+ spaces[i]
|
|
459
|
-
+ predictions[i]
|
|
460
|
-
+ " "
|
|
461
|
-
+ scores[i]
|
|
462
|
-
+ " "
|
|
463
|
-
+ str(scores_oxa[i])
|
|
464
|
-
+ " "
|
|
465
|
-
+ str(scores_oxa_ind[i][0])
|
|
466
|
-
+ " "
|
|
467
|
-
+ str(scores_oxa_ind[i][1])
|
|
468
|
-
)
|
|
469
|
-
else:
|
|
470
|
-
excel.append(
|
|
471
|
-
files[i]
|
|
472
|
-
+ spaces[i]
|
|
473
|
-
+ predictions[i]
|
|
474
|
-
+ " "
|
|
475
|
-
+ scores[i]
|
|
476
|
-
+ " "
|
|
477
|
-
+ str(scores_oxa[i])
|
|
478
|
-
+ " "
|
|
479
|
-
+ str(scores_oxa_ind[i][0])
|
|
480
|
-
+ " "
|
|
481
|
-
+ str(scores_oxa_ind[i][1])
|
|
482
|
-
)
|
|
483
|
-
excelv2.append(
|
|
484
|
-
files[i] + "," + predictions[i] + "," + scores[i] + str(scores_oxa[i])
|
|
485
|
-
)
|
|
486
|
-
print(
|
|
487
|
-
header_filename
|
|
488
|
-
+ " Species Score blaOXA-Family blaOXA-Gene Score"
|
|
489
|
-
)
|
|
490
|
-
print(
|
|
491
|
-
underscore
|
|
492
|
-
+ "_______________________________________________________________________________________________________________"
|
|
493
|
-
)
|
|
494
|
-
for i in excel:
|
|
495
|
-
print(i)
|
|
496
|
-
for i in range(0, len(excelv2)):
|
|
497
|
-
excelv2[i] = [excelv2[i]]
|
|
498
|
-
if csv_table:
|
|
499
|
-
with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
|
|
500
|
-
writer = csv.writer(file)
|
|
501
|
-
writer.writerows(excelv2)
|
|
502
|
-
print("")
|
|
503
|
-
print("")
|
|
504
|
-
elif ClAssT and oxa and not XspecT:
|
|
505
|
-
excelv2 = []
|
|
506
|
-
for i in range(len(files)):
|
|
507
|
-
if scores_oxa == ["None"]:
|
|
508
|
-
excel.append(
|
|
509
|
-
files[i]
|
|
510
|
-
+ spaces[i]
|
|
511
|
-
+ predictions_ClAssT[i]
|
|
512
|
-
+ " "
|
|
513
|
-
+ scores_ClAssT[i]
|
|
514
|
-
+ " "
|
|
515
|
-
+ str(scores_oxa[i])
|
|
516
|
-
+ " "
|
|
517
|
-
+ str(scores_oxa_ind[i][0])
|
|
518
|
-
+ " "
|
|
519
|
-
+ str(scores_oxa_ind[i][1])
|
|
520
|
-
)
|
|
521
|
-
else:
|
|
522
|
-
excel.append(
|
|
523
|
-
files[i]
|
|
524
|
-
+ spaces[i]
|
|
525
|
-
+ predictions_ClAssT[i]
|
|
526
|
-
+ " "
|
|
527
|
-
+ scores_ClAssT[i]
|
|
528
|
-
+ " "
|
|
529
|
-
+ str(scores_oxa[i])
|
|
530
|
-
+ " "
|
|
531
|
-
+ str(scores_oxa_ind[i][0])
|
|
532
|
-
+ " "
|
|
533
|
-
+ str(scores_oxa_ind[i][1])
|
|
534
|
-
)
|
|
535
|
-
excelv2.append(
|
|
536
|
-
files[i]
|
|
537
|
-
+ ","
|
|
538
|
-
+ predictions_ClAssT[i]
|
|
539
|
-
+ ","
|
|
540
|
-
+ scores_ClAssT[i]
|
|
541
|
-
+ ","
|
|
542
|
-
+ str(scores_oxa[i])
|
|
543
|
-
)
|
|
544
|
-
print(
|
|
545
|
-
header_filename
|
|
546
|
-
+ " Sub-Type Score blaOXA-Family blaOXA-Gene Score"
|
|
547
|
-
)
|
|
548
|
-
print(
|
|
549
|
-
underscore
|
|
550
|
-
+ "______________________________________________________________________________________________________"
|
|
551
|
-
)
|
|
552
|
-
for i in excel:
|
|
553
|
-
print(i)
|
|
554
|
-
for i in range(0, len(excelv2)):
|
|
555
|
-
excelv2[i] = [excelv2[i]]
|
|
556
|
-
if csv_table:
|
|
557
|
-
with open(r"Results/XspecT_mini_csv/Results.csv", "w", newline="") as file:
|
|
558
|
-
writer = csv.writer(file)
|
|
559
|
-
writer.writerows(excelv2)
|
|
560
|
-
print("")
|
|
561
|
-
print("")
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus, mode):
|
|
565
|
-
"""performs a BF-lookup for a set of genomes for testing purpose"""
|
|
566
|
-
print("Starting taxonomic assignment on species-level...")
|
|
567
|
-
predictions = []
|
|
568
|
-
scores = []
|
|
569
|
-
counterx = 0
|
|
570
|
-
contig_header = []
|
|
571
|
-
contig_seq = []
|
|
572
|
-
# Phillip
|
|
573
|
-
names_path = (
|
|
574
|
-
Path(os.getcwd()) / "filter" / "species_names" / ("Filter" + genus + ".txt")
|
|
575
|
-
)
|
|
576
|
-
with open(names_path, "rb") as fp:
|
|
577
|
-
names = pickle.load(fp)
|
|
578
|
-
names = sorted(names)
|
|
579
|
-
# translation_dict = load_translation_dict(genus)
|
|
580
|
-
for i in range(len(files)):
|
|
581
|
-
if (
|
|
582
|
-
i == int(len(files) / 6)
|
|
583
|
-
or i == int(len(files) / 3)
|
|
584
|
-
or i == int(len(files) / 2)
|
|
585
|
-
or i == int(len(files) / 1.5)
|
|
586
|
-
or i == int(len(files) / 1.2)
|
|
587
|
-
):
|
|
588
|
-
print("...")
|
|
589
|
-
BF.number_of_kmeres = 0
|
|
590
|
-
BF.hits_per_filter = [0] * BF.clonetypes
|
|
591
|
-
BF_1_1.number_of_kmeres = 0
|
|
592
|
-
BF_1_1.hits_per_filter = [0]
|
|
593
|
-
if file_format == "fasta" or file_format == "fna" or file_format == "fa":
|
|
594
|
-
if metagenome:
|
|
595
|
-
contigs = []
|
|
596
|
-
contigs_classified = {}
|
|
597
|
-
for sequence in SeqIO.parse(paths[i], "fasta"):
|
|
598
|
-
contigs = []
|
|
599
|
-
contigs_kmers = []
|
|
600
|
-
BF_1_1.kmer_hits_single = []
|
|
601
|
-
BF_1_1.number_of_kmeres = 0
|
|
602
|
-
BF_1_1.hits_per_filter = [0] * BF.clonetypes
|
|
603
|
-
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
604
|
-
# then the contigs won't be tested further
|
|
605
|
-
hit_sum = sum(BF_1_1.hits_per_filter)
|
|
606
|
-
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
607
|
-
sample_size = int(len(str(sequence.seq)) ** 0.5)
|
|
608
|
-
threshold_contig = sample_size * 0.7
|
|
609
|
-
for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
|
|
610
|
-
if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
|
|
611
|
-
BF_1_1.lookup(str(sequence.seq[i : i + BF_1_1.k]).upper())
|
|
612
|
-
|
|
613
|
-
# needs at least 70% hits to continue with the contig
|
|
614
|
-
counter = 0
|
|
615
|
-
if (sum(BF_1_1.hits_per_filter) - hit_sum) > threshold_contig:
|
|
616
|
-
for j in range(len(str(sequence.seq)) - BF_1_1.k):
|
|
617
|
-
if "N" not in str(sequence.seq[j : j + BF_1_1.k]):
|
|
618
|
-
contigs_kmers.append(
|
|
619
|
-
str(sequence.seq[j : j + BF_1_1.k]).upper()
|
|
620
|
-
)
|
|
621
|
-
counter += 1
|
|
622
|
-
# how many kmers? to use
|
|
623
|
-
if counter >= 5000000:
|
|
624
|
-
break
|
|
625
|
-
# contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
|
|
626
|
-
contigs.append(contigs_kmers)
|
|
627
|
-
BF_1_1.hits_per_filter = hits_per_filter_copy
|
|
628
|
-
else:
|
|
629
|
-
# resetting hit counter
|
|
630
|
-
BF_1_1.hits_per_filter = hits_per_filter_copy
|
|
631
|
-
continue
|
|
632
|
-
|
|
633
|
-
contigs_filtered = []
|
|
634
|
-
counter = 0
|
|
635
|
-
# Since we classify individual contigs now, the var contigs only contains one item which makes those loops unneccesary
|
|
636
|
-
for i in range(len(contigs)):
|
|
637
|
-
threshold = 0
|
|
638
|
-
for j in range(len(contigs[i])):
|
|
639
|
-
BF_1_1.number_of_kmeres += 1
|
|
640
|
-
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
641
|
-
BF_1_1.lookup(contigs[i][j])
|
|
642
|
-
if hits_per_filter_copy != BF_1_1.hits_per_filter:
|
|
643
|
-
threshold += 1
|
|
644
|
-
# parameter value needs to be determined
|
|
645
|
-
if threshold >= (0.7 * len(contigs[i])):
|
|
646
|
-
contigs_filtered += contigs[i]
|
|
647
|
-
counter += len(contigs[i])
|
|
648
|
-
if counter >= 5000:
|
|
649
|
-
break
|
|
650
|
-
|
|
651
|
-
# since we do indv. contig classifications we need to reset the BF vars
|
|
652
|
-
BF.kmer_hits_single = []
|
|
653
|
-
BF.number_of_kmeres = 0
|
|
654
|
-
BF.hits_per_filter = [0] * BF.clonetypes
|
|
655
|
-
for kmer in contigs_filtered:
|
|
656
|
-
BF.number_of_kmeres += 1
|
|
657
|
-
kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
|
|
658
|
-
if kmer > kmer_reversed:
|
|
659
|
-
BF.lookup(kmer)
|
|
660
|
-
else:
|
|
661
|
-
BF.lookup(kmer_reversed)
|
|
662
|
-
score = BF.get_score()
|
|
663
|
-
score_edit = [str(x) for x in score]
|
|
664
|
-
score_edit = ",".join(score_edit)
|
|
665
|
-
|
|
666
|
-
# making prediction
|
|
667
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
668
|
-
prediction = names[index_result]
|
|
669
|
-
|
|
670
|
-
# skip ambiguous contigs
|
|
671
|
-
if max(score) == sorted(score)[-2]:
|
|
672
|
-
continue
|
|
673
|
-
|
|
674
|
-
# bootstrapping
|
|
675
|
-
bootstrap_n = 100
|
|
676
|
-
samples = bs.bootstrap(
|
|
677
|
-
BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
|
|
678
|
-
)
|
|
679
|
-
sample_scores = bs.bootstrap_scores(
|
|
680
|
-
samples, BF.number_of_kmeres, BF.clonetypes
|
|
681
|
-
)
|
|
682
|
-
bootstrap_score = 0
|
|
683
|
-
bootstrap_predictions = []
|
|
684
|
-
for i in range(len(sample_scores)):
|
|
685
|
-
# skip ambiguous contigs (species with same score)
|
|
686
|
-
if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
|
|
687
|
-
bootstrap_predictions.append(
|
|
688
|
-
names[
|
|
689
|
-
max(
|
|
690
|
-
range(len(sample_scores[i])),
|
|
691
|
-
key=sample_scores[i].__getitem__,
|
|
692
|
-
)
|
|
693
|
-
]
|
|
694
|
-
)
|
|
695
|
-
if (
|
|
696
|
-
max(
|
|
697
|
-
range(len(sample_scores[i])),
|
|
698
|
-
key=sample_scores[i].__getitem__,
|
|
699
|
-
)
|
|
700
|
-
== index_result
|
|
701
|
-
):
|
|
702
|
-
bootstrap_score += 1
|
|
703
|
-
else:
|
|
704
|
-
continue
|
|
705
|
-
bootstrap_score = bootstrap_score / bootstrap_n
|
|
706
|
-
|
|
707
|
-
# ---------------------------------------------------------------------------------------------
|
|
708
|
-
# Collect results
|
|
709
|
-
# change this var to the species you want your contigs saved from
|
|
710
|
-
save_contigs = "none"
|
|
711
|
-
|
|
712
|
-
if (genus[0] + ". " + prediction) not in contigs_classified:
|
|
713
|
-
contigs_classified[genus[0] + ". " + prediction] = [
|
|
714
|
-
[max(score)],
|
|
715
|
-
1,
|
|
716
|
-
[len(str(sequence.seq))],
|
|
717
|
-
sorted(score)[-2] / max(score),
|
|
718
|
-
[bootstrap_score],
|
|
719
|
-
contigs_filtered,
|
|
720
|
-
None,
|
|
721
|
-
]
|
|
722
|
-
if prediction == save_contigs:
|
|
723
|
-
contig_header += [sequence.description]
|
|
724
|
-
contig_seq += [str(sequence.seq)]
|
|
725
|
-
else:
|
|
726
|
-
contigs_classified[genus[0] + ". " + prediction][0] += [
|
|
727
|
-
max(score)
|
|
728
|
-
]
|
|
729
|
-
contigs_classified[genus[0] + ". " + prediction][1] += 1
|
|
730
|
-
contigs_classified[genus[0] + ". " + prediction][2] += [
|
|
731
|
-
len(str(sequence.seq))
|
|
732
|
-
]
|
|
733
|
-
contigs_classified[genus[0] + ". " + prediction][3] += sorted(
|
|
734
|
-
score
|
|
735
|
-
)[-2] / max(score)
|
|
736
|
-
contigs_classified[genus[0] + ". " + prediction][4] += [
|
|
737
|
-
bootstrap_score
|
|
738
|
-
]
|
|
739
|
-
contigs_classified[genus[0] + ". " + prediction][
|
|
740
|
-
5
|
|
741
|
-
] += contigs_filtered
|
|
742
|
-
if prediction == save_contigs:
|
|
743
|
-
contig_header += [sequence.description]
|
|
744
|
-
contig_seq += [str(sequence.seq)]
|
|
745
|
-
# scores.append(str(max(score)))
|
|
746
|
-
else:
|
|
747
|
-
# Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
|
|
748
|
-
BF.kmer_hits_single = []
|
|
749
|
-
for sequence in SeqIO.parse(paths[i], "fasta"):
|
|
750
|
-
for j in range(0, len(sequence.seq) - BF.k, mode):
|
|
751
|
-
BF.number_of_kmeres += 1
|
|
752
|
-
kmer = str(sequence.seq[j : j + BF.k])
|
|
753
|
-
kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
|
|
754
|
-
if kmer > kmer_reversed:
|
|
755
|
-
BF.lookup(kmer)
|
|
756
|
-
else:
|
|
757
|
-
BF.lookup(kmer_reversed)
|
|
758
|
-
|
|
759
|
-
score = BF.get_score()
|
|
760
|
-
# print("Scores: ", score)
|
|
761
|
-
if metagenome:
|
|
762
|
-
# map kmers to genome for HGT detection
|
|
763
|
-
# change later to new functions this is OLD
|
|
764
|
-
if False:
|
|
765
|
-
for prediction in contigs_classified:
|
|
766
|
-
kmers = contigs_classified[prediction][5]
|
|
767
|
-
# Strip "A."
|
|
768
|
-
prediction = prediction[2:]
|
|
769
|
-
# kmer mapping to genome, start by loading the kmer_dict in
|
|
770
|
-
path_pos = (
|
|
771
|
-
"filter\kmer_positions\Acinetobacter\\"
|
|
772
|
-
+ prediction
|
|
773
|
-
+ "_positions.txt"
|
|
774
|
-
)
|
|
775
|
-
# delete later
|
|
776
|
-
path_posv2 = (
|
|
777
|
-
"filter\kmer_positions\Acinetobacter\\"
|
|
778
|
-
+ prediction
|
|
779
|
-
+ "_complete_positions.txt"
|
|
780
|
-
)
|
|
781
|
-
# cluster kmers to contigs
|
|
782
|
-
# delete try later
|
|
783
|
-
try:
|
|
784
|
-
with open(path_pos, "rb") as fp:
|
|
785
|
-
kmer_dict = pickle.load(fp)
|
|
786
|
-
except:
|
|
787
|
-
with open(path_posv2, "rb") as fp:
|
|
788
|
-
kmer_dict = pickle.load(fp)
|
|
789
|
-
contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
|
|
790
|
-
contigs_classified[genus[0] + ". " + prediction][
|
|
791
|
-
6
|
|
792
|
-
] = contig_amounts_distances
|
|
793
|
-
# del kmer_dict
|
|
794
|
-
for key, value in contigs_classified.items():
|
|
795
|
-
number_of_contigs = value[1]
|
|
796
|
-
# save results
|
|
797
|
-
results_clustering = [
|
|
798
|
-
[
|
|
799
|
-
key
|
|
800
|
-
+ ","
|
|
801
|
-
+ str(statistics.median(value[0]))
|
|
802
|
-
+ ","
|
|
803
|
-
+ str(number_of_contigs),
|
|
804
|
-
str(statistics.median(value[2]))
|
|
805
|
-
+ ","
|
|
806
|
-
+ str(round(value[3] / number_of_contigs, 2))
|
|
807
|
-
+ ","
|
|
808
|
-
+ str(statistics.median(value[4])),
|
|
809
|
-
]
|
|
810
|
-
]
|
|
811
|
-
# with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
|
|
812
|
-
# writer = csv.writer(file)
|
|
813
|
-
# writer.writerows(results_clustering)
|
|
814
|
-
value[0] = "Score Median: " + str(statistics.median(value[0]))
|
|
815
|
-
value[1] = "Number of Contigs: " + str(number_of_contigs)
|
|
816
|
-
value[2] = "Contig-Length Median: " + str(
|
|
817
|
-
statistics.median(value[2])
|
|
818
|
-
)
|
|
819
|
-
value[3] = "Repetiviness: " + str(
|
|
820
|
-
round(value[3] / number_of_contigs, 2)
|
|
821
|
-
)
|
|
822
|
-
value[4] = "Bootstrap Median: " + str(statistics.median(value[4]))
|
|
823
|
-
# value[6] = "Clusters: " + str(value[6])
|
|
824
|
-
contigs_classified[key] = value
|
|
825
|
-
print("Species: ", key)
|
|
826
|
-
print(value[0])
|
|
827
|
-
print(value[1])
|
|
828
|
-
print(value[2])
|
|
829
|
-
print(value[3])
|
|
830
|
-
print(value[4])
|
|
831
|
-
print(value[6])
|
|
832
|
-
print()
|
|
833
|
-
|
|
834
|
-
save_contigs = "none"
|
|
835
|
-
if save_contigs != "none":
|
|
836
|
-
with open(r"Results/Contigs_saved.fasta", "w") as file:
|
|
837
|
-
for j in range(len(contig_header)):
|
|
838
|
-
file.write(contig_header[j] + "\n")
|
|
839
|
-
file.write(contig_seq[j] + "\n")
|
|
840
|
-
file.write("\n")
|
|
841
|
-
elif file_format == "fastq" or file_format == "fq":
|
|
842
|
-
if metagenome:
|
|
843
|
-
# ---------------------------------------------------------------------------------------------
|
|
844
|
-
# initialize variables
|
|
845
|
-
BF_1_1.kmer_hits_single = []
|
|
846
|
-
BF_1_1.number_of_kmeres = 0
|
|
847
|
-
BF_1_1.hits_per_filter = [0] * BF.clonetypes
|
|
848
|
-
counter = 0
|
|
849
|
-
reads = []
|
|
850
|
-
reads_classified = {}
|
|
851
|
-
reads_passed = 0
|
|
852
|
-
ambiguous_reads = 0
|
|
853
|
-
|
|
854
|
-
# ---------------------------------------------------------------------------------------------
|
|
855
|
-
# First prefiltering step: Check if read contains at least 3 kmeres
|
|
856
|
-
for sequence in SeqIO.parse(paths[i], "fastq"):
|
|
857
|
-
dna_composition = {}
|
|
858
|
-
dna_composition = calculate_dna_composition(sequence.seq)
|
|
859
|
-
BF_1_1.kmer_hits_single = []
|
|
860
|
-
BF_1_1.number_of_kmeres = 0
|
|
861
|
-
BF_1_1.hits_per_filter = [0] * BF.clonetypes
|
|
862
|
-
# reverse_sequence = sequence.seq.reverse_complement()
|
|
863
|
-
read_kmers = []
|
|
864
|
-
reads = []
|
|
865
|
-
if counter < read_amount:
|
|
866
|
-
counter += 1
|
|
867
|
-
else:
|
|
868
|
-
break
|
|
869
|
-
k1 = str(sequence.seq[0 : BF_1_1.k]) # first k-mer
|
|
870
|
-
k2 = str(
|
|
871
|
-
sequence.seq[len(str(sequence.seq)) - BF_1_1.k :]
|
|
872
|
-
) # last k-mer
|
|
873
|
-
mid = len(str(sequence.seq)) // 2
|
|
874
|
-
k3 = str(sequence.seq[mid : mid + BF_1_1.k]) # k-mer in middle
|
|
875
|
-
k4 = str(sequence.seq[BF_1_1.k : BF_1_1.k * 2])
|
|
876
|
-
k5 = str(sequence.seq[mid + BF_1_1.k : mid + BF_1_1.k * 2])
|
|
877
|
-
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
878
|
-
# then the read won't be tested further
|
|
879
|
-
hit_sum = sum(BF_1_1.hits_per_filter)
|
|
880
|
-
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
881
|
-
# sample_size = int(len(str(sequence.seq)) ** 0.5)
|
|
882
|
-
# threshold_read = sample_size * 0.7
|
|
883
|
-
# for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
|
|
884
|
-
# if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
|
|
885
|
-
# BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
|
|
886
|
-
if "N" not in str(sequence.seq):
|
|
887
|
-
BF_1_1.lookup(k1)
|
|
888
|
-
BF_1_1.lookup(k2)
|
|
889
|
-
BF_1_1.lookup(k3)
|
|
890
|
-
BF_1_1.lookup(k4)
|
|
891
|
-
BF_1_1.lookup(k5)
|
|
892
|
-
else:
|
|
893
|
-
continue
|
|
894
|
-
# needs at least 2 of 3 hits to continue with read
|
|
895
|
-
if (sum(BF_1_1.hits_per_filter) - hit_sum) > 3:
|
|
896
|
-
for j in range(len(str(sequence.seq)) - BF_1_1.k):
|
|
897
|
-
read_kmers.append(str(sequence.seq[j : j + BF_1_1.k]))
|
|
898
|
-
# read_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
|
|
899
|
-
reads.append(read_kmers)
|
|
900
|
-
BF_1_1.hits_per_filter = hits_per_filter_copy
|
|
901
|
-
else:
|
|
902
|
-
# resetting hit counter
|
|
903
|
-
BF_1_1.hits_per_filter = hits_per_filter_copy
|
|
904
|
-
continue
|
|
905
|
-
|
|
906
|
-
# ---------------------------------------------------------------------------------------------
|
|
907
|
-
# Second prefiltering step: Check if read contains at least 80% of kmers from one species
|
|
908
|
-
# reads_filtered = set()
|
|
909
|
-
reads_filtered = []
|
|
910
|
-
for i in range(len(reads)):
|
|
911
|
-
threshold = 0
|
|
912
|
-
for j in range(len(reads[i])):
|
|
913
|
-
BF_1_1.number_of_kmeres += 1
|
|
914
|
-
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
915
|
-
if "N" not in reads[i][j]:
|
|
916
|
-
BF_1_1.lookup(reads[i][j])
|
|
917
|
-
if hits_per_filter_copy != BF_1_1.hits_per_filter:
|
|
918
|
-
threshold += 1
|
|
919
|
-
if threshold >= 0.7 * len(reads[i]):
|
|
920
|
-
reads_filtered += reads[i]
|
|
921
|
-
if len(reads_filtered) == 0:
|
|
922
|
-
continue
|
|
923
|
-
|
|
924
|
-
# ---------------------------------------------------------------------------------------------
|
|
925
|
-
# Start of the actual classification
|
|
926
|
-
BF.number_of_kmeres = 0
|
|
927
|
-
BF.hits_per_filter = [0] * BF.clonetypes
|
|
928
|
-
BF.kmer_hits_single = []
|
|
929
|
-
for kmer in reads_filtered:
|
|
930
|
-
if "N" not in kmer:
|
|
931
|
-
BF.number_of_kmeres += 1
|
|
932
|
-
kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
|
|
933
|
-
if kmer > kmer_reversed:
|
|
934
|
-
BF.lookup(kmer)
|
|
935
|
-
else:
|
|
936
|
-
BF.lookup(kmer_reversed)
|
|
937
|
-
else:
|
|
938
|
-
continue
|
|
939
|
-
score = BF.get_score()
|
|
940
|
-
score_edit = [str(x) for x in score]
|
|
941
|
-
score_edit = ",".join(score_edit)
|
|
942
|
-
|
|
943
|
-
# making prediction
|
|
944
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
945
|
-
prediction = names[index_result]
|
|
946
|
-
if max(score) == sorted(score)[-2]:
|
|
947
|
-
ambiguous_reads += 1
|
|
948
|
-
# print("Ambiguous read")
|
|
949
|
-
# continue
|
|
950
|
-
|
|
951
|
-
# ---------------------------------------------------------------------------------------------
|
|
952
|
-
# bootstrapping
|
|
953
|
-
bootstrap_n = 100
|
|
954
|
-
samples = bs.bootstrap(
|
|
955
|
-
BF.kmer_hits_single, BF.number_of_kmeres, bootstrap_n
|
|
956
|
-
)
|
|
957
|
-
sample_scores = bs.bootstrap_scores(
|
|
958
|
-
samples, BF.number_of_kmeres, BF.clonetypes
|
|
959
|
-
)
|
|
960
|
-
bootstrap_score = 0
|
|
961
|
-
bootstrap_predictions = []
|
|
962
|
-
for i in range(len(sample_scores)):
|
|
963
|
-
if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
|
|
964
|
-
bootstrap_predictions.append(
|
|
965
|
-
names[
|
|
966
|
-
max(
|
|
967
|
-
range(len(sample_scores[i])),
|
|
968
|
-
key=sample_scores[i].__getitem__,
|
|
969
|
-
)
|
|
970
|
-
]
|
|
971
|
-
)
|
|
972
|
-
if (
|
|
973
|
-
max(
|
|
974
|
-
range(len(sample_scores[i])),
|
|
975
|
-
key=sample_scores[i].__getitem__,
|
|
976
|
-
)
|
|
977
|
-
== index_result
|
|
978
|
-
):
|
|
979
|
-
bootstrap_score += 1
|
|
980
|
-
else:
|
|
981
|
-
continue
|
|
982
|
-
bootstrap_score = bootstrap_score / bootstrap_n
|
|
983
|
-
|
|
984
|
-
# ---------------------------------------------------------------------------------------------
|
|
985
|
-
# HGT identification pipeline start
|
|
986
|
-
|
|
987
|
-
# skip species clear reads
|
|
988
|
-
# if max(score) <= 0.9:
|
|
989
|
-
# identify split reads from HGT
|
|
990
|
-
# split_regions = map.identify_split_reads(score, BF.kmer_hits_single)
|
|
991
|
-
|
|
992
|
-
# split_read contains touples --> ([first part of list, second part of list], index of species)
|
|
993
|
-
|
|
994
|
-
# check if it is in fact a split read , 0.6 is arbitrary value, it is the threshold for the difference between the two regions
|
|
995
|
-
# if abs(sum(split_regions[0][0]) - sum(split_regions[0][1])) > 0.6:
|
|
996
|
-
# get the species names
|
|
997
|
-
# acceptor_species = names[split_regions[0][1]]
|
|
998
|
-
# donor_species = names[split_regions[1][1]]
|
|
999
|
-
# donor_acceptor = [donor_species, acceptor_species]
|
|
1000
|
-
# else:
|
|
1001
|
-
# donor_acceptor = [None]
|
|
1002
|
-
|
|
1003
|
-
# ---------------------------------------------------------------------------------------------
|
|
1004
|
-
# Collect results from classification
|
|
1005
|
-
if (genus[0] + ". " + prediction) not in reads_classified:
|
|
1006
|
-
reads_classified[genus[0] + ". " + prediction] = [
|
|
1007
|
-
max(score),
|
|
1008
|
-
1,
|
|
1009
|
-
sorted(score)[-2] / max(score),
|
|
1010
|
-
BF.number_of_kmeres,
|
|
1011
|
-
[bootstrap_score],
|
|
1012
|
-
reads_filtered,
|
|
1013
|
-
None,
|
|
1014
|
-
]
|
|
1015
|
-
else:
|
|
1016
|
-
reads_classified[genus[0] + ". " + prediction][1] += 1
|
|
1017
|
-
reads_classified[genus[0] + ". " + prediction][0] += max(score)
|
|
1018
|
-
reads_classified[genus[0] + ". " + prediction][2] += sorted(
|
|
1019
|
-
score
|
|
1020
|
-
)[-2] / max(score)
|
|
1021
|
-
reads_classified[genus[0] + ". " + prediction][
|
|
1022
|
-
3
|
|
1023
|
-
] += BF.number_of_kmeres
|
|
1024
|
-
reads_classified[genus[0] + ". " + prediction][4] += [
|
|
1025
|
-
bootstrap_score
|
|
1026
|
-
]
|
|
1027
|
-
reads_classified[genus[0] + ". " + prediction][
|
|
1028
|
-
5
|
|
1029
|
-
] += reads_filtered
|
|
1030
|
-
# reads_classified[genus[0] + ". " + prediction][7] += [dna_composition]
|
|
1031
|
-
# reads_classified[genus[0] + ". " + prediction][8] += [donor_acceptor]
|
|
1032
|
-
|
|
1033
|
-
else:
|
|
1034
|
-
# classification for sequence pure reads, check every 10th kmer (or everyone for "complete" mode)
|
|
1035
|
-
counter = 0
|
|
1036
|
-
# Important! Resetting the kmer_hits_single otherwise MEMORY LEAK
|
|
1037
|
-
BF.kmer_hits_single = []
|
|
1038
|
-
for sequence in SeqIO.parse(paths[i], "fastq"):
|
|
1039
|
-
if counter < read_amount:
|
|
1040
|
-
counter += 1
|
|
1041
|
-
for j in range(0, len(sequence.seq) - BF.k + 1, mode):
|
|
1042
|
-
BF.number_of_kmeres += 1
|
|
1043
|
-
kmer = str(sequence.seq[j : j + BF.k])
|
|
1044
|
-
kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
|
|
1045
|
-
if kmer > kmer_reversed:
|
|
1046
|
-
BF.lookup(kmer)
|
|
1047
|
-
else:
|
|
1048
|
-
BF.lookup(kmer_reversed)
|
|
1049
|
-
else:
|
|
1050
|
-
break
|
|
1051
|
-
score = BF.get_score()
|
|
1052
|
-
|
|
1053
|
-
if metagenome:
|
|
1054
|
-
# ---------------------------------------------------------------------------------------------
|
|
1055
|
-
# map kmers to genome for HGT detection
|
|
1056
|
-
if False:
|
|
1057
|
-
# map and cluster single reads to genome
|
|
1058
|
-
read_clusters = []
|
|
1059
|
-
for prediction in reads_classified:
|
|
1060
|
-
# load list of kmers from read
|
|
1061
|
-
kmers = reads_classified[prediction][5]
|
|
1062
|
-
# Strip genus name
|
|
1063
|
-
prediction = prediction[2:]
|
|
1064
|
-
|
|
1065
|
-
# kmer mapping to genome, start by loading the kmer_dict in
|
|
1066
|
-
path_pos = (
|
|
1067
|
-
"filter\kmer_positions\Acinetobacter\\"
|
|
1068
|
-
+ prediction
|
|
1069
|
-
+ "_positions.txt"
|
|
1070
|
-
)
|
|
1071
|
-
# delete later
|
|
1072
|
-
path_posv2 = (
|
|
1073
|
-
"filter\kmer_positions\Acinetobacter\\"
|
|
1074
|
-
+ prediction
|
|
1075
|
-
+ "_complete_positions.txt"
|
|
1076
|
-
)
|
|
1077
|
-
# cluster kmers to reads
|
|
1078
|
-
# delete try later
|
|
1079
|
-
try:
|
|
1080
|
-
with open(path_pos, "rb") as fp:
|
|
1081
|
-
kmer_dict = pickle.load(fp)
|
|
1082
|
-
except:
|
|
1083
|
-
with open(path_posv2, "rb") as fp:
|
|
1084
|
-
kmer_dict = pickle.load(fp)
|
|
1085
|
-
test = map.map_kmers(kmers, kmer_dict, genus)
|
|
1086
|
-
clusters = map.cluster_kmers(kmers, kmer_dict)
|
|
1087
|
-
read_clusters.append(clusters)
|
|
1088
|
-
reads_classified[genus[0] + ". " + prediction][
|
|
1089
|
-
6
|
|
1090
|
-
] = reads_amounts_distances
|
|
1091
|
-
# del kmer_dict
|
|
1092
|
-
|
|
1093
|
-
# now cluster mappings of multiple reads to genome
|
|
1094
|
-
for cluster in read_clusters:
|
|
1095
|
-
# TODO
|
|
1096
|
-
continue
|
|
1097
|
-
|
|
1098
|
-
# ---------------------------------------------------------------------------------------------
|
|
1099
|
-
# Collect results from classification
|
|
1100
|
-
for key, value in reads_classified.items():
|
|
1101
|
-
if key == "unknown":
|
|
1102
|
-
continue
|
|
1103
|
-
value.insert(2, value[0] / value[1])
|
|
1104
|
-
value.pop(0)
|
|
1105
|
-
reads_classified[key] = value
|
|
1106
|
-
print(
|
|
1107
|
-
key,
|
|
1108
|
-
value[0],
|
|
1109
|
-
round(value[1], 2),
|
|
1110
|
-
round(value[2] / value[0], 2),
|
|
1111
|
-
round(value[3] / value[0], 2),
|
|
1112
|
-
statistics.median(value[4]),
|
|
1113
|
-
)
|
|
1114
|
-
score_edit = [str(x) for x in score]
|
|
1115
|
-
score_edit = ",".join(score_edit)
|
|
1116
|
-
# making prediction
|
|
1117
|
-
if not metagenome:
|
|
1118
|
-
# prediction = Classifier.classify(r'Training_data/Training_data_spec.csv', score, True)
|
|
1119
|
-
# Phillip
|
|
1120
|
-
# file_name = genus + "_Training_data_spec.csv"
|
|
1121
|
-
# path = Path(__file__).parent.absolute() / "Training_data" / file_name
|
|
1122
|
-
# prediction = Classifier.classify(path, score, True)
|
|
1123
|
-
# SVM TURNED OFF TEsting!!
|
|
1124
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
1125
|
-
prediction = names[index_result]
|
|
1126
|
-
names_copy = names[:]
|
|
1127
|
-
# sort score by descending order and names_copy accordingly
|
|
1128
|
-
score, names_copy = zip(*sorted(zip(score, names_copy), reverse=True))
|
|
1129
|
-
# print(score[0:3])
|
|
1130
|
-
# print(names_copy[0:3])
|
|
1131
|
-
else:
|
|
1132
|
-
# Phillip
|
|
1133
|
-
# prediction_name = translation_dict[prediction]
|
|
1134
|
-
# predictions.append(prediction_name)
|
|
1135
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
1136
|
-
prediction = names[index_result]
|
|
1137
|
-
translation_dict = load_translation_dict(genus)
|
|
1138
|
-
predictions.append(translation_dict[prediction])
|
|
1139
|
-
scores.append(str(max(score)))
|
|
1140
|
-
print("Taxonomic assignment done...")
|
|
1141
|
-
return predictions, scores
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
def clAssT(BF_2, files, paths, file_format, read_amount):
|
|
1145
|
-
print("Starting strain-typing on sub-type-level...")
|
|
1146
|
-
predictions_ClAssT = []
|
|
1147
|
-
scores_ClAssT = []
|
|
1148
|
-
for i in range(len(files)):
|
|
1149
|
-
if (
|
|
1150
|
-
i == int(len(files) / 6)
|
|
1151
|
-
or i == int(len(files) / 3)
|
|
1152
|
-
or i == int(len(files) / 2)
|
|
1153
|
-
or i == int(len(files) / 1.5)
|
|
1154
|
-
or i == int(len(files) / 1.2)
|
|
1155
|
-
):
|
|
1156
|
-
print("...")
|
|
1157
|
-
BF_2.number_of_kmeres = 0
|
|
1158
|
-
BF_2.hits_per_filter = [0] * BF_2.clonetypes
|
|
1159
|
-
if file_format == "fasta" or file_format == "fna":
|
|
1160
|
-
for sequence in SeqIO.parse(paths[i], "fasta"):
|
|
1161
|
-
# Originally 10
|
|
1162
|
-
for j in range(0, len(sequence.seq) - BF_2.k, 500):
|
|
1163
|
-
BF_2.number_of_kmeres += 1
|
|
1164
|
-
BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
|
|
1165
|
-
elif file_format == "fastq" or file_format == "fq":
|
|
1166
|
-
counter = 0
|
|
1167
|
-
for sequence in SeqIO.parse(paths[i], "fastq"):
|
|
1168
|
-
if counter < read_amount:
|
|
1169
|
-
counter += 1
|
|
1170
|
-
for j in range(0, len(sequence.seq) - BF_2.k + 1, 10):
|
|
1171
|
-
BF_2.number_of_kmeres += 1
|
|
1172
|
-
BF_2.lookup(str(sequence.seq[j : j + BF_2.k]))
|
|
1173
|
-
else:
|
|
1174
|
-
break
|
|
1175
|
-
score_ClAssT = BF_2.get_score()
|
|
1176
|
-
score_edit_ClAssT = [str(x) for x in score_ClAssT]
|
|
1177
|
-
score_edit_ClAssT = ",".join(score_edit_ClAssT)
|
|
1178
|
-
prediction_ClAssT = Classifier.classify(
|
|
1179
|
-
r"Training_data/Training_data_IC.csv",
|
|
1180
|
-
score_ClAssT,
|
|
1181
|
-
[True, True, True, True, True, True, True, True, False],
|
|
1182
|
-
)
|
|
1183
|
-
predictions_ClAssT.append(prediction_ClAssT)
|
|
1184
|
-
scores_ClAssT.append(str(max(score_ClAssT)))
|
|
1185
|
-
|
|
1186
|
-
print("Strain-typing on sub-type-level done...")
|
|
1187
|
-
return predictions_ClAssT, scores_ClAssT
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
def blaOXA(BF_3, files, paths, file_format, read_amount):
|
|
1191
|
-
start = time.time()
|
|
1192
|
-
print("Start screening for blaOXA-genes...")
|
|
1193
|
-
paths_oxa = sorted(os.listdir(r"filter/OXAs/families"))
|
|
1194
|
-
BF_families = BF_3["OXA-families"]
|
|
1195
|
-
oxas = []
|
|
1196
|
-
scores_oxa = []
|
|
1197
|
-
scores_oxa_ind = []
|
|
1198
|
-
for i in paths_oxa:
|
|
1199
|
-
oxas.append(i[:-4])
|
|
1200
|
-
# print("OXA-families: ", oxas) # correct
|
|
1201
|
-
for i in range(len(files)):
|
|
1202
|
-
oxa_dic = {}
|
|
1203
|
-
if (
|
|
1204
|
-
i == int(len(files) / 6)
|
|
1205
|
-
or i == int(len(files) / 3)
|
|
1206
|
-
or i == int(len(files) / 2)
|
|
1207
|
-
or i == int(len(files) / 1.5)
|
|
1208
|
-
or i == int(len(files) / 1.2)
|
|
1209
|
-
):
|
|
1210
|
-
print("...")
|
|
1211
|
-
# Checking file type
|
|
1212
|
-
# if the file is fasta -> concat lines
|
|
1213
|
-
reads = []
|
|
1214
|
-
BF_families.number_of_kmeres = 0
|
|
1215
|
-
BF_families.hits_per_filter = [0] * BF_families.clonetypes
|
|
1216
|
-
BF_families.table = OXATable()
|
|
1217
|
-
BF_families.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
|
|
1218
|
-
if file_format == "fasta" or file_format == "fna":
|
|
1219
|
-
for sequence in SeqIO.parse(paths[i], "fasta"):
|
|
1220
|
-
reads.append(str(sequence.seq))
|
|
1221
|
-
BF_families.lookup_oxa(reads, ".fna")
|
|
1222
|
-
elif file_format == "fastq" or file_format == "fq":
|
|
1223
|
-
counter = 0
|
|
1224
|
-
for sequence in SeqIO.parse(paths[i], "fastq"):
|
|
1225
|
-
if counter < read_amount:
|
|
1226
|
-
counter += 1
|
|
1227
|
-
reads.append(str(sequence.seq))
|
|
1228
|
-
else:
|
|
1229
|
-
break
|
|
1230
|
-
BF_families.lookup_oxa(reads, ".fq")
|
|
1231
|
-
# print("Reads used: ", counter)
|
|
1232
|
-
score_oxa = BF_families.get_oxa_score()
|
|
1233
|
-
# print("Score: ", score_oxa)
|
|
1234
|
-
for i in range(len(oxas)):
|
|
1235
|
-
oxa_dic[oxas[i]] = score_oxa[i]
|
|
1236
|
-
for i in range(len(oxa_dic)):
|
|
1237
|
-
if oxa_dic[oxas[i]] < 0.3:
|
|
1238
|
-
del oxa_dic[oxas[i]]
|
|
1239
|
-
if len(oxa_dic) == 0:
|
|
1240
|
-
oxa_dic = "None"
|
|
1241
|
-
if oxa_dic != "None":
|
|
1242
|
-
oxa_dic = dict(sorted(oxa_dic.items(), key=lambda item: item[1]))
|
|
1243
|
-
scores_oxa.append(oxa_dic)
|
|
1244
|
-
# prepare data for next taxonomic level
|
|
1245
|
-
oxa_names = []
|
|
1246
|
-
# print(oxa_dic)
|
|
1247
|
-
for oxa_family in oxa_dic:
|
|
1248
|
-
oxa_names.append(oxa_family[:-7])
|
|
1249
|
-
for oxa_family in oxa_names:
|
|
1250
|
-
if oxa_dic == "None":
|
|
1251
|
-
scores_oxa_ind.append(["None", 0])
|
|
1252
|
-
break
|
|
1253
|
-
# print("blaOXA: ", oxa_dic)
|
|
1254
|
-
oxa_dic_ind = {}
|
|
1255
|
-
## TODO:
|
|
1256
|
-
# print("blaOXA: ", oxa_family)
|
|
1257
|
-
BF_ind = BF_3[oxa_family]
|
|
1258
|
-
BF_ind.number_of_kmeres = 0
|
|
1259
|
-
BF_ind.hits_per_filter = [0] * BF_ind.clonetypes
|
|
1260
|
-
BF_ind.table = OXATable()
|
|
1261
|
-
BF_ind.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
|
|
1262
|
-
paths_oxa = sorted(os.listdir(r"filter/OXAs/individual/" + oxa_family))
|
|
1263
|
-
oxas_ind = []
|
|
1264
|
-
for i in paths_oxa:
|
|
1265
|
-
oxas_ind.append(i[:-4])
|
|
1266
|
-
if file_format == "fasta" or file_format == "fna":
|
|
1267
|
-
BF_ind.lookup_oxa(reads, ".fna")
|
|
1268
|
-
elif file_format == "fastq" or file_format == "fq":
|
|
1269
|
-
BF_ind.lookup_oxa(reads, ".fq")
|
|
1270
|
-
score_oxa = BF_ind.get_oxa_score()
|
|
1271
|
-
# build dict with oxa-gen and its score
|
|
1272
|
-
for i in range(len(oxas_ind)):
|
|
1273
|
-
oxa_dic_ind[oxas_ind[i]] = score_oxa[i]
|
|
1274
|
-
# filter dict by score
|
|
1275
|
-
if len(oxa_dic_ind) == 0 or max(oxa_dic_ind.values()) < 0.3:
|
|
1276
|
-
scores_oxa_ind.append("None")
|
|
1277
|
-
else:
|
|
1278
|
-
scores_oxa_ind.append(
|
|
1279
|
-
[
|
|
1280
|
-
max(oxa_dic_ind, key=oxa_dic_ind.get),
|
|
1281
|
-
oxa_dic_ind[max(oxa_dic_ind, key=oxa_dic_ind.get)],
|
|
1282
|
-
]
|
|
1283
|
-
)
|
|
1284
|
-
end = time.time()
|
|
1285
|
-
needed = round(end - start, 2)
|
|
1286
|
-
print("Time needed: ", needed)
|
|
1287
|
-
print("Screening for blaOXA-genes done...")
|
|
1288
|
-
return scores_oxa, scores_oxa_ind
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
def calculate_dna_composition(sequence):
|
|
1292
|
-
"""calculates the DNA composition of a sequence"""
|
|
1293
|
-
composition = {"A": 0, "C": 0, "G": 0, "T": 0}
|
|
1294
|
-
|
|
1295
|
-
total = 0
|
|
1296
|
-
for base in sequence:
|
|
1297
|
-
if base in composition:
|
|
1298
|
-
composition[base] += 1
|
|
1299
|
-
total += 1
|
|
1300
|
-
for base in composition:
|
|
1301
|
-
composition[base] = round(composition[base] / total, 2)
|
|
1302
|
-
|
|
1303
|
-
return composition
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
def main():
|
|
1307
|
-
"""Parse CLI arguments and call respective functions"""
|
|
1308
|
-
arg_list = sys.argv
|
|
1309
|
-
# Phillip
|
|
1310
|
-
genus = arg_list[1]
|
|
1311
|
-
genera = search_filter.get_genera_array_sizes()
|
|
1312
|
-
genera = list(genera.keys())
|
|
1313
|
-
|
|
1314
|
-
if genus not in genera:
|
|
1315
|
-
print(f"{genus} is unknown.")
|
|
1316
|
-
quit()
|
|
1317
|
-
if "XspecT" in arg_list or "xspect" in arg_list:
|
|
1318
|
-
xspect = True
|
|
1319
|
-
else:
|
|
1320
|
-
xspect = False
|
|
1321
|
-
if "ClAssT" in arg_list or "classt" in arg_list and genus == "Acinetobacter":
|
|
1322
|
-
classt = True
|
|
1323
|
-
elif "ClAssT" in arg_list or "classt" in arg_list and genus != "Acinetobacter":
|
|
1324
|
-
print(f"ClAssT unavailable for {genus}")
|
|
1325
|
-
else:
|
|
1326
|
-
classt = False
|
|
1327
|
-
if "Oxa" in arg_list or "oxa" in arg_list and genus == "Acinetobacter":
|
|
1328
|
-
oxa = True
|
|
1329
|
-
elif "Oxa" in arg_list or "oxa" in arg_list and genus != "Acinetobacter":
|
|
1330
|
-
print(f"Oxa unavailable for {genus}")
|
|
1331
|
-
else:
|
|
1332
|
-
oxa = False
|
|
1333
|
-
if "Metagenome" in arg_list or "metagenome" in arg_list:
|
|
1334
|
-
metagenome = True
|
|
1335
|
-
else:
|
|
1336
|
-
metagenome = False
|
|
1337
|
-
if ("fasta" in arg_list) or ("fna" in arg_list) or ("fa" in arg_list):
|
|
1338
|
-
file_format = "fasta"
|
|
1339
|
-
read_amount = 342480
|
|
1340
|
-
elif ("fastq" in arg_list) or ("fq" in arg_list):
|
|
1341
|
-
file_format = "fastq"
|
|
1342
|
-
index = arg_list.index("fastq")
|
|
1343
|
-
if arg_list[index + 1].isdigit():
|
|
1344
|
-
read_amount = int(arg_list[index + 1])
|
|
1345
|
-
else:
|
|
1346
|
-
print("Error: Wrong Input, use a number after fastq!")
|
|
1347
|
-
quit()
|
|
1348
|
-
else:
|
|
1349
|
-
print("Error: Wrong Input, use fasta/fna/fa or fastq/fq!")
|
|
1350
|
-
quit()
|
|
1351
|
-
if "save" in arg_list or "Save" in arg_list:
|
|
1352
|
-
csv_table = True
|
|
1353
|
-
else:
|
|
1354
|
-
csv_table = False
|
|
1355
|
-
if "complete" in arg_list or "Complete" in arg_list:
|
|
1356
|
-
mode = 1
|
|
1357
|
-
else:
|
|
1358
|
-
mode = 500
|
|
1359
|
-
|
|
1360
|
-
file_path = arg_list[-1]
|
|
1361
|
-
|
|
1362
|
-
xspecT_mini(
|
|
1363
|
-
file_path,
|
|
1364
|
-
xspect,
|
|
1365
|
-
classt,
|
|
1366
|
-
oxa,
|
|
1367
|
-
file_format,
|
|
1368
|
-
read_amount,
|
|
1369
|
-
csv_table,
|
|
1370
|
-
metagenome,
|
|
1371
|
-
genus,
|
|
1372
|
-
mode,
|
|
1373
|
-
)
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
if __name__ == "__main__":
|
|
1377
|
-
main()
|