XspecT 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/METADATA +1 -1
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/RECORD +13 -12
- xspect/BF_v2.py +25 -36
- xspect/WebApp.py +15 -28
- xspect/XspecT_mini.py +15 -29
- xspect/main.py +1 -1
- xspect/search_filter.py +8 -8
- xspect/train_filter/README_XspecT_Erweiterung.md +119 -0
- xspect/train_filter/create_svm.py +1 -1
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/LICENSE +0 -0
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/WHEEL +0 -0
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
xspect/BF_v2.py,sha256=
|
|
1
|
+
xspect/BF_v2.py,sha256=05sp27VzxFtsjt2oyMyhW4aqNmUxGTori49j6lxo1BU,25392
|
|
2
2
|
xspect/Bootstrap.py,sha256=AYyEBo3MoOnPqhPAHe726mX8L9NuXDa5SATxZKLMv3s,830
|
|
3
3
|
xspect/Classifier.py,sha256=BgqpZiMYi2maaccTzJcgH2tjrtDH-U7COc7E4t4cQt8,3602
|
|
4
4
|
xspect/OXA_Table.py,sha256=1GxsyxMpUEgQirY0nJHtR3jl61DoPZh2Rb9L0VdMxD4,1632
|
|
5
|
-
xspect/WebApp.py,sha256=
|
|
6
|
-
xspect/XspecT_mini.py,sha256=
|
|
5
|
+
xspect/WebApp.py,sha256=H4NyfDELrqUSFKOGDLNSJxsNzfLsCX9_BJMln9UXQk0,24941
|
|
6
|
+
xspect/XspecT_mini.py,sha256=OApDXSVIZFK8ZNpNJRPYTlyOLszZbpkJt3jJC51hV8Q,54694
|
|
7
7
|
xspect/XspecT_trainer.py,sha256=6Gj2mltyVyM8Rsh5EU8tSCGMG7niYBLfId664zYaVXI,21703
|
|
8
8
|
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
xspect/download_filters.py,sha256=wSyX-IucjuKIEcVx-E0ClsA0XL0DI1FgMlO2UULgaXc,1048
|
|
10
10
|
xspect/file_io.py,sha256=IWae7xxAt-EmyEbxo0nDSe3RJHmLkQT5jNS2Z3qLKdg,4807
|
|
11
|
-
xspect/main.py,sha256=
|
|
11
|
+
xspect/main.py,sha256=xZxaXOUvA26OurM9DLsbqDzpUEvbtTZOs4916y2Ifdo,2948
|
|
12
12
|
xspect/map_kmers.py,sha256=63iTQS_GZZBK2DxjEs5xoI4KgfpZOntCKul06rrgi5w,6000
|
|
13
|
-
xspect/search_filter.py,sha256=
|
|
13
|
+
xspect/search_filter.py,sha256=a0n2VHmmfVqXaKwLLb478Lvb46kN2GwNgFOZmee1_xo,17261
|
|
14
14
|
xspect/static/How-To.png,sha256=QO6HydIHcL3oM9feMxmfZcKE8M62fIRl2xs_5S_NL5M,119621
|
|
15
15
|
xspect/static/Logo.png,sha256=bvOWMpqxmBigg9jEvZtIMOsXncbSFwnYu4eYNSf1_Qw,296095
|
|
16
16
|
xspect/static/Logo2.png,sha256=V7hpGb3XYLN5vEQQNJdpNjQX_F2A_f1wKAP8N37NwGs,292730
|
|
@@ -28,8 +28,9 @@ xspect/templates/layoutabout.html,sha256=ICC8g0DP8a7MLNrEYnXBgtnkwMjIktsimmqwqjM
|
|
|
28
28
|
xspect/templates/layouthome.html,sha256=6EtVD-L6xlTc7XGk77f9CARKW7JLpv2iiyUci1BK00A,2870
|
|
29
29
|
xspect/templates/layoutspecies.html,sha256=MNGSDEvuKFvgsyXoRLCu-rma10gntUI9vP_9a2sNl7M,24008
|
|
30
30
|
xspect/templates/species.html,sha256=rD9fCmSgyI8hRcmy56mNQH7VR5jnmtriv9WlvTIJJjE,2412
|
|
31
|
+
xspect/train_filter/README_XspecT_Erweiterung.md,sha256=Gn64Biz32LiUfQVYb43Hez6ihDTSFTLfEMwUJ_l1MGU,2879
|
|
31
32
|
xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
-
xspect/train_filter/create_svm.py,sha256=
|
|
33
|
+
xspect/train_filter/create_svm.py,sha256=sktMp8tmQWjcbWkiEzRizabB4qF9wVr7bds2jAvxx4Y,6830
|
|
33
34
|
xspect/train_filter/extract_and_concatenate.py,sha256=kXGqCrOk3TbOkKLJV8nKC6nL8Zg0TWKDCJu2gq8K_cw,5239
|
|
34
35
|
xspect/train_filter/get_paths.py,sha256=JXPbv_Fx5BKHZQ4bkSIGU7yj5zjkmhsI0Z6U4nU0gug,941
|
|
35
36
|
xspect/train_filter/html_scrap.py,sha256=iQXREhG37SNUx7gHoP8eqayMEIH00QLFMTNmIMogb_M,3799
|
|
@@ -40,9 +41,9 @@ xspect/train_filter/ncbi_api/download_assemblies.py,sha256=iX1qK8R6p2b3RiHPfqVsL
|
|
|
40
41
|
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=RhHvxKiQ8HJgoSb6njYEgO_vPioBqEMPvT3lE2lHXp0,3766
|
|
41
42
|
xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=pmzg6-fDGLinNSXNbBRv0v62lRgHxW4aXZ0uV1TJhOE,1793
|
|
42
43
|
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=uhBBGffgL4mcJpyp9KxVyOGUh8FxUTAI4xKzoLDav_Y,1577
|
|
43
|
-
XspecT-0.1.
|
|
44
|
-
XspecT-0.1.
|
|
45
|
-
XspecT-0.1.
|
|
46
|
-
XspecT-0.1.
|
|
47
|
-
XspecT-0.1.
|
|
48
|
-
XspecT-0.1.
|
|
44
|
+
XspecT-0.1.3.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
45
|
+
XspecT-0.1.3.dist-info/METADATA,sha256=ulwNLqzESiHPxspAckJm3RkWXg5qf-T6KoNvfTnsH0g,5475
|
|
46
|
+
XspecT-0.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
47
|
+
XspecT-0.1.3.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
48
|
+
XspecT-0.1.3.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
49
|
+
XspecT-0.1.3.dist-info/RECORD,,
|
xspect/BF_v2.py
CHANGED
|
@@ -48,6 +48,7 @@ class AbaumanniiBloomfilter:
|
|
|
48
48
|
] # names of the IC's
|
|
49
49
|
number_of_kmeres = 0 # counter of k-meres, will be used to calculate score
|
|
50
50
|
reads = 1000 # standard read number
|
|
51
|
+
kmer_hits_single = [] # kmer hits per filter
|
|
51
52
|
|
|
52
53
|
def __init__(self, arraysize):
|
|
53
54
|
"""creates empty matrix"""
|
|
@@ -181,8 +182,17 @@ class AbaumanniiBloomfilter:
|
|
|
181
182
|
|
|
182
183
|
return positions
|
|
183
184
|
|
|
185
|
+
def lookup_canonical(self, kmer, limit=False):
|
|
186
|
+
"""takes kmer input string and checks all clonetypes if the cononicalized kmer is inside that set of kmers"""
|
|
187
|
+
|
|
188
|
+
# canonicalize
|
|
189
|
+
complement = str(Seq(kmer).reverse_complement())
|
|
190
|
+
kmer = max(kmer, complement)
|
|
191
|
+
|
|
192
|
+
self.lookup(kmer, limit)
|
|
193
|
+
|
|
184
194
|
def lookup(self, kmer, limit=False):
|
|
185
|
-
"""
|
|
195
|
+
"""
|
|
186
196
|
takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
|
|
187
197
|
"""
|
|
188
198
|
|
|
@@ -213,6 +223,8 @@ class AbaumanniiBloomfilter:
|
|
|
213
223
|
temp[i] += 1
|
|
214
224
|
self.hit = True
|
|
215
225
|
if limit:
|
|
226
|
+
# reset single kmer kit vector / memory management
|
|
227
|
+
self.kmer_hits_single = []
|
|
216
228
|
if self.table.lookup(self.names[i], kmer):
|
|
217
229
|
self.hits_per_filter[i] += 1
|
|
218
230
|
else:
|
|
@@ -274,11 +286,7 @@ class AbaumanniiBloomfilter:
|
|
|
274
286
|
continue
|
|
275
287
|
self.number_of_kmeres += 1
|
|
276
288
|
kmer = str(single_read[j : j + self.k])
|
|
277
|
-
|
|
278
|
-
if kmer > kmer_reversed:
|
|
279
|
-
self.lookup(kmer)
|
|
280
|
-
else:
|
|
281
|
-
self.lookup(kmer_reversed)
|
|
289
|
+
self.lookup_canonical(kmer)
|
|
282
290
|
# XspecT Sequence-Reads every 10th kmer
|
|
283
291
|
elif quick == 2:
|
|
284
292
|
for single_read in range(0, len(reads)):
|
|
@@ -291,11 +299,7 @@ class AbaumanniiBloomfilter:
|
|
|
291
299
|
# lookup for kmer
|
|
292
300
|
temp = reads[single_read]
|
|
293
301
|
kmer = str(temp[j : j + self.k])
|
|
294
|
-
|
|
295
|
-
if kmer > kmer_reversed:
|
|
296
|
-
self.lookup(kmer)
|
|
297
|
-
else:
|
|
298
|
-
self.lookup(kmer_reversed)
|
|
302
|
+
self.lookup_canonical(kmer)
|
|
299
303
|
if self.hit == True:
|
|
300
304
|
hit_counter += 1
|
|
301
305
|
elif quick == 3:
|
|
@@ -307,11 +311,7 @@ class AbaumanniiBloomfilter:
|
|
|
307
311
|
continue
|
|
308
312
|
self.number_of_kmeres += 1
|
|
309
313
|
kmer = str(single_read[j : j + self.k])
|
|
310
|
-
|
|
311
|
-
if kmer > kmer_reversed:
|
|
312
|
-
self.lookup(kmer)
|
|
313
|
-
else:
|
|
314
|
-
self.lookup(kmer_reversed)
|
|
314
|
+
self.lookup_canonical(kmer)
|
|
315
315
|
# metagenome mode
|
|
316
316
|
elif quick == 4:
|
|
317
317
|
print("Stage 1")
|
|
@@ -332,13 +332,8 @@ class AbaumanniiBloomfilter:
|
|
|
332
332
|
self.hits_per_filter = [0] * self.clonetypes
|
|
333
333
|
for kmer in read:
|
|
334
334
|
counter += 1
|
|
335
|
-
# lookup for kmer, use lexikographical smaller kmer
|
|
336
335
|
self.number_of_kmeres += 1
|
|
337
|
-
|
|
338
|
-
if kmer > kmer_reversed:
|
|
339
|
-
self.lookup(kmer)
|
|
340
|
-
else:
|
|
341
|
-
self.lookup(kmer_reversed)
|
|
336
|
+
self.lookup_canonical(kmer)
|
|
342
337
|
score = self.get_score()
|
|
343
338
|
score_edit = [str(x) for x in score]
|
|
344
339
|
score_edit = ",".join(score_edit)
|
|
@@ -383,9 +378,9 @@ class AbaumanniiBloomfilter:
|
|
|
383
378
|
bootstrap_score = bootstrap_score / bootstrap_n
|
|
384
379
|
# bootstrap_score = 1
|
|
385
380
|
|
|
386
|
-
if
|
|
381
|
+
if prediction not in reads_classified:
|
|
387
382
|
# Value 5 war vohrer = read
|
|
388
|
-
reads_classified[
|
|
383
|
+
reads_classified[prediction] = [
|
|
389
384
|
[max(score)],
|
|
390
385
|
1,
|
|
391
386
|
[len(read)],
|
|
@@ -395,13 +390,11 @@ class AbaumanniiBloomfilter:
|
|
|
395
390
|
None,
|
|
396
391
|
]
|
|
397
392
|
else:
|
|
398
|
-
reads_classified[
|
|
399
|
-
reads_classified[
|
|
400
|
-
reads_classified[
|
|
401
|
-
reads_classified[
|
|
402
|
-
|
|
403
|
-
)
|
|
404
|
-
reads_classified["A." + prediction][4] += [bootstrap_score]
|
|
393
|
+
reads_classified[prediction][0] += [max(score)]
|
|
394
|
+
reads_classified[prediction][1] += 1
|
|
395
|
+
reads_classified[prediction][2] += [len(read)]
|
|
396
|
+
reads_classified[prediction][3] += sorted(score)[-2] / max(score)
|
|
397
|
+
reads_classified[prediction][4] += [bootstrap_score]
|
|
405
398
|
# reads_classified["A." + prediction][5] += None
|
|
406
399
|
# tracker.print_diff()
|
|
407
400
|
# not ready yet
|
|
@@ -475,11 +468,7 @@ class AbaumanniiBloomfilter:
|
|
|
475
468
|
self.number_of_kmeres += 1
|
|
476
469
|
# lookup for kmer
|
|
477
470
|
kmer = str(single_read[j : j + self.k])
|
|
478
|
-
|
|
479
|
-
if kmer > kmer_reversed:
|
|
480
|
-
self.lookup(kmer)
|
|
481
|
-
else:
|
|
482
|
-
self.lookup(kmer_reversed)
|
|
471
|
+
self.lookup_canonical(kmer)
|
|
483
472
|
|
|
484
473
|
def cleanup(self):
|
|
485
474
|
"""deletes matrix"""
|
xspect/WebApp.py
CHANGED
|
@@ -12,7 +12,7 @@ import logging
|
|
|
12
12
|
import pickle
|
|
13
13
|
import secrets
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from Bio import Entrez, Medline
|
|
15
|
+
from Bio import Entrez, Medline, SeqIO
|
|
16
16
|
from flask import (
|
|
17
17
|
Flask,
|
|
18
18
|
render_template,
|
|
@@ -139,35 +139,25 @@ def assignspec():
|
|
|
139
139
|
return redirect("/resultsspec")
|
|
140
140
|
|
|
141
141
|
else:
|
|
142
|
-
|
|
143
|
-
# if the file is fasta -> concat lines
|
|
144
|
-
ext = filename.split(".")[-2]
|
|
142
|
+
ext = filename.split(".")[-1]
|
|
145
143
|
with open(filename) as f:
|
|
146
144
|
reads = f.read().splitlines()
|
|
147
145
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
if ext == "fq" or ext == "fastq":
|
|
147
|
+
sequences = SeqIO.parse(filename, "fastq")
|
|
148
|
+
quick = 2
|
|
149
|
+
else:
|
|
152
150
|
if quick:
|
|
153
151
|
quick = 1
|
|
154
152
|
else:
|
|
155
153
|
quick = 0
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
else:
|
|
160
|
-
if metagenome:
|
|
161
|
-
quick = 4
|
|
162
|
-
else:
|
|
163
|
-
quick = 2
|
|
164
|
-
# deleting file
|
|
165
|
-
os.remove(filename)
|
|
154
|
+
sequences = SeqIO.parse(filename, "fasta")
|
|
155
|
+
|
|
156
|
+
reads = [str(sequence.seq).upper() for sequence in sequences]
|
|
166
157
|
|
|
167
|
-
for i in range(len(reads)):
|
|
168
|
-
reads[i] = reads[i].upper()
|
|
169
158
|
# starts the lookup for a given sequence
|
|
170
159
|
if metagenome:
|
|
160
|
+
quick = 4
|
|
171
161
|
start_meta = time.time()
|
|
172
162
|
reads, reads_oxa = read_search_pre(reads, BF_Master_prefilter, ext)
|
|
173
163
|
end_meta = time.time()
|
|
@@ -193,8 +183,8 @@ def assignspec():
|
|
|
193
183
|
# assign reads to species
|
|
194
184
|
species_dict = {}
|
|
195
185
|
predictions_names = set()
|
|
196
|
-
for
|
|
197
|
-
predictions_names.add(
|
|
186
|
+
for prediction in predictions:
|
|
187
|
+
predictions_names.add(prediction)
|
|
198
188
|
for species in predictions_names:
|
|
199
189
|
species_dict[species] = []
|
|
200
190
|
# dict with species as keys and reads as values for oxa search
|
|
@@ -323,11 +313,6 @@ def assignspec():
|
|
|
323
313
|
)
|
|
324
314
|
return redirect("/resultsspec")
|
|
325
315
|
|
|
326
|
-
app.logger.info(
|
|
327
|
-
"Assignment done for " + str(filename) + ", Time needed: " + str(needed)
|
|
328
|
-
)
|
|
329
|
-
return redirect("/resultsspec")
|
|
330
|
-
|
|
331
316
|
|
|
332
317
|
# about page
|
|
333
318
|
@app.route("/about")
|
|
@@ -511,6 +496,7 @@ def resultsspec():
|
|
|
511
496
|
|
|
512
497
|
elif metagenome:
|
|
513
498
|
reads_classified = session.get("reads_classified")
|
|
499
|
+
genus = session.get("genus")
|
|
514
500
|
# sort reads_classified by highest value of the second element
|
|
515
501
|
sorted_reads_classified = dict(
|
|
516
502
|
sorted(reads_classified.items(), key=lambda x: x[1][1], reverse=True)
|
|
@@ -518,8 +504,9 @@ def resultsspec():
|
|
|
518
504
|
# get key of reads_classified with highest value of the second element from the value
|
|
519
505
|
predictions = []
|
|
520
506
|
values = []
|
|
507
|
+
translation_dict = load_translation_dict(genus)
|
|
521
508
|
for key, value in sorted_reads_classified.items():
|
|
522
|
-
predictions.append(key)
|
|
509
|
+
predictions.append(translation_dict[key])
|
|
523
510
|
values.append(value[1])
|
|
524
511
|
clonetypes_sorted = predictions[:12]
|
|
525
512
|
values_sorted = values[:12]
|
xspect/XspecT_mini.py
CHANGED
|
@@ -608,7 +608,9 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
608
608
|
threshold_contig = sample_size * 0.7
|
|
609
609
|
for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
|
|
610
610
|
if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
|
|
611
|
-
BF_1_1.
|
|
611
|
+
BF_1_1.lookup_canonical(
|
|
612
|
+
str(sequence.seq[i : i + BF_1_1.k]).upper()
|
|
613
|
+
)
|
|
612
614
|
|
|
613
615
|
# needs at least 70% hits to continue with the contig
|
|
614
616
|
counter = 0
|
|
@@ -620,7 +622,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
620
622
|
)
|
|
621
623
|
counter += 1
|
|
622
624
|
# how many kmers? to use
|
|
623
|
-
if counter >=
|
|
625
|
+
if counter >= 5000:
|
|
624
626
|
break
|
|
625
627
|
# contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
|
|
626
628
|
contigs.append(contigs_kmers)
|
|
@@ -638,7 +640,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
638
640
|
for j in range(len(contigs[i])):
|
|
639
641
|
BF_1_1.number_of_kmeres += 1
|
|
640
642
|
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
641
|
-
BF_1_1.
|
|
643
|
+
BF_1_1.lookup_canonical(contigs[i][j])
|
|
642
644
|
if hits_per_filter_copy != BF_1_1.hits_per_filter:
|
|
643
645
|
threshold += 1
|
|
644
646
|
# parameter value needs to be determined
|
|
@@ -654,11 +656,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
654
656
|
BF.hits_per_filter = [0] * BF.clonetypes
|
|
655
657
|
for kmer in contigs_filtered:
|
|
656
658
|
BF.number_of_kmeres += 1
|
|
657
|
-
|
|
658
|
-
if kmer > kmer_reversed:
|
|
659
|
-
BF.lookup(kmer)
|
|
660
|
-
else:
|
|
661
|
-
BF.lookup(kmer_reversed)
|
|
659
|
+
BF.lookup_canonical(kmer)
|
|
662
660
|
score = BF.get_score()
|
|
663
661
|
score_edit = [str(x) for x in score]
|
|
664
662
|
score_edit = ",".join(score_edit)
|
|
@@ -750,11 +748,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
750
748
|
for j in range(0, len(sequence.seq) - BF.k, mode):
|
|
751
749
|
BF.number_of_kmeres += 1
|
|
752
750
|
kmer = str(sequence.seq[j : j + BF.k])
|
|
753
|
-
|
|
754
|
-
if kmer > kmer_reversed:
|
|
755
|
-
BF.lookup(kmer)
|
|
756
|
-
else:
|
|
757
|
-
BF.lookup(kmer_reversed)
|
|
751
|
+
BF.lookup_canonical(kmer)
|
|
758
752
|
|
|
759
753
|
score = BF.get_score()
|
|
760
754
|
# print("Scores: ", score)
|
|
@@ -884,11 +878,11 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
884
878
|
# if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
|
|
885
879
|
# BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
|
|
886
880
|
if "N" not in str(sequence.seq):
|
|
887
|
-
BF_1_1.
|
|
888
|
-
BF_1_1.
|
|
889
|
-
BF_1_1.
|
|
890
|
-
BF_1_1.
|
|
891
|
-
BF_1_1.
|
|
881
|
+
BF_1_1.lookup_canonical(k1)
|
|
882
|
+
BF_1_1.lookup_canonical(k2)
|
|
883
|
+
BF_1_1.lookup_canonical(k3)
|
|
884
|
+
BF_1_1.lookup_canonical(k4)
|
|
885
|
+
BF_1_1.lookup_canonical(k5)
|
|
892
886
|
else:
|
|
893
887
|
continue
|
|
894
888
|
# needs at least 2 of 3 hits to continue with read
|
|
@@ -913,7 +907,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
913
907
|
BF_1_1.number_of_kmeres += 1
|
|
914
908
|
hits_per_filter_copy = BF_1_1.hits_per_filter[:]
|
|
915
909
|
if "N" not in reads[i][j]:
|
|
916
|
-
BF_1_1.
|
|
910
|
+
BF_1_1.lookup_canonical(reads[i][j])
|
|
917
911
|
if hits_per_filter_copy != BF_1_1.hits_per_filter:
|
|
918
912
|
threshold += 1
|
|
919
913
|
if threshold >= 0.7 * len(reads[i]):
|
|
@@ -929,11 +923,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
929
923
|
for kmer in reads_filtered:
|
|
930
924
|
if "N" not in kmer:
|
|
931
925
|
BF.number_of_kmeres += 1
|
|
932
|
-
|
|
933
|
-
if kmer > kmer_reversed:
|
|
934
|
-
BF.lookup(kmer)
|
|
935
|
-
else:
|
|
936
|
-
BF.lookup(kmer_reversed)
|
|
926
|
+
BF.lookup_canonical(kmer)
|
|
937
927
|
else:
|
|
938
928
|
continue
|
|
939
929
|
score = BF.get_score()
|
|
@@ -1041,11 +1031,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
|
|
|
1041
1031
|
for j in range(0, len(sequence.seq) - BF.k + 1, mode):
|
|
1042
1032
|
BF.number_of_kmeres += 1
|
|
1043
1033
|
kmer = str(sequence.seq[j : j + BF.k])
|
|
1044
|
-
|
|
1045
|
-
if kmer > kmer_reversed:
|
|
1046
|
-
BF.lookup(kmer)
|
|
1047
|
-
else:
|
|
1048
|
-
BF.lookup(kmer_reversed)
|
|
1034
|
+
BF.lookup_canonical(kmer)
|
|
1049
1035
|
else:
|
|
1050
1036
|
break
|
|
1051
1037
|
score = BF.get_score()
|
xspect/main.py
CHANGED
|
@@ -56,6 +56,7 @@ def classify(genus, path, species, ic, oxa, metagenome, complete, save):
|
|
|
56
56
|
mode = 1
|
|
57
57
|
file_format = "fasta"
|
|
58
58
|
read_amount = 342480
|
|
59
|
+
print(mode)
|
|
59
60
|
|
|
60
61
|
xspecT_mini(
|
|
61
62
|
path,
|
|
@@ -111,7 +112,6 @@ def train(genus, bf_assembly_path, svm_assembly_path, complete, check):
|
|
|
111
112
|
@cli.command()
|
|
112
113
|
def web():
|
|
113
114
|
"""Open the XspecT web app."""
|
|
114
|
-
webbrowser.open("http://localhost:8000")
|
|
115
115
|
app.run(host="0.0.0.0", port=8000, debug=True, threaded=True)
|
|
116
116
|
|
|
117
117
|
|
xspect/search_filter.py
CHANGED
|
@@ -299,7 +299,7 @@ def read_search_pre(reads, BF_pre, ext):
|
|
|
299
299
|
threshold_read = sample_size * 0.7
|
|
300
300
|
for i in range(0, len(single_read) - BF_pre.k, sample_size):
|
|
301
301
|
if "N" not in single_read[i : i + BF_pre.k]:
|
|
302
|
-
BF_pre.
|
|
302
|
+
BF_pre.lookup_canonical(single_read[i : i + BF_pre.k])
|
|
303
303
|
# for reads use a static sample of 5
|
|
304
304
|
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
305
305
|
# then the read won't be tested further
|
|
@@ -312,11 +312,11 @@ def read_search_pre(reads, BF_pre, ext):
|
|
|
312
312
|
k4 = single_read[BF_pre.k : BF_pre.k * 2]
|
|
313
313
|
k5 = single_read[mid + BF_pre.k : mid + BF_pre.k * 2]
|
|
314
314
|
if "N" not in single_read:
|
|
315
|
-
BF_pre.
|
|
316
|
-
BF_pre.
|
|
317
|
-
BF_pre.
|
|
318
|
-
BF_pre.
|
|
319
|
-
BF_pre.
|
|
315
|
+
BF_pre.lookup_canonical(k1)
|
|
316
|
+
BF_pre.lookup_canonical(k2)
|
|
317
|
+
BF_pre.lookup_canonical(k3)
|
|
318
|
+
BF_pre.lookup_canonical(k4)
|
|
319
|
+
BF_pre.lookup_canonical(k5)
|
|
320
320
|
threshold_read = 3
|
|
321
321
|
# needs at least 2 of 3 hits to continue with read
|
|
322
322
|
counter = 0
|
|
@@ -348,7 +348,7 @@ def read_search_pre(reads, BF_pre, ext):
|
|
|
348
348
|
for j in range(len(reads_new[i])):
|
|
349
349
|
BF_pre.number_of_kmeres += 1
|
|
350
350
|
hits_per_filter_copy = BF_pre.hits_per_filter[:]
|
|
351
|
-
BF_pre.
|
|
351
|
+
BF_pre.lookup_canonical(reads_new[i][j])
|
|
352
352
|
if hits_per_filter_copy != BF_pre.hits_per_filter:
|
|
353
353
|
threshold += 1
|
|
354
354
|
if threshold >= cutoff * len(reads_new[i]):
|
|
@@ -373,7 +373,7 @@ def read_search_spec(reads, quick, BF, ext, genus):
|
|
|
373
373
|
names = [translation_dict[name] for name in names_id]
|
|
374
374
|
return score, names, hits, None
|
|
375
375
|
# Metagenome mode
|
|
376
|
-
elif
|
|
376
|
+
elif quick == 4:
|
|
377
377
|
reads_classified, predictions = BF.lookup_txt(reads, genus, ext, quick)
|
|
378
378
|
hits = None
|
|
379
379
|
names = None
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# XspecT-Erweiterung
|
|
2
|
+
|
|
3
|
+
Expands XspecT, so new filter for a genus can automatically be trained. It's main
|
|
4
|
+
script is XspecT_trainer.py. The rest of the scripts are inside the python module
|
|
5
|
+
train_filter.
|
|
6
|
+
|
|
7
|
+
## Training new filter
|
|
8
|
+
|
|
9
|
+
XspecT_trainer.py uses command line arguments. The examples for using XspecT_trainer.py
|
|
10
|
+
are using Salmonella since this genus only has two defined species in the NCBI
|
|
11
|
+
databases.
|
|
12
|
+
|
|
13
|
+
### Jellyfish
|
|
14
|
+
|
|
15
|
+
The program jellyfish is used to count distinct k-meres in the assemblies. For XspecT_
|
|
16
|
+
trainer.py to work jellyfish needs to be installed. It can be installed using bioconda:
|
|
17
|
+
|
|
18
|
+
`
|
|
19
|
+
conda install -c bioconda jellyfish
|
|
20
|
+
`
|
|
21
|
+
|
|
22
|
+
### Training examples
|
|
23
|
+
|
|
24
|
+
New filters with assemblies from NCBI RefSeq can be trained with the following line. The
|
|
25
|
+
python libraries from [requirements.txt](..%2Frequirements.txt) need to be installed.
|
|
26
|
+
|
|
27
|
+
`
|
|
28
|
+
python XspecT_trainer.py Salmonella 1
|
|
29
|
+
`
|
|
30
|
+
|
|
31
|
+
Training filters with custom data can be done using the following line.
|
|
32
|
+
|
|
33
|
+
`
|
|
34
|
+
python XspecT_trainer.py Salmonella 2 -bf /path/to/concate_assemblies -svm
|
|
35
|
+
/path/to/assemblies
|
|
36
|
+
`
|
|
37
|
+
|
|
38
|
+
All command line arguments are explained using the following line.
|
|
39
|
+
|
|
40
|
+
`
|
|
41
|
+
python XspecT_trainer.py -h
|
|
42
|
+
`
|
|
43
|
+
|
|
44
|
+
# Explanation of the scripts
|
|
45
|
+
|
|
46
|
+
## backup_filter.py
|
|
47
|
+
|
|
48
|
+
Creates a backup of all files needed for the species assignment by XspecT for a specific
|
|
49
|
+
genus. The backup will be done, if new filters will be created for a genus which
|
|
50
|
+
already has trained filters.
|
|
51
|
+
|
|
52
|
+
## create_svm.py
|
|
53
|
+
|
|
54
|
+
Downloads the needed assemblies and trains a support-vector-machine for the genus.
|
|
55
|
+
|
|
56
|
+
## extract_and_concatenate.py
|
|
57
|
+
|
|
58
|
+
Unzips the downloaded assemblies. Concatenates assemblies per species that will be used
|
|
59
|
+
to train the bloomfilters.
|
|
60
|
+
|
|
61
|
+
## get_paths.py
|
|
62
|
+
|
|
63
|
+
Functions that get specific paths.
|
|
64
|
+
|
|
65
|
+
## html_scrap.py
|
|
66
|
+
|
|
67
|
+
Updates a list of all NCBI RefSeq assembly accessions that have a taxonomy check result
|
|
68
|
+
of OK. The taxonomy check from NCBI RefSeq uses the ANI (average-nucleotide-
|
|
69
|
+
identity) to compute a result.
|
|
70
|
+
|
|
71
|
+
## interface_XspecT.py
|
|
72
|
+
|
|
73
|
+
Mostly functions that train new bloomfilters automatically. The functions were
|
|
74
|
+
originally writen for XspecT in a non-automatic way and were updated.
|
|
75
|
+
|
|
76
|
+
## k_mer_count.py
|
|
77
|
+
|
|
78
|
+
Uses jellyfish to count distinct k-meres in every concatenated assembly. The highest
|
|
79
|
+
count will be used to compute the size of the bloomfilters.
|
|
80
|
+
|
|
81
|
+
## ncbi_api
|
|
82
|
+
|
|
83
|
+
A module which makes requests to the NCBI Datasets API.
|
|
84
|
+
|
|
85
|
+
### download_assemblies.py
|
|
86
|
+
|
|
87
|
+
The specific function that downloads assemblies from NCBI RefSeq using NCBI
|
|
88
|
+
datasets.
|
|
89
|
+
|
|
90
|
+
### ncbi_assembly_metadata.py
|
|
91
|
+
|
|
92
|
+
Takes a dictionary with species and their taxon ID and asks NCBI for assemblies of
|
|
93
|
+
the species. Saves the collected accessions of the found and selected assemblies.
|
|
94
|
+
|
|
95
|
+
### ncbi_children_tree.py
|
|
96
|
+
|
|
97
|
+
Takes the name or ID of a genus and gives a list with all its species.
|
|
98
|
+
|
|
99
|
+
### ncbi_taxon_metadata.py
|
|
100
|
+
|
|
101
|
+
Takes a list with taxon and collects metadata like their scientific name and rank.
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
@@ -127,7 +127,7 @@ def perform_lookup(bloomfilter, files, file_paths, accessions, names, spacing):
|
|
|
127
127
|
# Dominik: changed sample size to var
|
|
128
128
|
for j in range(0, len(sequence.seq) - BF.k, spacing):
|
|
129
129
|
BF.number_of_kmeres += 1
|
|
130
|
-
BF.
|
|
130
|
+
BF.lookup_canonical(str(sequence.seq[j : j + BF.k]))
|
|
131
131
|
|
|
132
132
|
score = BF.get_score()
|
|
133
133
|
score = [str(x) for x in score]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|