XspecT 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: XspecT
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Tool to monitor and characterize pathogens using Bloom filters.
5
5
  License: MIT License
6
6
 
@@ -1,16 +1,16 @@
1
- xspect/BF_v2.py,sha256=3zJgWY6VxfE-6eSqUTgoOY4Z_mp6IKBKDpuWu34FlKI,26080
1
+ xspect/BF_v2.py,sha256=05sp27VzxFtsjt2oyMyhW4aqNmUxGTori49j6lxo1BU,25392
2
2
  xspect/Bootstrap.py,sha256=AYyEBo3MoOnPqhPAHe726mX8L9NuXDa5SATxZKLMv3s,830
3
3
  xspect/Classifier.py,sha256=BgqpZiMYi2maaccTzJcgH2tjrtDH-U7COc7E4t4cQt8,3602
4
4
  xspect/OXA_Table.py,sha256=1GxsyxMpUEgQirY0nJHtR3jl61DoPZh2Rb9L0VdMxD4,1632
5
- xspect/WebApp.py,sha256=eo1EJOMjW5grCZyvX5g1J4ppwyZb_M9lYGCNuJidM0Q,25224
6
- xspect/XspecT_mini.py,sha256=t_4OlhzLytRXkM0ig9lo0Szfm2QgJhls52TScUxFN1s,55411
5
+ xspect/WebApp.py,sha256=H4NyfDELrqUSFKOGDLNSJxsNzfLsCX9_BJMln9UXQk0,24941
6
+ xspect/XspecT_mini.py,sha256=OApDXSVIZFK8ZNpNJRPYTlyOLszZbpkJt3jJC51hV8Q,54694
7
7
  xspect/XspecT_trainer.py,sha256=6Gj2mltyVyM8Rsh5EU8tSCGMG7niYBLfId664zYaVXI,21703
8
8
  xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  xspect/download_filters.py,sha256=wSyX-IucjuKIEcVx-E0ClsA0XL0DI1FgMlO2UULgaXc,1048
10
10
  xspect/file_io.py,sha256=IWae7xxAt-EmyEbxo0nDSe3RJHmLkQT5jNS2Z3qLKdg,4807
11
- xspect/main.py,sha256=bF7ntgy_gR0ZNIB9JVxtXb-a6o0Lt0__tI_zzj03B24,2977
11
+ xspect/main.py,sha256=xZxaXOUvA26OurM9DLsbqDzpUEvbtTZOs4916y2Ifdo,2948
12
12
  xspect/map_kmers.py,sha256=63iTQS_GZZBK2DxjEs5xoI4KgfpZOntCKul06rrgi5w,6000
13
- xspect/search_filter.py,sha256=EZkM2917cjy4Q0zQDC9bJ0S-dyD-MBBmJqrAHQ1P260,17190
13
+ xspect/search_filter.py,sha256=a0n2VHmmfVqXaKwLLb478Lvb46kN2GwNgFOZmee1_xo,17261
14
14
  xspect/static/How-To.png,sha256=QO6HydIHcL3oM9feMxmfZcKE8M62fIRl2xs_5S_NL5M,119621
15
15
  xspect/static/Logo.png,sha256=bvOWMpqxmBigg9jEvZtIMOsXncbSFwnYu4eYNSf1_Qw,296095
16
16
  xspect/static/Logo2.png,sha256=V7hpGb3XYLN5vEQQNJdpNjQX_F2A_f1wKAP8N37NwGs,292730
@@ -28,8 +28,9 @@ xspect/templates/layoutabout.html,sha256=ICC8g0DP8a7MLNrEYnXBgtnkwMjIktsimmqwqjM
28
28
  xspect/templates/layouthome.html,sha256=6EtVD-L6xlTc7XGk77f9CARKW7JLpv2iiyUci1BK00A,2870
29
29
  xspect/templates/layoutspecies.html,sha256=MNGSDEvuKFvgsyXoRLCu-rma10gntUI9vP_9a2sNl7M,24008
30
30
  xspect/templates/species.html,sha256=rD9fCmSgyI8hRcmy56mNQH7VR5jnmtriv9WlvTIJJjE,2412
31
+ xspect/train_filter/README_XspecT_Erweiterung.md,sha256=Gn64Biz32LiUfQVYb43Hez6ihDTSFTLfEMwUJ_l1MGU,2879
31
32
  xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
- xspect/train_filter/create_svm.py,sha256=E1QwBeUtAlOlKf6QKfmRtKaz_6idv7M8Hb-jbNb_wGk,6820
33
+ xspect/train_filter/create_svm.py,sha256=sktMp8tmQWjcbWkiEzRizabB4qF9wVr7bds2jAvxx4Y,6830
33
34
  xspect/train_filter/extract_and_concatenate.py,sha256=kXGqCrOk3TbOkKLJV8nKC6nL8Zg0TWKDCJu2gq8K_cw,5239
34
35
  xspect/train_filter/get_paths.py,sha256=JXPbv_Fx5BKHZQ4bkSIGU7yj5zjkmhsI0Z6U4nU0gug,941
35
36
  xspect/train_filter/html_scrap.py,sha256=iQXREhG37SNUx7gHoP8eqayMEIH00QLFMTNmIMogb_M,3799
@@ -40,9 +41,9 @@ xspect/train_filter/ncbi_api/download_assemblies.py,sha256=iX1qK8R6p2b3RiHPfqVsL
40
41
  xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=RhHvxKiQ8HJgoSb6njYEgO_vPioBqEMPvT3lE2lHXp0,3766
41
42
  xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=pmzg6-fDGLinNSXNbBRv0v62lRgHxW4aXZ0uV1TJhOE,1793
42
43
  xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=uhBBGffgL4mcJpyp9KxVyOGUh8FxUTAI4xKzoLDav_Y,1577
43
- XspecT-0.1.2.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
44
- XspecT-0.1.2.dist-info/METADATA,sha256=h4OX8L719oZsPj0Xcab4bx4ZstZiMUuPFpVcbZoGc_w,5475
45
- XspecT-0.1.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
46
- XspecT-0.1.2.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
47
- XspecT-0.1.2.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
48
- XspecT-0.1.2.dist-info/RECORD,,
44
+ XspecT-0.1.3.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
45
+ XspecT-0.1.3.dist-info/METADATA,sha256=ulwNLqzESiHPxspAckJm3RkWXg5qf-T6KoNvfTnsH0g,5475
46
+ XspecT-0.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
47
+ XspecT-0.1.3.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
48
+ XspecT-0.1.3.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
49
+ XspecT-0.1.3.dist-info/RECORD,,
xspect/BF_v2.py CHANGED
@@ -48,6 +48,7 @@ class AbaumanniiBloomfilter:
48
48
  ] # names of the IC's
49
49
  number_of_kmeres = 0 # counter of k-meres, will be used to calculate score
50
50
  reads = 1000 # standard read number
51
+ kmer_hits_single = [] # kmer hits per filter
51
52
 
52
53
  def __init__(self, arraysize):
53
54
  """creates empty matrix"""
@@ -181,8 +182,17 @@ class AbaumanniiBloomfilter:
181
182
 
182
183
  return positions
183
184
 
185
+ def lookup_canonical(self, kmer, limit=False):
186
+ """takes kmer input string and checks all clonetypes if the cononicalized kmer is inside that set of kmers"""
187
+
188
+ # canonicalize
189
+ complement = str(Seq(kmer).reverse_complement())
190
+ kmer = max(kmer, complement)
191
+
192
+ self.lookup(kmer, limit)
193
+
184
194
  def lookup(self, kmer, limit=False):
185
- """checks if an element is in the filters, returns list with True/False,
195
+ """
186
196
  takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
187
197
  """
188
198
 
@@ -213,6 +223,8 @@ class AbaumanniiBloomfilter:
213
223
  temp[i] += 1
214
224
  self.hit = True
215
225
  if limit:
226
+ # reset single kmer kit vector / memory management
227
+ self.kmer_hits_single = []
216
228
  if self.table.lookup(self.names[i], kmer):
217
229
  self.hits_per_filter[i] += 1
218
230
  else:
@@ -274,11 +286,7 @@ class AbaumanniiBloomfilter:
274
286
  continue
275
287
  self.number_of_kmeres += 1
276
288
  kmer = str(single_read[j : j + self.k])
277
- kmer_reversed = str(Seq(kmer).reverse_complement())
278
- if kmer > kmer_reversed:
279
- self.lookup(kmer)
280
- else:
281
- self.lookup(kmer_reversed)
289
+ self.lookup_canonical(kmer)
282
290
  # XspecT Sequence-Reads every 10th kmer
283
291
  elif quick == 2:
284
292
  for single_read in range(0, len(reads)):
@@ -291,11 +299,7 @@ class AbaumanniiBloomfilter:
291
299
  # lookup for kmer
292
300
  temp = reads[single_read]
293
301
  kmer = str(temp[j : j + self.k])
294
- kmer_reversed = str(Seq(kmer).reverse_complement())
295
- if kmer > kmer_reversed:
296
- self.lookup(kmer)
297
- else:
298
- self.lookup(kmer_reversed)
302
+ self.lookup_canonical(kmer)
299
303
  if self.hit == True:
300
304
  hit_counter += 1
301
305
  elif quick == 3:
@@ -307,11 +311,7 @@ class AbaumanniiBloomfilter:
307
311
  continue
308
312
  self.number_of_kmeres += 1
309
313
  kmer = str(single_read[j : j + self.k])
310
- kmer_reversed = str(Seq(kmer).reverse_complement())
311
- if kmer > kmer_reversed:
312
- self.lookup(kmer)
313
- else:
314
- self.lookup(kmer_reversed)
314
+ self.lookup_canonical(kmer)
315
315
  # metagenome mode
316
316
  elif quick == 4:
317
317
  print("Stage 1")
@@ -332,13 +332,8 @@ class AbaumanniiBloomfilter:
332
332
  self.hits_per_filter = [0] * self.clonetypes
333
333
  for kmer in read:
334
334
  counter += 1
335
- # lookup for kmer, use lexikographical smaller kmer
336
335
  self.number_of_kmeres += 1
337
- kmer_reversed = str(Seq(kmer).reverse_complement())
338
- if kmer > kmer_reversed:
339
- self.lookup(kmer)
340
- else:
341
- self.lookup(kmer_reversed)
336
+ self.lookup_canonical(kmer)
342
337
  score = self.get_score()
343
338
  score_edit = [str(x) for x in score]
344
339
  score_edit = ",".join(score_edit)
@@ -383,9 +378,9 @@ class AbaumanniiBloomfilter:
383
378
  bootstrap_score = bootstrap_score / bootstrap_n
384
379
  # bootstrap_score = 1
385
380
 
386
- if ("A." + prediction) not in reads_classified:
381
+ if prediction not in reads_classified:
387
382
  # Value 5 war vohrer = read
388
- reads_classified["A." + prediction] = [
383
+ reads_classified[prediction] = [
389
384
  [max(score)],
390
385
  1,
391
386
  [len(read)],
@@ -395,13 +390,11 @@ class AbaumanniiBloomfilter:
395
390
  None,
396
391
  ]
397
392
  else:
398
- reads_classified["A." + prediction][0] += [max(score)]
399
- reads_classified["A." + prediction][1] += 1
400
- reads_classified["A." + prediction][2] += [len(read)]
401
- reads_classified["A." + prediction][3] += sorted(score)[-2] / max(
402
- score
403
- )
404
- reads_classified["A." + prediction][4] += [bootstrap_score]
393
+ reads_classified[prediction][0] += [max(score)]
394
+ reads_classified[prediction][1] += 1
395
+ reads_classified[prediction][2] += [len(read)]
396
+ reads_classified[prediction][3] += sorted(score)[-2] / max(score)
397
+ reads_classified[prediction][4] += [bootstrap_score]
405
398
  # reads_classified["A." + prediction][5] += None
406
399
  # tracker.print_diff()
407
400
  # not ready yet
@@ -475,11 +468,7 @@ class AbaumanniiBloomfilter:
475
468
  self.number_of_kmeres += 1
476
469
  # lookup for kmer
477
470
  kmer = str(single_read[j : j + self.k])
478
- kmer_reversed = str(Seq(kmer).reverse_complement())
479
- if kmer > kmer_reversed:
480
- self.lookup(kmer)
481
- else:
482
- self.lookup(kmer_reversed)
471
+ self.lookup_canonical(kmer)
483
472
 
484
473
  def cleanup(self):
485
474
  """deletes matrix"""
xspect/WebApp.py CHANGED
@@ -12,7 +12,7 @@ import logging
12
12
  import pickle
13
13
  import secrets
14
14
  import pandas as pd
15
- from Bio import Entrez, Medline
15
+ from Bio import Entrez, Medline, SeqIO
16
16
  from flask import (
17
17
  Flask,
18
18
  render_template,
@@ -139,35 +139,25 @@ def assignspec():
139
139
  return redirect("/resultsspec")
140
140
 
141
141
  else:
142
- # Checking file type
143
- # if the file is fasta -> concat lines
144
- ext = filename.split(".")[-2]
142
+ ext = filename.split(".")[-1]
145
143
  with open(filename) as f:
146
144
  reads = f.read().splitlines()
147
145
 
148
- # Concat Lines if not .fq file
149
- if ext != "fq" and ext != "fastq":
150
- reads = "".join(reads)
151
- reads = reads.split(">")
146
+ if ext == "fq" or ext == "fastq":
147
+ sequences = SeqIO.parse(filename, "fastq")
148
+ quick = 2
149
+ else:
152
150
  if quick:
153
151
  quick = 1
154
152
  else:
155
153
  quick = 0
156
- if metagenome:
157
- quick = 4
158
- reads.pop(0)
159
- else:
160
- if metagenome:
161
- quick = 4
162
- else:
163
- quick = 2
164
- # deleting file
165
- os.remove(filename)
154
+ sequences = SeqIO.parse(filename, "fasta")
155
+
156
+ reads = [str(sequence.seq).upper() for sequence in sequences]
166
157
 
167
- for i in range(len(reads)):
168
- reads[i] = reads[i].upper()
169
158
  # starts the lookup for a given sequence
170
159
  if metagenome:
160
+ quick = 4
171
161
  start_meta = time.time()
172
162
  reads, reads_oxa = read_search_pre(reads, BF_Master_prefilter, ext)
173
163
  end_meta = time.time()
@@ -193,8 +183,8 @@ def assignspec():
193
183
  # assign reads to species
194
184
  species_dict = {}
195
185
  predictions_names = set()
196
- for ele in predictions:
197
- predictions_names.add(ele)
186
+ for prediction in predictions:
187
+ predictions_names.add(prediction)
198
188
  for species in predictions_names:
199
189
  species_dict[species] = []
200
190
  # dict with species as keys and reads as values for oxa search
@@ -323,11 +313,6 @@ def assignspec():
323
313
  )
324
314
  return redirect("/resultsspec")
325
315
 
326
- app.logger.info(
327
- "Assignment done for " + str(filename) + ", Time needed: " + str(needed)
328
- )
329
- return redirect("/resultsspec")
330
-
331
316
 
332
317
  # about page
333
318
  @app.route("/about")
@@ -511,6 +496,7 @@ def resultsspec():
511
496
 
512
497
  elif metagenome:
513
498
  reads_classified = session.get("reads_classified")
499
+ genus = session.get("genus")
514
500
  # sort reads_classified by highest value of the second element
515
501
  sorted_reads_classified = dict(
516
502
  sorted(reads_classified.items(), key=lambda x: x[1][1], reverse=True)
@@ -518,8 +504,9 @@ def resultsspec():
518
504
  # get key of reads_classified with highest value of the second element from the value
519
505
  predictions = []
520
506
  values = []
507
+ translation_dict = load_translation_dict(genus)
521
508
  for key, value in sorted_reads_classified.items():
522
- predictions.append(key)
509
+ predictions.append(translation_dict[key])
523
510
  values.append(value[1])
524
511
  clonetypes_sorted = predictions[:12]
525
512
  values_sorted = values[:12]
xspect/XspecT_mini.py CHANGED
@@ -608,7 +608,9 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
608
608
  threshold_contig = sample_size * 0.7
609
609
  for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
610
610
  if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
611
- BF_1_1.lookup(str(sequence.seq[i : i + BF_1_1.k]).upper())
611
+ BF_1_1.lookup_canonical(
612
+ str(sequence.seq[i : i + BF_1_1.k]).upper()
613
+ )
612
614
 
613
615
  # needs at least 70% hits to continue with the contig
614
616
  counter = 0
@@ -620,7 +622,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
620
622
  )
621
623
  counter += 1
622
624
  # how many kmers? to use
623
- if counter >= 5000000:
625
+ if counter >= 5000:
624
626
  break
625
627
  # contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
626
628
  contigs.append(contigs_kmers)
@@ -638,7 +640,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
638
640
  for j in range(len(contigs[i])):
639
641
  BF_1_1.number_of_kmeres += 1
640
642
  hits_per_filter_copy = BF_1_1.hits_per_filter[:]
641
- BF_1_1.lookup(contigs[i][j])
643
+ BF_1_1.lookup_canonical(contigs[i][j])
642
644
  if hits_per_filter_copy != BF_1_1.hits_per_filter:
643
645
  threshold += 1
644
646
  # parameter value needs to be determined
@@ -654,11 +656,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
654
656
  BF.hits_per_filter = [0] * BF.clonetypes
655
657
  for kmer in contigs_filtered:
656
658
  BF.number_of_kmeres += 1
657
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
658
- if kmer > kmer_reversed:
659
- BF.lookup(kmer)
660
- else:
661
- BF.lookup(kmer_reversed)
659
+ BF.lookup_canonical(kmer)
662
660
  score = BF.get_score()
663
661
  score_edit = [str(x) for x in score]
664
662
  score_edit = ",".join(score_edit)
@@ -750,11 +748,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
750
748
  for j in range(0, len(sequence.seq) - BF.k, mode):
751
749
  BF.number_of_kmeres += 1
752
750
  kmer = str(sequence.seq[j : j + BF.k])
753
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
754
- if kmer > kmer_reversed:
755
- BF.lookup(kmer)
756
- else:
757
- BF.lookup(kmer_reversed)
751
+ BF.lookup_canonical(kmer)
758
752
 
759
753
  score = BF.get_score()
760
754
  # print("Scores: ", score)
@@ -884,11 +878,11 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
884
878
  # if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
885
879
  # BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
886
880
  if "N" not in str(sequence.seq):
887
- BF_1_1.lookup(k1)
888
- BF_1_1.lookup(k2)
889
- BF_1_1.lookup(k3)
890
- BF_1_1.lookup(k4)
891
- BF_1_1.lookup(k5)
881
+ BF_1_1.lookup_canonical(k1)
882
+ BF_1_1.lookup_canonical(k2)
883
+ BF_1_1.lookup_canonical(k3)
884
+ BF_1_1.lookup_canonical(k4)
885
+ BF_1_1.lookup_canonical(k5)
892
886
  else:
893
887
  continue
894
888
  # needs at least 2 of 3 hits to continue with read
@@ -913,7 +907,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
913
907
  BF_1_1.number_of_kmeres += 1
914
908
  hits_per_filter_copy = BF_1_1.hits_per_filter[:]
915
909
  if "N" not in reads[i][j]:
916
- BF_1_1.lookup(reads[i][j])
910
+ BF_1_1.lookup_canonical(reads[i][j])
917
911
  if hits_per_filter_copy != BF_1_1.hits_per_filter:
918
912
  threshold += 1
919
913
  if threshold >= 0.7 * len(reads[i]):
@@ -929,11 +923,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
929
923
  for kmer in reads_filtered:
930
924
  if "N" not in kmer:
931
925
  BF.number_of_kmeres += 1
932
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
933
- if kmer > kmer_reversed:
934
- BF.lookup(kmer)
935
- else:
936
- BF.lookup(kmer_reversed)
926
+ BF.lookup_canonical(kmer)
937
927
  else:
938
928
  continue
939
929
  score = BF.get_score()
@@ -1041,11 +1031,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
1041
1031
  for j in range(0, len(sequence.seq) - BF.k + 1, mode):
1042
1032
  BF.number_of_kmeres += 1
1043
1033
  kmer = str(sequence.seq[j : j + BF.k])
1044
- kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
1045
- if kmer > kmer_reversed:
1046
- BF.lookup(kmer)
1047
- else:
1048
- BF.lookup(kmer_reversed)
1034
+ BF.lookup_canonical(kmer)
1049
1035
  else:
1050
1036
  break
1051
1037
  score = BF.get_score()
xspect/main.py CHANGED
@@ -56,6 +56,7 @@ def classify(genus, path, species, ic, oxa, metagenome, complete, save):
56
56
  mode = 1
57
57
  file_format = "fasta"
58
58
  read_amount = 342480
59
+ print(mode)
59
60
 
60
61
  xspecT_mini(
61
62
  path,
@@ -111,7 +112,6 @@ def train(genus, bf_assembly_path, svm_assembly_path, complete, check):
111
112
  @cli.command()
112
113
  def web():
113
114
  """Open the XspecT web app."""
114
- webbrowser.open("http://localhost:8000")
115
115
  app.run(host="0.0.0.0", port=8000, debug=True, threaded=True)
116
116
 
117
117
 
xspect/search_filter.py CHANGED
@@ -299,7 +299,7 @@ def read_search_pre(reads, BF_pre, ext):
299
299
  threshold_read = sample_size * 0.7
300
300
  for i in range(0, len(single_read) - BF_pre.k, sample_size):
301
301
  if "N" not in single_read[i : i + BF_pre.k]:
302
- BF_pre.lookup(single_read[i : i + BF_pre.k])
302
+ BF_pre.lookup_canonical(single_read[i : i + BF_pre.k])
303
303
  # for reads use a static sample of 5
304
304
  # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
305
305
  # then the read won't be tested further
@@ -312,11 +312,11 @@ def read_search_pre(reads, BF_pre, ext):
312
312
  k4 = single_read[BF_pre.k : BF_pre.k * 2]
313
313
  k5 = single_read[mid + BF_pre.k : mid + BF_pre.k * 2]
314
314
  if "N" not in single_read:
315
- BF_pre.lookup(k1)
316
- BF_pre.lookup(k2)
317
- BF_pre.lookup(k3)
318
- BF_pre.lookup(k4)
319
- BF_pre.lookup(k5)
315
+ BF_pre.lookup_canonical(k1)
316
+ BF_pre.lookup_canonical(k2)
317
+ BF_pre.lookup_canonical(k3)
318
+ BF_pre.lookup_canonical(k4)
319
+ BF_pre.lookup_canonical(k5)
320
320
  threshold_read = 3
321
321
  # needs at least 2 of 3 hits to continue with read
322
322
  counter = 0
@@ -348,7 +348,7 @@ def read_search_pre(reads, BF_pre, ext):
348
348
  for j in range(len(reads_new[i])):
349
349
  BF_pre.number_of_kmeres += 1
350
350
  hits_per_filter_copy = BF_pre.hits_per_filter[:]
351
- BF_pre.lookup(reads_new[i][j])
351
+ BF_pre.lookup_canonical(reads_new[i][j])
352
352
  if hits_per_filter_copy != BF_pre.hits_per_filter:
353
353
  threshold += 1
354
354
  if threshold >= cutoff * len(reads_new[i]):
@@ -373,7 +373,7 @@ def read_search_spec(reads, quick, BF, ext, genus):
373
373
  names = [translation_dict[name] for name in names_id]
374
374
  return score, names, hits, None
375
375
  # Metagenome mode
376
- elif quick == 4:
376
+ elif quick == 4:
377
377
  reads_classified, predictions = BF.lookup_txt(reads, genus, ext, quick)
378
378
  hits = None
379
379
  names = None
@@ -0,0 +1,119 @@
1
+ # XspecT-Erweiterung
2
+
3
+ Expands XspecT, so new filter for a genus can automatically be trained. It's main
4
+ script is XspecT_trainer.py. The rest of the scripts are inside the python module
5
+ train_filter.
6
+
7
+ ## Training new filter
8
+
9
+ XspecT_trainer.py uses command line arguments. The examples for using XspecT_trainer.py
10
+ are using Salmonella since this genus only has two defined species in the NCBI
11
+ databases.
12
+
13
+ ### Jellyfish
14
+
15
+ The program jellyfish is used to count distinct k-meres in the assemblies. For XspecT_
16
+ trainer.py to work jellyfish needs to be installed. It can be installed using bioconda:
17
+
18
+ `
19
+ conda install -c bioconda jellyfish
20
+ `
21
+
22
+ ### Training examples
23
+
24
+ New filters with assemblies from NCBI RefSeq can be trained with the following line. The
25
+ python libraries from [requirements.txt](..%2Frequirements.txt) need to be installed.
26
+
27
+ `
28
+ python XspecT_trainer.py Salmonella 1
29
+ `
30
+
31
+ Training filters with custom data can be done using the following line.
32
+
33
+ `
34
+ python XspecT_trainer.py Salmonella 2 -bf /path/to/concate_assemblies -svm
35
+ /path/to/assemblies
36
+ `
37
+
38
+ All command line arguments are explained using the following line.
39
+
40
+ `
41
+ python XspecT_trainer.py -h
42
+ `
43
+
44
+ # Explanation of the scripts
45
+
46
+ ## backup_filter.py
47
+
48
+ Creates a backup of all files needed for the species assignment by XspecT for a specific
49
+ genus. The backup will be done, if new filters will be created for a genus which
50
+ already has trained filters.
51
+
52
+ ## create_svm.py
53
+
54
+ Downloads the needed assemblies and trains a support-vector-machine for the genus.
55
+
56
+ ## extract_and_concatenate.py
57
+
58
+ Unzips the downloaded assemblies. Concatenates assemblies per species that will be used
59
+ to train the bloomfilters.
60
+
61
+ ## get_paths.py
62
+
63
+ Functions that get specific paths.
64
+
65
+ ## html_scrap.py
66
+
67
+ Updates a list of all NCBI RefSeq assembly accessions that have a taxonomy check result
68
+ of OK. The taxonomy check from NCBI RefSeq uses the ANI (average-nucleotide-
69
+ identity) to compute a result.
70
+
71
+ ## interface_XspecT.py
72
+
73
+ Mostly functions that train new bloomfilters automatically. The functions were
74
+ originally writen for XspecT in a non-automatic way and were updated.
75
+
76
+ ## k_mer_count.py
77
+
78
+ Uses jellyfish to count distinct k-meres in every concatenated assembly. The highest
79
+ count will be used to compute the size of the bloomfilters.
80
+
81
+ ## ncbi_api
82
+
83
+ A module which makes requests to the NCBI Datasets API.
84
+
85
+ ### download_assemblies.py
86
+
87
+ The specific function that downloads assemblies from NCBI RefSeq using NCBI
88
+ datasets.
89
+
90
+ ### ncbi_assembly_metadata.py
91
+
92
+ Takes a dictionary with species and their taxon ID and asks NCBI for assemblies of
93
+ the species. Saves the collected accessions of the found and selected assemblies.
94
+
95
+ ### ncbi_children_tree.py
96
+
97
+ Takes the name or ID of a genus and gives a list with all its species.
98
+
99
+ ### ncbi_taxon_metadata.py
100
+
101
+ Takes a list with taxon and collects metadata like their scientific name and rank.
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
@@ -127,7 +127,7 @@ def perform_lookup(bloomfilter, files, file_paths, accessions, names, spacing):
127
127
  # Dominik: changed sample size to var
128
128
  for j in range(0, len(sequence.seq) - BF.k, spacing):
129
129
  BF.number_of_kmeres += 1
130
- BF.lookup(str(sequence.seq[j : j + BF.k]))
130
+ BF.lookup_canonical(str(sequence.seq[j : j + BF.k]))
131
131
 
132
132
  score = BF.get_score()
133
133
  score = [str(x) for x in score]
File without changes