PyPI - XspecT - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

XspecT 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (13) hide show

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/METADATA +1 -1
{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/RECORD +13 -12
xspect/BF_v2.py +25 -36
xspect/WebApp.py +15 -28
xspect/XspecT_mini.py +15 -29
xspect/main.py +1 -1
xspect/search_filter.py +8 -8
xspect/train_filter/README_XspecT_Erweiterung.md +119 -0
xspect/train_filter/create_svm.py +1 -1
{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/LICENSE +0 -0
{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/WHEEL +0 -0
{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/entry_points.txt +0 -0
{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/top_level.txt +0 -0

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: XspecT
-Version: 0.1.2
+Version: 0.1.3
 Summary: Tool to monitor and characterize pathogens using Bloom filters.
 License: MIT License

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-xspect/BF_v2.py,sha256=3zJgWY6VxfE-6eSqUTgoOY4Z_mp6IKBKDpuWu34FlKI,26080
+xspect/BF_v2.py,sha256=05sp27VzxFtsjt2oyMyhW4aqNmUxGTori49j6lxo1BU,25392
 xspect/Bootstrap.py,sha256=AYyEBo3MoOnPqhPAHe726mX8L9NuXDa5SATxZKLMv3s,830
 xspect/Classifier.py,sha256=BgqpZiMYi2maaccTzJcgH2tjrtDH-U7COc7E4t4cQt8,3602
 xspect/OXA_Table.py,sha256=1GxsyxMpUEgQirY0nJHtR3jl61DoPZh2Rb9L0VdMxD4,1632
-xspect/WebApp.py,sha256=eo1EJOMjW5grCZyvX5g1J4ppwyZb_M9lYGCNuJidM0Q,25224
-xspect/XspecT_mini.py,sha256=t_4OlhzLytRXkM0ig9lo0Szfm2QgJhls52TScUxFN1s,55411
+xspect/WebApp.py,sha256=H4NyfDELrqUSFKOGDLNSJxsNzfLsCX9_BJMln9UXQk0,24941
+xspect/XspecT_mini.py,sha256=OApDXSVIZFK8ZNpNJRPYTlyOLszZbpkJt3jJC51hV8Q,54694
 xspect/XspecT_trainer.py,sha256=6Gj2mltyVyM8Rsh5EU8tSCGMG7niYBLfId664zYaVXI,21703
 xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xspect/download_filters.py,sha256=wSyX-IucjuKIEcVx-E0ClsA0XL0DI1FgMlO2UULgaXc,1048
 xspect/file_io.py,sha256=IWae7xxAt-EmyEbxo0nDSe3RJHmLkQT5jNS2Z3qLKdg,4807
-xspect/main.py,sha256=bF7ntgy_gR0ZNIB9JVxtXb-a6o0Lt0__tI_zzj03B24,2977
+xspect/main.py,sha256=xZxaXOUvA26OurM9DLsbqDzpUEvbtTZOs4916y2Ifdo,2948
 xspect/map_kmers.py,sha256=63iTQS_GZZBK2DxjEs5xoI4KgfpZOntCKul06rrgi5w,6000
-xspect/search_filter.py,sha256=EZkM2917cjy4Q0zQDC9bJ0S-dyD-MBBmJqrAHQ1P260,17190
+xspect/search_filter.py,sha256=a0n2VHmmfVqXaKwLLb478Lvb46kN2GwNgFOZmee1_xo,17261
 xspect/static/How-To.png,sha256=QO6HydIHcL3oM9feMxmfZcKE8M62fIRl2xs_5S_NL5M,119621
 xspect/static/Logo.png,sha256=bvOWMpqxmBigg9jEvZtIMOsXncbSFwnYu4eYNSf1_Qw,296095
 xspect/static/Logo2.png,sha256=V7hpGb3XYLN5vEQQNJdpNjQX_F2A_f1wKAP8N37NwGs,292730
@@ -28,8 +28,9 @@ xspect/templates/layoutabout.html,sha256=ICC8g0DP8a7MLNrEYnXBgtnkwMjIktsimmqwqjM
 xspect/templates/layouthome.html,sha256=6EtVD-L6xlTc7XGk77f9CARKW7JLpv2iiyUci1BK00A,2870
 xspect/templates/layoutspecies.html,sha256=MNGSDEvuKFvgsyXoRLCu-rma10gntUI9vP_9a2sNl7M,24008
 xspect/templates/species.html,sha256=rD9fCmSgyI8hRcmy56mNQH7VR5jnmtriv9WlvTIJJjE,2412
+xspect/train_filter/README_XspecT_Erweiterung.md,sha256=Gn64Biz32LiUfQVYb43Hez6ihDTSFTLfEMwUJ_l1MGU,2879
 xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-xspect/train_filter/create_svm.py,sha256=E1QwBeUtAlOlKf6QKfmRtKaz_6idv7M8Hb-jbNb_wGk,6820
+xspect/train_filter/create_svm.py,sha256=sktMp8tmQWjcbWkiEzRizabB4qF9wVr7bds2jAvxx4Y,6830
 xspect/train_filter/extract_and_concatenate.py,sha256=kXGqCrOk3TbOkKLJV8nKC6nL8Zg0TWKDCJu2gq8K_cw,5239
 xspect/train_filter/get_paths.py,sha256=JXPbv_Fx5BKHZQ4bkSIGU7yj5zjkmhsI0Z6U4nU0gug,941
 xspect/train_filter/html_scrap.py,sha256=iQXREhG37SNUx7gHoP8eqayMEIH00QLFMTNmIMogb_M,3799
@@ -40,9 +41,9 @@ xspect/train_filter/ncbi_api/download_assemblies.py,sha256=iX1qK8R6p2b3RiHPfqVsL
 xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=RhHvxKiQ8HJgoSb6njYEgO_vPioBqEMPvT3lE2lHXp0,3766
 xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=pmzg6-fDGLinNSXNbBRv0v62lRgHxW4aXZ0uV1TJhOE,1793
 xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=uhBBGffgL4mcJpyp9KxVyOGUh8FxUTAI4xKzoLDav_Y,1577
-XspecT-0.1.2.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
-XspecT-0.1.2.dist-info/METADATA,sha256=h4OX8L719oZsPj0Xcab4bx4ZstZiMUuPFpVcbZoGc_w,5475
-XspecT-0.1.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-XspecT-0.1.2.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
-XspecT-0.1.2.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
-XspecT-0.1.2.dist-info/RECORD,,
+XspecT-0.1.3.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
+XspecT-0.1.3.dist-info/METADATA,sha256=ulwNLqzESiHPxspAckJm3RkWXg5qf-T6KoNvfTnsH0g,5475
+XspecT-0.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+XspecT-0.1.3.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
+XspecT-0.1.3.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
+XspecT-0.1.3.dist-info/RECORD,,

xspect/BF_v2.py CHANGED Viewed

@@ -48,6 +48,7 @@ class AbaumanniiBloomfilter:
     ]  # names of the IC's
     number_of_kmeres = 0  # counter of k-meres, will be used to calculate score
     reads = 1000  # standard read number
+    kmer_hits_single = []  # kmer hits per filter
     def __init__(self, arraysize):
         """creates empty matrix"""
@@ -181,8 +182,17 @@ class AbaumanniiBloomfilter:
         return positions
+    def lookup_canonical(self, kmer, limit=False):
+        """takes kmer input string and checks all clonetypes if the cononicalized kmer is inside that set of kmers"""
+        # canonicalize
+        complement = str(Seq(kmer).reverse_complement())
+        kmer = max(kmer, complement)
+        self.lookup(kmer, limit)
     def lookup(self, kmer, limit=False):
-        """checks if an element is in the filters, returns list with True/False,
+        """
         takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
         """
@@ -213,6 +223,8 @@ class AbaumanniiBloomfilter:
                 temp[i] += 1
                 self.hit = True
                 if limit:
+                    # reset single kmer kit vector / memory management
+                    self.kmer_hits_single = []
                     if self.table.lookup(self.names[i], kmer):
                         self.hits_per_filter[i] += 1
                 else:
@@ -274,11 +286,7 @@ class AbaumanniiBloomfilter:
                         continue
                     self.number_of_kmeres += 1
                     kmer = str(single_read[j : j + self.k])
-                    kmer_reversed = str(Seq(kmer).reverse_complement())
-                    if kmer > kmer_reversed:
-                        self.lookup(kmer)
-                    else:
-                        self.lookup(kmer_reversed)
+                    self.lookup_canonical(kmer)
         # XspecT Sequence-Reads every 10th kmer
         elif quick == 2:
             for single_read in range(0, len(reads)):
@@ -291,11 +299,7 @@ class AbaumanniiBloomfilter:
                     # lookup for kmer
                     temp = reads[single_read]
                     kmer = str(temp[j : j + self.k])
-                    kmer_reversed = str(Seq(kmer).reverse_complement())
-                    if kmer > kmer_reversed:
-                        self.lookup(kmer)
-                    else:
-                        self.lookup(kmer_reversed)
+                    self.lookup_canonical(kmer)
                     if self.hit == True:
                         hit_counter += 1
         elif quick == 3:
@@ -307,11 +311,7 @@ class AbaumanniiBloomfilter:
                         continue
                     self.number_of_kmeres += 1
                     kmer = str(single_read[j : j + self.k])
-                    kmer_reversed = str(Seq(kmer).reverse_complement())
-                    if kmer > kmer_reversed:
-                        self.lookup(kmer)
-                    else:
-                        self.lookup(kmer_reversed)
+                    self.lookup_canonical(kmer)
         # metagenome mode
         elif quick == 4:
             print("Stage 1")
@@ -332,13 +332,8 @@ class AbaumanniiBloomfilter:
                 self.hits_per_filter = [0] * self.clonetypes
                 for kmer in read:
                     counter += 1
-                    # lookup for kmer, use lexikographical smaller kmer
                     self.number_of_kmeres += 1
-                    kmer_reversed = str(Seq(kmer).reverse_complement())
-                    if kmer > kmer_reversed:
-                        self.lookup(kmer)
-                    else:
-                        self.lookup(kmer_reversed)
+                    self.lookup_canonical(kmer)
                 score = self.get_score()
                 score_edit = [str(x) for x in score]
                 score_edit = ",".join(score_edit)
@@ -383,9 +378,9 @@ class AbaumanniiBloomfilter:
                 bootstrap_score = bootstrap_score / bootstrap_n
                 # bootstrap_score = 1
-                if ("A." + prediction) not in reads_classified:
+                if prediction not in reads_classified:
                     # Value 5 war vohrer = read
-                    reads_classified["A." + prediction] = [
+                    reads_classified[prediction] = [
                         [max(score)],
                         1,
                         [len(read)],
@@ -395,13 +390,11 @@ class AbaumanniiBloomfilter:
                         None,
                     ]
                 else:
-                    reads_classified["A." + prediction][0] += [max(score)]
-                    reads_classified["A." + prediction][1] += 1
-                    reads_classified["A." + prediction][2] += [len(read)]
-                    reads_classified["A." + prediction][3] += sorted(score)[-2] / max(
-                        score
-                    )
-                    reads_classified["A." + prediction][4] += [bootstrap_score]
+                    reads_classified[prediction][0] += [max(score)]
+                    reads_classified[prediction][1] += 1
+                    reads_classified[prediction][2] += [len(read)]
+                    reads_classified[prediction][3] += sorted(score)[-2] / max(score)
+                    reads_classified[prediction][4] += [bootstrap_score]
                     # reads_classified["A." + prediction][5] += None
                 # tracker.print_diff()
             # not ready yet
@@ -475,11 +468,7 @@ class AbaumanniiBloomfilter:
                     self.number_of_kmeres += 1
                     # lookup for kmer
                     kmer = str(single_read[j : j + self.k])
-                    kmer_reversed = str(Seq(kmer).reverse_complement())
-                    if kmer > kmer_reversed:
-                        self.lookup(kmer)
-                    else:
-                        self.lookup(kmer_reversed)
+                    self.lookup_canonical(kmer)
     def cleanup(self):
         """deletes matrix"""

xspect/WebApp.py CHANGED Viewed

@@ -12,7 +12,7 @@ import logging
 import pickle
 import secrets
 import pandas as pd
-from Bio import Entrez, Medline
+from Bio import Entrez, Medline, SeqIO
 from flask import (
     Flask,
     render_template,
@@ -139,35 +139,25 @@ def assignspec():
         return redirect("/resultsspec")
     else:
-        # Checking file type
-        # if the file is fasta -> concat lines
-        ext = filename.split(".")[-2]
+        ext = filename.split(".")[-1]
         with open(filename) as f:
             reads = f.read().splitlines()
-        # Concat Lines if not .fq file
-        if ext != "fq" and ext != "fastq":
-            reads = "".join(reads)
-            reads = reads.split(">")
+        if ext == "fq" or ext == "fastq":
+            sequences = SeqIO.parse(filename, "fastq")
+            quick = 2
+        else:
             if quick:
                 quick = 1
             else:
                 quick = 0
-            if metagenome:
-                quick = 4
-            reads.pop(0)
-        else:
-            if metagenome:
-                quick = 4
-            else:
-                quick = 2
-        # deleting file
-        os.remove(filename)
+            sequences = SeqIO.parse(filename, "fasta")
+        reads = [str(sequence.seq).upper() for sequence in sequences]
-        for i in range(len(reads)):
-            reads[i] = reads[i].upper()
     # starts the lookup for a given sequence
     if metagenome:
+        quick = 4
         start_meta = time.time()
         reads, reads_oxa = read_search_pre(reads, BF_Master_prefilter, ext)
         end_meta = time.time()
@@ -193,8 +183,8 @@ def assignspec():
         # assign reads to species
         species_dict = {}
         predictions_names = set()
-        for ele in predictions:
-            predictions_names.add(ele)
+        for prediction in predictions:
+            predictions_names.add(prediction)
         for species in predictions_names:
             species_dict[species] = []
         # dict with species as keys and reads as values for oxa search
@@ -323,11 +313,6 @@ def assignspec():
         )
         return redirect("/resultsspec")
-    app.logger.info(
-        "Assignment done for " + str(filename) + ", Time needed: " + str(needed)
-    )
-    return redirect("/resultsspec")
 # about page
 @app.route("/about")
@@ -511,6 +496,7 @@ def resultsspec():
     elif metagenome:
         reads_classified = session.get("reads_classified")
+        genus = session.get("genus")
         # sort reads_classified by highest value of the second element
         sorted_reads_classified = dict(
             sorted(reads_classified.items(), key=lambda x: x[1][1], reverse=True)
@@ -518,8 +504,9 @@ def resultsspec():
         # get key of reads_classified with highest value of the second element from the value
         predictions = []
         values = []
+        translation_dict = load_translation_dict(genus)
         for key, value in sorted_reads_classified.items():
-            predictions.append(key)
+            predictions.append(translation_dict[key])
             values.append(value[1])
         clonetypes_sorted = predictions[:12]
         values_sorted = values[:12]

xspect/XspecT_mini.py CHANGED Viewed

@@ -608,7 +608,9 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                     threshold_contig = sample_size * 0.7
                     for i in range(0, len(str(sequence.seq)) - BF_1_1.k, sample_size):
                         if "N" not in str(sequence.seq[i : i + BF_1_1.k]):
-                            BF_1_1.lookup(str(sequence.seq[i : i + BF_1_1.k]).upper())
+                            BF_1_1.lookup_canonical(
+                                str(sequence.seq[i : i + BF_1_1.k]).upper()
+                            )
                     # needs at least 70% hits to continue with the contig
                     counter = 0
@@ -620,7 +622,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                                 )
                                 counter += 1
                                 # how many kmers? to use
-                                if counter >= 5000000:
+                                if counter >= 5000:
                                     break
                         # contigs_kmers.append(str(reverse_sequence[j: j + BF_1_1.k]))
                         contigs.append(contigs_kmers)
@@ -638,7 +640,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                         for j in range(len(contigs[i])):
                             BF_1_1.number_of_kmeres += 1
                             hits_per_filter_copy = BF_1_1.hits_per_filter[:]
-                            BF_1_1.lookup(contigs[i][j])
+                            BF_1_1.lookup_canonical(contigs[i][j])
                             if hits_per_filter_copy != BF_1_1.hits_per_filter:
                                 threshold += 1
                         # parameter value needs to be determined
@@ -654,11 +656,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                     BF.hits_per_filter = [0] * BF.clonetypes
                     for kmer in contigs_filtered:
                         BF.number_of_kmeres += 1
-                        kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
-                        if kmer > kmer_reversed:
-                            BF.lookup(kmer)
-                        else:
-                            BF.lookup(kmer_reversed)
+                        BF.lookup_canonical(kmer)
                     score = BF.get_score()
                     score_edit = [str(x) for x in score]
                     score_edit = ",".join(score_edit)
@@ -750,11 +748,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                     for j in range(0, len(sequence.seq) - BF.k, mode):
                         BF.number_of_kmeres += 1
                         kmer = str(sequence.seq[j : j + BF.k])
-                        kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
-                        if kmer > kmer_reversed:
-                            BF.lookup(kmer)
-                        else:
-                            BF.lookup(kmer_reversed)
+                        BF.lookup_canonical(kmer)
             score = BF.get_score()
             # print("Scores: ", score)
@@ -884,11 +878,11 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                     #    if "N" not in str(sequence.seq[i: i + BF_1_1.k]):
                     #        BF_1_1.lookup(str(sequence.seq[i: i + BF_1_1.k]))
                     if "N" not in str(sequence.seq):
-                        BF_1_1.lookup(k1)
-                        BF_1_1.lookup(k2)
-                        BF_1_1.lookup(k3)
-                        BF_1_1.lookup(k4)
-                        BF_1_1.lookup(k5)
+                        BF_1_1.lookup_canonical(k1)
+                        BF_1_1.lookup_canonical(k2)
+                        BF_1_1.lookup_canonical(k3)
+                        BF_1_1.lookup_canonical(k4)
+                        BF_1_1.lookup_canonical(k5)
                     else:
                         continue
                     # needs at least 2 of 3 hits to continue with read
@@ -913,7 +907,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                             BF_1_1.number_of_kmeres += 1
                             hits_per_filter_copy = BF_1_1.hits_per_filter[:]
                             if "N" not in reads[i][j]:
-                                BF_1_1.lookup(reads[i][j])
+                                BF_1_1.lookup_canonical(reads[i][j])
                             if hits_per_filter_copy != BF_1_1.hits_per_filter:
                                 threshold += 1
                         if threshold >= 0.7 * len(reads[i]):
@@ -929,11 +923,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                     for kmer in reads_filtered:
                         if "N" not in kmer:
                             BF.number_of_kmeres += 1
-                            kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
-                            if kmer > kmer_reversed:
-                                BF.lookup(kmer)
-                            else:
-                                BF.lookup(kmer_reversed)
+                            BF.lookup_canonical(kmer)
                         else:
                             continue
                     score = BF.get_score()
@@ -1041,11 +1031,7 @@ def xspecT(BF, BF_1_1, files, paths, file_format, read_amount, metagenome, genus
                         for j in range(0, len(sequence.seq) - BF.k + 1, mode):
                             BF.number_of_kmeres += 1
                             kmer = str(sequence.seq[j : j + BF.k])
-                            kmer_reversed = str(Seq.Seq(kmer).reverse_complement())
-                            if kmer > kmer_reversed:
-                                BF.lookup(kmer)
-                            else:
-                                BF.lookup(kmer_reversed)
+                            BF.lookup_canonical(kmer)
                     else:
                         break
             score = BF.get_score()

xspect/main.py CHANGED Viewed

@@ -56,6 +56,7 @@ def classify(genus, path, species, ic, oxa, metagenome, complete, save):
         mode = 1
     file_format = "fasta"
     read_amount = 342480
+    print(mode)
     xspecT_mini(
         path,
@@ -111,7 +112,6 @@ def train(genus, bf_assembly_path, svm_assembly_path, complete, check):
 @cli.command()
 def web():
     """Open the XspecT web app."""
-    webbrowser.open("http://localhost:8000")
     app.run(host="0.0.0.0", port=8000, debug=True, threaded=True)

xspect/search_filter.py CHANGED Viewed

@@ -299,7 +299,7 @@ def read_search_pre(reads, BF_pre, ext):
             threshold_read = sample_size * 0.7
             for i in range(0, len(single_read) - BF_pre.k, sample_size):
                 if "N" not in single_read[i : i + BF_pre.k]:
-                    BF_pre.lookup(single_read[i : i + BF_pre.k])
+                    BF_pre.lookup_canonical(single_read[i : i + BF_pre.k])
         # for reads use a static sample of 5
         # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
         # then the read won't be tested further
@@ -312,11 +312,11 @@ def read_search_pre(reads, BF_pre, ext):
             k4 = single_read[BF_pre.k : BF_pre.k * 2]
             k5 = single_read[mid + BF_pre.k : mid + BF_pre.k * 2]
             if "N" not in single_read:
-                BF_pre.lookup(k1)
-                BF_pre.lookup(k2)
-                BF_pre.lookup(k3)
-                BF_pre.lookup(k4)
-                BF_pre.lookup(k5)
+                BF_pre.lookup_canonical(k1)
+                BF_pre.lookup_canonical(k2)
+                BF_pre.lookup_canonical(k3)
+                BF_pre.lookup_canonical(k4)
+                BF_pre.lookup_canonical(k5)
             threshold_read = 3
         # needs at least 2 of 3 hits to continue with read
         counter = 0
@@ -348,7 +348,7 @@ def read_search_pre(reads, BF_pre, ext):
         for j in range(len(reads_new[i])):
             BF_pre.number_of_kmeres += 1
             hits_per_filter_copy = BF_pre.hits_per_filter[:]
-            BF_pre.lookup(reads_new[i][j])
+            BF_pre.lookup_canonical(reads_new[i][j])
             if hits_per_filter_copy != BF_pre.hits_per_filter:
                 threshold += 1
         if threshold >= cutoff * len(reads_new[i]):
@@ -373,7 +373,7 @@ def read_search_spec(reads, quick, BF, ext, genus):
         names = [translation_dict[name] for name in names_id]
         return score, names, hits, None
     # Metagenome mode
-    elif quick == 4:
+    elif  quick == 4:
         reads_classified, predictions = BF.lookup_txt(reads, genus, ext, quick)
         hits = None
         names = None

xspect/train_filter/README_XspecT_Erweiterung.md ADDED Viewed

@@ -0,0 +1,119 @@
+# XspecT-Erweiterung
+Expands XspecT, so new filter for a genus can automatically be trained. It's main
+script is XspecT_trainer.py. The rest of the scripts are inside the python module
+train_filter.
+## Training new filter
+XspecT_trainer.py uses command line arguments. The examples for using XspecT_trainer.py
+are using Salmonella since this genus only has two defined species in the NCBI
+databases.
+### Jellyfish
+The program jellyfish is used to count distinct k-meres in the assemblies. For XspecT_
+trainer.py to work jellyfish needs to be installed. It can be installed using bioconda:
+`
+conda install -c bioconda jellyfish
+`
+### Training examples
+New filters with assemblies from NCBI RefSeq can be trained with the following line. The
+python libraries from [requirements.txt](..%2Frequirements.txt) need to be installed.
+`
+python XspecT_trainer.py Salmonella 1
+`
+Training filters with custom data can be done using the following line.
+`
+python XspecT_trainer.py Salmonella 2 -bf /path/to/concate_assemblies -svm
+/path/to/assemblies
+`
+All command line arguments are explained using the following line.
+`
+python XspecT_trainer.py -h
+`
+# Explanation of the scripts
+## backup_filter.py
+Creates a backup of all files needed for the species assignment by XspecT for a specific
+genus. The backup will be done, if new filters will be created for a genus which
+already has trained filters.
+## create_svm.py
+Downloads the needed assemblies and trains a support-vector-machine for the genus.
+## extract_and_concatenate.py
+Unzips the downloaded assemblies. Concatenates assemblies per species that will be used
+to train the bloomfilters.
+## get_paths.py
+Functions that get specific paths.
+## html_scrap.py
+Updates a list of all NCBI RefSeq assembly accessions that have a taxonomy check result
+of OK. The taxonomy check from NCBI RefSeq uses the ANI (average-nucleotide-
+identity) to compute a result.
+## interface_XspecT.py
+Mostly functions that train new bloomfilters automatically. The functions were
+originally writen for XspecT in a non-automatic way and were updated.
+## k_mer_count.py
+Uses jellyfish to count distinct k-meres in every concatenated assembly. The highest
+count will be used to compute the size of the bloomfilters.
+## ncbi_api
+A module which makes requests to the NCBI Datasets API.
+### download_assemblies.py
+The specific function that downloads assemblies from NCBI RefSeq using NCBI
+datasets.
+### ncbi_assembly_metadata.py
+Takes a dictionary with species and their taxon ID and asks NCBI for assemblies of
+the species. Saves the collected accessions of the found and selected assemblies.
+### ncbi_children_tree.py
+Takes the name or ID of a genus and gives a list with all its species.
+### ncbi_taxon_metadata.py
+Takes a list with taxon and collects metadata like their scientific name and rank.

xspect/train_filter/create_svm.py CHANGED Viewed

@@ -127,7 +127,7 @@ def perform_lookup(bloomfilter, files, file_paths, accessions, names, spacing):
             # Dominik: changed sample size to var
             for j in range(0, len(sequence.seq) - BF.k, spacing):
                 BF.number_of_kmeres += 1
-                BF.lookup(str(sequence.seq[j : j + BF.k]))
+                BF.lookup_canonical(str(sequence.seq[j : j + BF.k]))
         score = BF.get_score()
         score = [str(x) for x in score]

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{XspecT-0.1.2.dist-info → XspecT-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

XspecT 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

Potentially problematic release.

XspecT 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl