XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
- XspecT-0.2.0.dist-info/RECORD +30 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
- xspect/definitions.py +42 -0
- xspect/download_filters.py +11 -26
- xspect/fastapi.py +101 -0
- xspect/file_io.py +34 -103
- xspect/main.py +70 -66
- xspect/model_management.py +88 -0
- xspect/models/__init__.py +0 -0
- xspect/models/probabilistic_filter_model.py +277 -0
- xspect/models/probabilistic_filter_svm_model.py +169 -0
- xspect/models/probabilistic_single_filter_model.py +109 -0
- xspect/models/result.py +148 -0
- xspect/pipeline.py +201 -0
- xspect/run.py +38 -0
- xspect/train.py +304 -0
- xspect/train_filter/create_svm.py +6 -183
- xspect/train_filter/extract_and_concatenate.py +117 -121
- xspect/train_filter/html_scrap.py +16 -28
- xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
- XspecT-0.1.3.dist-info/RECORD +0 -49
- xspect/BF_v2.py +0 -637
- xspect/Bootstrap.py +0 -29
- xspect/Classifier.py +0 -142
- xspect/OXA_Table.py +0 -53
- xspect/WebApp.py +0 -724
- xspect/XspecT_mini.py +0 -1363
- xspect/XspecT_trainer.py +0 -611
- xspect/map_kmers.py +0 -155
- xspect/search_filter.py +0 -504
- xspect/static/How-To.png +0 -0
- xspect/static/Logo.png +0 -0
- xspect/static/Logo2.png +0 -0
- xspect/static/Workflow_AspecT.png +0 -0
- xspect/static/Workflow_ClAssT.png +0 -0
- xspect/static/js.js +0 -615
- xspect/static/main.css +0 -280
- xspect/templates/400.html +0 -64
- xspect/templates/401.html +0 -62
- xspect/templates/404.html +0 -62
- xspect/templates/500.html +0 -62
- xspect/templates/about.html +0 -544
- xspect/templates/home.html +0 -51
- xspect/templates/layoutabout.html +0 -87
- xspect/templates/layouthome.html +0 -63
- xspect/templates/layoutspecies.html +0 -468
- xspect/templates/species.html +0 -33
- xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
- xspect/train_filter/get_paths.py +0 -35
- xspect/train_filter/interface_XspecT.py +0 -204
- xspect/train_filter/k_mer_count.py +0 -162
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/map_kmers.py
DELETED
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
import statistics
|
|
2
|
-
import pickle
|
|
3
|
-
import os
|
|
4
|
-
from sklearn.cluster import KMeans
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def identify_split_reads(score, kmer_hits_single):
|
|
9
|
-
"""This method identifies if the read is split because of HGT"""
|
|
10
|
-
|
|
11
|
-
# notes
|
|
12
|
-
"""
|
|
13
|
-
Identify split read regions from the kmer profiles, one side should cluster Ones and the other side should cluster Zeros
|
|
14
|
-
for the respective species.
|
|
15
|
-
1.) Collect the kmer profiles, transforms the kmer_hit vector into a species_hit vector --> shows which positions in the read are covered
|
|
16
|
-
2.) Identify part of kmer profiles with a high number of ones --> shows where the read is covered
|
|
17
|
-
3.) Divide the kmer profiles into two clusters --> One with Zeros and one with Ones
|
|
18
|
-
--> Maximize the number of ones in the first cluster and the number of zeros in the second cluster
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
# initialize variables
|
|
22
|
-
kmer_profiles = [] * len(score)
|
|
23
|
-
split_regions = []
|
|
24
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
25
|
-
|
|
26
|
-
# Collect the kmer profiles, transforms the kmer_hit vector into a species_hit vector
|
|
27
|
-
for kmer_hits in kmer_hits_single:
|
|
28
|
-
for i in range(len(kmer_hits)):
|
|
29
|
-
kmer_profiles[i].append(kmer_hits[i])
|
|
30
|
-
|
|
31
|
-
og_species = None
|
|
32
|
-
novel_species = None
|
|
33
|
-
clusters = cluster_zeros_ones(kmer_profiles[index_result])
|
|
34
|
-
split_regions.append((clusters, i))
|
|
35
|
-
# which part of the read belongs to the original species, 0 = left, 1 = right
|
|
36
|
-
if sum(clusters[0]) > sum(clusters[1]):
|
|
37
|
-
og_species = 0
|
|
38
|
-
novel_species = 1
|
|
39
|
-
else:
|
|
40
|
-
og_species = 1
|
|
41
|
-
novel_species = 0
|
|
42
|
-
|
|
43
|
-
for i, kmer_profile in enumerate(kmer_profiles):
|
|
44
|
-
if i == index_result:
|
|
45
|
-
continue
|
|
46
|
-
clusters = cluster_zeros_ones(kmer_profile)
|
|
47
|
-
# 0.3 and 0.6 are arbitrary values, they should be tested etc.
|
|
48
|
-
# Find the complemantary species of the split read --> HGT Donor
|
|
49
|
-
if (
|
|
50
|
-
sum(clusters[og_species]) / len(clusters[og_species]) < 0.3
|
|
51
|
-
and sum(clusters[novel_species]) / len(clusters[novel_species]) >= 0.6
|
|
52
|
-
):
|
|
53
|
-
split_regions.append((clusters, i))
|
|
54
|
-
break
|
|
55
|
-
|
|
56
|
-
# split_regions = [([cluster_0, cluster_1], og_index), ([cluster_0, cluster_1], novel_index)]
|
|
57
|
-
return split_regions
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def cluster_zeros_ones(input_list, threshold=None):
|
|
61
|
-
"""This method divides a list of zeros and ones into two lists,
|
|
62
|
-
maximizing occurences of zeros and ones in each list"""
|
|
63
|
-
|
|
64
|
-
# min. length of a cluster
|
|
65
|
-
if threshold == None:
|
|
66
|
-
threshold = len(input_list) * 0.1
|
|
67
|
-
|
|
68
|
-
# copy the input list
|
|
69
|
-
input_list_copy = input_list[:]
|
|
70
|
-
|
|
71
|
-
# convert zeros to -1 so that they function as penalty
|
|
72
|
-
input_list[:] = [-1 if x == 0 else 1 for x in input_list]
|
|
73
|
-
cluster_score = 0
|
|
74
|
-
# calculate the score for the cluster for each possible split
|
|
75
|
-
for i in range(threshold, len(input_list) - threshold):
|
|
76
|
-
# goal is to maximize the score --> highest score is the best split, contains the most ones and least zeros (-1)
|
|
77
|
-
score = max(sum(input_list[:i]), sum(input_list[i:]))
|
|
78
|
-
if score > cluster_score:
|
|
79
|
-
cluster_score = score
|
|
80
|
-
split_index = i
|
|
81
|
-
|
|
82
|
-
# split the input list into two clusters
|
|
83
|
-
cluster_0 = input_list_copy[:split_index]
|
|
84
|
-
cluster_1 = input_list_copy[split_index:]
|
|
85
|
-
|
|
86
|
-
return [cluster_0, cluster_1]
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
# TODO:
|
|
90
|
-
# rename function to map_kmers and split into two functions -> second one to cluster kmers
|
|
91
|
-
def cluster_kmers(kmer_list, kmer_dict):
|
|
92
|
-
"""Map kmers to their respective genome"""
|
|
93
|
-
clusters = {}
|
|
94
|
-
contig_median_list = []
|
|
95
|
-
# Schleife über alle kmere in kmer_list
|
|
96
|
-
for i in range(len(kmer_list)):
|
|
97
|
-
kmer = kmer_list[i]
|
|
98
|
-
# Überprüfen, ob das kmer in kmer_dict vorhanden ist
|
|
99
|
-
kmer_info = kmer_dict.get(kmer)
|
|
100
|
-
if kmer_info is None:
|
|
101
|
-
continue
|
|
102
|
-
# Holt die Contig-ID und kmer-Position aus kmer_dict
|
|
103
|
-
kmer_id = kmer_dict[kmer][1]
|
|
104
|
-
kmer_pos = kmer_dict[kmer][0]
|
|
105
|
-
# Füge das kmer dem entsprechenden Contig in das Dictionary hinzu
|
|
106
|
-
# Ein Contig ist ein cluster
|
|
107
|
-
if kmer_id not in clusters:
|
|
108
|
-
clusters[kmer_id] = []
|
|
109
|
-
clusters[kmer_id].append((kmer, kmer_pos))
|
|
110
|
-
# Schleife über alle Contigs im Dictionary clusters
|
|
111
|
-
for contig in clusters:
|
|
112
|
-
contig_list = clusters[contig]
|
|
113
|
-
contig_len = len(contig_list)
|
|
114
|
-
if contig_len < 2:
|
|
115
|
-
# print("Zu wenig kmere im Contig!")
|
|
116
|
-
continue
|
|
117
|
-
# Sortieren der kmere in der Liste nach der Position
|
|
118
|
-
sorted_contig_list = sorted(contig_list, key=lambda x: x[1])
|
|
119
|
-
distances = []
|
|
120
|
-
# Schleife über die sortierte Liste von kmere im Contig
|
|
121
|
-
for i in range(1, len(sorted_contig_list)):
|
|
122
|
-
kmer_pos = sorted_contig_list[i][1]
|
|
123
|
-
prev_kmer_pos = sorted_contig_list[i - 1][1]
|
|
124
|
-
# Berechnung der Distanz zum vorherigen kmer
|
|
125
|
-
distance = kmer_pos - prev_kmer_pos
|
|
126
|
-
distances.append(distance)
|
|
127
|
-
# Berechnung der Median-Entfernung für das aktuelle Contig
|
|
128
|
-
# print(distances)
|
|
129
|
-
median_distance = statistics.median(distances)
|
|
130
|
-
# print(median_distance)
|
|
131
|
-
contig_median_list.append((contig, median_distance, contig_len))
|
|
132
|
-
# Summe der Contigs in contig_median_list
|
|
133
|
-
num_contigs = len(contig_median_list)
|
|
134
|
-
# Liste aller Contig-Größen
|
|
135
|
-
contig_lengths = [x[2] for x in contig_median_list]
|
|
136
|
-
# Liste aller Median-Entfernungen
|
|
137
|
-
median_distances = [x[1] for x in contig_median_list]
|
|
138
|
-
# Median der Median-Entfernungen berechnen
|
|
139
|
-
if len(median_distances) > 0:
|
|
140
|
-
median_of_medians = statistics.median(median_distances)
|
|
141
|
-
else:
|
|
142
|
-
median_of_medians = None
|
|
143
|
-
# Ergebnisliste erstellen
|
|
144
|
-
result = [num_contigs, median_of_medians, contig_lengths]
|
|
145
|
-
return result
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def main():
|
|
149
|
-
# Verzeichnis mit den Genomen
|
|
150
|
-
genome_dir = "path/to/genomes"
|
|
151
|
-
# create_genome_kmer_list(genome_dir, 21, "Acinetobacter")
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
if __name__ == "__main__":
|
|
155
|
-
main()
|
xspect/search_filter.py
DELETED
|
@@ -1,504 +0,0 @@
|
|
|
1
|
-
from multiprocessing import Process, Pipe
|
|
2
|
-
import pickle
|
|
3
|
-
import glob
|
|
4
|
-
import os
|
|
5
|
-
from collections import Counter
|
|
6
|
-
import time
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from linecache import getline
|
|
9
|
-
from xspect.train_filter.interface_XspecT import load_translation_dict
|
|
10
|
-
import xspect.BF_v2 as BF_v2
|
|
11
|
-
from multiprocessing import Process, Pipe
|
|
12
|
-
import pickle
|
|
13
|
-
import glob
|
|
14
|
-
import os
|
|
15
|
-
from collections import Counter
|
|
16
|
-
import time
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
from linecache import getline
|
|
19
|
-
from xspect.train_filter.interface_XspecT import load_translation_dict
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def get_added_genomes():
|
|
23
|
-
"""Reads in pickled list, returns none if no new genomes have been added"""
|
|
24
|
-
with open(r"filter/FilterClonetypes.txt", "rb") as fp:
|
|
25
|
-
clonetypes = pickle.load(fp)
|
|
26
|
-
|
|
27
|
-
# IC1 to IC8 are not deletable. That means if the IC-list is not longer than 8, than
|
|
28
|
-
# there are no new IC's
|
|
29
|
-
if len(clonetypes) == 8:
|
|
30
|
-
added = [None]
|
|
31
|
-
else:
|
|
32
|
-
# gives all added genomes after IC8
|
|
33
|
-
added = clonetypes[8:]
|
|
34
|
-
|
|
35
|
-
return added
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def read_search(IC_lookup, reads, quick, pipe=None):
|
|
39
|
-
with open(r"filter/FilterClonetypes.txt", "rb") as fp:
|
|
40
|
-
clonetypes = pickle.load(fp)
|
|
41
|
-
# initialising filter with database parameters
|
|
42
|
-
BF = BF_v2.AbaumanniiBloomfilter(123000000)
|
|
43
|
-
BF.set_arraysize(123000000)
|
|
44
|
-
BF.set_hashes(7)
|
|
45
|
-
BF.set_k(20)
|
|
46
|
-
|
|
47
|
-
# Array Size 22.000.000
|
|
48
|
-
paths = [
|
|
49
|
-
r"filter/IC1.txt",
|
|
50
|
-
r"filter/IC2.txt",
|
|
51
|
-
r"filter/IC3.txt",
|
|
52
|
-
r"filter/IC4.txt",
|
|
53
|
-
r"filter/IC5.txt",
|
|
54
|
-
r"filter/IC6.txt",
|
|
55
|
-
r"filter/IC7.txt",
|
|
56
|
-
r"filter/IC8.txt",
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
if IC_lookup[8]:
|
|
60
|
-
# added Genomes
|
|
61
|
-
# IC1 to IC8
|
|
62
|
-
# Selecting wanted slices
|
|
63
|
-
for i in [7, 6, 5, 4, 3, 2, 1, 0]:
|
|
64
|
-
if IC_lookup[i]:
|
|
65
|
-
pass
|
|
66
|
-
else:
|
|
67
|
-
del clonetypes[i]
|
|
68
|
-
del paths[i]
|
|
69
|
-
|
|
70
|
-
# getting all added files
|
|
71
|
-
temp = glob.glob("filter/added/*.txt")
|
|
72
|
-
added = []
|
|
73
|
-
if len(temp) == 0:
|
|
74
|
-
pass
|
|
75
|
-
else:
|
|
76
|
-
# these for-loops are needed for sorting the paths
|
|
77
|
-
# so they match with the pickle-list order
|
|
78
|
-
for i in range(len(clonetypes)):
|
|
79
|
-
for j in range(len(temp)):
|
|
80
|
-
if clonetypes[i] in temp[j]:
|
|
81
|
-
added.append(temp[j])
|
|
82
|
-
|
|
83
|
-
paths.extend(added)
|
|
84
|
-
|
|
85
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
86
|
-
|
|
87
|
-
else:
|
|
88
|
-
# Only IC1 to IC8
|
|
89
|
-
# Selecting wanted slices
|
|
90
|
-
clonetypes = clonetypes[:8]
|
|
91
|
-
for i in [7, 6, 5, 4, 3, 2, 1, 0]:
|
|
92
|
-
if IC_lookup[i]:
|
|
93
|
-
pass
|
|
94
|
-
else:
|
|
95
|
-
del clonetypes[i]
|
|
96
|
-
del paths[i]
|
|
97
|
-
|
|
98
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
99
|
-
BF.lookup_txt(reads, False, quick)
|
|
100
|
-
score = BF.get_score()
|
|
101
|
-
hits = BF.get_hits_per_filter()
|
|
102
|
-
names = BF.get_names()
|
|
103
|
-
BF.cleanup()
|
|
104
|
-
del BF
|
|
105
|
-
|
|
106
|
-
if pipe is not None:
|
|
107
|
-
pipe.send([score, names, hits])
|
|
108
|
-
pipe.close()
|
|
109
|
-
else:
|
|
110
|
-
return score, names, hits
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def pre_processing_ClAssT():
|
|
114
|
-
"Preprocesses the Bloomfilter-Matrix when the program is launched"
|
|
115
|
-
with open(r"filter/FilterClonetypes.txt", "rb") as fp:
|
|
116
|
-
clonetypes = pickle.load(fp)
|
|
117
|
-
# initialising filter with database parameters
|
|
118
|
-
# kmer20 = 115000000
|
|
119
|
-
# kmer31 = 122000000
|
|
120
|
-
BF = BF_v2.AbaumanniiBloomfilter(123000000)
|
|
121
|
-
BF.set_arraysize(123000000)
|
|
122
|
-
BF.set_hashes(7)
|
|
123
|
-
BF.set_k(20)
|
|
124
|
-
# paths = sorted(os.listdir(r"filter/species/"))
|
|
125
|
-
paths = [
|
|
126
|
-
r"filter/IC1.txt",
|
|
127
|
-
r"filter/IC2.txt",
|
|
128
|
-
r"filter/IC3.txt",
|
|
129
|
-
r"filter/IC4.txt",
|
|
130
|
-
r"filter/IC5.txt",
|
|
131
|
-
r"filter/IC6.txt",
|
|
132
|
-
r"filter/IC7.txt",
|
|
133
|
-
r"filter/IC8.txt",
|
|
134
|
-
]
|
|
135
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
136
|
-
return BF
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def get_genera_array_sizes():
|
|
140
|
-
"""Searches for all genera that have Bloomfilters.
|
|
141
|
-
|
|
142
|
-
:return: A dictionary with the genus name as key and a list of array sizes as value.
|
|
143
|
-
"""
|
|
144
|
-
array_size_path = Path(os.getcwd()) / "filter" / "array_sizes"
|
|
145
|
-
|
|
146
|
-
# Get a list of all genera with bloomfilters, exclude hidden files such as .DS_Store.
|
|
147
|
-
genera = [
|
|
148
|
-
file.name for file in array_size_path.iterdir() if not file.name.startswith(".")
|
|
149
|
-
]
|
|
150
|
-
genera_array_sizes = dict()
|
|
151
|
-
|
|
152
|
-
# Iterate through all genera.
|
|
153
|
-
for file_name in genera:
|
|
154
|
-
genus_name = str(file_name).split(".")[0]
|
|
155
|
-
file_path = array_size_path / file_name
|
|
156
|
-
sizes = getline(str(file_path), 1).replace("\n", "")
|
|
157
|
-
|
|
158
|
-
# Array sizes are saved as a list with the size for species BF as first and meta-mode BF as second entry.
|
|
159
|
-
array_sizes = sizes.split(" ")
|
|
160
|
-
|
|
161
|
-
# Genus name is the key and array size list as value.
|
|
162
|
-
genera_array_sizes[genus_name] = array_sizes
|
|
163
|
-
|
|
164
|
-
return genera_array_sizes
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def pre_process_genus(genus, array_size, k=21, meta_mode=False):
|
|
168
|
-
"""Pre processes the bloomfilter for the selected genus.
|
|
169
|
-
|
|
170
|
-
:param genus: Name of the genus.
|
|
171
|
-
:type genus: str
|
|
172
|
-
:param array_size: Size of the bloomfilter.
|
|
173
|
-
:type array_size: int
|
|
174
|
-
:param k: K-mer length.
|
|
175
|
-
:type k: int
|
|
176
|
-
:param meta_mode: Decides if metagenome mode was selected.
|
|
177
|
-
:type meta_mode: bool
|
|
178
|
-
:return: The preprocessed bloomfilter.
|
|
179
|
-
"""
|
|
180
|
-
# Get the correct path to bloomfilter names.
|
|
181
|
-
if meta_mode:
|
|
182
|
-
file_name = "Filter" + genus + "Complete.txt"
|
|
183
|
-
else:
|
|
184
|
-
file_name = "Filter" + genus + ".txt"
|
|
185
|
-
names_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
|
|
186
|
-
with open(names_path, "rb") as fp:
|
|
187
|
-
names = pickle.load(fp)
|
|
188
|
-
# Set bloomfilter variables.
|
|
189
|
-
BF = BF_v2.AbaumanniiBloomfilter(array_size)
|
|
190
|
-
BF.set_arraysize(array_size)
|
|
191
|
-
BF.set_hashes(7)
|
|
192
|
-
BF.set_k(k)
|
|
193
|
-
|
|
194
|
-
# Get paths to the bloomfilters.
|
|
195
|
-
if meta_mode:
|
|
196
|
-
paths = [Path(os.getcwd()) / "filter" / "Metagenomes" / (genus + ".txt")]
|
|
197
|
-
else:
|
|
198
|
-
genus_path = Path(os.getcwd()) / "filter" / genus
|
|
199
|
-
paths = sorted(os.listdir(genus_path))
|
|
200
|
-
for i in range(len(paths)):
|
|
201
|
-
paths[i] = genus_path / paths[i]
|
|
202
|
-
|
|
203
|
-
BF.read_clonetypes(paths, names)
|
|
204
|
-
return BF
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def pre_process_all(genera, k=21, meta_mode=False, genus=None):
|
|
208
|
-
"""Pre process bloomfilters for all genera.
|
|
209
|
-
|
|
210
|
-
:param genera: All genera with their array sizes.
|
|
211
|
-
:type genera: dict[str, List[str]]
|
|
212
|
-
:param k: K-mer length.
|
|
213
|
-
:type k: int
|
|
214
|
-
:param meta_mode: Decides if metagenome mode was selected.
|
|
215
|
-
:type meta_mode: bool
|
|
216
|
-
:param genus: Name of genus that will be the only genus to be pre processed.
|
|
217
|
-
:type genus: list
|
|
218
|
-
:return: All genera as keys and their bloomfilters as values.
|
|
219
|
-
"""
|
|
220
|
-
bloomfilters = dict()
|
|
221
|
-
# If a genus name is given, only pre process the given genus.
|
|
222
|
-
if genus:
|
|
223
|
-
for current_genus in list(genera.keys()):
|
|
224
|
-
if current_genus not in genus:
|
|
225
|
-
del genera[current_genus]
|
|
226
|
-
for genus in genera.keys():
|
|
227
|
-
if meta_mode:
|
|
228
|
-
BF = pre_process_genus(
|
|
229
|
-
genus, int(genera[genus][1]), k=k, meta_mode=meta_mode
|
|
230
|
-
)
|
|
231
|
-
else:
|
|
232
|
-
BF = pre_process_genus(
|
|
233
|
-
genus, int(genera[genus][0]), k=k, meta_mode=meta_mode
|
|
234
|
-
)
|
|
235
|
-
bloomfilters[genus] = BF
|
|
236
|
-
return bloomfilters
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def pre_processing(genus):
|
|
240
|
-
"Preprocesses the Bloomfilter-Matrix when the program is launched"
|
|
241
|
-
filename = "Filter" + genus + ".txt"
|
|
242
|
-
with open(r"filter/species_names/" + filename, "rb") as fp:
|
|
243
|
-
clonetypes = pickle.load(fp)
|
|
244
|
-
# initialising filter with database parameters
|
|
245
|
-
# get BF parameters from file
|
|
246
|
-
with open(r"filter/array_sizes/" + genus + ".txt", "r") as fp:
|
|
247
|
-
array_size = fp.readline()
|
|
248
|
-
array_size = int(array_size.split(" ")[0])
|
|
249
|
-
# 115000000 old
|
|
250
|
-
BF = BF_v2.AbaumanniiBloomfilter(array_size)
|
|
251
|
-
BF.set_arraysize(array_size)
|
|
252
|
-
BF.set_hashes(7)
|
|
253
|
-
BF.set_k(21)
|
|
254
|
-
# paths = sorted(os.listdir(r"filter/species_reversed/"))
|
|
255
|
-
# change to dynamic path
|
|
256
|
-
paths = sorted(os.listdir(r"filter/" + genus + "/"))
|
|
257
|
-
for i in range(len(paths)):
|
|
258
|
-
paths[i] = r"filter/" + genus + "/" + paths[i]
|
|
259
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
260
|
-
print("Preprocessing done for: ", genus)
|
|
261
|
-
return BF
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def pre_processing_prefilter2(genus):
|
|
265
|
-
"Preprocesses Acinetobacter Prefilter, collapse with other prefilter after testing"
|
|
266
|
-
filename = "Filter" + genus + "Complete.txt"
|
|
267
|
-
with open(r"filter/species_names/" + filename, "rb") as fp:
|
|
268
|
-
clonetypes = pickle.load(fp)
|
|
269
|
-
# get array size from file
|
|
270
|
-
with open(r"filter/array_sizes/" + genus + ".txt", "r") as fp:
|
|
271
|
-
array_size = fp.readline()
|
|
272
|
-
array_size = int(array_size.split(" ")[1])
|
|
273
|
-
# initialising filter with database parameters
|
|
274
|
-
BF = BF_v2.AbaumanniiBloomfilter(array_size)
|
|
275
|
-
BF.set_arraysize(array_size)
|
|
276
|
-
BF.set_hashes(7)
|
|
277
|
-
BF.set_k(21)
|
|
278
|
-
paths = ["filter/Metagenomes/" + genus + ".txt"]
|
|
279
|
-
BF.read_clonetypes(paths, clonetypes)
|
|
280
|
-
print("Preprocessing done for Metagenome Prefilter: ", genus)
|
|
281
|
-
return BF
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def read_search_pre(reads, BF_pre, ext):
|
|
285
|
-
reads_new = []
|
|
286
|
-
counter = 0
|
|
287
|
-
BF_pre.number_of_kmeres = 0
|
|
288
|
-
BF_pre.hits_per_filter = [0]
|
|
289
|
-
read_amount = 0
|
|
290
|
-
reads_oxa_prefilter = []
|
|
291
|
-
reads_oxa_filtered = []
|
|
292
|
-
for single_read in reads:
|
|
293
|
-
read_kmers = []
|
|
294
|
-
hit_sum = sum(BF_pre.hits_per_filter)
|
|
295
|
-
hits_per_filter_copy = BF_pre.hits_per_filter[:]
|
|
296
|
-
# use a scaling sample size for contigs/scaffolds
|
|
297
|
-
if ext == "fasta" or ext == "fna" or ext == "fa":
|
|
298
|
-
sample_size = int(len(single_read) ** 0.5)
|
|
299
|
-
threshold_read = sample_size * 0.7
|
|
300
|
-
for i in range(0, len(single_read) - BF_pre.k, sample_size):
|
|
301
|
-
if "N" not in single_read[i : i + BF_pre.k]:
|
|
302
|
-
BF_pre.lookup_canonical(single_read[i : i + BF_pre.k])
|
|
303
|
-
# for reads use a static sample of 5
|
|
304
|
-
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
305
|
-
# then the read won't be tested further
|
|
306
|
-
else:
|
|
307
|
-
# TO-DO implement dynamic sample size
|
|
308
|
-
k1 = single_read[0 : BF_pre.k] # first k-mer
|
|
309
|
-
k2 = single_read[len(single_read) - BF_pre.k :] # last k-mer
|
|
310
|
-
mid = len(single_read) // 2
|
|
311
|
-
k3 = single_read[mid : mid + BF_pre.k] # k-mer in middle
|
|
312
|
-
k4 = single_read[BF_pre.k : BF_pre.k * 2]
|
|
313
|
-
k5 = single_read[mid + BF_pre.k : mid + BF_pre.k * 2]
|
|
314
|
-
if "N" not in single_read:
|
|
315
|
-
BF_pre.lookup_canonical(k1)
|
|
316
|
-
BF_pre.lookup_canonical(k2)
|
|
317
|
-
BF_pre.lookup_canonical(k3)
|
|
318
|
-
BF_pre.lookup_canonical(k4)
|
|
319
|
-
BF_pre.lookup_canonical(k5)
|
|
320
|
-
threshold_read = 3
|
|
321
|
-
# needs at least 2 of 3 hits to continue with read
|
|
322
|
-
counter = 0
|
|
323
|
-
if (sum(BF_pre.hits_per_filter) - hit_sum) > threshold_read:
|
|
324
|
-
read_amount += 1
|
|
325
|
-
for j in range(len(single_read) - BF_pre.k):
|
|
326
|
-
if "N" not in single_read[j : j + BF_pre.k]:
|
|
327
|
-
read_kmers.append(single_read[j : j + BF_pre.k])
|
|
328
|
-
if ext == "fasta" or ext == "fna" or ext == "fa":
|
|
329
|
-
counter += 1
|
|
330
|
-
# extract up to 5000 kmeres per read/contig
|
|
331
|
-
if counter >= 5000:
|
|
332
|
-
break
|
|
333
|
-
reads_oxa_prefilter.append(single_read)
|
|
334
|
-
reads_new.append(read_kmers)
|
|
335
|
-
BF_pre.hits_per_filter = hits_per_filter_copy
|
|
336
|
-
else:
|
|
337
|
-
# resetting hit counter
|
|
338
|
-
BF_pre.hits_per_filter = hits_per_filter_copy
|
|
339
|
-
reads_filtered = []
|
|
340
|
-
threshold_dic = {}
|
|
341
|
-
if ext == "fasta" or ext == "fna" or ext == "fa":
|
|
342
|
-
cutoff = 0.7
|
|
343
|
-
else:
|
|
344
|
-
cutoff = 0.8
|
|
345
|
-
counter = 0
|
|
346
|
-
for i in range(len(reads_new)):
|
|
347
|
-
threshold = 0
|
|
348
|
-
for j in range(len(reads_new[i])):
|
|
349
|
-
BF_pre.number_of_kmeres += 1
|
|
350
|
-
hits_per_filter_copy = BF_pre.hits_per_filter[:]
|
|
351
|
-
BF_pre.lookup_canonical(reads_new[i][j])
|
|
352
|
-
if hits_per_filter_copy != BF_pre.hits_per_filter:
|
|
353
|
-
threshold += 1
|
|
354
|
-
if threshold >= cutoff * len(reads_new[i]):
|
|
355
|
-
reads_filtered.append(reads_new[i])
|
|
356
|
-
reads_oxa_filtered.append(reads_oxa_prefilter[i])
|
|
357
|
-
counter += len(reads_new[i])
|
|
358
|
-
# if ext == "fasta" or ext == "fna" or ext == "fa":
|
|
359
|
-
# if counter >= 50000:
|
|
360
|
-
# break
|
|
361
|
-
return reads_filtered, reads_oxa_filtered
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
def read_search_spec(reads, quick, BF, ext, genus):
|
|
365
|
-
"Searches sequence-data in Bloomfilter and gets kmer-hits"
|
|
366
|
-
if quick < 4:
|
|
367
|
-
BF.lookup_txt(reads, genus, ext, quick)
|
|
368
|
-
score = BF.get_score()
|
|
369
|
-
hits = BF.get_hits_per_filter()
|
|
370
|
-
names_id = BF.get_names()
|
|
371
|
-
# convert ids to names
|
|
372
|
-
translation_dict = load_translation_dict(genus)
|
|
373
|
-
names = [translation_dict[name] for name in names_id]
|
|
374
|
-
return score, names, hits, None
|
|
375
|
-
# Metagenome mode
|
|
376
|
-
elif quick == 4:
|
|
377
|
-
reads_classified, predictions = BF.lookup_txt(reads, genus, ext, quick)
|
|
378
|
-
hits = None
|
|
379
|
-
names = None
|
|
380
|
-
return reads_classified, names, hits, predictions
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
def pre_processing_oxa():
|
|
384
|
-
# getting filters
|
|
385
|
-
oxa_families = sorted(os.listdir(r"filter/OXAs/families"))
|
|
386
|
-
oxa_family_names = []
|
|
387
|
-
paths = []
|
|
388
|
-
for filter in oxa_families:
|
|
389
|
-
oxa_family_names.append(filter[:-4])
|
|
390
|
-
# get paths for oxa-family BF
|
|
391
|
-
for i in range(len(oxa_families)):
|
|
392
|
-
paths.append(r"filter/OXAs/families/" + oxa_families[i])
|
|
393
|
-
|
|
394
|
-
# getting filters of individiual filters of oxa-families
|
|
395
|
-
oxas_ind = sorted(os.listdir(r"filter/OXAs/individual"))
|
|
396
|
-
# get paths for individiual BF
|
|
397
|
-
paths_ind = []
|
|
398
|
-
oxa_ind_names = []
|
|
399
|
-
for i in range(len(oxas_ind)):
|
|
400
|
-
paths_ind.append(r"filter/OXAs/individual/" + oxas_ind[i])
|
|
401
|
-
# TODO: Rename variables
|
|
402
|
-
paths_ind_ind = {}
|
|
403
|
-
for i in range(len(paths_ind)):
|
|
404
|
-
temp = sorted(os.listdir(paths_ind[i]))
|
|
405
|
-
temp_list = []
|
|
406
|
-
for j in range(len(temp)):
|
|
407
|
-
temp_list.append(paths_ind[i] + "/" + temp[j])
|
|
408
|
-
paths_ind_ind[oxas_ind[i]] = temp_list
|
|
409
|
-
# list of BF-objects
|
|
410
|
-
BF_dict = {}
|
|
411
|
-
# initialising filter with database parameters
|
|
412
|
-
BF = BF_v2.AbaumanniiBloomfilter(80000)
|
|
413
|
-
BF.set_arraysize(80000)
|
|
414
|
-
BF.set_clonetypes(len(paths))
|
|
415
|
-
BF.set_hashes(7)
|
|
416
|
-
BF.set_k(21)
|
|
417
|
-
# User Options
|
|
418
|
-
# reading single OXA filters
|
|
419
|
-
BF.read_clonetypes(paths, oxa_family_names)
|
|
420
|
-
BF_dict["OXA-families"] = BF
|
|
421
|
-
# Add one BF-object for each oxa-family which contains individual oxa-BF
|
|
422
|
-
for name, path_oxa_family in paths_ind_ind.items():
|
|
423
|
-
names = []
|
|
424
|
-
for filter in path_oxa_family:
|
|
425
|
-
temp = filter.split("/")
|
|
426
|
-
names.append(temp[-1][:-4])
|
|
427
|
-
# initialising filter with database parameters
|
|
428
|
-
BF = BF_v2.AbaumanniiBloomfilter(80000)
|
|
429
|
-
BF.set_arraysize(80000)
|
|
430
|
-
BF.set_clonetypes(len(path_oxa_family))
|
|
431
|
-
BF.set_hashes(7)
|
|
432
|
-
BF.set_k(21)
|
|
433
|
-
# User Options
|
|
434
|
-
# reading single OXA filters
|
|
435
|
-
BF.read_clonetypes(path_oxa_family, names)
|
|
436
|
-
BF_dict[name] = BF
|
|
437
|
-
return BF_dict
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
def single_oxa(reads, ext, pipe=None):
|
|
441
|
-
"""Uses the Bloomfilter module to lookup the OXA-genes"""
|
|
442
|
-
# getting filters
|
|
443
|
-
paths = sorted(os.listdir(r"filter/OXAs/families/"))
|
|
444
|
-
oxas = []
|
|
445
|
-
for i in paths:
|
|
446
|
-
oxas.append(i[:-4])
|
|
447
|
-
|
|
448
|
-
for i in range(len(paths)):
|
|
449
|
-
paths[i] = r"filter/OXAs/families/" + paths[i]
|
|
450
|
-
|
|
451
|
-
# initialising filter with database parameters
|
|
452
|
-
BF = BF_v2.AbaumanniiBloomfilter(80000)
|
|
453
|
-
BF.set_arraysize(80000)
|
|
454
|
-
BF.set_clonetypes(len(paths))
|
|
455
|
-
BF.set_hashes(7)
|
|
456
|
-
BF.set_k(21)
|
|
457
|
-
# User Options
|
|
458
|
-
|
|
459
|
-
# reading single OXA filters
|
|
460
|
-
BF.read_clonetypes(paths, oxas)
|
|
461
|
-
|
|
462
|
-
# starting Bloomfilter process, depends on filetype
|
|
463
|
-
coordinates_forward, coordinates_reversed = BF.lookup_oxa(reads, ext)
|
|
464
|
-
|
|
465
|
-
score = BF.get_oxa_score()
|
|
466
|
-
BF.cleanup()
|
|
467
|
-
del BF
|
|
468
|
-
|
|
469
|
-
if pipe is not None:
|
|
470
|
-
pipe.send([score, oxas])
|
|
471
|
-
pipe.close()
|
|
472
|
-
else:
|
|
473
|
-
return score, oxas, coordinates_forward, coordinates_reversed
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
def oxa_and_IC_multiprocessing(IC_lookup, reads, ext, quick):
|
|
477
|
-
"""Uses Multiprocessing to lookup OXA genes and Clonetypes at the same time"""
|
|
478
|
-
# Sources:
|
|
479
|
-
# https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
|
|
480
|
-
# https://stackoverflow.com/questions/7207309/python-how-can-i-run-python-functions-in-parallel
|
|
481
|
-
# using pipes to Transfer data between functions
|
|
482
|
-
parent_ic, child_ic = Pipe()
|
|
483
|
-
parent_oxa, child_oxa = Pipe()
|
|
484
|
-
|
|
485
|
-
if ext == "fq" or ext == "fastq":
|
|
486
|
-
reads_ct = reads[:2000]
|
|
487
|
-
else:
|
|
488
|
-
reads_ct = reads
|
|
489
|
-
start = time.time()
|
|
490
|
-
p1 = Process(target=read_search, args=(IC_lookup, reads_ct, quick, child_ic))
|
|
491
|
-
p1.start()
|
|
492
|
-
p2 = Process(target=single_oxa, args=(reads, ext, child_oxa))
|
|
493
|
-
p2.start()
|
|
494
|
-
p1.join()
|
|
495
|
-
p2.join()
|
|
496
|
-
end = time.time()
|
|
497
|
-
needed = round(end - start, 2)
|
|
498
|
-
print("Time needed multiprocessing: ", needed)
|
|
499
|
-
|
|
500
|
-
# getting results back from pipes
|
|
501
|
-
results_ic = parent_ic.recv() # has scores and names
|
|
502
|
-
results_oxa = parent_oxa.recv() # has scores and names
|
|
503
|
-
|
|
504
|
-
return results_ic[0], results_ic[1], results_ic[2], results_oxa[0], results_oxa[1]
|
xspect/static/How-To.png
DELETED
|
Binary file
|
xspect/static/Logo.png
DELETED
|
Binary file
|
xspect/static/Logo2.png
DELETED
|
Binary file
|
|
Binary file
|
|
Binary file
|