XspecT 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (57) hide show
  1. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.2.dist-info/RECORD +0 -48
  26. xspect/BF_v2.py +0 -648
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -737
  31. xspect/XspecT_mini.py +0 -1377
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/get_paths.py +0 -35
  53. xspect/train_filter/interface_XspecT.py +0 -204
  54. xspect/train_filter/k_mer_count.py +0 -162
  55. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  56. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  57. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/map_kmers.py DELETED
@@ -1,155 +0,0 @@
1
- import statistics
2
- import pickle
3
- import os
4
- from sklearn.cluster import KMeans
5
- import numpy as np
6
-
7
-
8
- def identify_split_reads(score, kmer_hits_single):
9
- """This method identifies if the read is split because of HGT"""
10
-
11
- # notes
12
- """
13
- Identify split read regions from the kmer profiles, one side should cluster Ones and the other side should cluster Zeros
14
- for the respective species.
15
- 1.) Collect the kmer profiles, transforms the kmer_hit vector into a species_hit vector --> shows which positions in the read are covered
16
- 2.) Identify part of kmer profiles with a high number of ones --> shows where the read is covered
17
- 3.) Divide the kmer profiles into two clusters --> One with Zeros and one with Ones
18
- --> Maximize the number of ones in the first cluster and the number of zeros in the second cluster
19
- """
20
-
21
- # initialize variables
22
- kmer_profiles = [] * len(score)
23
- split_regions = []
24
- index_result = max(range(len(score)), key=score.__getitem__)
25
-
26
- # Collect the kmer profiles, transforms the kmer_hit vector into a species_hit vector
27
- for kmer_hits in kmer_hits_single:
28
- for i in range(len(kmer_hits)):
29
- kmer_profiles[i].append(kmer_hits[i])
30
-
31
- og_species = None
32
- novel_species = None
33
- clusters = cluster_zeros_ones(kmer_profiles[index_result])
34
- split_regions.append((clusters, i))
35
- # which part of the read belongs to the original species, 0 = left, 1 = right
36
- if sum(clusters[0]) > sum(clusters[1]):
37
- og_species = 0
38
- novel_species = 1
39
- else:
40
- og_species = 1
41
- novel_species = 0
42
-
43
- for i, kmer_profile in enumerate(kmer_profiles):
44
- if i == index_result:
45
- continue
46
- clusters = cluster_zeros_ones(kmer_profile)
47
- # 0.3 and 0.6 are arbitrary values, they should be tested etc.
48
- # Find the complemantary species of the split read --> HGT Donor
49
- if (
50
- sum(clusters[og_species]) / len(clusters[og_species]) < 0.3
51
- and sum(clusters[novel_species]) / len(clusters[novel_species]) >= 0.6
52
- ):
53
- split_regions.append((clusters, i))
54
- break
55
-
56
- # split_regions = [([cluster_0, cluster_1], og_index), ([cluster_0, cluster_1], novel_index)]
57
- return split_regions
58
-
59
-
60
- def cluster_zeros_ones(input_list, threshold=None):
61
- """This method divides a list of zeros and ones into two lists,
62
- maximizing occurences of zeros and ones in each list"""
63
-
64
- # min. length of a cluster
65
- if threshold == None:
66
- threshold = len(input_list) * 0.1
67
-
68
- # copy the input list
69
- input_list_copy = input_list[:]
70
-
71
- # convert zeros to -1 so that they function as penalty
72
- input_list[:] = [-1 if x == 0 else 1 for x in input_list]
73
- cluster_score = 0
74
- # calculate the score for the cluster for each possible split
75
- for i in range(threshold, len(input_list) - threshold):
76
- # goal is to maximize the score --> highest score is the best split, contains the most ones and least zeros (-1)
77
- score = max(sum(input_list[:i]), sum(input_list[i:]))
78
- if score > cluster_score:
79
- cluster_score = score
80
- split_index = i
81
-
82
- # split the input list into two clusters
83
- cluster_0 = input_list_copy[:split_index]
84
- cluster_1 = input_list_copy[split_index:]
85
-
86
- return [cluster_0, cluster_1]
87
-
88
-
89
- # TODO:
90
- # rename function to map_kmers and split into two functions -> second one to cluster kmers
91
- def cluster_kmers(kmer_list, kmer_dict):
92
- """Map kmers to their respective genome"""
93
- clusters = {}
94
- contig_median_list = []
95
- # Schleife über alle kmere in kmer_list
96
- for i in range(len(kmer_list)):
97
- kmer = kmer_list[i]
98
- # Überprüfen, ob das kmer in kmer_dict vorhanden ist
99
- kmer_info = kmer_dict.get(kmer)
100
- if kmer_info is None:
101
- continue
102
- # Holt die Contig-ID und kmer-Position aus kmer_dict
103
- kmer_id = kmer_dict[kmer][1]
104
- kmer_pos = kmer_dict[kmer][0]
105
- # Füge das kmer dem entsprechenden Contig in das Dictionary hinzu
106
- # Ein Contig ist ein cluster
107
- if kmer_id not in clusters:
108
- clusters[kmer_id] = []
109
- clusters[kmer_id].append((kmer, kmer_pos))
110
- # Schleife über alle Contigs im Dictionary clusters
111
- for contig in clusters:
112
- contig_list = clusters[contig]
113
- contig_len = len(contig_list)
114
- if contig_len < 2:
115
- # print("Zu wenig kmere im Contig!")
116
- continue
117
- # Sortieren der kmere in der Liste nach der Position
118
- sorted_contig_list = sorted(contig_list, key=lambda x: x[1])
119
- distances = []
120
- # Schleife über die sortierte Liste von kmere im Contig
121
- for i in range(1, len(sorted_contig_list)):
122
- kmer_pos = sorted_contig_list[i][1]
123
- prev_kmer_pos = sorted_contig_list[i - 1][1]
124
- # Berechnung der Distanz zum vorherigen kmer
125
- distance = kmer_pos - prev_kmer_pos
126
- distances.append(distance)
127
- # Berechnung der Median-Entfernung für das aktuelle Contig
128
- # print(distances)
129
- median_distance = statistics.median(distances)
130
- # print(median_distance)
131
- contig_median_list.append((contig, median_distance, contig_len))
132
- # Summe der Contigs in contig_median_list
133
- num_contigs = len(contig_median_list)
134
- # Liste aller Contig-Größen
135
- contig_lengths = [x[2] for x in contig_median_list]
136
- # Liste aller Median-Entfernungen
137
- median_distances = [x[1] for x in contig_median_list]
138
- # Median der Median-Entfernungen berechnen
139
- if len(median_distances) > 0:
140
- median_of_medians = statistics.median(median_distances)
141
- else:
142
- median_of_medians = None
143
- # Ergebnisliste erstellen
144
- result = [num_contigs, median_of_medians, contig_lengths]
145
- return result
146
-
147
-
148
- def main():
149
- # Verzeichnis mit den Genomen
150
- genome_dir = "path/to/genomes"
151
- # create_genome_kmer_list(genome_dir, 21, "Acinetobacter")
152
-
153
-
154
- if __name__ == "__main__":
155
- main()
xspect/search_filter.py DELETED
@@ -1,504 +0,0 @@
1
- from multiprocessing import Process, Pipe
2
- import pickle
3
- import glob
4
- import os
5
- from collections import Counter
6
- import time
7
- from pathlib import Path
8
- from linecache import getline
9
- from xspect.train_filter.interface_XspecT import load_translation_dict
10
- import xspect.BF_v2 as BF_v2
11
- from multiprocessing import Process, Pipe
12
- import pickle
13
- import glob
14
- import os
15
- from collections import Counter
16
- import time
17
- from pathlib import Path
18
- from linecache import getline
19
- from xspect.train_filter.interface_XspecT import load_translation_dict
20
-
21
-
22
- def get_added_genomes():
23
- """Reads in pickled list, returns none if no new genomes have been added"""
24
- with open(r"filter/FilterClonetypes.txt", "rb") as fp:
25
- clonetypes = pickle.load(fp)
26
-
27
- # IC1 to IC8 are not deletable. That means if the IC-list is not longer than 8, than
28
- # there are no new IC's
29
- if len(clonetypes) == 8:
30
- added = [None]
31
- else:
32
- # gives all added genomes after IC8
33
- added = clonetypes[8:]
34
-
35
- return added
36
-
37
-
38
- def read_search(IC_lookup, reads, quick, pipe=None):
39
- with open(r"filter/FilterClonetypes.txt", "rb") as fp:
40
- clonetypes = pickle.load(fp)
41
- # initialising filter with database parameters
42
- BF = BF_v2.AbaumanniiBloomfilter(123000000)
43
- BF.set_arraysize(123000000)
44
- BF.set_hashes(7)
45
- BF.set_k(20)
46
-
47
- # Array Size 22.000.000
48
- paths = [
49
- r"filter/IC1.txt",
50
- r"filter/IC2.txt",
51
- r"filter/IC3.txt",
52
- r"filter/IC4.txt",
53
- r"filter/IC5.txt",
54
- r"filter/IC6.txt",
55
- r"filter/IC7.txt",
56
- r"filter/IC8.txt",
57
- ]
58
-
59
- if IC_lookup[8]:
60
- # added Genomes
61
- # IC1 to IC8
62
- # Selecting wanted slices
63
- for i in [7, 6, 5, 4, 3, 2, 1, 0]:
64
- if IC_lookup[i]:
65
- pass
66
- else:
67
- del clonetypes[i]
68
- del paths[i]
69
-
70
- # getting all added files
71
- temp = glob.glob("filter/added/*.txt")
72
- added = []
73
- if len(temp) == 0:
74
- pass
75
- else:
76
- # these for-loops are needed for sorting the paths
77
- # so they match with the pickle-list order
78
- for i in range(len(clonetypes)):
79
- for j in range(len(temp)):
80
- if clonetypes[i] in temp[j]:
81
- added.append(temp[j])
82
-
83
- paths.extend(added)
84
-
85
- BF.read_clonetypes(paths, clonetypes)
86
-
87
- else:
88
- # Only IC1 to IC8
89
- # Selecting wanted slices
90
- clonetypes = clonetypes[:8]
91
- for i in [7, 6, 5, 4, 3, 2, 1, 0]:
92
- if IC_lookup[i]:
93
- pass
94
- else:
95
- del clonetypes[i]
96
- del paths[i]
97
-
98
- BF.read_clonetypes(paths, clonetypes)
99
- BF.lookup_txt(reads, False, quick)
100
- score = BF.get_score()
101
- hits = BF.get_hits_per_filter()
102
- names = BF.get_names()
103
- BF.cleanup()
104
- del BF
105
-
106
- if pipe is not None:
107
- pipe.send([score, names, hits])
108
- pipe.close()
109
- else:
110
- return score, names, hits
111
-
112
-
113
- def pre_processing_ClAssT():
114
- "Preprocesses the Bloomfilter-Matrix when the program is launched"
115
- with open(r"filter/FilterClonetypes.txt", "rb") as fp:
116
- clonetypes = pickle.load(fp)
117
- # initialising filter with database parameters
118
- # kmer20 = 115000000
119
- # kmer31 = 122000000
120
- BF = BF_v2.AbaumanniiBloomfilter(123000000)
121
- BF.set_arraysize(123000000)
122
- BF.set_hashes(7)
123
- BF.set_k(20)
124
- # paths = sorted(os.listdir(r"filter/species/"))
125
- paths = [
126
- r"filter/IC1.txt",
127
- r"filter/IC2.txt",
128
- r"filter/IC3.txt",
129
- r"filter/IC4.txt",
130
- r"filter/IC5.txt",
131
- r"filter/IC6.txt",
132
- r"filter/IC7.txt",
133
- r"filter/IC8.txt",
134
- ]
135
- BF.read_clonetypes(paths, clonetypes)
136
- return BF
137
-
138
-
139
- def get_genera_array_sizes():
140
- """Searches for all genera that have Bloomfilters.
141
-
142
- :return: A dictionary with the genus name as key and a list of array sizes as value.
143
- """
144
- array_size_path = Path(os.getcwd()) / "filter" / "array_sizes"
145
-
146
- # Get a list of all genera with bloomfilters, exclude hidden files such as .DS_Store.
147
- genera = [
148
- file.name for file in array_size_path.iterdir() if not file.name.startswith(".")
149
- ]
150
- genera_array_sizes = dict()
151
-
152
- # Iterate through all genera.
153
- for file_name in genera:
154
- genus_name = str(file_name).split(".")[0]
155
- file_path = array_size_path / file_name
156
- sizes = getline(str(file_path), 1).replace("\n", "")
157
-
158
- # Array sizes are saved as a list with the size for species BF as first and meta-mode BF as second entry.
159
- array_sizes = sizes.split(" ")
160
-
161
- # Genus name is the key and array size list as value.
162
- genera_array_sizes[genus_name] = array_sizes
163
-
164
- return genera_array_sizes
165
-
166
-
167
- def pre_process_genus(genus, array_size, k=21, meta_mode=False):
168
- """Pre processes the bloomfilter for the selected genus.
169
-
170
- :param genus: Name of the genus.
171
- :type genus: str
172
- :param array_size: Size of the bloomfilter.
173
- :type array_size: int
174
- :param k: K-mer length.
175
- :type k: int
176
- :param meta_mode: Decides if metagenome mode was selected.
177
- :type meta_mode: bool
178
- :return: The preprocessed bloomfilter.
179
- """
180
- # Get the correct path to bloomfilter names.
181
- if meta_mode:
182
- file_name = "Filter" + genus + "Complete.txt"
183
- else:
184
- file_name = "Filter" + genus + ".txt"
185
- names_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
186
- with open(names_path, "rb") as fp:
187
- names = pickle.load(fp)
188
- # Set bloomfilter variables.
189
- BF = BF_v2.AbaumanniiBloomfilter(array_size)
190
- BF.set_arraysize(array_size)
191
- BF.set_hashes(7)
192
- BF.set_k(k)
193
-
194
- # Get paths to the bloomfilters.
195
- if meta_mode:
196
- paths = [Path(os.getcwd()) / "filter" / "Metagenomes" / (genus + ".txt")]
197
- else:
198
- genus_path = Path(os.getcwd()) / "filter" / genus
199
- paths = sorted(os.listdir(genus_path))
200
- for i in range(len(paths)):
201
- paths[i] = genus_path / paths[i]
202
-
203
- BF.read_clonetypes(paths, names)
204
- return BF
205
-
206
-
207
- def pre_process_all(genera, k=21, meta_mode=False, genus=None):
208
- """Pre process bloomfilters for all genera.
209
-
210
- :param genera: All genera with their array sizes.
211
- :type genera: dict[str, List[str]]
212
- :param k: K-mer length.
213
- :type k: int
214
- :param meta_mode: Decides if metagenome mode was selected.
215
- :type meta_mode: bool
216
- :param genus: Name of genus that will be the only genus to be pre processed.
217
- :type genus: list
218
- :return: All genera as keys and their bloomfilters as values.
219
- """
220
- bloomfilters = dict()
221
- # If a genus name is given, only pre process the given genus.
222
- if genus:
223
- for current_genus in list(genera.keys()):
224
- if current_genus not in genus:
225
- del genera[current_genus]
226
- for genus in genera.keys():
227
- if meta_mode:
228
- BF = pre_process_genus(
229
- genus, int(genera[genus][1]), k=k, meta_mode=meta_mode
230
- )
231
- else:
232
- BF = pre_process_genus(
233
- genus, int(genera[genus][0]), k=k, meta_mode=meta_mode
234
- )
235
- bloomfilters[genus] = BF
236
- return bloomfilters
237
-
238
-
239
- def pre_processing(genus):
240
- "Preprocesses the Bloomfilter-Matrix when the program is launched"
241
- filename = "Filter" + genus + ".txt"
242
- with open(r"filter/species_names/" + filename, "rb") as fp:
243
- clonetypes = pickle.load(fp)
244
- # initialising filter with database parameters
245
- # get BF parameters from file
246
- with open(r"filter/array_sizes/" + genus + ".txt", "r") as fp:
247
- array_size = fp.readline()
248
- array_size = int(array_size.split(" ")[0])
249
- # 115000000 old
250
- BF = BF_v2.AbaumanniiBloomfilter(array_size)
251
- BF.set_arraysize(array_size)
252
- BF.set_hashes(7)
253
- BF.set_k(21)
254
- # paths = sorted(os.listdir(r"filter/species_reversed/"))
255
- # change to dynamic path
256
- paths = sorted(os.listdir(r"filter/" + genus + "/"))
257
- for i in range(len(paths)):
258
- paths[i] = r"filter/" + genus + "/" + paths[i]
259
- BF.read_clonetypes(paths, clonetypes)
260
- print("Preprocessing done for: ", genus)
261
- return BF
262
-
263
-
264
- def pre_processing_prefilter2(genus):
265
- "Preprocesses Acinetobacter Prefilter, collapse with other prefilter after testing"
266
- filename = "Filter" + genus + "Complete.txt"
267
- with open(r"filter/species_names/" + filename, "rb") as fp:
268
- clonetypes = pickle.load(fp)
269
- # get array size from file
270
- with open(r"filter/array_sizes/" + genus + ".txt", "r") as fp:
271
- array_size = fp.readline()
272
- array_size = int(array_size.split(" ")[1])
273
- # initialising filter with database parameters
274
- BF = BF_v2.AbaumanniiBloomfilter(array_size)
275
- BF.set_arraysize(array_size)
276
- BF.set_hashes(7)
277
- BF.set_k(21)
278
- paths = ["filter/Metagenomes/" + genus + ".txt"]
279
- BF.read_clonetypes(paths, clonetypes)
280
- print("Preprocessing done for Metagenome Prefilter: ", genus)
281
- return BF
282
-
283
-
284
- def read_search_pre(reads, BF_pre, ext):
285
- reads_new = []
286
- counter = 0
287
- BF_pre.number_of_kmeres = 0
288
- BF_pre.hits_per_filter = [0]
289
- read_amount = 0
290
- reads_oxa_prefilter = []
291
- reads_oxa_filtered = []
292
- for single_read in reads:
293
- read_kmers = []
294
- hit_sum = sum(BF_pre.hits_per_filter)
295
- hits_per_filter_copy = BF_pre.hits_per_filter[:]
296
- # use a scaling sample size for contigs/scaffolds
297
- if ext == "fasta" or ext == "fna" or ext == "fa":
298
- sample_size = int(len(single_read) ** 0.5)
299
- threshold_read = sample_size * 0.7
300
- for i in range(0, len(single_read) - BF_pre.k, sample_size):
301
- if "N" not in single_read[i : i + BF_pre.k]:
302
- BF_pre.lookup(single_read[i : i + BF_pre.k])
303
- # for reads use a static sample of 5
304
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
305
- # then the read won't be tested further
306
- else:
307
- # TO-DO implement dynamic sample size
308
- k1 = single_read[0 : BF_pre.k] # first k-mer
309
- k2 = single_read[len(single_read) - BF_pre.k :] # last k-mer
310
- mid = len(single_read) // 2
311
- k3 = single_read[mid : mid + BF_pre.k] # k-mer in middle
312
- k4 = single_read[BF_pre.k : BF_pre.k * 2]
313
- k5 = single_read[mid + BF_pre.k : mid + BF_pre.k * 2]
314
- if "N" not in single_read:
315
- BF_pre.lookup(k1)
316
- BF_pre.lookup(k2)
317
- BF_pre.lookup(k3)
318
- BF_pre.lookup(k4)
319
- BF_pre.lookup(k5)
320
- threshold_read = 3
321
- # needs at least 2 of 3 hits to continue with read
322
- counter = 0
323
- if (sum(BF_pre.hits_per_filter) - hit_sum) > threshold_read:
324
- read_amount += 1
325
- for j in range(len(single_read) - BF_pre.k):
326
- if "N" not in single_read[j : j + BF_pre.k]:
327
- read_kmers.append(single_read[j : j + BF_pre.k])
328
- if ext == "fasta" or ext == "fna" or ext == "fa":
329
- counter += 1
330
- # extract up to 5000 kmeres per read/contig
331
- if counter >= 5000:
332
- break
333
- reads_oxa_prefilter.append(single_read)
334
- reads_new.append(read_kmers)
335
- BF_pre.hits_per_filter = hits_per_filter_copy
336
- else:
337
- # resetting hit counter
338
- BF_pre.hits_per_filter = hits_per_filter_copy
339
- reads_filtered = []
340
- threshold_dic = {}
341
- if ext == "fasta" or ext == "fna" or ext == "fa":
342
- cutoff = 0.7
343
- else:
344
- cutoff = 0.8
345
- counter = 0
346
- for i in range(len(reads_new)):
347
- threshold = 0
348
- for j in range(len(reads_new[i])):
349
- BF_pre.number_of_kmeres += 1
350
- hits_per_filter_copy = BF_pre.hits_per_filter[:]
351
- BF_pre.lookup(reads_new[i][j])
352
- if hits_per_filter_copy != BF_pre.hits_per_filter:
353
- threshold += 1
354
- if threshold >= cutoff * len(reads_new[i]):
355
- reads_filtered.append(reads_new[i])
356
- reads_oxa_filtered.append(reads_oxa_prefilter[i])
357
- counter += len(reads_new[i])
358
- # if ext == "fasta" or ext == "fna" or ext == "fa":
359
- # if counter >= 50000:
360
- # break
361
- return reads_filtered, reads_oxa_filtered
362
-
363
-
364
- def read_search_spec(reads, quick, BF, ext, genus):
365
- "Searches sequence-data in Bloomfilter and gets kmer-hits"
366
- if quick < 4:
367
- BF.lookup_txt(reads, genus, ext, quick)
368
- score = BF.get_score()
369
- hits = BF.get_hits_per_filter()
370
- names_id = BF.get_names()
371
- # convert ids to names
372
- translation_dict = load_translation_dict(genus)
373
- names = [translation_dict[name] for name in names_id]
374
- return score, names, hits, None
375
- # Metagenome mode
376
- elif quick == 4:
377
- reads_classified, predictions = BF.lookup_txt(reads, genus, ext, quick)
378
- hits = None
379
- names = None
380
- return reads_classified, names, hits, predictions
381
-
382
-
383
- def pre_processing_oxa():
384
- # getting filters
385
- oxa_families = sorted(os.listdir(r"filter/OXAs/families"))
386
- oxa_family_names = []
387
- paths = []
388
- for filter in oxa_families:
389
- oxa_family_names.append(filter[:-4])
390
- # get paths for oxa-family BF
391
- for i in range(len(oxa_families)):
392
- paths.append(r"filter/OXAs/families/" + oxa_families[i])
393
-
394
- # getting filters of individiual filters of oxa-families
395
- oxas_ind = sorted(os.listdir(r"filter/OXAs/individual"))
396
- # get paths for individiual BF
397
- paths_ind = []
398
- oxa_ind_names = []
399
- for i in range(len(oxas_ind)):
400
- paths_ind.append(r"filter/OXAs/individual/" + oxas_ind[i])
401
- # TODO: Rename variables
402
- paths_ind_ind = {}
403
- for i in range(len(paths_ind)):
404
- temp = sorted(os.listdir(paths_ind[i]))
405
- temp_list = []
406
- for j in range(len(temp)):
407
- temp_list.append(paths_ind[i] + "/" + temp[j])
408
- paths_ind_ind[oxas_ind[i]] = temp_list
409
- # list of BF-objects
410
- BF_dict = {}
411
- # initialising filter with database parameters
412
- BF = BF_v2.AbaumanniiBloomfilter(80000)
413
- BF.set_arraysize(80000)
414
- BF.set_clonetypes(len(paths))
415
- BF.set_hashes(7)
416
- BF.set_k(21)
417
- # User Options
418
- # reading single OXA filters
419
- BF.read_clonetypes(paths, oxa_family_names)
420
- BF_dict["OXA-families"] = BF
421
- # Add one BF-object for each oxa-family which contains individual oxa-BF
422
- for name, path_oxa_family in paths_ind_ind.items():
423
- names = []
424
- for filter in path_oxa_family:
425
- temp = filter.split("/")
426
- names.append(temp[-1][:-4])
427
- # initialising filter with database parameters
428
- BF = BF_v2.AbaumanniiBloomfilter(80000)
429
- BF.set_arraysize(80000)
430
- BF.set_clonetypes(len(path_oxa_family))
431
- BF.set_hashes(7)
432
- BF.set_k(21)
433
- # User Options
434
- # reading single OXA filters
435
- BF.read_clonetypes(path_oxa_family, names)
436
- BF_dict[name] = BF
437
- return BF_dict
438
-
439
-
440
- def single_oxa(reads, ext, pipe=None):
441
- """Uses the Bloomfilter module to lookup the OXA-genes"""
442
- # getting filters
443
- paths = sorted(os.listdir(r"filter/OXAs/families/"))
444
- oxas = []
445
- for i in paths:
446
- oxas.append(i[:-4])
447
-
448
- for i in range(len(paths)):
449
- paths[i] = r"filter/OXAs/families/" + paths[i]
450
-
451
- # initialising filter with database parameters
452
- BF = BF_v2.AbaumanniiBloomfilter(80000)
453
- BF.set_arraysize(80000)
454
- BF.set_clonetypes(len(paths))
455
- BF.set_hashes(7)
456
- BF.set_k(21)
457
- # User Options
458
-
459
- # reading single OXA filters
460
- BF.read_clonetypes(paths, oxas)
461
-
462
- # starting Bloomfilter process, depends on filetype
463
- coordinates_forward, coordinates_reversed = BF.lookup_oxa(reads, ext)
464
-
465
- score = BF.get_oxa_score()
466
- BF.cleanup()
467
- del BF
468
-
469
- if pipe is not None:
470
- pipe.send([score, oxas])
471
- pipe.close()
472
- else:
473
- return score, oxas, coordinates_forward, coordinates_reversed
474
-
475
-
476
- def oxa_and_IC_multiprocessing(IC_lookup, reads, ext, quick):
477
- """Uses Multiprocessing to lookup OXA genes and Clonetypes at the same time"""
478
- # Sources:
479
- # https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
480
- # https://stackoverflow.com/questions/7207309/python-how-can-i-run-python-functions-in-parallel
481
- # using pipes to Transfer data between functions
482
- parent_ic, child_ic = Pipe()
483
- parent_oxa, child_oxa = Pipe()
484
-
485
- if ext == "fq" or ext == "fastq":
486
- reads_ct = reads[:2000]
487
- else:
488
- reads_ct = reads
489
- start = time.time()
490
- p1 = Process(target=read_search, args=(IC_lookup, reads_ct, quick, child_ic))
491
- p1.start()
492
- p2 = Process(target=single_oxa, args=(reads, ext, child_oxa))
493
- p2.start()
494
- p1.join()
495
- p2.join()
496
- end = time.time()
497
- needed = round(end - start, 2)
498
- print("Time needed multiprocessing: ", needed)
499
-
500
- # getting results back from pipes
501
- results_ic = parent_ic.recv() # has scores and names
502
- results_oxa = parent_oxa.recv() # has scores and names
503
-
504
- return results_ic[0], results_ic[1], results_ic[2], results_oxa[0], results_oxa[1]
xspect/static/How-To.png DELETED
Binary file
xspect/static/Logo.png DELETED
Binary file
xspect/static/Logo2.png DELETED
Binary file
Binary file
Binary file