XspecT 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (57) hide show
  1. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.2.dist-info/RECORD +0 -48
  26. xspect/BF_v2.py +0 -648
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -737
  31. xspect/XspecT_mini.py +0 -1377
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/get_paths.py +0 -35
  53. xspect/train_filter/interface_XspecT.py +0 -204
  54. xspect/train_filter/k_mer_count.py +0 -162
  55. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  56. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  57. {XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/BF_v2.py DELETED
@@ -1,648 +0,0 @@
1
- """Bloomfilter implementation"""
2
-
3
- import os
4
- import csv
5
- from copy import deepcopy
6
- import pickle
7
- import statistics
8
- from pathlib import Path
9
-
10
-
11
- try:
12
- # try with a fast c-implementation ...
13
- import mmh3 as mmh3
14
- except ImportError:
15
- # ... otherwise fallback to this module!
16
- import pymmh3 as mmh3
17
-
18
- from bitarray import bitarray
19
- from Bio import SeqIO
20
- from Bio.Seq import Seq
21
- import h5py
22
- import xspect.Bootstrap as bs
23
- from xspect.OXA_Table import OXATable
24
-
25
-
26
- class AbaumanniiBloomfilter:
27
- """Bloomfilter that can read FASTA and FASTQ files to assign the given file to a reference-genome"""
28
-
29
- # Implementation of the Bloomfilter Project for Acinetobacter baumannii
30
- # Used an customized also for the Bloomfilter Project for Acinetobacter Species Assignment
31
- # Variables from the Strain-Typing were used if possible for the Species-Assignment to not over-complicate the Code
32
- # Code partly from https://github.com/Phelimb/BIGSI
33
-
34
- clonetypes = 1 # Number of IC's/Species
35
- hits_per_filter = [0] * clonetypes # Hit counter per IC/per Species
36
- array_size = 22000000 # Standard arraysize per IC is 22mio for Core-genome
37
- hashes = 7 # Number of used Hash-functions
38
- k = 20 # length of the k-meres
39
- names = [
40
- "IC1",
41
- "IC2",
42
- "IC3",
43
- "IC4",
44
- "IC5",
45
- "IC6",
46
- "IC7",
47
- "IC8",
48
- ] # names of the IC's
49
- number_of_kmeres = 0 # counter of k-meres, will be used to calculate score
50
- reads = 1000 # standard read number
51
-
52
- def __init__(self, arraysize):
53
- """creates empty matrix"""
54
- self.matrix = bitarray(arraysize)
55
- self.matrix.setall(False)
56
- self.array_size = arraysize
57
- self.kmeres = []
58
- self.hits_per_filter_kmere = []
59
- self.kmer_hits_single = []
60
- self.coverage = []
61
- self.hit = False
62
-
63
- # Setter
64
-
65
- def set_arraysize(self, new):
66
- """changes Arraysize to new input-value, does not recreate matrix"""
67
- self.array_size = new
68
-
69
- def set_clonetypes(self, new):
70
- """changes number of Clonetypes"""
71
- self.clonetypes = new
72
- self.hits_per_filter = [0] * self.clonetypes
73
-
74
- def set_hashes(self, new):
75
- """Changes number of used hash-functions"""
76
- self.hashes = new
77
-
78
- def set_k(self, new):
79
- """Changes length of k-meres"""
80
- self.k = new
81
-
82
- def set_names(self, new):
83
- """Changes Names of Filters, Input must be a List of names"""
84
- self.names = new
85
-
86
- def reset_counter(self):
87
- """resets counter"""
88
- self.number_of_kmeres = 0
89
- self.hits_per_filter = [0] * self.clonetypes
90
-
91
- def set_reads(self, new):
92
- """Changes number of reads to new value"""
93
- self.reads = new
94
-
95
- # Getter
96
-
97
- def get_score(self):
98
- """calculates score for all clonetypes
99
- Score is #hits / #kmeres"""
100
-
101
- score = []
102
-
103
- # calculates float for each value in [hits per filter]
104
- for i in range(self.clonetypes):
105
- if self.hits_per_filter[i] == 0:
106
- score.append(0.0)
107
- else:
108
- score.append(
109
- round(
110
- float(self.hits_per_filter[i]) / float(self.number_of_kmeres), 2
111
- )
112
- )
113
-
114
- return score
115
-
116
- def get_reads(self):
117
- """gets number of reads"""
118
- return self.reads
119
-
120
- def get_hits_per_filter(self):
121
- """gets Hits per Filter"""
122
- return self.hits_per_filter
123
-
124
- def get_kmeres_per_sequence(self):
125
- """gets K-mer counter"""
126
- # returns number of k-meres per file
127
- return self.number_of_kmeres
128
-
129
- def get_names(self):
130
- """gets names of filters"""
131
- return self.names
132
-
133
- def get_coverage(self):
134
- """gets coverage"""
135
- return self.coverage
136
-
137
- # File management
138
-
139
- def save_clonetypes(self, path):
140
- """saves matrix as a binary file to the input-path"""
141
- # saving filters of clonetypes
142
-
143
- # creating file and saving matrix with the bitarray modul
144
- with open(path, "wb") as fh:
145
- # writing to file with bitarray command
146
- self.matrix.tofile(fh)
147
-
148
- def read_clonetypes(self, paths, names):
149
- """reads slices from files and concats them to a matrix,
150
- paths is list of paths and names is a string list"""
151
-
152
- # Updating parameters
153
- self.clonetypes = len(paths)
154
- self.names = names
155
- self.matrix = bitarray(0)
156
- self.number_of_kmeres = 0
157
- self.hits_per_filter = [0] * self.clonetypes
158
-
159
- # creating matrix from single filters
160
- for path in paths:
161
- temp = bitarray()
162
-
163
- with open(path, "rb") as fh:
164
- temp.fromfile(fh)
165
- self.matrix.extend(temp)
166
-
167
- # Bloomfilter
168
-
169
- def hash(self, kmer):
170
- """Hashes given string and returns Positions for the Array"""
171
-
172
- # Empty list for Array positions
173
- positions = []
174
- # Creating hashes for needed number of hash functions
175
- for i in range(self.hashes):
176
- # mmh3 takes that string and a seed,
177
- # each hash function takes an individual seed
178
- # after that, the hash-value will me divided by the array size until
179
- # a position in the array is guaranteed
180
- positions.append(mmh3.hash(kmer, i) % self.array_size)
181
-
182
- return positions
183
-
184
- def lookup(self, kmer, limit=False):
185
- """checks if an element is in the filters, returns list with True/False,
186
- takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
187
- """
188
-
189
- # getting positions
190
- positions = self.hash(str(kmer))
191
- # control if element is in filter
192
- hits = [True] * self.clonetypes
193
- self.hit = False
194
- # save the individual kmer-hit vector for bootstrapping
195
- temp = [0] * self.clonetypes
196
-
197
- for i in range(self.clonetypes):
198
- row = i * self.array_size
199
- # all 7 Positions are hardcoded, the number of hashes is always(!) 7
200
- # if all positions are True, then hits[i] will also stay True
201
- # (i*self.array_size) skips to the same position in the next filter
202
- hits[i] = (
203
- self.matrix[positions[0] + row]
204
- and self.matrix[positions[1] + row]
205
- and self.matrix[positions[2] + row]
206
- and self.matrix[positions[3] + row]
207
- and self.matrix[positions[4] + row]
208
- and self.matrix[positions[5] + row]
209
- and self.matrix[positions[6] + row]
210
- )
211
-
212
- if hits[i]:
213
- temp[i] += 1
214
- self.hit = True
215
- if limit:
216
- if self.table.lookup(self.names[i], kmer):
217
- self.hits_per_filter[i] += 1
218
- else:
219
- # Update hit counter
220
- self.hits_per_filter[i] += 1
221
- self.kmer_hits_single.append(temp)
222
-
223
- def train(self, kmer, clonetype):
224
- """trains specific filter for a k-mer, input is that kmer and the desired Filter"""
225
-
226
- # getting hash Values
227
- positions = self.hash(kmer)
228
- # changing 0s to 1 in filter
229
- for position in positions:
230
- # getting position of cell
231
- self.matrix[self.array_size * clonetype + position] = True
232
-
233
- def train_sequence(self, filepath, clonetype, quick=False):
234
- """trains whole sequence into filter, takes filepath to file and the desired filter as input"""
235
- # for each sequence (in multi-FASTA file)
236
- if quick:
237
- for sequence in SeqIO.parse(filepath, "fasta"):
238
- # for each k-mere
239
- for i in range(len(sequence.seq) - self.k):
240
- # trains k-mere into filter
241
- self.train(str(sequence.seq[i : i + self.k]), clonetype)
242
- else:
243
- for sequence in SeqIO.parse(filepath, "fasta"):
244
- # for each k-mere
245
- # for i in range(len(sequence.seq) - self.k + 1):
246
- for i in range(len(sequence.seq) - self.k + 1):
247
- # tests which kmer ist lexicographic greater
248
- kmer = str(sequence.seq[i : i + self.k])
249
- kmer_complement = str(
250
- sequence.seq[i : i + self.k].reverse_complement()
251
- )
252
- # trains k-mere into filter
253
- if kmer > kmer_complement:
254
- self.train(kmer, clonetype)
255
- else:
256
- self.train(kmer_complement, clonetype)
257
- # trains k-mere into filter
258
- # self.train(str(sequence.seq[i: i + self.k]), clonetype)
259
- # testing
260
- # self.train(str(sequence.seq[i: i + self.k].reverse_complement()), clonetype)
261
-
262
- def lookup_txt(self, reads, genus, ext=False, quick=False):
263
- """Reading extracted fq-reads"""
264
- self.number_of_kmeres = 0
265
- self.hits_per_filter = [0] * self.clonetypes
266
-
267
- if quick == 1:
268
- # Quick: Non-overlapping k-mers
269
- # XspecT-Quick-Mode every 500th kmer
270
- for single_read in reads:
271
- # r is rest, so all kmers have size k
272
- for j in range(0, len(single_read) - self.k, 500):
273
- if "N" in single_read[j : j + self.k]:
274
- continue
275
- self.number_of_kmeres += 1
276
- kmer = str(single_read[j : j + self.k])
277
- kmer_reversed = str(Seq(kmer).reverse_complement())
278
- if kmer > kmer_reversed:
279
- self.lookup(kmer)
280
- else:
281
- self.lookup(kmer_reversed)
282
- # XspecT Sequence-Reads every 10th kmer
283
- elif quick == 2:
284
- for single_read in range(0, len(reads)):
285
- hit_counter = 0
286
- for j in range(0, len(reads[single_read]) - self.k, 10):
287
- if j == 5 and hit_counter == 0:
288
- break
289
- # updating counter
290
- self.number_of_kmeres += 1
291
- # lookup for kmer
292
- temp = reads[single_read]
293
- kmer = str(temp[j : j + self.k])
294
- kmer_reversed = str(Seq(kmer).reverse_complement())
295
- if kmer > kmer_reversed:
296
- self.lookup(kmer)
297
- else:
298
- self.lookup(kmer_reversed)
299
- if self.hit == True:
300
- hit_counter += 1
301
- elif quick == 3:
302
- # ClAssT Quick-Mode every 10th kmer
303
- for single_read in reads:
304
- # r is rest, so all kmers have size k
305
- for j in range(0, len(single_read) - self.k, 10):
306
- if "N" in single_read[j : j + self.k]:
307
- continue
308
- self.number_of_kmeres += 1
309
- kmer = str(single_read[j : j + self.k])
310
- kmer_reversed = str(Seq(kmer).reverse_complement())
311
- if kmer > kmer_reversed:
312
- self.lookup(kmer)
313
- else:
314
- self.lookup(kmer_reversed)
315
- # metagenome mode
316
- elif quick == 4:
317
- print("Stage 1")
318
- # tracker = SummaryTracker()
319
- counter = 0
320
- reads_classified = {}
321
- names = []
322
- predictions = []
323
- file_name = "Filter" + genus + ".txt"
324
- names_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
325
- with open(names_path, "rb") as fp:
326
- names = pickle.load(fp)
327
- print("Stage 2")
328
- for read in reads:
329
- # since we do indv. contig classifications we need to reset the BF vars
330
- self.kmer_hits_single = []
331
- self.number_of_kmeres = 0
332
- self.hits_per_filter = [0] * self.clonetypes
333
- for kmer in read:
334
- counter += 1
335
- # lookup for kmer, use lexikographical smaller kmer
336
- self.number_of_kmeres += 1
337
- kmer_reversed = str(Seq(kmer).reverse_complement())
338
- if kmer > kmer_reversed:
339
- self.lookup(kmer)
340
- else:
341
- self.lookup(kmer_reversed)
342
- score = self.get_score()
343
- score_edit = [str(x) for x in score]
344
- score_edit = ",".join(score_edit)
345
- # making prediction
346
- index_result = max(range(len(score)), key=score.__getitem__)
347
- prediction = names[index_result]
348
- predictions.append(prediction)
349
- # skip ambiguous contigs
350
- if max(score) == sorted(score)[-2]:
351
- continue
352
- # bootstrapping
353
- bootstrap_n = 100
354
- samples = bs.bootstrap(
355
- self.kmer_hits_single, self.number_of_kmeres, bootstrap_n
356
- )
357
- sample_scores = bs.bootstrap_scores(
358
- samples, self.number_of_kmeres, self.clonetypes
359
- )
360
- bootstrap_score = 0
361
- bootstrap_predictions = []
362
- for i in range(len(sample_scores)):
363
- # skip ambiguous contigs (species with same score)
364
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
365
- bootstrap_predictions.append(
366
- names[
367
- max(
368
- range(len(sample_scores[i])),
369
- key=sample_scores[i].__getitem__,
370
- )
371
- ]
372
- )
373
- if (
374
- max(
375
- range(len(sample_scores[i])),
376
- key=sample_scores[i].__getitem__,
377
- )
378
- == index_result
379
- ):
380
- bootstrap_score += 1
381
- else:
382
- continue
383
- bootstrap_score = bootstrap_score / bootstrap_n
384
- # bootstrap_score = 1
385
-
386
- if ("A." + prediction) not in reads_classified:
387
- # Value 5 war vohrer = read
388
- reads_classified["A." + prediction] = [
389
- [max(score)],
390
- 1,
391
- [len(read)],
392
- sorted(score)[-2] / max(score),
393
- [bootstrap_score],
394
- None,
395
- None,
396
- ]
397
- else:
398
- reads_classified["A." + prediction][0] += [max(score)]
399
- reads_classified["A." + prediction][1] += 1
400
- reads_classified["A." + prediction][2] += [len(read)]
401
- reads_classified["A." + prediction][3] += sorted(score)[-2] / max(
402
- score
403
- )
404
- reads_classified["A." + prediction][4] += [bootstrap_score]
405
- # reads_classified["A." + prediction][5] += None
406
- # tracker.print_diff()
407
- # not ready yet
408
- """for prediction in reads_classified:
409
- kmers = reads_classified[prediction][5]
410
- # Strip "A."
411
- prediction = prediction[2:]
412
- # kmer mapping to genome, start by loading the kmer_dict in
413
- path_pos = "filter\kmer_positions\Acinetobacter\\" + prediction + "_positions.txt"
414
- # delete later
415
- path_posv2 = "filter\kmer_positions\Acinetobacter\\" + prediction + "_complete_positions.txt"
416
- # cluster kmers to contigs
417
- # delete try later
418
- start_dict = time.time()
419
- try:
420
- with open(path_pos, 'rb') as fp:
421
- kmer_dict = pickle.load(fp)
422
- except:
423
- with open(path_posv2, 'rb') as fp:
424
- kmer_dict = pickle.load(fp)
425
- end_dict = time.time()
426
- needed_dict = round(end_dict - start_dict, 2)
427
- print("Time needed to load kmer_dict in: ", needed_dict)
428
- contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
429
- reads_classified["A." + prediction][6] = contig_amounts_distances"""
430
-
431
- print("Stage 3")
432
- # print results
433
- for key, value in reads_classified.items():
434
- number_of_contigs = value[1]
435
- # save results
436
- results_clustering = [
437
- [
438
- key
439
- + ","
440
- + str(statistics.median(value[0]))
441
- + ","
442
- + str(number_of_contigs),
443
- str(statistics.median(value[2]))
444
- + ","
445
- + str(round(value[3] / number_of_contigs, 2))
446
- + ","
447
- + str(statistics.median(value[4]))
448
- + ","
449
- + str(value[6]),
450
- ]
451
- ]
452
- # with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
453
- # writer = csv.writer(file)
454
- # writer.writerows(results_clustering)
455
- # Score Median
456
- value[0] = statistics.median(value[0])
457
- # Number of Contigs
458
- value[1] = number_of_contigs
459
- # Contig-Length Median
460
- value[2] = statistics.median(value[2])
461
- # Uniqueness
462
- value[3] = round(1 - (value[3] / number_of_contigs), 2)
463
- # Bootstrap Median
464
- value[4] = statistics.median(value[4])
465
- # value[6] = "Clusters: " + str(value[6])
466
- reads_classified[key] = value
467
- print("Stage 4")
468
- print("Types of return vars: ", type(reads_classified), type(predictions))
469
- return reads_classified, predictions
470
-
471
- else:
472
- for single_read in reads:
473
- for j in range(len(single_read) - self.k + 1):
474
- # updating counter
475
- self.number_of_kmeres += 1
476
- # lookup for kmer
477
- kmer = str(single_read[j : j + self.k])
478
- kmer_reversed = str(Seq(kmer).reverse_complement())
479
- if kmer > kmer_reversed:
480
- self.lookup(kmer)
481
- else:
482
- self.lookup(kmer_reversed)
483
-
484
- def cleanup(self):
485
- """deletes matrix"""
486
- self.matrix = None
487
-
488
- def lookup_oxa(self, reads, ext):
489
- """Looks for OXA Genes: Extension (ext) selects the fq-seach or fasta-search mode"""
490
- self.table = OXATable()
491
- self.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
492
- if ext == "fq":
493
- # fq mode
494
- coordinates_forward = []
495
- coordinates_reversed = []
496
- for i in range(len(reads)):
497
- # going through all reads, discarding those who don't get any hits with 3 test k-meres
498
-
499
- # Building 3 test-kmeres: first, last, and middle
500
- k1 = reads[i][0 : self.k] # first k-mer
501
- k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
502
- mid = len(reads[i]) // 2
503
- k3 = reads[i][mid : mid + self.k] # k-mer in middle
504
-
505
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
506
- # then the read won't be tested further
507
- hit_sum = sum(self.hits_per_filter)
508
- copy = deepcopy(self.hits_per_filter)
509
- self.lookup(k1, True)
510
- self.lookup(k2, True)
511
- self.lookup(k3, True)
512
-
513
- # needs at least 2 of 3 hits to continue with read
514
- if (sum(self.hits_per_filter) - hit_sum) > 1:
515
- for j in range(1, len(reads[i]) - 1 - self.k + 1):
516
- # Skipping first, last and middle k-mer
517
- if j != mid:
518
- self.lookup(reads[i][j : j + self.k], True)
519
- self.number_of_kmeres += 1
520
-
521
- else:
522
- # resetting hit counter
523
- self.hits_per_filter = copy
524
-
525
- # same, but with reverse complement
526
- reads[i] = Seq(reads[i])
527
- reads[i] = reads[i].reverse_complement()
528
- k1 = reads[i][0 : self.k] # first k-mer
529
- k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
530
- mid = len(reads[i]) // 2
531
- k3 = reads[i][mid : mid + self.k] # k-mer in middle
532
-
533
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
534
- # then the read won't be tested further
535
- hit_sum = sum(self.hits_per_filter)
536
- copy = deepcopy(self.hits_per_filter)
537
- self.lookup(k1, True)
538
- self.lookup(k2, True)
539
- self.lookup(k3, True)
540
-
541
- # needs at least 2 of 3 hits to continue with read
542
- if (sum(self.hits_per_filter) - hit_sum) > 1:
543
- for j in range(1, len(reads[i]) - 1 - self.k + 1):
544
- # Skipping first, last and middle k-mer
545
- if j != mid:
546
- self.lookup(reads[i][j : j + self.k], True)
547
- self.number_of_kmeres += 1
548
-
549
- else:
550
- # resetting hit counter
551
- self.hits_per_filter = copy
552
-
553
- else:
554
- # fasta mode
555
- # Altes testen mit Genom, hits per filter ausgeben lassen
556
- # self.oxa_search_genomes(reads)
557
- # self.oxa_search_genomes_v2(reads)
558
- coordinates_forward = self.oxa_search_genomes_v3(reads)
559
- reads_reversed = []
560
- for r in range(len(reads)):
561
- # building reverse complement
562
- reads_reversed.append(Seq(reads[r]))
563
- reads_reversed[r] = reads_reversed[r].reverse_complement()
564
- # lookup reverse complement
565
- # self.oxa_search_genomes(reads)
566
- # self.oxa_search_genomes_v2(reads)
567
- coordinates_reversed = self.oxa_search_genomes_v3(reads_reversed)
568
-
569
- # cleanup
570
- reads = None
571
- self.table.cleanup()
572
- return coordinates_forward, coordinates_reversed
573
-
574
- def oxa_search_genomes_v3(self, genome):
575
- coordinates = []
576
- for i in genome:
577
- j = 0
578
- success = False
579
- while j < len(i):
580
- hits = sum(self.hits_per_filter)
581
- kmer = i[j : j + self.k]
582
- self.lookup(kmer, True)
583
- if success == False:
584
- if sum(self.hits_per_filter) > hits:
585
- counter = 0
586
- coordinates.append([j])
587
- # 1024 (longest oxa-gene) - 19
588
- for n in range(j - 249, j + 1005, 1):
589
- if 0 <= j < len(i):
590
- hits_per_filter_copy = self.hits_per_filter[:]
591
- kmer = i[n : n + self.k]
592
- self.lookup(kmer, True)
593
- if hits_per_filter_copy != self.hits_per_filter:
594
- counter += 1
595
- if counter > 300:
596
- coordinates[-1].append(j + 1005)
597
- else:
598
- coordinates.pop()
599
- j += 1005
600
- success = True
601
- else:
602
- # j += 20
603
- j += 250
604
- success = False
605
- else:
606
- if sum(self.hits_per_filter) > hits:
607
- coordinates.append([j])
608
- counter = 0
609
- for n in range(j, j + 1005, 1):
610
- if 0 <= j < len(i):
611
- kmer = i[n : n + self.k]
612
- hits_per_filter_copy = self.hits_per_filter[:]
613
- self.lookup(kmer, True)
614
- if hits_per_filter_copy != self.hits_per_filter:
615
- counter += 1
616
- if counter > 300:
617
- coordinates[-1].append(j + 1005)
618
- else:
619
- coordinates.pop()
620
- j += 1005
621
- success = True
622
- else:
623
- j += 250
624
- success = False
625
- # if len(coordinates) > 0:
626
- # print("Coordinates: ", coordinates)
627
- return coordinates
628
-
629
- def get_oxa_score(self):
630
- """Returning hits per OXA/kmere in OXA-filter"""
631
- table = OXATable()
632
- counter = table.get_counter()
633
- score = []
634
- # calculates float for each value in [hits per filter]
635
- for i in range(self.clonetypes):
636
- if self.hits_per_filter[i] == 0:
637
- score.append(0.0)
638
- else:
639
- score.append(
640
- round(
641
- float(self.hits_per_filter[i]) / float(counter[self.names[i]]),
642
- 2,
643
- )
644
- )
645
- # print(self.hits_per_filter[i], counter[self.names[i]])
646
- # reset hits per filter
647
- self.hits_per_filter = [0] * self.clonetypes
648
- return score
xspect/Bootstrap.py DELETED
@@ -1,29 +0,0 @@
1
- import random
2
- from numpy import array
3
- from numpy import sum
4
-
5
-
6
- def bootstrap(data, sample_amount, size):
7
- samples = []
8
- for i in range(size):
9
- sample = []
10
- for j in range(sample_amount):
11
- sample.append(random.choice(data))
12
- sample = array(sample)
13
- temp = sum(sample, 0)
14
- samples.append(list(temp))
15
- return samples
16
-
17
-
18
- def bootstrap_scores(samples, number_of_kmeres, number_of_filters):
19
- scores = []
20
- # calculates float for each value in [hits per filter]
21
- for i in range(len(samples)):
22
- score = []
23
- for j in range(number_of_filters):
24
- if samples[i][j] == 0:
25
- score.append(0.0)
26
- else:
27
- score.append(round(float(samples[i][j]) / float(number_of_kmeres), 2))
28
- scores.append(score)
29
- return scores