XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/BF_v2.py DELETED
@@ -1,637 +0,0 @@
1
- """Bloomfilter implementation"""
2
-
3
- import os
4
- import csv
5
- from copy import deepcopy
6
- import pickle
7
- import statistics
8
- from pathlib import Path
9
-
10
-
11
- try:
12
- # try with a fast c-implementation ...
13
- import mmh3 as mmh3
14
- except ImportError:
15
- # ... otherwise fallback to this module!
16
- import pymmh3 as mmh3
17
-
18
- from bitarray import bitarray
19
- from Bio import SeqIO
20
- from Bio.Seq import Seq
21
- import h5py
22
- import xspect.Bootstrap as bs
23
- from xspect.OXA_Table import OXATable
24
-
25
-
26
- class AbaumanniiBloomfilter:
27
- """Bloomfilter that can read FASTA and FASTQ files to assign the given file to a reference-genome"""
28
-
29
- # Implementation of the Bloomfilter Project for Acinetobacter baumannii
30
- # Used an customized also for the Bloomfilter Project for Acinetobacter Species Assignment
31
- # Variables from the Strain-Typing were used if possible for the Species-Assignment to not over-complicate the Code
32
- # Code partly from https://github.com/Phelimb/BIGSI
33
-
34
- clonetypes = 1 # Number of IC's/Species
35
- hits_per_filter = [0] * clonetypes # Hit counter per IC/per Species
36
- array_size = 22000000 # Standard arraysize per IC is 22mio for Core-genome
37
- hashes = 7 # Number of used Hash-functions
38
- k = 20 # length of the k-meres
39
- names = [
40
- "IC1",
41
- "IC2",
42
- "IC3",
43
- "IC4",
44
- "IC5",
45
- "IC6",
46
- "IC7",
47
- "IC8",
48
- ] # names of the IC's
49
- number_of_kmeres = 0 # counter of k-meres, will be used to calculate score
50
- reads = 1000 # standard read number
51
- kmer_hits_single = [] # kmer hits per filter
52
-
53
- def __init__(self, arraysize):
54
- """creates empty matrix"""
55
- self.matrix = bitarray(arraysize)
56
- self.matrix.setall(False)
57
- self.array_size = arraysize
58
- self.kmeres = []
59
- self.hits_per_filter_kmere = []
60
- self.kmer_hits_single = []
61
- self.coverage = []
62
- self.hit = False
63
-
64
- # Setter
65
-
66
- def set_arraysize(self, new):
67
- """changes Arraysize to new input-value, does not recreate matrix"""
68
- self.array_size = new
69
-
70
- def set_clonetypes(self, new):
71
- """changes number of Clonetypes"""
72
- self.clonetypes = new
73
- self.hits_per_filter = [0] * self.clonetypes
74
-
75
- def set_hashes(self, new):
76
- """Changes number of used hash-functions"""
77
- self.hashes = new
78
-
79
- def set_k(self, new):
80
- """Changes length of k-meres"""
81
- self.k = new
82
-
83
- def set_names(self, new):
84
- """Changes Names of Filters, Input must be a List of names"""
85
- self.names = new
86
-
87
- def reset_counter(self):
88
- """resets counter"""
89
- self.number_of_kmeres = 0
90
- self.hits_per_filter = [0] * self.clonetypes
91
-
92
- def set_reads(self, new):
93
- """Changes number of reads to new value"""
94
- self.reads = new
95
-
96
- # Getter
97
-
98
- def get_score(self):
99
- """calculates score for all clonetypes
100
- Score is #hits / #kmeres"""
101
-
102
- score = []
103
-
104
- # calculates float for each value in [hits per filter]
105
- for i in range(self.clonetypes):
106
- if self.hits_per_filter[i] == 0:
107
- score.append(0.0)
108
- else:
109
- score.append(
110
- round(
111
- float(self.hits_per_filter[i]) / float(self.number_of_kmeres), 2
112
- )
113
- )
114
-
115
- return score
116
-
117
- def get_reads(self):
118
- """gets number of reads"""
119
- return self.reads
120
-
121
- def get_hits_per_filter(self):
122
- """gets Hits per Filter"""
123
- return self.hits_per_filter
124
-
125
- def get_kmeres_per_sequence(self):
126
- """gets K-mer counter"""
127
- # returns number of k-meres per file
128
- return self.number_of_kmeres
129
-
130
- def get_names(self):
131
- """gets names of filters"""
132
- return self.names
133
-
134
- def get_coverage(self):
135
- """gets coverage"""
136
- return self.coverage
137
-
138
- # File management
139
-
140
- def save_clonetypes(self, path):
141
- """saves matrix as a binary file to the input-path"""
142
- # saving filters of clonetypes
143
-
144
- # creating file and saving matrix with the bitarray modul
145
- with open(path, "wb") as fh:
146
- # writing to file with bitarray command
147
- self.matrix.tofile(fh)
148
-
149
- def read_clonetypes(self, paths, names):
150
- """reads slices from files and concats them to a matrix,
151
- paths is list of paths and names is a string list"""
152
-
153
- # Updating parameters
154
- self.clonetypes = len(paths)
155
- self.names = names
156
- self.matrix = bitarray(0)
157
- self.number_of_kmeres = 0
158
- self.hits_per_filter = [0] * self.clonetypes
159
-
160
- # creating matrix from single filters
161
- for path in paths:
162
- temp = bitarray()
163
-
164
- with open(path, "rb") as fh:
165
- temp.fromfile(fh)
166
- self.matrix.extend(temp)
167
-
168
- # Bloomfilter
169
-
170
- def hash(self, kmer):
171
- """Hashes given string and returns Positions for the Array"""
172
-
173
- # Empty list for Array positions
174
- positions = []
175
- # Creating hashes for needed number of hash functions
176
- for i in range(self.hashes):
177
- # mmh3 takes that string and a seed,
178
- # each hash function takes an individual seed
179
- # after that, the hash-value will me divided by the array size until
180
- # a position in the array is guaranteed
181
- positions.append(mmh3.hash(kmer, i) % self.array_size)
182
-
183
- return positions
184
-
185
- def lookup_canonical(self, kmer, limit=False):
186
- """takes kmer input string and checks all clonetypes if the cononicalized kmer is inside that set of kmers"""
187
-
188
- # canonicalize
189
- complement = str(Seq(kmer).reverse_complement())
190
- kmer = max(kmer, complement)
191
-
192
- self.lookup(kmer, limit)
193
-
194
- def lookup(self, kmer, limit=False):
195
- """
196
- takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
197
- """
198
-
199
- # getting positions
200
- positions = self.hash(str(kmer))
201
- # control if element is in filter
202
- hits = [True] * self.clonetypes
203
- self.hit = False
204
- # save the individual kmer-hit vector for bootstrapping
205
- temp = [0] * self.clonetypes
206
-
207
- for i in range(self.clonetypes):
208
- row = i * self.array_size
209
- # all 7 Positions are hardcoded, the number of hashes is always(!) 7
210
- # if all positions are True, then hits[i] will also stay True
211
- # (i*self.array_size) skips to the same position in the next filter
212
- hits[i] = (
213
- self.matrix[positions[0] + row]
214
- and self.matrix[positions[1] + row]
215
- and self.matrix[positions[2] + row]
216
- and self.matrix[positions[3] + row]
217
- and self.matrix[positions[4] + row]
218
- and self.matrix[positions[5] + row]
219
- and self.matrix[positions[6] + row]
220
- )
221
-
222
- if hits[i]:
223
- temp[i] += 1
224
- self.hit = True
225
- if limit:
226
- # reset single kmer kit vector / memory management
227
- self.kmer_hits_single = []
228
- if self.table.lookup(self.names[i], kmer):
229
- self.hits_per_filter[i] += 1
230
- else:
231
- # Update hit counter
232
- self.hits_per_filter[i] += 1
233
- self.kmer_hits_single.append(temp)
234
-
235
- def train(self, kmer, clonetype):
236
- """trains specific filter for a k-mer, input is that kmer and the desired Filter"""
237
-
238
- # getting hash Values
239
- positions = self.hash(kmer)
240
- # changing 0s to 1 in filter
241
- for position in positions:
242
- # getting position of cell
243
- self.matrix[self.array_size * clonetype + position] = True
244
-
245
- def train_sequence(self, filepath, clonetype, quick=False):
246
- """trains whole sequence into filter, takes filepath to file and the desired filter as input"""
247
- # for each sequence (in multi-FASTA file)
248
- if quick:
249
- for sequence in SeqIO.parse(filepath, "fasta"):
250
- # for each k-mere
251
- for i in range(len(sequence.seq) - self.k):
252
- # trains k-mere into filter
253
- self.train(str(sequence.seq[i : i + self.k]), clonetype)
254
- else:
255
- for sequence in SeqIO.parse(filepath, "fasta"):
256
- # for each k-mere
257
- # for i in range(len(sequence.seq) - self.k + 1):
258
- for i in range(len(sequence.seq) - self.k + 1):
259
- # tests which kmer ist lexicographic greater
260
- kmer = str(sequence.seq[i : i + self.k])
261
- kmer_complement = str(
262
- sequence.seq[i : i + self.k].reverse_complement()
263
- )
264
- # trains k-mere into filter
265
- if kmer > kmer_complement:
266
- self.train(kmer, clonetype)
267
- else:
268
- self.train(kmer_complement, clonetype)
269
- # trains k-mere into filter
270
- # self.train(str(sequence.seq[i: i + self.k]), clonetype)
271
- # testing
272
- # self.train(str(sequence.seq[i: i + self.k].reverse_complement()), clonetype)
273
-
274
- def lookup_txt(self, reads, genus, ext=False, quick=False):
275
- """Reading extracted fq-reads"""
276
- self.number_of_kmeres = 0
277
- self.hits_per_filter = [0] * self.clonetypes
278
-
279
- if quick == 1:
280
- # Quick: Non-overlapping k-mers
281
- # XspecT-Quick-Mode every 500th kmer
282
- for single_read in reads:
283
- # r is rest, so all kmers have size k
284
- for j in range(0, len(single_read) - self.k, 500):
285
- if "N" in single_read[j : j + self.k]:
286
- continue
287
- self.number_of_kmeres += 1
288
- kmer = str(single_read[j : j + self.k])
289
- self.lookup_canonical(kmer)
290
- # XspecT Sequence-Reads every 10th kmer
291
- elif quick == 2:
292
- for single_read in range(0, len(reads)):
293
- hit_counter = 0
294
- for j in range(0, len(reads[single_read]) - self.k, 10):
295
- if j == 5 and hit_counter == 0:
296
- break
297
- # updating counter
298
- self.number_of_kmeres += 1
299
- # lookup for kmer
300
- temp = reads[single_read]
301
- kmer = str(temp[j : j + self.k])
302
- self.lookup_canonical(kmer)
303
- if self.hit == True:
304
- hit_counter += 1
305
- elif quick == 3:
306
- # ClAssT Quick-Mode every 10th kmer
307
- for single_read in reads:
308
- # r is rest, so all kmers have size k
309
- for j in range(0, len(single_read) - self.k, 10):
310
- if "N" in single_read[j : j + self.k]:
311
- continue
312
- self.number_of_kmeres += 1
313
- kmer = str(single_read[j : j + self.k])
314
- self.lookup_canonical(kmer)
315
- # metagenome mode
316
- elif quick == 4:
317
- print("Stage 1")
318
- # tracker = SummaryTracker()
319
- counter = 0
320
- reads_classified = {}
321
- names = []
322
- predictions = []
323
- file_name = "Filter" + genus + ".txt"
324
- names_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
325
- with open(names_path, "rb") as fp:
326
- names = pickle.load(fp)
327
- print("Stage 2")
328
- for read in reads:
329
- # since we do indv. contig classifications we need to reset the BF vars
330
- self.kmer_hits_single = []
331
- self.number_of_kmeres = 0
332
- self.hits_per_filter = [0] * self.clonetypes
333
- for kmer in read:
334
- counter += 1
335
- self.number_of_kmeres += 1
336
- self.lookup_canonical(kmer)
337
- score = self.get_score()
338
- score_edit = [str(x) for x in score]
339
- score_edit = ",".join(score_edit)
340
- # making prediction
341
- index_result = max(range(len(score)), key=score.__getitem__)
342
- prediction = names[index_result]
343
- predictions.append(prediction)
344
- # skip ambiguous contigs
345
- if max(score) == sorted(score)[-2]:
346
- continue
347
- # bootstrapping
348
- bootstrap_n = 100
349
- samples = bs.bootstrap(
350
- self.kmer_hits_single, self.number_of_kmeres, bootstrap_n
351
- )
352
- sample_scores = bs.bootstrap_scores(
353
- samples, self.number_of_kmeres, self.clonetypes
354
- )
355
- bootstrap_score = 0
356
- bootstrap_predictions = []
357
- for i in range(len(sample_scores)):
358
- # skip ambiguous contigs (species with same score)
359
- if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
360
- bootstrap_predictions.append(
361
- names[
362
- max(
363
- range(len(sample_scores[i])),
364
- key=sample_scores[i].__getitem__,
365
- )
366
- ]
367
- )
368
- if (
369
- max(
370
- range(len(sample_scores[i])),
371
- key=sample_scores[i].__getitem__,
372
- )
373
- == index_result
374
- ):
375
- bootstrap_score += 1
376
- else:
377
- continue
378
- bootstrap_score = bootstrap_score / bootstrap_n
379
- # bootstrap_score = 1
380
-
381
- if prediction not in reads_classified:
382
- # Value 5 war vohrer = read
383
- reads_classified[prediction] = [
384
- [max(score)],
385
- 1,
386
- [len(read)],
387
- sorted(score)[-2] / max(score),
388
- [bootstrap_score],
389
- None,
390
- None,
391
- ]
392
- else:
393
- reads_classified[prediction][0] += [max(score)]
394
- reads_classified[prediction][1] += 1
395
- reads_classified[prediction][2] += [len(read)]
396
- reads_classified[prediction][3] += sorted(score)[-2] / max(score)
397
- reads_classified[prediction][4] += [bootstrap_score]
398
- # reads_classified["A." + prediction][5] += None
399
- # tracker.print_diff()
400
- # not ready yet
401
- """for prediction in reads_classified:
402
- kmers = reads_classified[prediction][5]
403
- # Strip "A."
404
- prediction = prediction[2:]
405
- # kmer mapping to genome, start by loading the kmer_dict in
406
- path_pos = "filter\kmer_positions\Acinetobacter\\" + prediction + "_positions.txt"
407
- # delete later
408
- path_posv2 = "filter\kmer_positions\Acinetobacter\\" + prediction + "_complete_positions.txt"
409
- # cluster kmers to contigs
410
- # delete try later
411
- start_dict = time.time()
412
- try:
413
- with open(path_pos, 'rb') as fp:
414
- kmer_dict = pickle.load(fp)
415
- except:
416
- with open(path_posv2, 'rb') as fp:
417
- kmer_dict = pickle.load(fp)
418
- end_dict = time.time()
419
- needed_dict = round(end_dict - start_dict, 2)
420
- print("Time needed to load kmer_dict in: ", needed_dict)
421
- contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
422
- reads_classified["A." + prediction][6] = contig_amounts_distances"""
423
-
424
- print("Stage 3")
425
- # print results
426
- for key, value in reads_classified.items():
427
- number_of_contigs = value[1]
428
- # save results
429
- results_clustering = [
430
- [
431
- key
432
- + ","
433
- + str(statistics.median(value[0]))
434
- + ","
435
- + str(number_of_contigs),
436
- str(statistics.median(value[2]))
437
- + ","
438
- + str(round(value[3] / number_of_contigs, 2))
439
- + ","
440
- + str(statistics.median(value[4]))
441
- + ","
442
- + str(value[6]),
443
- ]
444
- ]
445
- # with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
446
- # writer = csv.writer(file)
447
- # writer.writerows(results_clustering)
448
- # Score Median
449
- value[0] = statistics.median(value[0])
450
- # Number of Contigs
451
- value[1] = number_of_contigs
452
- # Contig-Length Median
453
- value[2] = statistics.median(value[2])
454
- # Uniqueness
455
- value[3] = round(1 - (value[3] / number_of_contigs), 2)
456
- # Bootstrap Median
457
- value[4] = statistics.median(value[4])
458
- # value[6] = "Clusters: " + str(value[6])
459
- reads_classified[key] = value
460
- print("Stage 4")
461
- print("Types of return vars: ", type(reads_classified), type(predictions))
462
- return reads_classified, predictions
463
-
464
- else:
465
- for single_read in reads:
466
- for j in range(len(single_read) - self.k + 1):
467
- # updating counter
468
- self.number_of_kmeres += 1
469
- # lookup for kmer
470
- kmer = str(single_read[j : j + self.k])
471
- self.lookup_canonical(kmer)
472
-
473
- def cleanup(self):
474
- """deletes matrix"""
475
- self.matrix = None
476
-
477
- def lookup_oxa(self, reads, ext):
478
- """Looks for OXA Genes: Extension (ext) selects the fq-seach or fasta-search mode"""
479
- self.table = OXATable()
480
- self.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
481
- if ext == "fq":
482
- # fq mode
483
- coordinates_forward = []
484
- coordinates_reversed = []
485
- for i in range(len(reads)):
486
- # going through all reads, discarding those who don't get any hits with 3 test k-meres
487
-
488
- # Building 3 test-kmeres: first, last, and middle
489
- k1 = reads[i][0 : self.k] # first k-mer
490
- k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
491
- mid = len(reads[i]) // 2
492
- k3 = reads[i][mid : mid + self.k] # k-mer in middle
493
-
494
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
495
- # then the read won't be tested further
496
- hit_sum = sum(self.hits_per_filter)
497
- copy = deepcopy(self.hits_per_filter)
498
- self.lookup(k1, True)
499
- self.lookup(k2, True)
500
- self.lookup(k3, True)
501
-
502
- # needs at least 2 of 3 hits to continue with read
503
- if (sum(self.hits_per_filter) - hit_sum) > 1:
504
- for j in range(1, len(reads[i]) - 1 - self.k + 1):
505
- # Skipping first, last and middle k-mer
506
- if j != mid:
507
- self.lookup(reads[i][j : j + self.k], True)
508
- self.number_of_kmeres += 1
509
-
510
- else:
511
- # resetting hit counter
512
- self.hits_per_filter = copy
513
-
514
- # same, but with reverse complement
515
- reads[i] = Seq(reads[i])
516
- reads[i] = reads[i].reverse_complement()
517
- k1 = reads[i][0 : self.k] # first k-mer
518
- k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
519
- mid = len(reads[i]) // 2
520
- k3 = reads[i][mid : mid + self.k] # k-mer in middle
521
-
522
- # Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
523
- # then the read won't be tested further
524
- hit_sum = sum(self.hits_per_filter)
525
- copy = deepcopy(self.hits_per_filter)
526
- self.lookup(k1, True)
527
- self.lookup(k2, True)
528
- self.lookup(k3, True)
529
-
530
- # needs at least 2 of 3 hits to continue with read
531
- if (sum(self.hits_per_filter) - hit_sum) > 1:
532
- for j in range(1, len(reads[i]) - 1 - self.k + 1):
533
- # Skipping first, last and middle k-mer
534
- if j != mid:
535
- self.lookup(reads[i][j : j + self.k], True)
536
- self.number_of_kmeres += 1
537
-
538
- else:
539
- # resetting hit counter
540
- self.hits_per_filter = copy
541
-
542
- else:
543
- # fasta mode
544
- # Altes testen mit Genom, hits per filter ausgeben lassen
545
- # self.oxa_search_genomes(reads)
546
- # self.oxa_search_genomes_v2(reads)
547
- coordinates_forward = self.oxa_search_genomes_v3(reads)
548
- reads_reversed = []
549
- for r in range(len(reads)):
550
- # building reverse complement
551
- reads_reversed.append(Seq(reads[r]))
552
- reads_reversed[r] = reads_reversed[r].reverse_complement()
553
- # lookup reverse complement
554
- # self.oxa_search_genomes(reads)
555
- # self.oxa_search_genomes_v2(reads)
556
- coordinates_reversed = self.oxa_search_genomes_v3(reads_reversed)
557
-
558
- # cleanup
559
- reads = None
560
- self.table.cleanup()
561
- return coordinates_forward, coordinates_reversed
562
-
563
- def oxa_search_genomes_v3(self, genome):
564
- coordinates = []
565
- for i in genome:
566
- j = 0
567
- success = False
568
- while j < len(i):
569
- hits = sum(self.hits_per_filter)
570
- kmer = i[j : j + self.k]
571
- self.lookup(kmer, True)
572
- if success == False:
573
- if sum(self.hits_per_filter) > hits:
574
- counter = 0
575
- coordinates.append([j])
576
- # 1024 (longest oxa-gene) - 19
577
- for n in range(j - 249, j + 1005, 1):
578
- if 0 <= j < len(i):
579
- hits_per_filter_copy = self.hits_per_filter[:]
580
- kmer = i[n : n + self.k]
581
- self.lookup(kmer, True)
582
- if hits_per_filter_copy != self.hits_per_filter:
583
- counter += 1
584
- if counter > 300:
585
- coordinates[-1].append(j + 1005)
586
- else:
587
- coordinates.pop()
588
- j += 1005
589
- success = True
590
- else:
591
- # j += 20
592
- j += 250
593
- success = False
594
- else:
595
- if sum(self.hits_per_filter) > hits:
596
- coordinates.append([j])
597
- counter = 0
598
- for n in range(j, j + 1005, 1):
599
- if 0 <= j < len(i):
600
- kmer = i[n : n + self.k]
601
- hits_per_filter_copy = self.hits_per_filter[:]
602
- self.lookup(kmer, True)
603
- if hits_per_filter_copy != self.hits_per_filter:
604
- counter += 1
605
- if counter > 300:
606
- coordinates[-1].append(j + 1005)
607
- else:
608
- coordinates.pop()
609
- j += 1005
610
- success = True
611
- else:
612
- j += 250
613
- success = False
614
- # if len(coordinates) > 0:
615
- # print("Coordinates: ", coordinates)
616
- return coordinates
617
-
618
- def get_oxa_score(self):
619
- """Returning hits per OXA/kmere in OXA-filter"""
620
- table = OXATable()
621
- counter = table.get_counter()
622
- score = []
623
- # calculates float for each value in [hits per filter]
624
- for i in range(self.clonetypes):
625
- if self.hits_per_filter[i] == 0:
626
- score.append(0.0)
627
- else:
628
- score.append(
629
- round(
630
- float(self.hits_per_filter[i]) / float(counter[self.names[i]]),
631
- 2,
632
- )
633
- )
634
- # print(self.hits_per_filter[i], counter[self.names[i]])
635
- # reset hits per filter
636
- self.hits_per_filter = [0] * self.clonetypes
637
- return score
xspect/Bootstrap.py DELETED
@@ -1,29 +0,0 @@
1
- import random
2
- from numpy import array
3
- from numpy import sum
4
-
5
-
6
- def bootstrap(data, sample_amount, size):
7
- samples = []
8
- for i in range(size):
9
- sample = []
10
- for j in range(sample_amount):
11
- sample.append(random.choice(data))
12
- sample = array(sample)
13
- temp = sum(sample, 0)
14
- samples.append(list(temp))
15
- return samples
16
-
17
-
18
- def bootstrap_scores(samples, number_of_kmeres, number_of_filters):
19
- scores = []
20
- # calculates float for each value in [hits per filter]
21
- for i in range(len(samples)):
22
- score = []
23
- for j in range(number_of_filters):
24
- if samples[i][j] == 0:
25
- score.append(0.0)
26
- else:
27
- score.append(round(float(samples[i][j]) / float(number_of_kmeres), 2))
28
- scores.append(score)
29
- return scores