XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
- XspecT-0.2.0.dist-info/RECORD +30 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
- xspect/definitions.py +42 -0
- xspect/download_filters.py +11 -26
- xspect/fastapi.py +101 -0
- xspect/file_io.py +34 -103
- xspect/main.py +70 -66
- xspect/model_management.py +88 -0
- xspect/models/__init__.py +0 -0
- xspect/models/probabilistic_filter_model.py +277 -0
- xspect/models/probabilistic_filter_svm_model.py +169 -0
- xspect/models/probabilistic_single_filter_model.py +109 -0
- xspect/models/result.py +148 -0
- xspect/pipeline.py +201 -0
- xspect/run.py +38 -0
- xspect/train.py +304 -0
- xspect/train_filter/create_svm.py +6 -183
- xspect/train_filter/extract_and_concatenate.py +117 -121
- xspect/train_filter/html_scrap.py +16 -28
- xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
- XspecT-0.1.3.dist-info/RECORD +0 -49
- xspect/BF_v2.py +0 -637
- xspect/Bootstrap.py +0 -29
- xspect/Classifier.py +0 -142
- xspect/OXA_Table.py +0 -53
- xspect/WebApp.py +0 -724
- xspect/XspecT_mini.py +0 -1363
- xspect/XspecT_trainer.py +0 -611
- xspect/map_kmers.py +0 -155
- xspect/search_filter.py +0 -504
- xspect/static/How-To.png +0 -0
- xspect/static/Logo.png +0 -0
- xspect/static/Logo2.png +0 -0
- xspect/static/Workflow_AspecT.png +0 -0
- xspect/static/Workflow_ClAssT.png +0 -0
- xspect/static/js.js +0 -615
- xspect/static/main.css +0 -280
- xspect/templates/400.html +0 -64
- xspect/templates/401.html +0 -62
- xspect/templates/404.html +0 -62
- xspect/templates/500.html +0 -62
- xspect/templates/about.html +0 -544
- xspect/templates/home.html +0 -51
- xspect/templates/layoutabout.html +0 -87
- xspect/templates/layouthome.html +0 -63
- xspect/templates/layoutspecies.html +0 -468
- xspect/templates/species.html +0 -33
- xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
- xspect/train_filter/get_paths.py +0 -35
- xspect/train_filter/interface_XspecT.py +0 -204
- xspect/train_filter/k_mer_count.py +0 -162
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/BF_v2.py
DELETED
|
@@ -1,637 +0,0 @@
|
|
|
1
|
-
"""Bloomfilter implementation"""
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import csv
|
|
5
|
-
from copy import deepcopy
|
|
6
|
-
import pickle
|
|
7
|
-
import statistics
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
try:
|
|
12
|
-
# try with a fast c-implementation ...
|
|
13
|
-
import mmh3 as mmh3
|
|
14
|
-
except ImportError:
|
|
15
|
-
# ... otherwise fallback to this module!
|
|
16
|
-
import pymmh3 as mmh3
|
|
17
|
-
|
|
18
|
-
from bitarray import bitarray
|
|
19
|
-
from Bio import SeqIO
|
|
20
|
-
from Bio.Seq import Seq
|
|
21
|
-
import h5py
|
|
22
|
-
import xspect.Bootstrap as bs
|
|
23
|
-
from xspect.OXA_Table import OXATable
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class AbaumanniiBloomfilter:
|
|
27
|
-
"""Bloomfilter that can read FASTA and FASTQ files to assign the given file to a reference-genome"""
|
|
28
|
-
|
|
29
|
-
# Implementation of the Bloomfilter Project for Acinetobacter baumannii
|
|
30
|
-
# Used an customized also for the Bloomfilter Project for Acinetobacter Species Assignment
|
|
31
|
-
# Variables from the Strain-Typing were used if possible for the Species-Assignment to not over-complicate the Code
|
|
32
|
-
# Code partly from https://github.com/Phelimb/BIGSI
|
|
33
|
-
|
|
34
|
-
clonetypes = 1 # Number of IC's/Species
|
|
35
|
-
hits_per_filter = [0] * clonetypes # Hit counter per IC/per Species
|
|
36
|
-
array_size = 22000000 # Standard arraysize per IC is 22mio for Core-genome
|
|
37
|
-
hashes = 7 # Number of used Hash-functions
|
|
38
|
-
k = 20 # length of the k-meres
|
|
39
|
-
names = [
|
|
40
|
-
"IC1",
|
|
41
|
-
"IC2",
|
|
42
|
-
"IC3",
|
|
43
|
-
"IC4",
|
|
44
|
-
"IC5",
|
|
45
|
-
"IC6",
|
|
46
|
-
"IC7",
|
|
47
|
-
"IC8",
|
|
48
|
-
] # names of the IC's
|
|
49
|
-
number_of_kmeres = 0 # counter of k-meres, will be used to calculate score
|
|
50
|
-
reads = 1000 # standard read number
|
|
51
|
-
kmer_hits_single = [] # kmer hits per filter
|
|
52
|
-
|
|
53
|
-
def __init__(self, arraysize):
|
|
54
|
-
"""creates empty matrix"""
|
|
55
|
-
self.matrix = bitarray(arraysize)
|
|
56
|
-
self.matrix.setall(False)
|
|
57
|
-
self.array_size = arraysize
|
|
58
|
-
self.kmeres = []
|
|
59
|
-
self.hits_per_filter_kmere = []
|
|
60
|
-
self.kmer_hits_single = []
|
|
61
|
-
self.coverage = []
|
|
62
|
-
self.hit = False
|
|
63
|
-
|
|
64
|
-
# Setter
|
|
65
|
-
|
|
66
|
-
def set_arraysize(self, new):
|
|
67
|
-
"""changes Arraysize to new input-value, does not recreate matrix"""
|
|
68
|
-
self.array_size = new
|
|
69
|
-
|
|
70
|
-
def set_clonetypes(self, new):
|
|
71
|
-
"""changes number of Clonetypes"""
|
|
72
|
-
self.clonetypes = new
|
|
73
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
74
|
-
|
|
75
|
-
def set_hashes(self, new):
|
|
76
|
-
"""Changes number of used hash-functions"""
|
|
77
|
-
self.hashes = new
|
|
78
|
-
|
|
79
|
-
def set_k(self, new):
|
|
80
|
-
"""Changes length of k-meres"""
|
|
81
|
-
self.k = new
|
|
82
|
-
|
|
83
|
-
def set_names(self, new):
|
|
84
|
-
"""Changes Names of Filters, Input must be a List of names"""
|
|
85
|
-
self.names = new
|
|
86
|
-
|
|
87
|
-
def reset_counter(self):
|
|
88
|
-
"""resets counter"""
|
|
89
|
-
self.number_of_kmeres = 0
|
|
90
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
91
|
-
|
|
92
|
-
def set_reads(self, new):
|
|
93
|
-
"""Changes number of reads to new value"""
|
|
94
|
-
self.reads = new
|
|
95
|
-
|
|
96
|
-
# Getter
|
|
97
|
-
|
|
98
|
-
def get_score(self):
|
|
99
|
-
"""calculates score for all clonetypes
|
|
100
|
-
Score is #hits / #kmeres"""
|
|
101
|
-
|
|
102
|
-
score = []
|
|
103
|
-
|
|
104
|
-
# calculates float for each value in [hits per filter]
|
|
105
|
-
for i in range(self.clonetypes):
|
|
106
|
-
if self.hits_per_filter[i] == 0:
|
|
107
|
-
score.append(0.0)
|
|
108
|
-
else:
|
|
109
|
-
score.append(
|
|
110
|
-
round(
|
|
111
|
-
float(self.hits_per_filter[i]) / float(self.number_of_kmeres), 2
|
|
112
|
-
)
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
return score
|
|
116
|
-
|
|
117
|
-
def get_reads(self):
|
|
118
|
-
"""gets number of reads"""
|
|
119
|
-
return self.reads
|
|
120
|
-
|
|
121
|
-
def get_hits_per_filter(self):
|
|
122
|
-
"""gets Hits per Filter"""
|
|
123
|
-
return self.hits_per_filter
|
|
124
|
-
|
|
125
|
-
def get_kmeres_per_sequence(self):
|
|
126
|
-
"""gets K-mer counter"""
|
|
127
|
-
# returns number of k-meres per file
|
|
128
|
-
return self.number_of_kmeres
|
|
129
|
-
|
|
130
|
-
def get_names(self):
|
|
131
|
-
"""gets names of filters"""
|
|
132
|
-
return self.names
|
|
133
|
-
|
|
134
|
-
def get_coverage(self):
|
|
135
|
-
"""gets coverage"""
|
|
136
|
-
return self.coverage
|
|
137
|
-
|
|
138
|
-
# File management
|
|
139
|
-
|
|
140
|
-
def save_clonetypes(self, path):
|
|
141
|
-
"""saves matrix as a binary file to the input-path"""
|
|
142
|
-
# saving filters of clonetypes
|
|
143
|
-
|
|
144
|
-
# creating file and saving matrix with the bitarray modul
|
|
145
|
-
with open(path, "wb") as fh:
|
|
146
|
-
# writing to file with bitarray command
|
|
147
|
-
self.matrix.tofile(fh)
|
|
148
|
-
|
|
149
|
-
def read_clonetypes(self, paths, names):
|
|
150
|
-
"""reads slices from files and concats them to a matrix,
|
|
151
|
-
paths is list of paths and names is a string list"""
|
|
152
|
-
|
|
153
|
-
# Updating parameters
|
|
154
|
-
self.clonetypes = len(paths)
|
|
155
|
-
self.names = names
|
|
156
|
-
self.matrix = bitarray(0)
|
|
157
|
-
self.number_of_kmeres = 0
|
|
158
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
159
|
-
|
|
160
|
-
# creating matrix from single filters
|
|
161
|
-
for path in paths:
|
|
162
|
-
temp = bitarray()
|
|
163
|
-
|
|
164
|
-
with open(path, "rb") as fh:
|
|
165
|
-
temp.fromfile(fh)
|
|
166
|
-
self.matrix.extend(temp)
|
|
167
|
-
|
|
168
|
-
# Bloomfilter
|
|
169
|
-
|
|
170
|
-
def hash(self, kmer):
|
|
171
|
-
"""Hashes given string and returns Positions for the Array"""
|
|
172
|
-
|
|
173
|
-
# Empty list for Array positions
|
|
174
|
-
positions = []
|
|
175
|
-
# Creating hashes for needed number of hash functions
|
|
176
|
-
for i in range(self.hashes):
|
|
177
|
-
# mmh3 takes that string and a seed,
|
|
178
|
-
# each hash function takes an individual seed
|
|
179
|
-
# after that, the hash-value will me divided by the array size until
|
|
180
|
-
# a position in the array is guaranteed
|
|
181
|
-
positions.append(mmh3.hash(kmer, i) % self.array_size)
|
|
182
|
-
|
|
183
|
-
return positions
|
|
184
|
-
|
|
185
|
-
def lookup_canonical(self, kmer, limit=False):
|
|
186
|
-
"""takes kmer input string and checks all clonetypes if the cononicalized kmer is inside that set of kmers"""
|
|
187
|
-
|
|
188
|
-
# canonicalize
|
|
189
|
-
complement = str(Seq(kmer).reverse_complement())
|
|
190
|
-
kmer = max(kmer, complement)
|
|
191
|
-
|
|
192
|
-
self.lookup(kmer, limit)
|
|
193
|
-
|
|
194
|
-
def lookup(self, kmer, limit=False):
|
|
195
|
-
"""
|
|
196
|
-
takes kmer input string and checks all clonetypes if the k-mer is inside that set of kmers
|
|
197
|
-
"""
|
|
198
|
-
|
|
199
|
-
# getting positions
|
|
200
|
-
positions = self.hash(str(kmer))
|
|
201
|
-
# control if element is in filter
|
|
202
|
-
hits = [True] * self.clonetypes
|
|
203
|
-
self.hit = False
|
|
204
|
-
# save the individual kmer-hit vector for bootstrapping
|
|
205
|
-
temp = [0] * self.clonetypes
|
|
206
|
-
|
|
207
|
-
for i in range(self.clonetypes):
|
|
208
|
-
row = i * self.array_size
|
|
209
|
-
# all 7 Positions are hardcoded, the number of hashes is always(!) 7
|
|
210
|
-
# if all positions are True, then hits[i] will also stay True
|
|
211
|
-
# (i*self.array_size) skips to the same position in the next filter
|
|
212
|
-
hits[i] = (
|
|
213
|
-
self.matrix[positions[0] + row]
|
|
214
|
-
and self.matrix[positions[1] + row]
|
|
215
|
-
and self.matrix[positions[2] + row]
|
|
216
|
-
and self.matrix[positions[3] + row]
|
|
217
|
-
and self.matrix[positions[4] + row]
|
|
218
|
-
and self.matrix[positions[5] + row]
|
|
219
|
-
and self.matrix[positions[6] + row]
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
if hits[i]:
|
|
223
|
-
temp[i] += 1
|
|
224
|
-
self.hit = True
|
|
225
|
-
if limit:
|
|
226
|
-
# reset single kmer kit vector / memory management
|
|
227
|
-
self.kmer_hits_single = []
|
|
228
|
-
if self.table.lookup(self.names[i], kmer):
|
|
229
|
-
self.hits_per_filter[i] += 1
|
|
230
|
-
else:
|
|
231
|
-
# Update hit counter
|
|
232
|
-
self.hits_per_filter[i] += 1
|
|
233
|
-
self.kmer_hits_single.append(temp)
|
|
234
|
-
|
|
235
|
-
def train(self, kmer, clonetype):
|
|
236
|
-
"""trains specific filter for a k-mer, input is that kmer and the desired Filter"""
|
|
237
|
-
|
|
238
|
-
# getting hash Values
|
|
239
|
-
positions = self.hash(kmer)
|
|
240
|
-
# changing 0s to 1 in filter
|
|
241
|
-
for position in positions:
|
|
242
|
-
# getting position of cell
|
|
243
|
-
self.matrix[self.array_size * clonetype + position] = True
|
|
244
|
-
|
|
245
|
-
def train_sequence(self, filepath, clonetype, quick=False):
|
|
246
|
-
"""trains whole sequence into filter, takes filepath to file and the desired filter as input"""
|
|
247
|
-
# for each sequence (in multi-FASTA file)
|
|
248
|
-
if quick:
|
|
249
|
-
for sequence in SeqIO.parse(filepath, "fasta"):
|
|
250
|
-
# for each k-mere
|
|
251
|
-
for i in range(len(sequence.seq) - self.k):
|
|
252
|
-
# trains k-mere into filter
|
|
253
|
-
self.train(str(sequence.seq[i : i + self.k]), clonetype)
|
|
254
|
-
else:
|
|
255
|
-
for sequence in SeqIO.parse(filepath, "fasta"):
|
|
256
|
-
# for each k-mere
|
|
257
|
-
# for i in range(len(sequence.seq) - self.k + 1):
|
|
258
|
-
for i in range(len(sequence.seq) - self.k + 1):
|
|
259
|
-
# tests which kmer ist lexicographic greater
|
|
260
|
-
kmer = str(sequence.seq[i : i + self.k])
|
|
261
|
-
kmer_complement = str(
|
|
262
|
-
sequence.seq[i : i + self.k].reverse_complement()
|
|
263
|
-
)
|
|
264
|
-
# trains k-mere into filter
|
|
265
|
-
if kmer > kmer_complement:
|
|
266
|
-
self.train(kmer, clonetype)
|
|
267
|
-
else:
|
|
268
|
-
self.train(kmer_complement, clonetype)
|
|
269
|
-
# trains k-mere into filter
|
|
270
|
-
# self.train(str(sequence.seq[i: i + self.k]), clonetype)
|
|
271
|
-
# testing
|
|
272
|
-
# self.train(str(sequence.seq[i: i + self.k].reverse_complement()), clonetype)
|
|
273
|
-
|
|
274
|
-
def lookup_txt(self, reads, genus, ext=False, quick=False):
|
|
275
|
-
"""Reading extracted fq-reads"""
|
|
276
|
-
self.number_of_kmeres = 0
|
|
277
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
278
|
-
|
|
279
|
-
if quick == 1:
|
|
280
|
-
# Quick: Non-overlapping k-mers
|
|
281
|
-
# XspecT-Quick-Mode every 500th kmer
|
|
282
|
-
for single_read in reads:
|
|
283
|
-
# r is rest, so all kmers have size k
|
|
284
|
-
for j in range(0, len(single_read) - self.k, 500):
|
|
285
|
-
if "N" in single_read[j : j + self.k]:
|
|
286
|
-
continue
|
|
287
|
-
self.number_of_kmeres += 1
|
|
288
|
-
kmer = str(single_read[j : j + self.k])
|
|
289
|
-
self.lookup_canonical(kmer)
|
|
290
|
-
# XspecT Sequence-Reads every 10th kmer
|
|
291
|
-
elif quick == 2:
|
|
292
|
-
for single_read in range(0, len(reads)):
|
|
293
|
-
hit_counter = 0
|
|
294
|
-
for j in range(0, len(reads[single_read]) - self.k, 10):
|
|
295
|
-
if j == 5 and hit_counter == 0:
|
|
296
|
-
break
|
|
297
|
-
# updating counter
|
|
298
|
-
self.number_of_kmeres += 1
|
|
299
|
-
# lookup for kmer
|
|
300
|
-
temp = reads[single_read]
|
|
301
|
-
kmer = str(temp[j : j + self.k])
|
|
302
|
-
self.lookup_canonical(kmer)
|
|
303
|
-
if self.hit == True:
|
|
304
|
-
hit_counter += 1
|
|
305
|
-
elif quick == 3:
|
|
306
|
-
# ClAssT Quick-Mode every 10th kmer
|
|
307
|
-
for single_read in reads:
|
|
308
|
-
# r is rest, so all kmers have size k
|
|
309
|
-
for j in range(0, len(single_read) - self.k, 10):
|
|
310
|
-
if "N" in single_read[j : j + self.k]:
|
|
311
|
-
continue
|
|
312
|
-
self.number_of_kmeres += 1
|
|
313
|
-
kmer = str(single_read[j : j + self.k])
|
|
314
|
-
self.lookup_canonical(kmer)
|
|
315
|
-
# metagenome mode
|
|
316
|
-
elif quick == 4:
|
|
317
|
-
print("Stage 1")
|
|
318
|
-
# tracker = SummaryTracker()
|
|
319
|
-
counter = 0
|
|
320
|
-
reads_classified = {}
|
|
321
|
-
names = []
|
|
322
|
-
predictions = []
|
|
323
|
-
file_name = "Filter" + genus + ".txt"
|
|
324
|
-
names_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
|
|
325
|
-
with open(names_path, "rb") as fp:
|
|
326
|
-
names = pickle.load(fp)
|
|
327
|
-
print("Stage 2")
|
|
328
|
-
for read in reads:
|
|
329
|
-
# since we do indv. contig classifications we need to reset the BF vars
|
|
330
|
-
self.kmer_hits_single = []
|
|
331
|
-
self.number_of_kmeres = 0
|
|
332
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
333
|
-
for kmer in read:
|
|
334
|
-
counter += 1
|
|
335
|
-
self.number_of_kmeres += 1
|
|
336
|
-
self.lookup_canonical(kmer)
|
|
337
|
-
score = self.get_score()
|
|
338
|
-
score_edit = [str(x) for x in score]
|
|
339
|
-
score_edit = ",".join(score_edit)
|
|
340
|
-
# making prediction
|
|
341
|
-
index_result = max(range(len(score)), key=score.__getitem__)
|
|
342
|
-
prediction = names[index_result]
|
|
343
|
-
predictions.append(prediction)
|
|
344
|
-
# skip ambiguous contigs
|
|
345
|
-
if max(score) == sorted(score)[-2]:
|
|
346
|
-
continue
|
|
347
|
-
# bootstrapping
|
|
348
|
-
bootstrap_n = 100
|
|
349
|
-
samples = bs.bootstrap(
|
|
350
|
-
self.kmer_hits_single, self.number_of_kmeres, bootstrap_n
|
|
351
|
-
)
|
|
352
|
-
sample_scores = bs.bootstrap_scores(
|
|
353
|
-
samples, self.number_of_kmeres, self.clonetypes
|
|
354
|
-
)
|
|
355
|
-
bootstrap_score = 0
|
|
356
|
-
bootstrap_predictions = []
|
|
357
|
-
for i in range(len(sample_scores)):
|
|
358
|
-
# skip ambiguous contigs (species with same score)
|
|
359
|
-
if max(sample_scores[i]) != sorted(sample_scores[i])[-2]:
|
|
360
|
-
bootstrap_predictions.append(
|
|
361
|
-
names[
|
|
362
|
-
max(
|
|
363
|
-
range(len(sample_scores[i])),
|
|
364
|
-
key=sample_scores[i].__getitem__,
|
|
365
|
-
)
|
|
366
|
-
]
|
|
367
|
-
)
|
|
368
|
-
if (
|
|
369
|
-
max(
|
|
370
|
-
range(len(sample_scores[i])),
|
|
371
|
-
key=sample_scores[i].__getitem__,
|
|
372
|
-
)
|
|
373
|
-
== index_result
|
|
374
|
-
):
|
|
375
|
-
bootstrap_score += 1
|
|
376
|
-
else:
|
|
377
|
-
continue
|
|
378
|
-
bootstrap_score = bootstrap_score / bootstrap_n
|
|
379
|
-
# bootstrap_score = 1
|
|
380
|
-
|
|
381
|
-
if prediction not in reads_classified:
|
|
382
|
-
# Value 5 war vohrer = read
|
|
383
|
-
reads_classified[prediction] = [
|
|
384
|
-
[max(score)],
|
|
385
|
-
1,
|
|
386
|
-
[len(read)],
|
|
387
|
-
sorted(score)[-2] / max(score),
|
|
388
|
-
[bootstrap_score],
|
|
389
|
-
None,
|
|
390
|
-
None,
|
|
391
|
-
]
|
|
392
|
-
else:
|
|
393
|
-
reads_classified[prediction][0] += [max(score)]
|
|
394
|
-
reads_classified[prediction][1] += 1
|
|
395
|
-
reads_classified[prediction][2] += [len(read)]
|
|
396
|
-
reads_classified[prediction][3] += sorted(score)[-2] / max(score)
|
|
397
|
-
reads_classified[prediction][4] += [bootstrap_score]
|
|
398
|
-
# reads_classified["A." + prediction][5] += None
|
|
399
|
-
# tracker.print_diff()
|
|
400
|
-
# not ready yet
|
|
401
|
-
"""for prediction in reads_classified:
|
|
402
|
-
kmers = reads_classified[prediction][5]
|
|
403
|
-
# Strip "A."
|
|
404
|
-
prediction = prediction[2:]
|
|
405
|
-
# kmer mapping to genome, start by loading the kmer_dict in
|
|
406
|
-
path_pos = "filter\kmer_positions\Acinetobacter\\" + prediction + "_positions.txt"
|
|
407
|
-
# delete later
|
|
408
|
-
path_posv2 = "filter\kmer_positions\Acinetobacter\\" + prediction + "_complete_positions.txt"
|
|
409
|
-
# cluster kmers to contigs
|
|
410
|
-
# delete try later
|
|
411
|
-
start_dict = time.time()
|
|
412
|
-
try:
|
|
413
|
-
with open(path_pos, 'rb') as fp:
|
|
414
|
-
kmer_dict = pickle.load(fp)
|
|
415
|
-
except:
|
|
416
|
-
with open(path_posv2, 'rb') as fp:
|
|
417
|
-
kmer_dict = pickle.load(fp)
|
|
418
|
-
end_dict = time.time()
|
|
419
|
-
needed_dict = round(end_dict - start_dict, 2)
|
|
420
|
-
print("Time needed to load kmer_dict in: ", needed_dict)
|
|
421
|
-
contig_amounts_distances = bs.cluster_kmers(kmers, kmer_dict)
|
|
422
|
-
reads_classified["A." + prediction][6] = contig_amounts_distances"""
|
|
423
|
-
|
|
424
|
-
print("Stage 3")
|
|
425
|
-
# print results
|
|
426
|
-
for key, value in reads_classified.items():
|
|
427
|
-
number_of_contigs = value[1]
|
|
428
|
-
# save results
|
|
429
|
-
results_clustering = [
|
|
430
|
-
[
|
|
431
|
-
key
|
|
432
|
-
+ ","
|
|
433
|
-
+ str(statistics.median(value[0]))
|
|
434
|
-
+ ","
|
|
435
|
-
+ str(number_of_contigs),
|
|
436
|
-
str(statistics.median(value[2]))
|
|
437
|
-
+ ","
|
|
438
|
-
+ str(round(value[3] / number_of_contigs, 2))
|
|
439
|
-
+ ","
|
|
440
|
-
+ str(statistics.median(value[4]))
|
|
441
|
-
+ ","
|
|
442
|
-
+ str(value[6]),
|
|
443
|
-
]
|
|
444
|
-
]
|
|
445
|
-
# with open(r'Results/XspecT_mini_csv/Results_Clustering.csv', 'a', newline='') as file:
|
|
446
|
-
# writer = csv.writer(file)
|
|
447
|
-
# writer.writerows(results_clustering)
|
|
448
|
-
# Score Median
|
|
449
|
-
value[0] = statistics.median(value[0])
|
|
450
|
-
# Number of Contigs
|
|
451
|
-
value[1] = number_of_contigs
|
|
452
|
-
# Contig-Length Median
|
|
453
|
-
value[2] = statistics.median(value[2])
|
|
454
|
-
# Uniqueness
|
|
455
|
-
value[3] = round(1 - (value[3] / number_of_contigs), 2)
|
|
456
|
-
# Bootstrap Median
|
|
457
|
-
value[4] = statistics.median(value[4])
|
|
458
|
-
# value[6] = "Clusters: " + str(value[6])
|
|
459
|
-
reads_classified[key] = value
|
|
460
|
-
print("Stage 4")
|
|
461
|
-
print("Types of return vars: ", type(reads_classified), type(predictions))
|
|
462
|
-
return reads_classified, predictions
|
|
463
|
-
|
|
464
|
-
else:
|
|
465
|
-
for single_read in reads:
|
|
466
|
-
for j in range(len(single_read) - self.k + 1):
|
|
467
|
-
# updating counter
|
|
468
|
-
self.number_of_kmeres += 1
|
|
469
|
-
# lookup for kmer
|
|
470
|
-
kmer = str(single_read[j : j + self.k])
|
|
471
|
-
self.lookup_canonical(kmer)
|
|
472
|
-
|
|
473
|
-
def cleanup(self):
|
|
474
|
-
"""deletes matrix"""
|
|
475
|
-
self.matrix = None
|
|
476
|
-
|
|
477
|
-
def lookup_oxa(self, reads, ext):
|
|
478
|
-
"""Looks for OXA Genes: Extension (ext) selects the fq-seach or fasta-search mode"""
|
|
479
|
-
self.table = OXATable()
|
|
480
|
-
self.table.read_dic(r"filter/OXAs_dict/oxa_dict.txt")
|
|
481
|
-
if ext == "fq":
|
|
482
|
-
# fq mode
|
|
483
|
-
coordinates_forward = []
|
|
484
|
-
coordinates_reversed = []
|
|
485
|
-
for i in range(len(reads)):
|
|
486
|
-
# going through all reads, discarding those who don't get any hits with 3 test k-meres
|
|
487
|
-
|
|
488
|
-
# Building 3 test-kmeres: first, last, and middle
|
|
489
|
-
k1 = reads[i][0 : self.k] # first k-mer
|
|
490
|
-
k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
|
|
491
|
-
mid = len(reads[i]) // 2
|
|
492
|
-
k3 = reads[i][mid : mid + self.k] # k-mer in middle
|
|
493
|
-
|
|
494
|
-
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
495
|
-
# then the read won't be tested further
|
|
496
|
-
hit_sum = sum(self.hits_per_filter)
|
|
497
|
-
copy = deepcopy(self.hits_per_filter)
|
|
498
|
-
self.lookup(k1, True)
|
|
499
|
-
self.lookup(k2, True)
|
|
500
|
-
self.lookup(k3, True)
|
|
501
|
-
|
|
502
|
-
# needs at least 2 of 3 hits to continue with read
|
|
503
|
-
if (sum(self.hits_per_filter) - hit_sum) > 1:
|
|
504
|
-
for j in range(1, len(reads[i]) - 1 - self.k + 1):
|
|
505
|
-
# Skipping first, last and middle k-mer
|
|
506
|
-
if j != mid:
|
|
507
|
-
self.lookup(reads[i][j : j + self.k], True)
|
|
508
|
-
self.number_of_kmeres += 1
|
|
509
|
-
|
|
510
|
-
else:
|
|
511
|
-
# resetting hit counter
|
|
512
|
-
self.hits_per_filter = copy
|
|
513
|
-
|
|
514
|
-
# same, but with reverse complement
|
|
515
|
-
reads[i] = Seq(reads[i])
|
|
516
|
-
reads[i] = reads[i].reverse_complement()
|
|
517
|
-
k1 = reads[i][0 : self.k] # first k-mer
|
|
518
|
-
k2 = reads[i][len(reads[i]) - self.k :] # last k-mer
|
|
519
|
-
mid = len(reads[i]) // 2
|
|
520
|
-
k3 = reads[i][mid : mid + self.k] # k-mer in middle
|
|
521
|
-
|
|
522
|
-
# Taking sum of list as reference, if sum has not increased after testing those 3 kmeres,
|
|
523
|
-
# then the read won't be tested further
|
|
524
|
-
hit_sum = sum(self.hits_per_filter)
|
|
525
|
-
copy = deepcopy(self.hits_per_filter)
|
|
526
|
-
self.lookup(k1, True)
|
|
527
|
-
self.lookup(k2, True)
|
|
528
|
-
self.lookup(k3, True)
|
|
529
|
-
|
|
530
|
-
# needs at least 2 of 3 hits to continue with read
|
|
531
|
-
if (sum(self.hits_per_filter) - hit_sum) > 1:
|
|
532
|
-
for j in range(1, len(reads[i]) - 1 - self.k + 1):
|
|
533
|
-
# Skipping first, last and middle k-mer
|
|
534
|
-
if j != mid:
|
|
535
|
-
self.lookup(reads[i][j : j + self.k], True)
|
|
536
|
-
self.number_of_kmeres += 1
|
|
537
|
-
|
|
538
|
-
else:
|
|
539
|
-
# resetting hit counter
|
|
540
|
-
self.hits_per_filter = copy
|
|
541
|
-
|
|
542
|
-
else:
|
|
543
|
-
# fasta mode
|
|
544
|
-
# Altes testen mit Genom, hits per filter ausgeben lassen
|
|
545
|
-
# self.oxa_search_genomes(reads)
|
|
546
|
-
# self.oxa_search_genomes_v2(reads)
|
|
547
|
-
coordinates_forward = self.oxa_search_genomes_v3(reads)
|
|
548
|
-
reads_reversed = []
|
|
549
|
-
for r in range(len(reads)):
|
|
550
|
-
# building reverse complement
|
|
551
|
-
reads_reversed.append(Seq(reads[r]))
|
|
552
|
-
reads_reversed[r] = reads_reversed[r].reverse_complement()
|
|
553
|
-
# lookup reverse complement
|
|
554
|
-
# self.oxa_search_genomes(reads)
|
|
555
|
-
# self.oxa_search_genomes_v2(reads)
|
|
556
|
-
coordinates_reversed = self.oxa_search_genomes_v3(reads_reversed)
|
|
557
|
-
|
|
558
|
-
# cleanup
|
|
559
|
-
reads = None
|
|
560
|
-
self.table.cleanup()
|
|
561
|
-
return coordinates_forward, coordinates_reversed
|
|
562
|
-
|
|
563
|
-
def oxa_search_genomes_v3(self, genome):
|
|
564
|
-
coordinates = []
|
|
565
|
-
for i in genome:
|
|
566
|
-
j = 0
|
|
567
|
-
success = False
|
|
568
|
-
while j < len(i):
|
|
569
|
-
hits = sum(self.hits_per_filter)
|
|
570
|
-
kmer = i[j : j + self.k]
|
|
571
|
-
self.lookup(kmer, True)
|
|
572
|
-
if success == False:
|
|
573
|
-
if sum(self.hits_per_filter) > hits:
|
|
574
|
-
counter = 0
|
|
575
|
-
coordinates.append([j])
|
|
576
|
-
# 1024 (longest oxa-gene) - 19
|
|
577
|
-
for n in range(j - 249, j + 1005, 1):
|
|
578
|
-
if 0 <= j < len(i):
|
|
579
|
-
hits_per_filter_copy = self.hits_per_filter[:]
|
|
580
|
-
kmer = i[n : n + self.k]
|
|
581
|
-
self.lookup(kmer, True)
|
|
582
|
-
if hits_per_filter_copy != self.hits_per_filter:
|
|
583
|
-
counter += 1
|
|
584
|
-
if counter > 300:
|
|
585
|
-
coordinates[-1].append(j + 1005)
|
|
586
|
-
else:
|
|
587
|
-
coordinates.pop()
|
|
588
|
-
j += 1005
|
|
589
|
-
success = True
|
|
590
|
-
else:
|
|
591
|
-
# j += 20
|
|
592
|
-
j += 250
|
|
593
|
-
success = False
|
|
594
|
-
else:
|
|
595
|
-
if sum(self.hits_per_filter) > hits:
|
|
596
|
-
coordinates.append([j])
|
|
597
|
-
counter = 0
|
|
598
|
-
for n in range(j, j + 1005, 1):
|
|
599
|
-
if 0 <= j < len(i):
|
|
600
|
-
kmer = i[n : n + self.k]
|
|
601
|
-
hits_per_filter_copy = self.hits_per_filter[:]
|
|
602
|
-
self.lookup(kmer, True)
|
|
603
|
-
if hits_per_filter_copy != self.hits_per_filter:
|
|
604
|
-
counter += 1
|
|
605
|
-
if counter > 300:
|
|
606
|
-
coordinates[-1].append(j + 1005)
|
|
607
|
-
else:
|
|
608
|
-
coordinates.pop()
|
|
609
|
-
j += 1005
|
|
610
|
-
success = True
|
|
611
|
-
else:
|
|
612
|
-
j += 250
|
|
613
|
-
success = False
|
|
614
|
-
# if len(coordinates) > 0:
|
|
615
|
-
# print("Coordinates: ", coordinates)
|
|
616
|
-
return coordinates
|
|
617
|
-
|
|
618
|
-
def get_oxa_score(self):
|
|
619
|
-
"""Returning hits per OXA/kmere in OXA-filter"""
|
|
620
|
-
table = OXATable()
|
|
621
|
-
counter = table.get_counter()
|
|
622
|
-
score = []
|
|
623
|
-
# calculates float for each value in [hits per filter]
|
|
624
|
-
for i in range(self.clonetypes):
|
|
625
|
-
if self.hits_per_filter[i] == 0:
|
|
626
|
-
score.append(0.0)
|
|
627
|
-
else:
|
|
628
|
-
score.append(
|
|
629
|
-
round(
|
|
630
|
-
float(self.hits_per_filter[i]) / float(counter[self.names[i]]),
|
|
631
|
-
2,
|
|
632
|
-
)
|
|
633
|
-
)
|
|
634
|
-
# print(self.hits_per_filter[i], counter[self.names[i]])
|
|
635
|
-
# reset hits per filter
|
|
636
|
-
self.hits_per_filter = [0] * self.clonetypes
|
|
637
|
-
return score
|
xspect/Bootstrap.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
from numpy import array
|
|
3
|
-
from numpy import sum
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def bootstrap(data, sample_amount, size):
|
|
7
|
-
samples = []
|
|
8
|
-
for i in range(size):
|
|
9
|
-
sample = []
|
|
10
|
-
for j in range(sample_amount):
|
|
11
|
-
sample.append(random.choice(data))
|
|
12
|
-
sample = array(sample)
|
|
13
|
-
temp = sum(sample, 0)
|
|
14
|
-
samples.append(list(temp))
|
|
15
|
-
return samples
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def bootstrap_scores(samples, number_of_kmeres, number_of_filters):
|
|
19
|
-
scores = []
|
|
20
|
-
# calculates float for each value in [hits per filter]
|
|
21
|
-
for i in range(len(samples)):
|
|
22
|
-
score = []
|
|
23
|
-
for j in range(number_of_filters):
|
|
24
|
-
if samples[i][j] == 0:
|
|
25
|
-
score.append(0.0)
|
|
26
|
-
else:
|
|
27
|
-
score.append(round(float(samples[i][j]) / float(number_of_kmeres), 2))
|
|
28
|
-
scores.append(score)
|
|
29
|
-
return scores
|