NGSpeciesID 0.3.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
modules/__init__.py ADDED
File without changes
@@ -0,0 +1,104 @@
1
+
2
+ import edlib
3
+ import logging
4
+
5
+ from modules import help_functions
6
+
7
+ def reverse_complement(string):
8
+ #rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N', 'X':'X'}
9
+ # Modified for Abyss output
10
+ rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
11
+
12
+ rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
13
+ return(rev_comp)
14
+
15
+ def read_barcodes(primer_file):
16
+ barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}
17
+
18
+ for acc, seq in list(barcodes.items()):
19
+ logging.debug(f"{acc} {seq} {acc[:-3]}")
20
+ barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())
21
+
22
+ logging.debug(f"{barcodes}")
23
+ return barcodes
24
+
25
+ def get_universal_tails():
26
+ barcodes = {'1_F_fw' : 'TTTCTGTTGGTGCTGATATTGC',
27
+ '2_R_rc' : 'ACTTGCCTGTCGCTCTATCTTC'}
28
+ barcodes['1_F_rc'] = reverse_complement(barcodes['1_F_fw'])
29
+ barcodes['2_R_fw'] = reverse_complement(barcodes['2_R_rc'])
30
+ logging.debug(f"{barcodes}")
31
+ return barcodes
32
+
33
+
34
+ def find_barcode_locations(center, barcodes, primer_max_ed):
35
+ "Find barcodes in a center using edlib"
36
+
37
+ # Creation of a IUPAC equivalence map for edlib to allow IUPAC code in primers
38
+ # The IUPAC map was created with:
39
+ # from Bio.Data import IUPACData
40
+ # IUPAC_map = [(i, k) for i, j in IUPACData.ambiguous_dna_values.items() for k in j]
41
+ IUPAC_map = [('A', 'A'), ('C', 'C'), ('G', 'G'), ('T', 'T'), ('M', 'A'), ('M', 'C'),
42
+ ('R', 'A'), ('R', 'G'), ('W', 'A'), ('W', 'T'), ('S', 'C'), ('S', 'G'),
43
+ ('Y', 'C'), ('Y', 'T'), ('K', 'G'), ('K', 'T'), ('V', 'A'), ('V', 'C'),
44
+ ('V', 'G'), ('H', 'A'), ('H', 'C'), ('H', 'T'), ('D', 'A'), ('D', 'G'),
45
+ ('D', 'T'), ('B', 'C'), ('B', 'G'), ('B', 'T'), ('X', 'G'), ('X', 'A'),
46
+ ('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
47
+ all_locations = []
48
+ for primer_acc, primer_seq in barcodes.items():
49
+ # Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
50
+ result = edlib.align(primer_seq, center,
51
+ mode="HW", task="locations", k=primer_max_ed,
52
+ additionalEqualities=IUPAC_map)
53
+ ed = result["editDistance"]
54
+ locations = result["locations"]
55
+ logging.debug(f"{locations} {ed}")
56
+ if locations:
57
+ all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
58
+ return all_locations
59
+
60
+
61
+ def remove_barcodes(centers, barcodes, args):
62
+ """
63
+ Modifies consensus sequences by copping of at barcode sites.
64
+ This implies changing the datastructure centers with the modified consensus sequeces
65
+ """
66
+
67
+ centers_updated = False
68
+ for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
69
+
70
+ # if consensus is smaller than 2*trim_window we set trim window to half the sequence
71
+ if 2*args.trim_window > len(center):
72
+ trim_window = len(center)//2
73
+ else:
74
+ trim_window = args.trim_window
75
+
76
+ barcode_locations_beginning = find_barcode_locations(center[:trim_window], barcodes, args.primer_max_ed)
77
+ barcode_locations_end = find_barcode_locations(center[-trim_window:], barcodes, args.primer_max_ed)
78
+ logging.debug(f"{center}")
79
+
80
+ cut_start = 0
81
+ if barcode_locations_beginning:
82
+ logging.debug(f"FOUND BARCODE BEGINNING {barcode_locations_beginning}")
83
+ for bc, start, stop, ed in barcode_locations_beginning:
84
+ if stop > cut_start:
85
+ cut_start = stop
86
+
87
+ cut_end = len(center)
88
+ if barcode_locations_end:
89
+ logging.debug(f"FOUND BARCODE END {barcode_locations_end}")
90
+ earliest_hit = len(center)
91
+ for bc, start, stop, ed in barcode_locations_end:
92
+ if start < earliest_hit:
93
+ earliest_hit = start
94
+ cut_end = len(center) - (trim_window - earliest_hit)
95
+
96
+ if cut_start > 0 or cut_end < len(center):
97
+ center = center[cut_start: cut_end]
98
+
99
+ logging.debug(f"{center} NEW")
100
+ logging.debug(f"cut start {cut_start} cut end {cut_end}")
101
+ centers[i][2] = center
102
+ centers_updated = True
103
+
104
+ return centers_updated
modules/cluster.py ADDED
@@ -0,0 +1,373 @@
1
+ from __future__ import print_function
2
+ from functools import reduce
3
+ import os,sys
4
+ from collections import defaultdict
5
+ import math
6
+ from collections import deque
7
+ import itertools
8
+ from operator import mul
9
+ import logging
10
+
11
+ import parasail
12
+
13
+ from modules import help_functions
14
+
15
+
16
+ def get_kmer_minimizers(seq, k_size, w_size):
17
+ # kmers = [seq[i:i+k_size] for i in range(len(seq)-k_size) ]
18
+ w = w_size - k_size
19
+ window_kmers = deque([seq[i:i+k_size] for i in range(w +1)])
20
+ curr_min = min(window_kmers)
21
+ minimizers = [ (curr_min, list(window_kmers).index(curr_min)) ]
22
+
23
+ for i in range(w+1,len(seq) - k_size + 1):
24
+ new_kmer = seq[i:i+k_size]
25
+ # updateing window
26
+ discarded_kmer = window_kmers.popleft()
27
+ window_kmers.append(new_kmer)
28
+
29
+ # we have discarded previous windows minimizer, look for new minimizer brute force
30
+ if curr_min == discarded_kmer:
31
+ curr_min = min(window_kmers)
32
+ minimizers.append( (curr_min, list(window_kmers).index(curr_min) + i - w ) )
33
+
34
+ # Previous minimizer still in window, we only need to compare with the recently added kmer
35
+ elif new_kmer < curr_min:
36
+ curr_min = new_kmer
37
+ minimizers.append( (curr_min, i) )
38
+
39
+ return minimizers
40
+
41
+
42
+
43
+ def get_all_hits(minimizers, minimizer_database, read_cl_id):
44
+ """
45
+ Get all representatives ID's that shares matches with the minimizers in the read.
46
+ """
47
+ hit_clusters_ids = defaultdict(int)
48
+ hit_clusters_hit_index = defaultdict(list)
49
+ hit_clusters_hit_positions = defaultdict(list)
50
+ for i, (m, pos) in enumerate(minimizers): # iterating over minimizers from upstream to downstream in read
51
+ if m in minimizer_database:
52
+ for cl_id in minimizer_database[m]:
53
+ hit_clusters_ids[cl_id] += 1
54
+ hit_clusters_hit_index[cl_id].append(i) # index of the minimizer among the coordinate sorted minimizers in the read
55
+ hit_clusters_hit_positions[cl_id].append(pos) # positions of the minimizer among the coordinate sorted minimizers in the read
56
+
57
+ if read_cl_id in hit_clusters_ids:
58
+ del hit_clusters_ids[read_cl_id]
59
+ del hit_clusters_hit_index[read_cl_id]
60
+ del hit_clusters_hit_positions[read_cl_id]
61
+
62
+ return hit_clusters_ids, hit_clusters_hit_index, hit_clusters_hit_positions
63
+
64
+
65
+
66
+
67
+ def get_best_cluster(read_cl_id, compressed_seq_len, hit_clusters_ids, hit_clusters_hit_positions, minimizers, nummber_of_minimizers, hit_clusters_hit_index, representatives, p_emp_probs, args):
68
+ """
69
+ Tally up total covered (mapped region) and compare it with total unmapped region. What is counted as a consecutive mapped block depends on minimizer qualitity
70
+ threshold as mapped-read-legnth/total-read-length is chosen to classify a read as belonging to the cluster
71
+
72
+ Return: An integer >= 0 denoting the cluster ID that this read was assigned to. In not assigend to any previous cluster, return -1.
73
+ [Also returing mapped ratio and nr shared minimizers to best read for logging purposes.]
74
+ """
75
+ best_cluster_id = -1
76
+ nr_shared_kmers = 0
77
+ mapped_ratio = 0.0
78
+ if hit_clusters_ids:
79
+ top_matches = sorted(hit_clusters_hit_positions.items(), key=lambda x: (len(x[1]), sum(x[1]), representatives[x[0]][2]), reverse=True) #sorted(hit_clusters_ids.items(), key=lambda x: x[1], reverse=True)
80
+ top_hits = len(top_matches[0][1])
81
+ nr_shared_kmers = top_hits
82
+ if top_hits < args.min_shared:
83
+ pass
84
+ else:
85
+ for tm in top_matches:
86
+ cl_id = tm[0]
87
+ nm_hits = len(tm[1])
88
+ if nm_hits < args.min_fraction * top_hits or nm_hits < args.min_shared:
89
+ break
90
+
91
+ # cl_size = len(hit_clusters_ids)
92
+ minimizer_hit_positions = hit_clusters_hit_positions[cl_id]
93
+ minimizer_hit_indices = hit_clusters_hit_index[cl_id]
94
+ assert len(minimizer_hit_indices) == len(minimizer_hit_positions)
95
+ _, _, _, _, _, _, error_rate_c, rep_compressed_seq = representatives[cl_id]
96
+ _, _, _, _, _, _, error_rate_read, _ = representatives[read_cl_id]
97
+ p_error_in_kmers_emp = 1.0 - p_shared_minimizer_empirical(error_rate_read, error_rate_c, p_emp_probs)
98
+ minimizer_error_probabilities = [p_error_in_kmers_emp]*nummber_of_minimizers
99
+ total_mapped = 0
100
+ # prev_mpos = 0
101
+ prob_all_errors_since_last_hit = [reduce(mul, minimizer_error_probabilities[: minimizer_hit_indices[0]], 1)] + [ reduce(mul, minimizer_error_probabilities[hit_idx1+1: hit_idx2], 1) for hit_idx1, hit_idx2 in zip(minimizer_hit_indices[:-1], minimizer_hit_indices[1:]) ] + [reduce(mul, minimizer_error_probabilities[minimizer_hit_indices[-1]+1 : ], 1)]
102
+
103
+ assert len(prob_all_errors_since_last_hit) == len(minimizer_hit_positions) + 1
104
+ for i in range(len(minimizer_hit_indices)):
105
+ if prob_all_errors_since_last_hit[i] < args.min_prob_no_hits:
106
+ pass
107
+ else:
108
+ if i == 0:
109
+ total_mapped += minimizer_hit_positions[i]
110
+ else:
111
+ total_mapped += minimizer_hit_positions[i] - minimizer_hit_positions[i-1]
112
+ if prob_all_errors_since_last_hit[-1] < args.min_prob_no_hits:
113
+ pass
114
+ else:
115
+ total_mapped += compressed_seq_len - minimizer_hit_positions[-1]
116
+
117
+ mapped_ratio = total_mapped /float(compressed_seq_len)
118
+
119
+ # Calculate the ratio of mapped region of the representative
120
+ rep_mapped_ratio = total_mapped / float(len(rep_compressed_seq))
121
+
122
+ if args.symmetric_map_align_thresholds and min(mapped_ratio, rep_mapped_ratio) > args.mapped_threshold:
123
+ return cl_id, nm_hits, min(mapped_ratio, rep_mapped_ratio)
124
+ elif not args.symmetric_map_align_thresholds and mapped_ratio > args.mapped_threshold:
125
+ return cl_id, nm_hits, mapped_ratio
126
+
127
+ return best_cluster_id, nr_shared_kmers, mapped_ratio
128
+
129
+
130
+ def parasail_block_alignment(s1, s2, k, match_id, match_score = 2, mismatch_penalty = -2, opening_penalty = 5, gap_ext = 1):
131
+ user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty)
132
+ result = parasail.sg_trace_scan_16(s1, s2, opening_penalty, gap_ext, user_matrix)
133
+ if result.saturated:
134
+ logging.warning(f"SATURATED! {len(s1)} {len(s2)}")
135
+ result = parasail.sg_trace_scan_32(s1, s2, opening_penalty, gap_ext, user_matrix)
136
+ logging.warning("computed 32 bit instead")
137
+
138
+ # difference in how to obtain string from parasail between python v2 and v3...
139
+ if sys.version_info[0] < 3:
140
+ cigar_string = str(result.cigar.decode).decode('utf-8')
141
+ else:
142
+ cigar_string = str(result.cigar.decode, 'utf-8')
143
+
144
+ s1_alignment, s2_alignment = help_functions.cigar_to_seq(cigar_string, s1, s2)
145
+
146
+ # Rolling window of matching blocks
147
+ match_vector = [ 1 if n1 == n2 else 0 for n1, n2 in zip(s1_alignment, s2_alignment) ]
148
+ match_window = deque(match_vector[:k]) # initialization
149
+ current_match_count = sum(match_window)
150
+ aligned_region = []
151
+ if current_match_count >= match_id:
152
+ aligned_region.append(1)
153
+ else:
154
+ aligned_region.append(0)
155
+
156
+
157
+ for new_m_state in match_vector[k:]:
158
+ prev_m_state = match_window.popleft()
159
+ current_match_count = current_match_count - prev_m_state + new_m_state
160
+ match_window.append(new_m_state)
161
+
162
+ if current_match_count >= match_id:
163
+ aligned_region.append(1)
164
+ else:
165
+ aligned_region.append(0)
166
+
167
+ alignment_ratio = sum(aligned_region)/float(len(s1))
168
+ target_alignment_ratio = sum(aligned_region)/float(len(s2))
169
+ return (s1, s2, (s1_alignment, s2_alignment, alignment_ratio, target_alignment_ratio))
170
+
171
+
172
+ def get_best_cluster_block_align(read_cl_id, representatives, hit_clusters_ids, hit_clusters_hit_positions, phred_char_to_p, args):
173
+ best_cluster_id = -1
174
+ top_matches = sorted(hit_clusters_hit_positions.items(), key=lambda x: (len(x[1]), sum(x[1]), representatives[x[0]][2]), reverse=True) #sorted(hit_clusters_ids.items(), key=lambda x: x[1], reverse=True)
175
+ _, _, _, seq, r_qual, _, _, _ = representatives[read_cl_id]
176
+ top_hits = len(top_matches[0][1])
177
+ alignment_ratio = 0.0
178
+ for tm in top_matches:
179
+ cl_id = tm[0]
180
+ nm_hits = len(tm[1])
181
+ if nm_hits < top_hits:
182
+ break
183
+ _, _, _, c_seq, c_qual, _, _, _ = representatives[cl_id]
184
+
185
+ poisson_mean = sum([ r_qual.count(char_) * phred_char_to_p[char_] for char_ in set(r_qual)])
186
+ poisson_mean2 = sum([ c_qual.count(char_) * phred_char_to_p[char_] for char_ in set(c_qual)])
187
+
188
+ error_rate_sum = poisson_mean/float(len(seq)) + poisson_mean2/float(len(c_seq)) # k = max(int(mean_plus_two_stdvs_q2 + mean_plus_two_stdvs_q1) + 1 + int(len(seq)*args.variant_rate) , 40)
189
+ if error_rate_sum <= 0.01:
190
+ gap_opening_penalty = 5
191
+ elif 0.01 < error_rate_sum <= 0.04:
192
+ gap_opening_penalty = 4
193
+ elif 0.04 < error_rate_sum <= 0.1:
194
+ gap_opening_penalty = 3
195
+ elif 0.1 < error_rate_sum:
196
+ gap_opening_penalty = 2
197
+
198
+ match_id_tailored = math.floor((1.0 - error_rate_sum) * args.k)
199
+ (s1, s2, (s1_alignment, s2_alignment, alignment_ratio, target_alignment_ratio)) = parasail_block_alignment(seq, c_seq, args.k, match_id_tailored, opening_penalty = gap_opening_penalty, )
200
+ if args.symmetric_map_align_thresholds and min(alignment_ratio, target_alignment_ratio) >= args.aligned_threshold:
201
+ return cl_id, nm_hits, error_rate_sum, s1_alignment, s2_alignment, min(alignment_ratio, target_alignment_ratio)
202
+ elif not args.symmetric_map_align_thresholds and alignment_ratio >= args.aligned_threshold:
203
+ return cl_id, nm_hits, error_rate_sum, s1_alignment, s2_alignment, alignment_ratio
204
+
205
+ return best_cluster_id, 0, -1, -1, -1, alignment_ratio
206
+
207
+ def reads_to_clusters(clusters, representatives, sorted_reads, p_emp_probs, minimizer_database, new_batch_index, args):
208
+ """
209
+ Iterates throughreads in sorted order (w.r.t. score) and:
210
+ 1. homopolymenr compresses the read and obtain minimizers
211
+ 2. Finds the homopolymenr compressed error rate (if not computed in previous pass if more than 1 core specified to the program)
212
+ 3. Finds all the representatives that shares minimizers with the read
213
+ 4. Finds the best of the hits using mapping approach
214
+ 5. If no hit is found in 4. tries to align to representative with th most shared minimizers.
215
+ 6. Adds current read to representative, or makes it a new representative of a new cluster.
216
+ 7. If new representative: add the minimizers to the minimizer database
217
+ 8. Assign the actual reads to their new cluster and their new cluster representative (since all reads were initialized as their own representatives to deal with multiprocessing)
218
+ """
219
+
220
+ ## For multiprocessing only
221
+ prev_b_indices = [ prev_batch_index for (read_cl_id, prev_batch_index, acc, seq, qual, score) in sorted_reads ]
222
+ lowest_batch_index = max(1, min(prev_b_indices or [1]))
223
+ skip_count = prev_b_indices.count(lowest_batch_index)
224
+ logging.debug("Saved: {0} iterations.".format(skip_count) )
225
+ ###################################
226
+
227
+ ## logging counters
228
+ aln_passed_criteria = 0
229
+ mapped_passed_criteria = 0
230
+ aln_called = 0
231
+ ###################
232
+
233
+ phred_char_to_p = {chr(i) : min( 10**( - (ord(chr(i)) - 33)/10.0 ), 0.79433) for i in range(128)} # PHRED encoded quality character to prob of error. Need this locally if multiprocessing
234
+ cluster_to_new_cluster_id = {}
235
+
236
+ if args.print_output:
237
+ logging.debug("Iteration\tNrClusters\tMinDbSize\tCurrReadId\tClusterSizes")
238
+
239
+ for i, (read_cl_id, prev_batch_index, acc, seq, qual, score) in enumerate(sorted_reads):
240
+
241
+ ## This if statement is only active in parallelization code
242
+ ## to keep track of already processed reads in previous iteration
243
+ if prev_batch_index == lowest_batch_index:
244
+ lst = list(representatives[read_cl_id])
245
+ lst[1] = new_batch_index
246
+ t = tuple(lst)
247
+ representatives[read_cl_id] = t # just updated batch index
248
+ continue
249
+ ##############################################################
250
+
251
+ ################################################################################
252
+ ############ Just for develop purposes, print some info to std out ############
253
+ if i % args.print_output == 0:
254
+ inv_map = {}
255
+ for k, v in cluster_to_new_cluster_id.items():
256
+ inv_map.setdefault(v, set()).add(k)
257
+ cl_tmp = sorted( [ 1 + sum([len(clusters[cl_id]) for cl_id in c ]) for c in inv_map.values() ], reverse= True)
258
+ cl_tmp_nontrivial = [cl_size_tmp for cl_size_tmp in cl_tmp if cl_size_tmp > 1]
259
+ logging.debug("{0}\t{1}\t{2}\t{3}\t{4}".format(i, len(cl_tmp_nontrivial), len(minimizer_database), "_".join(acc.split("_")[:-1]), ",".join([str(s_) for s_ in sorted(cl_tmp_nontrivial, reverse=True)])))
260
+ ################################################################################
261
+ ################################################################################
262
+
263
+ # 1. homopolymenr compress read and obtain minimizers
264
+
265
+ seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
266
+ if len(seq_hpol_comp) < args.k:
267
+ logging.debug( f"skipping read of length: {len(seq)} homopolymer compressed: {len(seq_hpol_comp)} {seq}")
268
+ continue
269
+ minimizers = get_kmer_minimizers(seq_hpol_comp, args.k, args.w)
270
+
271
+ # 2. Find the homopolymer compressed error rate (else statement is the only one active in single core mode)
272
+
273
+ if len(representatives[read_cl_id]) == 8: # we have already computed homopolymenr compressed error rate in previous iteration (if isONclust is called with multiple cores):
274
+ lst = list(representatives[read_cl_id])
275
+ lst[1] = new_batch_index
276
+ t = tuple(lst)
277
+ representatives[read_cl_id] = t # just updated batch index
278
+ else:
279
+ all_read_hpol_lengths = [len([c for c in g]) for ch, g in itertools.groupby(seq)]
280
+ qualcomp = []
281
+ start = 0
282
+ for h_len in all_read_hpol_lengths:
283
+ q_max = min(qual[start: start + h_len], key = lambda x: phred_char_to_p[x])
284
+ qualcomp.append(q_max)
285
+ start += h_len
286
+ qualcomp = "".join([q for q in qualcomp])
287
+ assert len(seq_hpol_comp) == len(qualcomp)
288
+
289
+ # compute the average error rate after compression
290
+ poisson_mean = sum([ qualcomp.count(char_) * phred_char_to_p[char_] for char_ in set(qualcomp)])
291
+ h_pol_compr_error_rate = poisson_mean/float(len(qualcomp))
292
+ representatives[read_cl_id] = (read_cl_id, new_batch_index, acc, seq, qual, score, h_pol_compr_error_rate, seq_hpol_comp) # adding homopolymenr compressed error rate to info tuple of cluster origin sequence
293
+
294
+
295
+ # 3. Find all the representatives with shared minimizers (this is the time consuming function for noisy and large datasets)
296
+
297
+ hit_clusters_ids, hit_clusters_hit_index, hit_clusters_hit_positions = get_all_hits(minimizers, minimizer_database, read_cl_id)
298
+
299
+
300
+ # 4. Finds the best of the hits using mapping approach
301
+
302
+ best_cluster_id_m, nr_shared_kmers_m, mapped_ratio = get_best_cluster(read_cl_id, len(seq_hpol_comp), hit_clusters_ids, hit_clusters_hit_positions, minimizers, len(minimizers), hit_clusters_hit_index, representatives, p_emp_probs, args)
303
+
304
+
305
+ # 5. If step 4 is unsuccessfull we try to align the read to the representative(s) with the most shared minimizers.
306
+
307
+ if best_cluster_id_m >= 0:
308
+ mapped_passed_criteria += 1
309
+
310
+ if best_cluster_id_m < 0 and nr_shared_kmers_m >= args.min_shared:
311
+ aln_called += 1
312
+ best_cluster_id_a, nr_shared_kmers_a, error_rate_sum, s1_alignment, s2_alignment, alignment_ratio = get_best_cluster_block_align(read_cl_id, representatives, hit_clusters_ids, hit_clusters_hit_positions, phred_char_to_p, args)
313
+ if best_cluster_id_a >= 0:
314
+ aln_passed_criteria += 1
315
+
316
+ else:
317
+ best_cluster_id_a = -1
318
+
319
+
320
+ # 6. Adds current read to representative, or makes it a new representative of a new cluster.
321
+
322
+ best_cluster_id = max(best_cluster_id_m,best_cluster_id_a)
323
+ if best_cluster_id >= 0:
324
+ cluster_to_new_cluster_id[read_cl_id] = best_cluster_id
325
+
326
+ # 7. If new representative: add the minimizers to the minimizer database.
327
+
328
+ else : # Stays in current cluser, adding representative minimixers
329
+ for m, pos in minimizers:
330
+ if m in minimizer_database:
331
+ minimizer_database[m].add(read_cl_id)
332
+ else:
333
+ minimizer_database[m] = set()
334
+ minimizer_database[m].add(read_cl_id)
335
+
336
+
337
+ # 8. Since all reads were initialized as their own representatives we need to reassign reads to their new representative, (this approach was implemented to deal with iterative assigment in the multiprocessing version)
338
+ for read_cl_id in cluster_to_new_cluster_id:
339
+ new_cl_id = cluster_to_new_cluster_id[read_cl_id]
340
+ all_reads = clusters[read_cl_id]
341
+ for read_acc in all_reads:
342
+ clusters[new_cl_id].append(read_acc)
343
+ del clusters[read_cl_id]
344
+ # delete old origins
345
+ del representatives[read_cl_id]
346
+ ##########################
347
+
348
+ logging.debug("Total number of reads iterated through:{0}".format(len(sorted_reads)))
349
+ logging.debug("Passed mapping criteria:{0}".format(mapped_passed_criteria))
350
+ logging.debug("Passed alignment criteria in this process:{0}".format(aln_passed_criteria))
351
+ logging.debug("Total calls to alignment module in this process:{0}".format(aln_called))
352
+
353
+ return { new_batch_index : (clusters, representatives, minimizer_database, new_batch_index)}
354
+
355
+
356
+ def p_shared_minimizer_empirical(error_rate_read, error_rate_center, p_emp_probs):
357
+ e1 = round(error_rate_read, 2)
358
+ if e1 > 0.15:
359
+ e1 = 0.15
360
+ if e1 < 0.01:
361
+ e1 = 0.01
362
+ e2 = round(error_rate_center, 2)
363
+ if e2 > 0.15:
364
+ e2 = 0.15
365
+ if e2 < 0.01:
366
+ e2 = 0.01
367
+ p_kmer_shared = p_emp_probs[(e1,e2)]
368
+ return p_kmer_shared
369
+
370
+
371
+
372
+
373
+