NGSpeciesID 0.3.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modules/__init__.py +0 -0
- modules/barcode_trimmer.py +104 -0
- modules/cluster.py +373 -0
- modules/consensus.py +278 -0
- modules/get_sorted_fastq_for_cluster.py +218 -0
- modules/help_functions.py +104 -0
- modules/p_minimizers_shared.py +3 -0
- modules/parallelize.py +218 -0
- ngspeciesid-0.3.1.data/scripts/NGSpeciesID +288 -0
- ngspeciesid-0.3.1.dist-info/METADATA +350 -0
- ngspeciesid-0.3.1.dist-info/RECORD +14 -0
- ngspeciesid-0.3.1.dist-info/WHEEL +6 -0
- ngspeciesid-0.3.1.dist-info/licenses/LICENSE.txt +674 -0
- ngspeciesid-0.3.1.dist-info/top_level.txt +1 -0
modules/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,104 @@
|
|
1
|
+
|
2
|
+
import edlib
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from modules import help_functions
|
6
|
+
|
7
|
+
def reverse_complement(string):
|
8
|
+
#rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N', 'X':'X'}
|
9
|
+
# Modified for Abyss output
|
10
|
+
rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
|
11
|
+
|
12
|
+
rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
|
13
|
+
return(rev_comp)
|
14
|
+
|
15
|
+
def read_barcodes(primer_file):
|
16
|
+
barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}
|
17
|
+
|
18
|
+
for acc, seq in list(barcodes.items()):
|
19
|
+
logging.debug(f"{acc} {seq} {acc[:-3]}")
|
20
|
+
barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())
|
21
|
+
|
22
|
+
logging.debug(f"{barcodes}")
|
23
|
+
return barcodes
|
24
|
+
|
25
|
+
def get_universal_tails():
|
26
|
+
barcodes = {'1_F_fw' : 'TTTCTGTTGGTGCTGATATTGC',
|
27
|
+
'2_R_rc' : 'ACTTGCCTGTCGCTCTATCTTC'}
|
28
|
+
barcodes['1_F_rc'] = reverse_complement(barcodes['1_F_fw'])
|
29
|
+
barcodes['2_R_fw'] = reverse_complement(barcodes['2_R_rc'])
|
30
|
+
logging.debug(f"{barcodes}")
|
31
|
+
return barcodes
|
32
|
+
|
33
|
+
|
34
|
+
def find_barcode_locations(center, barcodes, primer_max_ed):
|
35
|
+
"Find barcodes in a center using edlib"
|
36
|
+
|
37
|
+
# Creation of a IUPAC equivalence map for edlib to allow IUPAC code in primers
|
38
|
+
# The IUPAC map was created with:
|
39
|
+
# from Bio.Data import IUPACData
|
40
|
+
# IUPAC_map = [(i, k) for i, j in IUPACData.ambiguous_dna_values.items() for k in j]
|
41
|
+
IUPAC_map = [('A', 'A'), ('C', 'C'), ('G', 'G'), ('T', 'T'), ('M', 'A'), ('M', 'C'),
|
42
|
+
('R', 'A'), ('R', 'G'), ('W', 'A'), ('W', 'T'), ('S', 'C'), ('S', 'G'),
|
43
|
+
('Y', 'C'), ('Y', 'T'), ('K', 'G'), ('K', 'T'), ('V', 'A'), ('V', 'C'),
|
44
|
+
('V', 'G'), ('H', 'A'), ('H', 'C'), ('H', 'T'), ('D', 'A'), ('D', 'G'),
|
45
|
+
('D', 'T'), ('B', 'C'), ('B', 'G'), ('B', 'T'), ('X', 'G'), ('X', 'A'),
|
46
|
+
('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
|
47
|
+
all_locations = []
|
48
|
+
for primer_acc, primer_seq in barcodes.items():
|
49
|
+
# Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
|
50
|
+
result = edlib.align(primer_seq, center,
|
51
|
+
mode="HW", task="locations", k=primer_max_ed,
|
52
|
+
additionalEqualities=IUPAC_map)
|
53
|
+
ed = result["editDistance"]
|
54
|
+
locations = result["locations"]
|
55
|
+
logging.debug(f"{locations} {ed}")
|
56
|
+
if locations:
|
57
|
+
all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
|
58
|
+
return all_locations
|
59
|
+
|
60
|
+
|
61
|
+
def remove_barcodes(centers, barcodes, args):
|
62
|
+
"""
|
63
|
+
Modifies consensus sequences by copping of at barcode sites.
|
64
|
+
This implies changing the datastructure centers with the modified consensus sequeces
|
65
|
+
"""
|
66
|
+
|
67
|
+
centers_updated = False
|
68
|
+
for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
|
69
|
+
|
70
|
+
# if consensus is smaller than 2*trim_window we set trim window to half the sequence
|
71
|
+
if 2*args.trim_window > len(center):
|
72
|
+
trim_window = len(center)//2
|
73
|
+
else:
|
74
|
+
trim_window = args.trim_window
|
75
|
+
|
76
|
+
barcode_locations_beginning = find_barcode_locations(center[:trim_window], barcodes, args.primer_max_ed)
|
77
|
+
barcode_locations_end = find_barcode_locations(center[-trim_window:], barcodes, args.primer_max_ed)
|
78
|
+
logging.debug(f"{center}")
|
79
|
+
|
80
|
+
cut_start = 0
|
81
|
+
if barcode_locations_beginning:
|
82
|
+
logging.debug(f"FOUND BARCODE BEGINNING {barcode_locations_beginning}")
|
83
|
+
for bc, start, stop, ed in barcode_locations_beginning:
|
84
|
+
if stop > cut_start:
|
85
|
+
cut_start = stop
|
86
|
+
|
87
|
+
cut_end = len(center)
|
88
|
+
if barcode_locations_end:
|
89
|
+
logging.debug(f"FOUND BARCODE END {barcode_locations_end}")
|
90
|
+
earliest_hit = len(center)
|
91
|
+
for bc, start, stop, ed in barcode_locations_end:
|
92
|
+
if start < earliest_hit:
|
93
|
+
earliest_hit = start
|
94
|
+
cut_end = len(center) - (trim_window - earliest_hit)
|
95
|
+
|
96
|
+
if cut_start > 0 or cut_end < len(center):
|
97
|
+
center = center[cut_start: cut_end]
|
98
|
+
|
99
|
+
logging.debug(f"{center} NEW")
|
100
|
+
logging.debug(f"cut start {cut_start} cut end {cut_end}")
|
101
|
+
centers[i][2] = center
|
102
|
+
centers_updated = True
|
103
|
+
|
104
|
+
return centers_updated
|
modules/cluster.py
ADDED
@@ -0,0 +1,373 @@
|
|
1
|
+
from __future__ import print_function
|
2
|
+
from functools import reduce
|
3
|
+
import os,sys
|
4
|
+
from collections import defaultdict
|
5
|
+
import math
|
6
|
+
from collections import deque
|
7
|
+
import itertools
|
8
|
+
from operator import mul
|
9
|
+
import logging
|
10
|
+
|
11
|
+
import parasail
|
12
|
+
|
13
|
+
from modules import help_functions
|
14
|
+
|
15
|
+
|
16
|
+
def get_kmer_minimizers(seq, k_size, w_size):
|
17
|
+
# kmers = [seq[i:i+k_size] for i in range(len(seq)-k_size) ]
|
18
|
+
w = w_size - k_size
|
19
|
+
window_kmers = deque([seq[i:i+k_size] for i in range(w +1)])
|
20
|
+
curr_min = min(window_kmers)
|
21
|
+
minimizers = [ (curr_min, list(window_kmers).index(curr_min)) ]
|
22
|
+
|
23
|
+
for i in range(w+1,len(seq) - k_size + 1):
|
24
|
+
new_kmer = seq[i:i+k_size]
|
25
|
+
# updateing window
|
26
|
+
discarded_kmer = window_kmers.popleft()
|
27
|
+
window_kmers.append(new_kmer)
|
28
|
+
|
29
|
+
# we have discarded previous windows minimizer, look for new minimizer brute force
|
30
|
+
if curr_min == discarded_kmer:
|
31
|
+
curr_min = min(window_kmers)
|
32
|
+
minimizers.append( (curr_min, list(window_kmers).index(curr_min) + i - w ) )
|
33
|
+
|
34
|
+
# Previous minimizer still in window, we only need to compare with the recently added kmer
|
35
|
+
elif new_kmer < curr_min:
|
36
|
+
curr_min = new_kmer
|
37
|
+
minimizers.append( (curr_min, i) )
|
38
|
+
|
39
|
+
return minimizers
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
def get_all_hits(minimizers, minimizer_database, read_cl_id):
|
44
|
+
"""
|
45
|
+
Get all representatives ID's that shares matches with the minimizers in the read.
|
46
|
+
"""
|
47
|
+
hit_clusters_ids = defaultdict(int)
|
48
|
+
hit_clusters_hit_index = defaultdict(list)
|
49
|
+
hit_clusters_hit_positions = defaultdict(list)
|
50
|
+
for i, (m, pos) in enumerate(minimizers): # iterating over minimizers from upstream to downstream in read
|
51
|
+
if m in minimizer_database:
|
52
|
+
for cl_id in minimizer_database[m]:
|
53
|
+
hit_clusters_ids[cl_id] += 1
|
54
|
+
hit_clusters_hit_index[cl_id].append(i) # index of the minimizer among the coordinate sorted minimizers in the read
|
55
|
+
hit_clusters_hit_positions[cl_id].append(pos) # positions of the minimizer among the coordinate sorted minimizers in the read
|
56
|
+
|
57
|
+
if read_cl_id in hit_clusters_ids:
|
58
|
+
del hit_clusters_ids[read_cl_id]
|
59
|
+
del hit_clusters_hit_index[read_cl_id]
|
60
|
+
del hit_clusters_hit_positions[read_cl_id]
|
61
|
+
|
62
|
+
return hit_clusters_ids, hit_clusters_hit_index, hit_clusters_hit_positions
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
def get_best_cluster(read_cl_id, compressed_seq_len, hit_clusters_ids, hit_clusters_hit_positions, minimizers, nummber_of_minimizers, hit_clusters_hit_index, representatives, p_emp_probs, args):
|
68
|
+
"""
|
69
|
+
Tally up total covered (mapped region) and compare it with total unmapped region. What is counted as a consecutive mapped block depends on minimizer qualitity
|
70
|
+
threshold as mapped-read-legnth/total-read-length is chosen to classify a read as belonging to the cluster
|
71
|
+
|
72
|
+
Return: An integer >= 0 denoting the cluster ID that this read was assigned to. In not assigend to any previous cluster, return -1.
|
73
|
+
[Also returing mapped ratio and nr shared minimizers to best read for logging purposes.]
|
74
|
+
"""
|
75
|
+
best_cluster_id = -1
|
76
|
+
nr_shared_kmers = 0
|
77
|
+
mapped_ratio = 0.0
|
78
|
+
if hit_clusters_ids:
|
79
|
+
top_matches = sorted(hit_clusters_hit_positions.items(), key=lambda x: (len(x[1]), sum(x[1]), representatives[x[0]][2]), reverse=True) #sorted(hit_clusters_ids.items(), key=lambda x: x[1], reverse=True)
|
80
|
+
top_hits = len(top_matches[0][1])
|
81
|
+
nr_shared_kmers = top_hits
|
82
|
+
if top_hits < args.min_shared:
|
83
|
+
pass
|
84
|
+
else:
|
85
|
+
for tm in top_matches:
|
86
|
+
cl_id = tm[0]
|
87
|
+
nm_hits = len(tm[1])
|
88
|
+
if nm_hits < args.min_fraction * top_hits or nm_hits < args.min_shared:
|
89
|
+
break
|
90
|
+
|
91
|
+
# cl_size = len(hit_clusters_ids)
|
92
|
+
minimizer_hit_positions = hit_clusters_hit_positions[cl_id]
|
93
|
+
minimizer_hit_indices = hit_clusters_hit_index[cl_id]
|
94
|
+
assert len(minimizer_hit_indices) == len(minimizer_hit_positions)
|
95
|
+
_, _, _, _, _, _, error_rate_c, rep_compressed_seq = representatives[cl_id]
|
96
|
+
_, _, _, _, _, _, error_rate_read, _ = representatives[read_cl_id]
|
97
|
+
p_error_in_kmers_emp = 1.0 - p_shared_minimizer_empirical(error_rate_read, error_rate_c, p_emp_probs)
|
98
|
+
minimizer_error_probabilities = [p_error_in_kmers_emp]*nummber_of_minimizers
|
99
|
+
total_mapped = 0
|
100
|
+
# prev_mpos = 0
|
101
|
+
prob_all_errors_since_last_hit = [reduce(mul, minimizer_error_probabilities[: minimizer_hit_indices[0]], 1)] + [ reduce(mul, minimizer_error_probabilities[hit_idx1+1: hit_idx2], 1) for hit_idx1, hit_idx2 in zip(minimizer_hit_indices[:-1], minimizer_hit_indices[1:]) ] + [reduce(mul, minimizer_error_probabilities[minimizer_hit_indices[-1]+1 : ], 1)]
|
102
|
+
|
103
|
+
assert len(prob_all_errors_since_last_hit) == len(minimizer_hit_positions) + 1
|
104
|
+
for i in range(len(minimizer_hit_indices)):
|
105
|
+
if prob_all_errors_since_last_hit[i] < args.min_prob_no_hits:
|
106
|
+
pass
|
107
|
+
else:
|
108
|
+
if i == 0:
|
109
|
+
total_mapped += minimizer_hit_positions[i]
|
110
|
+
else:
|
111
|
+
total_mapped += minimizer_hit_positions[i] - minimizer_hit_positions[i-1]
|
112
|
+
if prob_all_errors_since_last_hit[-1] < args.min_prob_no_hits:
|
113
|
+
pass
|
114
|
+
else:
|
115
|
+
total_mapped += compressed_seq_len - minimizer_hit_positions[-1]
|
116
|
+
|
117
|
+
mapped_ratio = total_mapped /float(compressed_seq_len)
|
118
|
+
|
119
|
+
# Calculate the ratio of mapped region of the representative
|
120
|
+
rep_mapped_ratio = total_mapped / float(len(rep_compressed_seq))
|
121
|
+
|
122
|
+
if args.symmetric_map_align_thresholds and min(mapped_ratio, rep_mapped_ratio) > args.mapped_threshold:
|
123
|
+
return cl_id, nm_hits, min(mapped_ratio, rep_mapped_ratio)
|
124
|
+
elif not args.symmetric_map_align_thresholds and mapped_ratio > args.mapped_threshold:
|
125
|
+
return cl_id, nm_hits, mapped_ratio
|
126
|
+
|
127
|
+
return best_cluster_id, nr_shared_kmers, mapped_ratio
|
128
|
+
|
129
|
+
|
130
|
+
def parasail_block_alignment(s1, s2, k, match_id, match_score = 2, mismatch_penalty = -2, opening_penalty = 5, gap_ext = 1):
|
131
|
+
user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty)
|
132
|
+
result = parasail.sg_trace_scan_16(s1, s2, opening_penalty, gap_ext, user_matrix)
|
133
|
+
if result.saturated:
|
134
|
+
logging.warning(f"SATURATED! {len(s1)} {len(s2)}")
|
135
|
+
result = parasail.sg_trace_scan_32(s1, s2, opening_penalty, gap_ext, user_matrix)
|
136
|
+
logging.warning("computed 32 bit instead")
|
137
|
+
|
138
|
+
# difference in how to obtain string from parasail between python v2 and v3...
|
139
|
+
if sys.version_info[0] < 3:
|
140
|
+
cigar_string = str(result.cigar.decode).decode('utf-8')
|
141
|
+
else:
|
142
|
+
cigar_string = str(result.cigar.decode, 'utf-8')
|
143
|
+
|
144
|
+
s1_alignment, s2_alignment = help_functions.cigar_to_seq(cigar_string, s1, s2)
|
145
|
+
|
146
|
+
# Rolling window of matching blocks
|
147
|
+
match_vector = [ 1 if n1 == n2 else 0 for n1, n2 in zip(s1_alignment, s2_alignment) ]
|
148
|
+
match_window = deque(match_vector[:k]) # initialization
|
149
|
+
current_match_count = sum(match_window)
|
150
|
+
aligned_region = []
|
151
|
+
if current_match_count >= match_id:
|
152
|
+
aligned_region.append(1)
|
153
|
+
else:
|
154
|
+
aligned_region.append(0)
|
155
|
+
|
156
|
+
|
157
|
+
for new_m_state in match_vector[k:]:
|
158
|
+
prev_m_state = match_window.popleft()
|
159
|
+
current_match_count = current_match_count - prev_m_state + new_m_state
|
160
|
+
match_window.append(new_m_state)
|
161
|
+
|
162
|
+
if current_match_count >= match_id:
|
163
|
+
aligned_region.append(1)
|
164
|
+
else:
|
165
|
+
aligned_region.append(0)
|
166
|
+
|
167
|
+
alignment_ratio = sum(aligned_region)/float(len(s1))
|
168
|
+
target_alignment_ratio = sum(aligned_region)/float(len(s2))
|
169
|
+
return (s1, s2, (s1_alignment, s2_alignment, alignment_ratio, target_alignment_ratio))
|
170
|
+
|
171
|
+
|
172
|
+
def get_best_cluster_block_align(read_cl_id, representatives, hit_clusters_ids, hit_clusters_hit_positions, phred_char_to_p, args):
|
173
|
+
best_cluster_id = -1
|
174
|
+
top_matches = sorted(hit_clusters_hit_positions.items(), key=lambda x: (len(x[1]), sum(x[1]), representatives[x[0]][2]), reverse=True) #sorted(hit_clusters_ids.items(), key=lambda x: x[1], reverse=True)
|
175
|
+
_, _, _, seq, r_qual, _, _, _ = representatives[read_cl_id]
|
176
|
+
top_hits = len(top_matches[0][1])
|
177
|
+
alignment_ratio = 0.0
|
178
|
+
for tm in top_matches:
|
179
|
+
cl_id = tm[0]
|
180
|
+
nm_hits = len(tm[1])
|
181
|
+
if nm_hits < top_hits:
|
182
|
+
break
|
183
|
+
_, _, _, c_seq, c_qual, _, _, _ = representatives[cl_id]
|
184
|
+
|
185
|
+
poisson_mean = sum([ r_qual.count(char_) * phred_char_to_p[char_] for char_ in set(r_qual)])
|
186
|
+
poisson_mean2 = sum([ c_qual.count(char_) * phred_char_to_p[char_] for char_ in set(c_qual)])
|
187
|
+
|
188
|
+
error_rate_sum = poisson_mean/float(len(seq)) + poisson_mean2/float(len(c_seq)) # k = max(int(mean_plus_two_stdvs_q2 + mean_plus_two_stdvs_q1) + 1 + int(len(seq)*args.variant_rate) , 40)
|
189
|
+
if error_rate_sum <= 0.01:
|
190
|
+
gap_opening_penalty = 5
|
191
|
+
elif 0.01 < error_rate_sum <= 0.04:
|
192
|
+
gap_opening_penalty = 4
|
193
|
+
elif 0.04 < error_rate_sum <= 0.1:
|
194
|
+
gap_opening_penalty = 3
|
195
|
+
elif 0.1 < error_rate_sum:
|
196
|
+
gap_opening_penalty = 2
|
197
|
+
|
198
|
+
match_id_tailored = math.floor((1.0 - error_rate_sum) * args.k)
|
199
|
+
(s1, s2, (s1_alignment, s2_alignment, alignment_ratio, target_alignment_ratio)) = parasail_block_alignment(seq, c_seq, args.k, match_id_tailored, opening_penalty = gap_opening_penalty, )
|
200
|
+
if args.symmetric_map_align_thresholds and min(alignment_ratio, target_alignment_ratio) >= args.aligned_threshold:
|
201
|
+
return cl_id, nm_hits, error_rate_sum, s1_alignment, s2_alignment, min(alignment_ratio, target_alignment_ratio)
|
202
|
+
elif not args.symmetric_map_align_thresholds and alignment_ratio >= args.aligned_threshold:
|
203
|
+
return cl_id, nm_hits, error_rate_sum, s1_alignment, s2_alignment, alignment_ratio
|
204
|
+
|
205
|
+
return best_cluster_id, 0, -1, -1, -1, alignment_ratio
|
206
|
+
|
207
|
+
def reads_to_clusters(clusters, representatives, sorted_reads, p_emp_probs, minimizer_database, new_batch_index, args):
|
208
|
+
"""
|
209
|
+
Iterates throughreads in sorted order (w.r.t. score) and:
|
210
|
+
1. homopolymenr compresses the read and obtain minimizers
|
211
|
+
2. Finds the homopolymenr compressed error rate (if not computed in previous pass if more than 1 core specified to the program)
|
212
|
+
3. Finds all the representatives that shares minimizers with the read
|
213
|
+
4. Finds the best of the hits using mapping approach
|
214
|
+
5. If no hit is found in 4. tries to align to representative with th most shared minimizers.
|
215
|
+
6. Adds current read to representative, or makes it a new representative of a new cluster.
|
216
|
+
7. If new representative: add the minimizers to the minimizer database
|
217
|
+
8. Assign the actual reads to their new cluster and their new cluster representative (since all reads were initialized as their own representatives to deal with multiprocessing)
|
218
|
+
"""
|
219
|
+
|
220
|
+
## For multiprocessing only
|
221
|
+
prev_b_indices = [ prev_batch_index for (read_cl_id, prev_batch_index, acc, seq, qual, score) in sorted_reads ]
|
222
|
+
lowest_batch_index = max(1, min(prev_b_indices or [1]))
|
223
|
+
skip_count = prev_b_indices.count(lowest_batch_index)
|
224
|
+
logging.debug("Saved: {0} iterations.".format(skip_count) )
|
225
|
+
###################################
|
226
|
+
|
227
|
+
## logging counters
|
228
|
+
aln_passed_criteria = 0
|
229
|
+
mapped_passed_criteria = 0
|
230
|
+
aln_called = 0
|
231
|
+
###################
|
232
|
+
|
233
|
+
phred_char_to_p = {chr(i) : min( 10**( - (ord(chr(i)) - 33)/10.0 ), 0.79433) for i in range(128)} # PHRED encoded quality character to prob of error. Need this locally if multiprocessing
|
234
|
+
cluster_to_new_cluster_id = {}
|
235
|
+
|
236
|
+
if args.print_output:
|
237
|
+
logging.debug("Iteration\tNrClusters\tMinDbSize\tCurrReadId\tClusterSizes")
|
238
|
+
|
239
|
+
for i, (read_cl_id, prev_batch_index, acc, seq, qual, score) in enumerate(sorted_reads):
|
240
|
+
|
241
|
+
## This if statement is only active in parallelization code
|
242
|
+
## to keep track of already processed reads in previous iteration
|
243
|
+
if prev_batch_index == lowest_batch_index:
|
244
|
+
lst = list(representatives[read_cl_id])
|
245
|
+
lst[1] = new_batch_index
|
246
|
+
t = tuple(lst)
|
247
|
+
representatives[read_cl_id] = t # just updated batch index
|
248
|
+
continue
|
249
|
+
##############################################################
|
250
|
+
|
251
|
+
################################################################################
|
252
|
+
############ Just for develop purposes, print some info to std out ############
|
253
|
+
if i % args.print_output == 0:
|
254
|
+
inv_map = {}
|
255
|
+
for k, v in cluster_to_new_cluster_id.items():
|
256
|
+
inv_map.setdefault(v, set()).add(k)
|
257
|
+
cl_tmp = sorted( [ 1 + sum([len(clusters[cl_id]) for cl_id in c ]) for c in inv_map.values() ], reverse= True)
|
258
|
+
cl_tmp_nontrivial = [cl_size_tmp for cl_size_tmp in cl_tmp if cl_size_tmp > 1]
|
259
|
+
logging.debug("{0}\t{1}\t{2}\t{3}\t{4}".format(i, len(cl_tmp_nontrivial), len(minimizer_database), "_".join(acc.split("_")[:-1]), ",".join([str(s_) for s_ in sorted(cl_tmp_nontrivial, reverse=True)])))
|
260
|
+
################################################################################
|
261
|
+
################################################################################
|
262
|
+
|
263
|
+
# 1. homopolymenr compress read and obtain minimizers
|
264
|
+
|
265
|
+
seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
|
266
|
+
if len(seq_hpol_comp) < args.k:
|
267
|
+
logging.debug( f"skipping read of length: {len(seq)} homopolymer compressed: {len(seq_hpol_comp)} {seq}")
|
268
|
+
continue
|
269
|
+
minimizers = get_kmer_minimizers(seq_hpol_comp, args.k, args.w)
|
270
|
+
|
271
|
+
# 2. Find the homopolymer compressed error rate (else statement is the only one active in single core mode)
|
272
|
+
|
273
|
+
if len(representatives[read_cl_id]) == 8: # we have already computed homopolymenr compressed error rate in previous iteration (if isONclust is called with multiple cores):
|
274
|
+
lst = list(representatives[read_cl_id])
|
275
|
+
lst[1] = new_batch_index
|
276
|
+
t = tuple(lst)
|
277
|
+
representatives[read_cl_id] = t # just updated batch index
|
278
|
+
else:
|
279
|
+
all_read_hpol_lengths = [len([c for c in g]) for ch, g in itertools.groupby(seq)]
|
280
|
+
qualcomp = []
|
281
|
+
start = 0
|
282
|
+
for h_len in all_read_hpol_lengths:
|
283
|
+
q_max = min(qual[start: start + h_len], key = lambda x: phred_char_to_p[x])
|
284
|
+
qualcomp.append(q_max)
|
285
|
+
start += h_len
|
286
|
+
qualcomp = "".join([q for q in qualcomp])
|
287
|
+
assert len(seq_hpol_comp) == len(qualcomp)
|
288
|
+
|
289
|
+
# compute the average error rate after compression
|
290
|
+
poisson_mean = sum([ qualcomp.count(char_) * phred_char_to_p[char_] for char_ in set(qualcomp)])
|
291
|
+
h_pol_compr_error_rate = poisson_mean/float(len(qualcomp))
|
292
|
+
representatives[read_cl_id] = (read_cl_id, new_batch_index, acc, seq, qual, score, h_pol_compr_error_rate, seq_hpol_comp) # adding homopolymenr compressed error rate to info tuple of cluster origin sequence
|
293
|
+
|
294
|
+
|
295
|
+
# 3. Find all the representatives with shared minimizers (this is the time consuming function for noisy and large datasets)
|
296
|
+
|
297
|
+
hit_clusters_ids, hit_clusters_hit_index, hit_clusters_hit_positions = get_all_hits(minimizers, minimizer_database, read_cl_id)
|
298
|
+
|
299
|
+
|
300
|
+
# 4. Finds the best of the hits using mapping approach
|
301
|
+
|
302
|
+
best_cluster_id_m, nr_shared_kmers_m, mapped_ratio = get_best_cluster(read_cl_id, len(seq_hpol_comp), hit_clusters_ids, hit_clusters_hit_positions, minimizers, len(minimizers), hit_clusters_hit_index, representatives, p_emp_probs, args)
|
303
|
+
|
304
|
+
|
305
|
+
# 5. If step 4 is unsuccessfull we try to align the read to the representative(s) with the most shared minimizers.
|
306
|
+
|
307
|
+
if best_cluster_id_m >= 0:
|
308
|
+
mapped_passed_criteria += 1
|
309
|
+
|
310
|
+
if best_cluster_id_m < 0 and nr_shared_kmers_m >= args.min_shared:
|
311
|
+
aln_called += 1
|
312
|
+
best_cluster_id_a, nr_shared_kmers_a, error_rate_sum, s1_alignment, s2_alignment, alignment_ratio = get_best_cluster_block_align(read_cl_id, representatives, hit_clusters_ids, hit_clusters_hit_positions, phred_char_to_p, args)
|
313
|
+
if best_cluster_id_a >= 0:
|
314
|
+
aln_passed_criteria += 1
|
315
|
+
|
316
|
+
else:
|
317
|
+
best_cluster_id_a = -1
|
318
|
+
|
319
|
+
|
320
|
+
# 6. Adds current read to representative, or makes it a new representative of a new cluster.
|
321
|
+
|
322
|
+
best_cluster_id = max(best_cluster_id_m,best_cluster_id_a)
|
323
|
+
if best_cluster_id >= 0:
|
324
|
+
cluster_to_new_cluster_id[read_cl_id] = best_cluster_id
|
325
|
+
|
326
|
+
# 7. If new representative: add the minimizers to the minimizer database.
|
327
|
+
|
328
|
+
else : # Stays in current cluser, adding representative minimixers
|
329
|
+
for m, pos in minimizers:
|
330
|
+
if m in minimizer_database:
|
331
|
+
minimizer_database[m].add(read_cl_id)
|
332
|
+
else:
|
333
|
+
minimizer_database[m] = set()
|
334
|
+
minimizer_database[m].add(read_cl_id)
|
335
|
+
|
336
|
+
|
337
|
+
# 8. Since all reads were initialized as their own representatives we need to reassign reads to their new representative, (this approach was implemented to deal with iterative assigment in the multiprocessing version)
|
338
|
+
for read_cl_id in cluster_to_new_cluster_id:
|
339
|
+
new_cl_id = cluster_to_new_cluster_id[read_cl_id]
|
340
|
+
all_reads = clusters[read_cl_id]
|
341
|
+
for read_acc in all_reads:
|
342
|
+
clusters[new_cl_id].append(read_acc)
|
343
|
+
del clusters[read_cl_id]
|
344
|
+
# delete old origins
|
345
|
+
del representatives[read_cl_id]
|
346
|
+
##########################
|
347
|
+
|
348
|
+
logging.debug("Total number of reads iterated through:{0}".format(len(sorted_reads)))
|
349
|
+
logging.debug("Passed mapping criteria:{0}".format(mapped_passed_criteria))
|
350
|
+
logging.debug("Passed alignment criteria in this process:{0}".format(aln_passed_criteria))
|
351
|
+
logging.debug("Total calls to alignment module in this process:{0}".format(aln_called))
|
352
|
+
|
353
|
+
return { new_batch_index : (clusters, representatives, minimizer_database, new_batch_index)}
|
354
|
+
|
355
|
+
|
356
|
+
def p_shared_minimizer_empirical(error_rate_read, error_rate_center, p_emp_probs):
|
357
|
+
e1 = round(error_rate_read, 2)
|
358
|
+
if e1 > 0.15:
|
359
|
+
e1 = 0.15
|
360
|
+
if e1 < 0.01:
|
361
|
+
e1 = 0.01
|
362
|
+
e2 = round(error_rate_center, 2)
|
363
|
+
if e2 > 0.15:
|
364
|
+
e2 = 0.15
|
365
|
+
if e2 < 0.01:
|
366
|
+
e2 = 0.01
|
367
|
+
p_kmer_shared = p_emp_probs[(e1,e2)]
|
368
|
+
return p_kmer_shared
|
369
|
+
|
370
|
+
|
371
|
+
|
372
|
+
|
373
|
+
|