NGSpeciesID 0.3.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modules/__init__.py +0 -0
- modules/barcode_trimmer.py +104 -0
- modules/cluster.py +373 -0
- modules/consensus.py +278 -0
- modules/get_sorted_fastq_for_cluster.py +218 -0
- modules/help_functions.py +104 -0
- modules/p_minimizers_shared.py +3 -0
- modules/parallelize.py +218 -0
- ngspeciesid-0.3.1.data/scripts/NGSpeciesID +288 -0
- ngspeciesid-0.3.1.dist-info/METADATA +350 -0
- ngspeciesid-0.3.1.dist-info/RECORD +14 -0
- ngspeciesid-0.3.1.dist-info/WHEEL +6 -0
- ngspeciesid-0.3.1.dist-info/licenses/LICENSE.txt +674 -0
- ngspeciesid-0.3.1.dist-info/top_level.txt +1 -0
modules/consensus.py
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
from __future__ import print_function
|
2
|
+
import subprocess
|
3
|
+
import sys, os
|
4
|
+
from sys import stdout
|
5
|
+
import re
|
6
|
+
import shutil
|
7
|
+
import parasail
|
8
|
+
import glob
|
9
|
+
import logging
|
10
|
+
|
11
|
+
from modules import help_functions
|
12
|
+
|
13
|
+
|
14
|
+
def cigar_to_seq(cigar, query, ref):
|
15
|
+
cigar_tuples = []
|
16
|
+
result = re.split(r'[=DXSMI]+', cigar)
|
17
|
+
cig_pos = 0
|
18
|
+
for length in result[:-1]:
|
19
|
+
cig_pos += len(length)
|
20
|
+
type_ = cigar[cig_pos]
|
21
|
+
cig_pos += 1
|
22
|
+
cigar_tuples.append((int(length), type_ ))
|
23
|
+
|
24
|
+
r_index = 0
|
25
|
+
q_index = 0
|
26
|
+
q_aln = []
|
27
|
+
r_aln = []
|
28
|
+
for length_ , type_ in cigar_tuples:
|
29
|
+
if type_ == "=" or type_ == "X":
|
30
|
+
q_aln.append(query[q_index : q_index + length_])
|
31
|
+
r_aln.append(ref[r_index : r_index + length_])
|
32
|
+
|
33
|
+
r_index += length_
|
34
|
+
q_index += length_
|
35
|
+
|
36
|
+
elif type_ == "I":
|
37
|
+
# insertion w.r.t. reference
|
38
|
+
r_aln.append('-' * length_)
|
39
|
+
q_aln.append(query[q_index: q_index + length_])
|
40
|
+
# only query index change
|
41
|
+
q_index += length_
|
42
|
+
|
43
|
+
elif type_ == 'D':
|
44
|
+
# deletion w.r.t. reference
|
45
|
+
r_aln.append(ref[r_index: r_index + length_])
|
46
|
+
q_aln.append('-' * length_)
|
47
|
+
# only ref index change
|
48
|
+
r_index += length_
|
49
|
+
|
50
|
+
else:
|
51
|
+
logging.error("Error processing cigar")
|
52
|
+
logging.error(cigar)
|
53
|
+
sys.exit()
|
54
|
+
|
55
|
+
return "".join([s for s in q_aln]), "".join([s for s in r_aln]), cigar_tuples
|
56
|
+
|
57
|
+
|
58
|
+
def parasail_alignment(s1, s2, match_score = 2, mismatch_penalty = -2, opening_penalty = 3, gap_ext = 1):
|
59
|
+
user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty)
|
60
|
+
result = parasail.sg_trace_scan_16(s1, s2, opening_penalty, gap_ext, user_matrix)
|
61
|
+
if result.saturated:
|
62
|
+
logging.warning(f"SATURATED!{len(s1)} {len(s2)}")
|
63
|
+
result = parasail.sg_trace_scan_32(s1, s2, opening_penalty, gap_ext, user_matrix)
|
64
|
+
logging.warning("computed 32 bit instead")
|
65
|
+
|
66
|
+
# difference in how to obtain string from parasail between python v2 and v3...
|
67
|
+
if sys.version_info[0] < 3:
|
68
|
+
cigar_string = str(result.cigar.decode).decode('utf-8')
|
69
|
+
else:
|
70
|
+
cigar_string = str(result.cigar.decode, 'utf-8')
|
71
|
+
s1_alignment, s2_alignment, cigar_tuples = cigar_to_seq(cigar_string, s1, s2)
|
72
|
+
|
73
|
+
return s1_alignment, s2_alignment, cigar_string, cigar_tuples, result.score
|
74
|
+
|
75
|
+
def reverse_complement(string):
|
76
|
+
#rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N', 'X':'X'}
|
77
|
+
# Modified for Abyss output
|
78
|
+
rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
|
79
|
+
|
80
|
+
rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
|
81
|
+
return(rev_comp)
|
82
|
+
|
83
|
+
def run_spoa(reads, spoa_out_file, spoa_path):
|
84
|
+
with open(spoa_out_file, "w") as output_file:
|
85
|
+
stdout.flush()
|
86
|
+
with open("/dev/null", "w") as null:
|
87
|
+
subprocess.check_call([ spoa_path, reads, "-l", "0", "-r", "0", "-g", "-2"], stdout=output_file, stderr=null)
|
88
|
+
stdout.flush()
|
89
|
+
with open(spoa_out_file, "r") as sof:
|
90
|
+
l = sof.readlines()
|
91
|
+
consensus = l[1].strip()
|
92
|
+
return consensus
|
93
|
+
|
94
|
+
def run_medaka(reads_to_center, center_file, outfolder, cores, medaka_model, outfastq=False):
|
95
|
+
medaka_stdout = os.path.join(outfolder, "stdout.txt")
|
96
|
+
with open(medaka_stdout, "w") as output_file:
|
97
|
+
stdout.flush()
|
98
|
+
with open(os.path.join(outfolder, "stderr.txt"), "w") as medaka_stderr:
|
99
|
+
cmd_args = ['medaka_consensus', '-i', reads_to_center, "-d", center_file, "-o", outfolder, "-t", cores]
|
100
|
+
if medaka_model:
|
101
|
+
cmd_args += ["-m", medaka_model]
|
102
|
+
if outfastq:
|
103
|
+
cmd_args += ["-q"]
|
104
|
+
subprocess.check_call(cmd_args, stdout=output_file, stderr=medaka_stderr)
|
105
|
+
stdout.flush()
|
106
|
+
|
107
|
+
def run_racon(reads_to_center, center_file, outfolder, cores, racon_iter):
|
108
|
+
racon_stdout = os.path.join(outfolder, "stdout.txt")
|
109
|
+
with open(racon_stdout, "w") as output_file:
|
110
|
+
stdout.flush()
|
111
|
+
for i in range(racon_iter):
|
112
|
+
with open(
|
113
|
+
os.path.join(outfolder, "read_alignments_it_{0}.paf".format(i)), 'w'
|
114
|
+
) as read_alignments, open(
|
115
|
+
os.path.join(outfolder, "mm2_stderr_it_{0}.txt".format(i)), "w"
|
116
|
+
) as mm2_stderr, open(
|
117
|
+
os.path.join(outfolder, "racon_stderr_it_{0}.txt".format(i)), "w"
|
118
|
+
) as racon_stderr, open(
|
119
|
+
os.path.join(outfolder, "racon_polished_it_{0}.fasta".format(i)
|
120
|
+
), 'w') as racon_polished:
|
121
|
+
subprocess.check_call(['minimap2', '-x', 'map-ont', center_file, reads_to_center], stdout=read_alignments, stderr=mm2_stderr)
|
122
|
+
subprocess.check_call(['racon', reads_to_center, read_alignments.name, center_file], stdout=racon_polished, stderr=racon_stderr)
|
123
|
+
center_file = racon_polished.name
|
124
|
+
|
125
|
+
shutil.copyfile(center_file, os.path.join(outfolder, "consensus.fasta"))
|
126
|
+
stdout.flush()
|
127
|
+
|
128
|
+
|
129
|
+
def highest_aln_identity(seq, seq2):
|
130
|
+
# RC
|
131
|
+
seq2_rc = reverse_complement(seq2)
|
132
|
+
seq_aln_rc, seq2_aln_rc, cigar_string_rc, cigar_tuples_rc, alignment_score_rc = parasail_alignment(seq, seq2_rc)
|
133
|
+
nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln_rc, seq2_aln_rc) if n1 != n2])
|
134
|
+
total_pos_rc = len(seq_aln_rc)
|
135
|
+
aln_identity_rc = (total_pos_rc - nr_mismatching_pos) / float(total_pos_rc)
|
136
|
+
logging.debug(f"Rec comp orientation identity %: {aln_identity_rc}")
|
137
|
+
|
138
|
+
# FW
|
139
|
+
seq_aln, seq2_aln, cigar_string, cigar_tuples, alignment_score = parasail_alignment(seq, seq2)
|
140
|
+
nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln, seq2_aln) if n1 != n2])
|
141
|
+
total_pos = len(seq_aln)
|
142
|
+
aln_identity_fw = (total_pos - nr_mismatching_pos) / float(total_pos)
|
143
|
+
logging.debug(f"Forward orientation identity %: {aln_identity_fw}")
|
144
|
+
aln_identity = max([aln_identity_fw, aln_identity_rc])
|
145
|
+
return aln_identity
|
146
|
+
|
147
|
+
|
148
|
+
def detect_reverse_complements(centers, rc_identity_threshold):
|
149
|
+
filtered_centers = []
|
150
|
+
already_removed = set()
|
151
|
+
for i, (nr_reads_in_cl, c_id, seq, reads_path) in enumerate(centers):
|
152
|
+
if type(reads_path) != list:
|
153
|
+
all_reads = [reads_path]
|
154
|
+
else:
|
155
|
+
all_reads = reads_path
|
156
|
+
|
157
|
+
merged_cluster_id = c_id
|
158
|
+
merged_nr_reads = nr_reads_in_cl
|
159
|
+
if c_id in already_removed:
|
160
|
+
logging.debug("has already been merged, skipping")
|
161
|
+
continue
|
162
|
+
|
163
|
+
elif i == len(centers) - 1: # last sequence and it is not in already_removed
|
164
|
+
filtered_centers.append( [merged_nr_reads, c_id, seq, all_reads ] )
|
165
|
+
|
166
|
+
else:
|
167
|
+
for j, (nr_reads_in_cl2, c_id2, seq2, reads_path) in enumerate(centers[i+1:]):
|
168
|
+
aln_identity = highest_aln_identity(seq, seq2)
|
169
|
+
if aln_identity >= rc_identity_threshold:
|
170
|
+
logging.debug("Detected two consensus sequences with alignment identidy above threshold (from either reverse complement or split clusters). Keeping center with the most read support and merging reads.")
|
171
|
+
merged_nr_reads += nr_reads_in_cl2
|
172
|
+
already_removed.add(c_id2)
|
173
|
+
|
174
|
+
if type(reads_path) != list:
|
175
|
+
all_reads.append(reads_path)
|
176
|
+
else:
|
177
|
+
for rp in reads_path:
|
178
|
+
all_reads.append(rp)
|
179
|
+
|
180
|
+
filtered_centers.append( [merged_nr_reads, c_id, seq, all_reads] )
|
181
|
+
|
182
|
+
logging.debug(f"{len(filtered_centers)} consensus formed.")
|
183
|
+
return filtered_centers
|
184
|
+
|
185
|
+
|
186
|
+
def polish_sequences(centers, args):
|
187
|
+
spoa_ref_location = os.path.join(args.outfolder, "consensus_reference_X.fasta")
|
188
|
+
logging.debug(f"Saving spoa references to files: {spoa_ref_location}")
|
189
|
+
# printing output from spoa and grouping reads
|
190
|
+
if args.medaka:
|
191
|
+
polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*")
|
192
|
+
elif args.racon:
|
193
|
+
polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*")
|
194
|
+
|
195
|
+
for folder in glob.glob(polishing_pattern):
|
196
|
+
shutil.rmtree(folder)
|
197
|
+
|
198
|
+
spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*")
|
199
|
+
for file in glob.glob(spoa_pattern):
|
200
|
+
os.remove(file)
|
201
|
+
|
202
|
+
for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers):
|
203
|
+
spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id))
|
204
|
+
with open(spoa_center_file, "w") as f:
|
205
|
+
f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
|
206
|
+
|
207
|
+
nr_reads_used = 0
|
208
|
+
all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
|
209
|
+
with open(all_reads_file, "w") as f:
|
210
|
+
for fasta_file in all_reads:
|
211
|
+
reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
|
212
|
+
for acc, (seq, qual) in reads.items():
|
213
|
+
acc_tmp = acc.split()[0]
|
214
|
+
f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc_tmp, seq, "+", qual))
|
215
|
+
nr_reads_used += 1
|
216
|
+
|
217
|
+
if args.medaka:
|
218
|
+
logging.debug("running medaka on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
|
219
|
+
polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
|
220
|
+
outfiles = [ # consider all output formats for compatibility with all Medaka versions
|
221
|
+
os.path.join(polishing_outfolder, "consensus.fasta"),
|
222
|
+
os.path.join(polishing_outfolder, "consensus.fastq")
|
223
|
+
]
|
224
|
+
help_functions.mkdir_p(polishing_outfolder)
|
225
|
+
run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model, outfastq=args.medaka_fastq)
|
226
|
+
medaka_ref_location = os.path.join(polishing_outfolder, "consensus.fasta/q")
|
227
|
+
logging.debug(f"Saving medaka reference to file: {medaka_ref_location}")
|
228
|
+
for f in outfiles:
|
229
|
+
if os.path.isfile(f):
|
230
|
+
with open(f, 'r') as cf:
|
231
|
+
centers[i][2] = cf.readlines()[1].strip() # the second line is the nucleotide sequence
|
232
|
+
break
|
233
|
+
assert centers[i][2], "Medaka consensus sequence not found"
|
234
|
+
elif args.racon:
|
235
|
+
logging.debug("running racon on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
|
236
|
+
polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
|
237
|
+
help_functions.mkdir_p(polishing_outfolder)
|
238
|
+
run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter)
|
239
|
+
racon_ref_location = os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id))
|
240
|
+
logging.debug(f"Saving racon reference to file: {racon_ref_location}")
|
241
|
+
with open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r') as cf:
|
242
|
+
l = cf.readlines()
|
243
|
+
center_polished = l[1].strip()
|
244
|
+
centers[i][2] = center_polished
|
245
|
+
|
246
|
+
return centers
|
247
|
+
|
248
|
+
|
249
|
+
def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, work_dir, abundance_cutoff, args):
|
250
|
+
centers = []
|
251
|
+
singletons = 0
|
252
|
+
discarded_clusters = []
|
253
|
+
reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(sorted_reads_fastq_file, 'r'))}
|
254
|
+
for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
|
255
|
+
nr_reads_in_cluster = len(all_read_acc)
|
256
|
+
if nr_reads_in_cluster >= abundance_cutoff:
|
257
|
+
reads_path_name = os.path.join(work_dir, "reads_c_id_{0}.fq".format(c_id))
|
258
|
+
with open(reads_path_name, "w") as reads_file:
|
259
|
+
for i, acc in enumerate(all_read_acc):
|
260
|
+
if (args.max_seqs_for_consensus) >=0 and (i >= args.max_seqs_for_consensus):
|
261
|
+
break
|
262
|
+
seq, qual = reads[acc]
|
263
|
+
reads_file.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
|
264
|
+
tmp_param = args.max_seqs_for_consensus if args.max_seqs_for_consensus > 0 else 2**32
|
265
|
+
logging.debug("creating center of {0} sequences.".format(min(nr_reads_in_cluster, tmp_param)))
|
266
|
+
center = run_spoa(reads_path_name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
|
267
|
+
centers.append( [nr_reads_in_cluster, c_id, center, reads_path_name])
|
268
|
+
elif nr_reads_in_cluster == 1:
|
269
|
+
singletons += 1
|
270
|
+
elif nr_reads_in_cluster > 1:
|
271
|
+
discarded_clusters.append(nr_reads_in_cluster)
|
272
|
+
logging.debug(f"{singletons} singletons were discarded")
|
273
|
+
logging.debug(
|
274
|
+
f"{len(discarded_clusters)} clusters were discarded due to not passing the abundance_cutoff: "
|
275
|
+
f"a total of {sum(discarded_clusters)} reads were discarded. "
|
276
|
+
f"Highest abundance among them: {max(discarded_clusters or [0])} reads."
|
277
|
+
)
|
278
|
+
return centers
|
@@ -0,0 +1,218 @@
|
|
1
|
+
from __future__ import print_function
|
2
|
+
import os,sys
|
3
|
+
import argparse
|
4
|
+
|
5
|
+
import signal
|
6
|
+
from multiprocessing import Pool
|
7
|
+
import multiprocessing as mp
|
8
|
+
|
9
|
+
import operator
|
10
|
+
import functools
|
11
|
+
from time import time
|
12
|
+
from collections import deque
|
13
|
+
import sys
|
14
|
+
import itertools
|
15
|
+
import math
|
16
|
+
import logging
|
17
|
+
|
18
|
+
from modules import help_functions
|
19
|
+
|
20
|
+
D = {chr(i) : min( 10**( - (ord(chr(i)) - 33)/10.0 ), 0.79433) for i in range(128)}
|
21
|
+
D_no_min = {chr(i) : 10**( - (ord(chr(i)) - 33)/10.0 ) for i in range(128)}
|
22
|
+
|
23
|
+
def expected_number_of_erroneous_kmers(quality_string, k):
|
24
|
+
prob_error = [D[char_] for char_ in quality_string]
|
25
|
+
window = deque([ (1.0 - p_e) for p_e in prob_error[:k]])
|
26
|
+
qurrent_prob_no_error = functools.reduce(operator.mul, window, 1)
|
27
|
+
sum_of_expectations = qurrent_prob_no_error # initialization
|
28
|
+
for p_e in prob_error[k:]:
|
29
|
+
p_to_leave = window.popleft()
|
30
|
+
qurrent_prob_no_error *= ((1.0 -p_e)/(p_to_leave))
|
31
|
+
sum_of_expectations += qurrent_prob_no_error
|
32
|
+
window.append(1.0 -p_e)
|
33
|
+
return len(quality_string) - k + 1 - sum_of_expectations
|
34
|
+
|
35
|
+
|
36
|
+
def reverse_complement(string):
|
37
|
+
rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
|
38
|
+
rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
|
39
|
+
return(rev_comp)
|
40
|
+
|
41
|
+
def batch(iterable, n=1):
|
42
|
+
l = len(iterable)
|
43
|
+
for ndx in range(0, l, n):
|
44
|
+
yield iterable[ndx:min(ndx + n, l)]
|
45
|
+
|
46
|
+
|
47
|
+
def calc_score_new(d):
|
48
|
+
for key,value in d.items():
|
49
|
+
l, k, q_threshold = value
|
50
|
+
|
51
|
+
read_array = []
|
52
|
+
error_rates = []
|
53
|
+
for i, (acc, seq, qual) in enumerate(l):
|
54
|
+
if i % 10000 == 0:
|
55
|
+
logging.debug(f"{i} reads processed.")
|
56
|
+
|
57
|
+
# skip very short reads or degenerate reads
|
58
|
+
seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
|
59
|
+
if len(seq) < 2*k or len(seq_hpol_comp) < k:
|
60
|
+
continue
|
61
|
+
|
62
|
+
poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
|
63
|
+
error_rate = poisson_mean/float(len(qual))
|
64
|
+
if 10*-math.log(error_rate, 10) <= q_threshold:
|
65
|
+
continue
|
66
|
+
|
67
|
+
error_rates.append(error_rate)
|
68
|
+
exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k)
|
69
|
+
p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
|
70
|
+
score = p_no_error_in_kmers * (len(seq) - k +1)
|
71
|
+
read_array.append((acc, seq, qual, score) )
|
72
|
+
return {key : (read_array, error_rates)}
|
73
|
+
|
74
|
+
|
75
|
+
def fastq_parallel(args):
|
76
|
+
k = args.k
|
77
|
+
q_threshold = args.quality_threshold
|
78
|
+
error_rates = []
|
79
|
+
reads = [ (acc,seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))]
|
80
|
+
start = time()
|
81
|
+
read_chunk_size = int( len(reads)/args.nr_cores ) + 1
|
82
|
+
read_batches = [b for b in batch(reads, read_chunk_size)]
|
83
|
+
del reads
|
84
|
+
####### parallelize alignment #########
|
85
|
+
original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
|
86
|
+
signal.signal(signal.SIGINT, original_sigint_handler)
|
87
|
+
mp.set_start_method('spawn')
|
88
|
+
logging.debug(f"{mp.get_context()}")
|
89
|
+
logging.debug(f"Environment set: {mp.get_context()}")
|
90
|
+
logging.debug(f"Using {args.nr_cores} cores.")
|
91
|
+
start_multi = time()
|
92
|
+
pool = Pool(processes=int(args.nr_cores))
|
93
|
+
try:
|
94
|
+
batch_lengths = [len(b) for b in read_batches]
|
95
|
+
logging.debug(f"{batch_lengths}")
|
96
|
+
data = [ {i : (b,k, q_threshold)} for i, b in enumerate(read_batches)] #[ {i+1 :((cluster_batches[i], cluster_seq_origin_batches[i], read_batches[i], p_emp_probs, lowest_batch_index_db[i], i+1, args), {})} for i in range(len(read_batches))]
|
97
|
+
res = pool.map_async(calc_score_new, data)
|
98
|
+
score_results =res.get(999999999) # Without the timeout this blocking call ignores all signals.
|
99
|
+
except KeyboardInterrupt:
|
100
|
+
logging.warning("Caught KeyboardInterrupt, terminating workers")
|
101
|
+
pool.terminate()
|
102
|
+
sys.exit()
|
103
|
+
else:
|
104
|
+
pool.close()
|
105
|
+
pool.join()
|
106
|
+
|
107
|
+
logging.debug(f"Time elapesd multiprocessing: {time() - start_multi}")
|
108
|
+
read_array, error_rates = [], []
|
109
|
+
|
110
|
+
for output_dict in score_results:
|
111
|
+
for k, v in output_dict.items():
|
112
|
+
r_a, err_rates = v
|
113
|
+
logging.debug(f"Batch index {k}")
|
114
|
+
for item in r_a:
|
115
|
+
read_array.append(item)
|
116
|
+
for item2 in err_rates:
|
117
|
+
error_rates.append(item2)
|
118
|
+
|
119
|
+
read_array.sort(key=lambda x: x[3], reverse=True)
|
120
|
+
error_rates.sort()
|
121
|
+
return read_array, error_rates
|
122
|
+
|
123
|
+
|
124
|
+
def fastq_single_core(args):
|
125
|
+
k = args.k
|
126
|
+
q_threshold = args.quality_threshold
|
127
|
+
error_rates = []
|
128
|
+
read_array = []
|
129
|
+
for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r'))):
|
130
|
+
if i % 10000 == 0:
|
131
|
+
logging.debug(f"{i} reads processed.")
|
132
|
+
|
133
|
+
# skip very short reads or degenerate reads
|
134
|
+
seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
|
135
|
+
if len(seq) < 2*k or len(seq_hpol_comp) < args.k:
|
136
|
+
continue
|
137
|
+
########################
|
138
|
+
|
139
|
+
exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k)
|
140
|
+
p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
|
141
|
+
score = p_no_error_in_kmers * (len(seq) - k +1)
|
142
|
+
|
143
|
+
## For (inferred) average error rate only, based on quality values
|
144
|
+
### These values are used in evaluations in the paper only, and are not used in clustering
|
145
|
+
poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
|
146
|
+
error_rate = poisson_mean/float(len(qual))
|
147
|
+
if 10*-math.log(error_rate, 10) <= q_threshold:
|
148
|
+
continue
|
149
|
+
error_rates.append(error_rate)
|
150
|
+
##############################################
|
151
|
+
|
152
|
+
read_array.append((acc, seq, qual, score) )
|
153
|
+
|
154
|
+
read_array.sort(key=lambda x: x[3], reverse=True)
|
155
|
+
return read_array, error_rates
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
def main(args):
|
160
|
+
start = time()
|
161
|
+
logfile = open(os.path.join(args.outfolder, "logfile.txt"), 'w')
|
162
|
+
if os.path.isfile(args.outfile) and args.use_old_sorted_file:
|
163
|
+
logging.warning("Using already existing sorted file in specified directory, in not intended, specify different outfolder or delete the current file.")
|
164
|
+
return args.outfile
|
165
|
+
|
166
|
+
elif args.fastq:
|
167
|
+
if args.nr_cores > 1:
|
168
|
+
read_array, error_rates = fastq_parallel(args)
|
169
|
+
else:
|
170
|
+
read_array, error_rates = fastq_single_core(args)
|
171
|
+
|
172
|
+
|
173
|
+
|
174
|
+
reads_sorted_outfile = open(args.outfile, "w")
|
175
|
+
for i, (acc, seq, qual, score) in enumerate(read_array):
|
176
|
+
reads_sorted_outfile.write("@{0}\n{1}\n+\n{2}\n".format(acc + "_{0}".format(score), seq, qual))
|
177
|
+
reads_sorted_outfile.close()
|
178
|
+
logging.debug(f"{len(read_array)} reads passed quality critera (avg phred Q val over {args.quality_threshold} and length > 2*k) and will be clustered.")
|
179
|
+
error_rates.sort()
|
180
|
+
min_e = error_rates[0]
|
181
|
+
max_e = error_rates[-1]
|
182
|
+
median_e = error_rates[int(len(error_rates)/2)]
|
183
|
+
mean_e = sum(error_rates)/len(error_rates)
|
184
|
+
logfile.write("Lowest read error rate:{0}\n".format(min_e))
|
185
|
+
logfile.write("Highest read error rate:{0}\n".format(max_e))
|
186
|
+
logfile.write("Median read error rate:{0}\n".format(median_e))
|
187
|
+
logfile.write("Mean read error rate:{0}\n".format(mean_e))
|
188
|
+
logfile.write("\n")
|
189
|
+
logfile.close()
|
190
|
+
logging.debug("Sorted all reads in {0} seconds.".format(time() - start) )
|
191
|
+
return reads_sorted_outfile.name
|
192
|
+
|
193
|
+
|
194
|
+
if __name__ == '__main__':
|
195
|
+
parser = argparse.ArgumentParser(description="Evaluate pacbio IsoSeq transcripts.")
|
196
|
+
reads_file = parser.add_mutually_exclusive_group(required=True)
|
197
|
+
reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
|
198
|
+
reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
|
199
|
+
parser.add_argument('--outfile', type=str, default=None, help='A fasta file with transcripts that are shared between samples and have perfect illumina support.')
|
200
|
+
parser.add_argument('--k', type=int, default=15, help='kmer size')
|
201
|
+
parser.add_argument('--debug', action='store_true', help='Enable debug logging')
|
202
|
+
|
203
|
+
args = parser.parse_args()
|
204
|
+
|
205
|
+
loglevel = logging.debug if args.debug else logging.INFO
|
206
|
+
|
207
|
+
logging.basicConfig(
|
208
|
+
level=loglevel,
|
209
|
+
format='%(message)s'
|
210
|
+
)
|
211
|
+
|
212
|
+
if len(sys.argv)==1:
|
213
|
+
parser.print_help()
|
214
|
+
sys.exit()
|
215
|
+
path_, file_prefix = os.path.split(args.outfile)
|
216
|
+
help_functions.mkdir_p(path_)
|
217
|
+
|
218
|
+
main(args)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import os
|
2
|
+
import errno
|
3
|
+
import re
|
4
|
+
import logging
|
5
|
+
import sys
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
'''
|
10
|
+
Below code taken from https://github.com/lh3/readfq/blob/master/readfq.py
|
11
|
+
'''
|
12
|
+
|
13
|
+
def readfq(fp): # this is a generator function
|
14
|
+
last = None # this is a buffer keeping the last unprocessed line
|
15
|
+
while True: # mimic closure; is it a bad idea?
|
16
|
+
if not last: # the first record or a record following a fastq
|
17
|
+
for l in fp: # search for the start of the next record
|
18
|
+
if l[0] in '>@': # fasta/q header line
|
19
|
+
last = l[:-1] # save this line
|
20
|
+
break
|
21
|
+
if not last: break
|
22
|
+
name, seqs, last = last[1:], [], None
|
23
|
+
for l in fp: # read the sequence
|
24
|
+
if l[0] in '@+>':
|
25
|
+
last = l[:-1]
|
26
|
+
break
|
27
|
+
seqs.append(l[:-1])
|
28
|
+
if not last or last[0] != '+': # this is a fasta record
|
29
|
+
yield name, (''.join(seqs), None) # yield a fasta record
|
30
|
+
if not last: break
|
31
|
+
else: # this is a fastq record
|
32
|
+
seq, leng, seqs = ''.join(seqs), 0, []
|
33
|
+
for l in fp: # read the quality
|
34
|
+
seqs.append(l[:-1])
|
35
|
+
leng += len(l) - 1
|
36
|
+
if leng >= len(seq): # have read enough quality
|
37
|
+
last = None
|
38
|
+
yield name, (seq, ''.join(seqs)); # yield a fastq record
|
39
|
+
break
|
40
|
+
if last: # reach EOF before reading enough quality
|
41
|
+
yield name, (seq, None) # yield a fasta record instead
|
42
|
+
break
|
43
|
+
|
44
|
+
|
45
|
+
def mkdir_p(path):
|
46
|
+
try:
|
47
|
+
os.makedirs(path)
|
48
|
+
logging.debug(f"creating {path}")
|
49
|
+
except OSError as exc: # Python >2.5
|
50
|
+
if exc.errno == errno.EEXIST and os.path.isdir(path):
|
51
|
+
pass
|
52
|
+
else:
|
53
|
+
raise
|
54
|
+
|
55
|
+
|
56
|
+
def cigar_to_seq(cigar, query, ref):
|
57
|
+
cigar_tuples = []
|
58
|
+
result = re.split(r'[=DXSMI]+', cigar)
|
59
|
+
i = 0
|
60
|
+
for length in result[:-1]:
|
61
|
+
i += len(length)
|
62
|
+
type_ = cigar[i]
|
63
|
+
i += 1
|
64
|
+
cigar_tuples.append((int(length), type_ ))
|
65
|
+
|
66
|
+
r_index = 0
|
67
|
+
q_index = 0
|
68
|
+
q_aln = []
|
69
|
+
r_aln = []
|
70
|
+
for length_ , type_ in cigar_tuples:
|
71
|
+
if type_ == "=" or type_ == "X":
|
72
|
+
q_aln.append(query[q_index : q_index + length_])
|
73
|
+
r_aln.append(ref[r_index : r_index + length_])
|
74
|
+
|
75
|
+
r_index += length_
|
76
|
+
q_index += length_
|
77
|
+
|
78
|
+
elif type_ == "I":
|
79
|
+
# insertion w.r.t. reference
|
80
|
+
r_aln.append('-' * length_)
|
81
|
+
q_aln.append(query[q_index: q_index + length_])
|
82
|
+
# only query index change
|
83
|
+
q_index += length_
|
84
|
+
|
85
|
+
elif type_ == 'D':
|
86
|
+
# deletion w.r.t. reference
|
87
|
+
r_aln.append(ref[r_index: r_index + length_])
|
88
|
+
q_aln.append('-' * length_)
|
89
|
+
# only ref index change
|
90
|
+
r_index += length_
|
91
|
+
|
92
|
+
else:
|
93
|
+
logging.error("Error processing cigar")
|
94
|
+
logging.error(cigar)
|
95
|
+
sys.exit()
|
96
|
+
|
97
|
+
return "".join([s for s in q_aln]), "".join([s for s in r_aln])
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|