NGSpeciesID 0.3.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
modules/consensus.py ADDED
@@ -0,0 +1,278 @@
1
+ from __future__ import print_function
2
+ import subprocess
3
+ import sys, os
4
+ from sys import stdout
5
+ import re
6
+ import shutil
7
+ import parasail
8
+ import glob
9
+ import logging
10
+
11
+ from modules import help_functions
12
+
13
+
14
+ def cigar_to_seq(cigar, query, ref):
15
+ cigar_tuples = []
16
+ result = re.split(r'[=DXSMI]+', cigar)
17
+ cig_pos = 0
18
+ for length in result[:-1]:
19
+ cig_pos += len(length)
20
+ type_ = cigar[cig_pos]
21
+ cig_pos += 1
22
+ cigar_tuples.append((int(length), type_ ))
23
+
24
+ r_index = 0
25
+ q_index = 0
26
+ q_aln = []
27
+ r_aln = []
28
+ for length_ , type_ in cigar_tuples:
29
+ if type_ == "=" or type_ == "X":
30
+ q_aln.append(query[q_index : q_index + length_])
31
+ r_aln.append(ref[r_index : r_index + length_])
32
+
33
+ r_index += length_
34
+ q_index += length_
35
+
36
+ elif type_ == "I":
37
+ # insertion w.r.t. reference
38
+ r_aln.append('-' * length_)
39
+ q_aln.append(query[q_index: q_index + length_])
40
+ # only query index change
41
+ q_index += length_
42
+
43
+ elif type_ == 'D':
44
+ # deletion w.r.t. reference
45
+ r_aln.append(ref[r_index: r_index + length_])
46
+ q_aln.append('-' * length_)
47
+ # only ref index change
48
+ r_index += length_
49
+
50
+ else:
51
+ logging.error("Error processing cigar")
52
+ logging.error(cigar)
53
+ sys.exit()
54
+
55
+ return "".join([s for s in q_aln]), "".join([s for s in r_aln]), cigar_tuples
56
+
57
+
58
+ def parasail_alignment(s1, s2, match_score = 2, mismatch_penalty = -2, opening_penalty = 3, gap_ext = 1):
59
+ user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty)
60
+ result = parasail.sg_trace_scan_16(s1, s2, opening_penalty, gap_ext, user_matrix)
61
+ if result.saturated:
62
+ logging.warning(f"SATURATED!{len(s1)} {len(s2)}")
63
+ result = parasail.sg_trace_scan_32(s1, s2, opening_penalty, gap_ext, user_matrix)
64
+ logging.warning("computed 32 bit instead")
65
+
66
+ # difference in how to obtain string from parasail between python v2 and v3...
67
+ if sys.version_info[0] < 3:
68
+ cigar_string = str(result.cigar.decode).decode('utf-8')
69
+ else:
70
+ cigar_string = str(result.cigar.decode, 'utf-8')
71
+ s1_alignment, s2_alignment, cigar_tuples = cigar_to_seq(cigar_string, s1, s2)
72
+
73
+ return s1_alignment, s2_alignment, cigar_string, cigar_tuples, result.score
74
+
75
+ def reverse_complement(string):
76
+ #rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N', 'X':'X'}
77
+ # Modified for Abyss output
78
+ rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
79
+
80
+ rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
81
+ return(rev_comp)
82
+
83
+ def run_spoa(reads, spoa_out_file, spoa_path):
84
+ with open(spoa_out_file, "w") as output_file:
85
+ stdout.flush()
86
+ with open("/dev/null", "w") as null:
87
+ subprocess.check_call([ spoa_path, reads, "-l", "0", "-r", "0", "-g", "-2"], stdout=output_file, stderr=null)
88
+ stdout.flush()
89
+ with open(spoa_out_file, "r") as sof:
90
+ l = sof.readlines()
91
+ consensus = l[1].strip()
92
+ return consensus
93
+
94
+ def run_medaka(reads_to_center, center_file, outfolder, cores, medaka_model, outfastq=False):
95
+ medaka_stdout = os.path.join(outfolder, "stdout.txt")
96
+ with open(medaka_stdout, "w") as output_file:
97
+ stdout.flush()
98
+ with open(os.path.join(outfolder, "stderr.txt"), "w") as medaka_stderr:
99
+ cmd_args = ['medaka_consensus', '-i', reads_to_center, "-d", center_file, "-o", outfolder, "-t", cores]
100
+ if medaka_model:
101
+ cmd_args += ["-m", medaka_model]
102
+ if outfastq:
103
+ cmd_args += ["-q"]
104
+ subprocess.check_call(cmd_args, stdout=output_file, stderr=medaka_stderr)
105
+ stdout.flush()
106
+
107
+ def run_racon(reads_to_center, center_file, outfolder, cores, racon_iter):
108
+ racon_stdout = os.path.join(outfolder, "stdout.txt")
109
+ with open(racon_stdout, "w") as output_file:
110
+ stdout.flush()
111
+ for i in range(racon_iter):
112
+ with open(
113
+ os.path.join(outfolder, "read_alignments_it_{0}.paf".format(i)), 'w'
114
+ ) as read_alignments, open(
115
+ os.path.join(outfolder, "mm2_stderr_it_{0}.txt".format(i)), "w"
116
+ ) as mm2_stderr, open(
117
+ os.path.join(outfolder, "racon_stderr_it_{0}.txt".format(i)), "w"
118
+ ) as racon_stderr, open(
119
+ os.path.join(outfolder, "racon_polished_it_{0}.fasta".format(i)
120
+ ), 'w') as racon_polished:
121
+ subprocess.check_call(['minimap2', '-x', 'map-ont', center_file, reads_to_center], stdout=read_alignments, stderr=mm2_stderr)
122
+ subprocess.check_call(['racon', reads_to_center, read_alignments.name, center_file], stdout=racon_polished, stderr=racon_stderr)
123
+ center_file = racon_polished.name
124
+
125
+ shutil.copyfile(center_file, os.path.join(outfolder, "consensus.fasta"))
126
+ stdout.flush()
127
+
128
+
129
+ def highest_aln_identity(seq, seq2):
130
+ # RC
131
+ seq2_rc = reverse_complement(seq2)
132
+ seq_aln_rc, seq2_aln_rc, cigar_string_rc, cigar_tuples_rc, alignment_score_rc = parasail_alignment(seq, seq2_rc)
133
+ nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln_rc, seq2_aln_rc) if n1 != n2])
134
+ total_pos_rc = len(seq_aln_rc)
135
+ aln_identity_rc = (total_pos_rc - nr_mismatching_pos) / float(total_pos_rc)
136
+ logging.debug(f"Rec comp orientation identity %: {aln_identity_rc}")
137
+
138
+ # FW
139
+ seq_aln, seq2_aln, cigar_string, cigar_tuples, alignment_score = parasail_alignment(seq, seq2)
140
+ nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln, seq2_aln) if n1 != n2])
141
+ total_pos = len(seq_aln)
142
+ aln_identity_fw = (total_pos - nr_mismatching_pos) / float(total_pos)
143
+ logging.debug(f"Forward orientation identity %: {aln_identity_fw}")
144
+ aln_identity = max([aln_identity_fw, aln_identity_rc])
145
+ return aln_identity
146
+
147
+
148
+ def detect_reverse_complements(centers, rc_identity_threshold):
149
+ filtered_centers = []
150
+ already_removed = set()
151
+ for i, (nr_reads_in_cl, c_id, seq, reads_path) in enumerate(centers):
152
+ if type(reads_path) != list:
153
+ all_reads = [reads_path]
154
+ else:
155
+ all_reads = reads_path
156
+
157
+ merged_cluster_id = c_id
158
+ merged_nr_reads = nr_reads_in_cl
159
+ if c_id in already_removed:
160
+ logging.debug("has already been merged, skipping")
161
+ continue
162
+
163
+ elif i == len(centers) - 1: # last sequence and it is not in already_removed
164
+ filtered_centers.append( [merged_nr_reads, c_id, seq, all_reads ] )
165
+
166
+ else:
167
+ for j, (nr_reads_in_cl2, c_id2, seq2, reads_path) in enumerate(centers[i+1:]):
168
+ aln_identity = highest_aln_identity(seq, seq2)
169
+ if aln_identity >= rc_identity_threshold:
170
+ logging.debug("Detected two consensus sequences with alignment identidy above threshold (from either reverse complement or split clusters). Keeping center with the most read support and merging reads.")
171
+ merged_nr_reads += nr_reads_in_cl2
172
+ already_removed.add(c_id2)
173
+
174
+ if type(reads_path) != list:
175
+ all_reads.append(reads_path)
176
+ else:
177
+ for rp in reads_path:
178
+ all_reads.append(rp)
179
+
180
+ filtered_centers.append( [merged_nr_reads, c_id, seq, all_reads] )
181
+
182
+ logging.debug(f"{len(filtered_centers)} consensus formed.")
183
+ return filtered_centers
184
+
185
+
186
+ def polish_sequences(centers, args):
187
+ spoa_ref_location = os.path.join(args.outfolder, "consensus_reference_X.fasta")
188
+ logging.debug(f"Saving spoa references to files: {spoa_ref_location}")
189
+ # printing output from spoa and grouping reads
190
+ if args.medaka:
191
+ polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*")
192
+ elif args.racon:
193
+ polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*")
194
+
195
+ for folder in glob.glob(polishing_pattern):
196
+ shutil.rmtree(folder)
197
+
198
+ spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*")
199
+ for file in glob.glob(spoa_pattern):
200
+ os.remove(file)
201
+
202
+ for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers):
203
+ spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id))
204
+ with open(spoa_center_file, "w") as f:
205
+ f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
206
+
207
+ nr_reads_used = 0
208
+ all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
209
+ with open(all_reads_file, "w") as f:
210
+ for fasta_file in all_reads:
211
+ reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
212
+ for acc, (seq, qual) in reads.items():
213
+ acc_tmp = acc.split()[0]
214
+ f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc_tmp, seq, "+", qual))
215
+ nr_reads_used += 1
216
+
217
+ if args.medaka:
218
+ logging.debug("running medaka on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
219
+ polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
220
+ outfiles = [ # consider all output formats for compatibility with all Medaka versions
221
+ os.path.join(polishing_outfolder, "consensus.fasta"),
222
+ os.path.join(polishing_outfolder, "consensus.fastq")
223
+ ]
224
+ help_functions.mkdir_p(polishing_outfolder)
225
+ run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model, outfastq=args.medaka_fastq)
226
+ medaka_ref_location = os.path.join(polishing_outfolder, "consensus.fasta/q")
227
+ logging.debug(f"Saving medaka reference to file: {medaka_ref_location}")
228
+ for f in outfiles:
229
+ if os.path.isfile(f):
230
+ with open(f, 'r') as cf:
231
+ centers[i][2] = cf.readlines()[1].strip() # the second line is the nucleotide sequence
232
+ break
233
+ assert centers[i][2], "Medaka consensus sequence not found"
234
+ elif args.racon:
235
+ logging.debug("running racon on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
236
+ polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
237
+ help_functions.mkdir_p(polishing_outfolder)
238
+ run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter)
239
+ racon_ref_location = os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id))
240
+ logging.debug(f"Saving racon reference to file: {racon_ref_location}")
241
+ with open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r') as cf:
242
+ l = cf.readlines()
243
+ center_polished = l[1].strip()
244
+ centers[i][2] = center_polished
245
+
246
+ return centers
247
+
248
+
249
+ def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, work_dir, abundance_cutoff, args):
250
+ centers = []
251
+ singletons = 0
252
+ discarded_clusters = []
253
+ reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(sorted_reads_fastq_file, 'r'))}
254
+ for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
255
+ nr_reads_in_cluster = len(all_read_acc)
256
+ if nr_reads_in_cluster >= abundance_cutoff:
257
+ reads_path_name = os.path.join(work_dir, "reads_c_id_{0}.fq".format(c_id))
258
+ with open(reads_path_name, "w") as reads_file:
259
+ for i, acc in enumerate(all_read_acc):
260
+ if (args.max_seqs_for_consensus) >=0 and (i >= args.max_seqs_for_consensus):
261
+ break
262
+ seq, qual = reads[acc]
263
+ reads_file.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
264
+ tmp_param = args.max_seqs_for_consensus if args.max_seqs_for_consensus > 0 else 2**32
265
+ logging.debug("creating center of {0} sequences.".format(min(nr_reads_in_cluster, tmp_param)))
266
+ center = run_spoa(reads_path_name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
267
+ centers.append( [nr_reads_in_cluster, c_id, center, reads_path_name])
268
+ elif nr_reads_in_cluster == 1:
269
+ singletons += 1
270
+ elif nr_reads_in_cluster > 1:
271
+ discarded_clusters.append(nr_reads_in_cluster)
272
+ logging.debug(f"{singletons} singletons were discarded")
273
+ logging.debug(
274
+ f"{len(discarded_clusters)} clusters were discarded due to not passing the abundance_cutoff: "
275
+ f"a total of {sum(discarded_clusters)} reads were discarded. "
276
+ f"Highest abundance among them: {max(discarded_clusters or [0])} reads."
277
+ )
278
+ return centers
@@ -0,0 +1,218 @@
1
+ from __future__ import print_function
2
+ import os,sys
3
+ import argparse
4
+
5
+ import signal
6
+ from multiprocessing import Pool
7
+ import multiprocessing as mp
8
+
9
+ import operator
10
+ import functools
11
+ from time import time
12
+ from collections import deque
13
+ import sys
14
+ import itertools
15
+ import math
16
+ import logging
17
+
18
+ from modules import help_functions
19
+
20
+ D = {chr(i) : min( 10**( - (ord(chr(i)) - 33)/10.0 ), 0.79433) for i in range(128)}
21
+ D_no_min = {chr(i) : 10**( - (ord(chr(i)) - 33)/10.0 ) for i in range(128)}
22
+
23
+ def expected_number_of_erroneous_kmers(quality_string, k):
24
+ prob_error = [D[char_] for char_ in quality_string]
25
+ window = deque([ (1.0 - p_e) for p_e in prob_error[:k]])
26
+ qurrent_prob_no_error = functools.reduce(operator.mul, window, 1)
27
+ sum_of_expectations = qurrent_prob_no_error # initialization
28
+ for p_e in prob_error[k:]:
29
+ p_to_leave = window.popleft()
30
+ qurrent_prob_no_error *= ((1.0 -p_e)/(p_to_leave))
31
+ sum_of_expectations += qurrent_prob_no_error
32
+ window.append(1.0 -p_e)
33
+ return len(quality_string) - k + 1 - sum_of_expectations
34
+
35
+
36
+ def reverse_complement(string):
37
+ rev_nuc = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'N':'N', 'X':'X', 'n':'n', 'Y':'R', 'R':'Y', 'K':'M', 'M':'K', 'S':'S', 'W':'W', 'B':'V', 'V':'B', 'H':'D', 'D':'H', 'y':'r', 'r':'y', 'k':'m', 'm':'k', 's':'s', 'w':'w', 'b':'v', 'v':'b', 'h':'d', 'd':'h'}
38
+ rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
39
+ return(rev_comp)
40
+
41
+ def batch(iterable, n=1):
42
+ l = len(iterable)
43
+ for ndx in range(0, l, n):
44
+ yield iterable[ndx:min(ndx + n, l)]
45
+
46
+
47
+ def calc_score_new(d):
48
+ for key,value in d.items():
49
+ l, k, q_threshold = value
50
+
51
+ read_array = []
52
+ error_rates = []
53
+ for i, (acc, seq, qual) in enumerate(l):
54
+ if i % 10000 == 0:
55
+ logging.debug(f"{i} reads processed.")
56
+
57
+ # skip very short reads or degenerate reads
58
+ seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
59
+ if len(seq) < 2*k or len(seq_hpol_comp) < k:
60
+ continue
61
+
62
+ poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
63
+ error_rate = poisson_mean/float(len(qual))
64
+ if 10*-math.log(error_rate, 10) <= q_threshold:
65
+ continue
66
+
67
+ error_rates.append(error_rate)
68
+ exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k)
69
+ p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
70
+ score = p_no_error_in_kmers * (len(seq) - k +1)
71
+ read_array.append((acc, seq, qual, score) )
72
+ return {key : (read_array, error_rates)}
73
+
74
+
75
+ def fastq_parallel(args):
76
+ k = args.k
77
+ q_threshold = args.quality_threshold
78
+ error_rates = []
79
+ reads = [ (acc,seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))]
80
+ start = time()
81
+ read_chunk_size = int( len(reads)/args.nr_cores ) + 1
82
+ read_batches = [b for b in batch(reads, read_chunk_size)]
83
+ del reads
84
+ ####### parallelize alignment #########
85
+ original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
86
+ signal.signal(signal.SIGINT, original_sigint_handler)
87
+ mp.set_start_method('spawn')
88
+ logging.debug(f"{mp.get_context()}")
89
+ logging.debug(f"Environment set: {mp.get_context()}")
90
+ logging.debug(f"Using {args.nr_cores} cores.")
91
+ start_multi = time()
92
+ pool = Pool(processes=int(args.nr_cores))
93
+ try:
94
+ batch_lengths = [len(b) for b in read_batches]
95
+ logging.debug(f"{batch_lengths}")
96
+ data = [ {i : (b,k, q_threshold)} for i, b in enumerate(read_batches)] #[ {i+1 :((cluster_batches[i], cluster_seq_origin_batches[i], read_batches[i], p_emp_probs, lowest_batch_index_db[i], i+1, args), {})} for i in range(len(read_batches))]
97
+ res = pool.map_async(calc_score_new, data)
98
+ score_results =res.get(999999999) # Without the timeout this blocking call ignores all signals.
99
+ except KeyboardInterrupt:
100
+ logging.warning("Caught KeyboardInterrupt, terminating workers")
101
+ pool.terminate()
102
+ sys.exit()
103
+ else:
104
+ pool.close()
105
+ pool.join()
106
+
107
+ logging.debug(f"Time elapesd multiprocessing: {time() - start_multi}")
108
+ read_array, error_rates = [], []
109
+
110
+ for output_dict in score_results:
111
+ for k, v in output_dict.items():
112
+ r_a, err_rates = v
113
+ logging.debug(f"Batch index {k}")
114
+ for item in r_a:
115
+ read_array.append(item)
116
+ for item2 in err_rates:
117
+ error_rates.append(item2)
118
+
119
+ read_array.sort(key=lambda x: x[3], reverse=True)
120
+ error_rates.sort()
121
+ return read_array, error_rates
122
+
123
+
124
+ def fastq_single_core(args):
125
+ k = args.k
126
+ q_threshold = args.quality_threshold
127
+ error_rates = []
128
+ read_array = []
129
+ for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r'))):
130
+ if i % 10000 == 0:
131
+ logging.debug(f"{i} reads processed.")
132
+
133
+ # skip very short reads or degenerate reads
134
+ seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
135
+ if len(seq) < 2*k or len(seq_hpol_comp) < args.k:
136
+ continue
137
+ ########################
138
+
139
+ exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k)
140
+ p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
141
+ score = p_no_error_in_kmers * (len(seq) - k +1)
142
+
143
+ ## For (inferred) average error rate only, based on quality values
144
+ ### These values are used in evaluations in the paper only, and are not used in clustering
145
+ poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
146
+ error_rate = poisson_mean/float(len(qual))
147
+ if 10*-math.log(error_rate, 10) <= q_threshold:
148
+ continue
149
+ error_rates.append(error_rate)
150
+ ##############################################
151
+
152
+ read_array.append((acc, seq, qual, score) )
153
+
154
+ read_array.sort(key=lambda x: x[3], reverse=True)
155
+ return read_array, error_rates
156
+
157
+
158
+
159
+ def main(args):
160
+ start = time()
161
+ logfile = open(os.path.join(args.outfolder, "logfile.txt"), 'w')
162
+ if os.path.isfile(args.outfile) and args.use_old_sorted_file:
163
+ logging.warning("Using already existing sorted file in specified directory, in not intended, specify different outfolder or delete the current file.")
164
+ return args.outfile
165
+
166
+ elif args.fastq:
167
+ if args.nr_cores > 1:
168
+ read_array, error_rates = fastq_parallel(args)
169
+ else:
170
+ read_array, error_rates = fastq_single_core(args)
171
+
172
+
173
+
174
+ reads_sorted_outfile = open(args.outfile, "w")
175
+ for i, (acc, seq, qual, score) in enumerate(read_array):
176
+ reads_sorted_outfile.write("@{0}\n{1}\n+\n{2}\n".format(acc + "_{0}".format(score), seq, qual))
177
+ reads_sorted_outfile.close()
178
+ logging.debug(f"{len(read_array)} reads passed quality critera (avg phred Q val over {args.quality_threshold} and length > 2*k) and will be clustered.")
179
+ error_rates.sort()
180
+ min_e = error_rates[0]
181
+ max_e = error_rates[-1]
182
+ median_e = error_rates[int(len(error_rates)/2)]
183
+ mean_e = sum(error_rates)/len(error_rates)
184
+ logfile.write("Lowest read error rate:{0}\n".format(min_e))
185
+ logfile.write("Highest read error rate:{0}\n".format(max_e))
186
+ logfile.write("Median read error rate:{0}\n".format(median_e))
187
+ logfile.write("Mean read error rate:{0}\n".format(mean_e))
188
+ logfile.write("\n")
189
+ logfile.close()
190
+ logging.debug("Sorted all reads in {0} seconds.".format(time() - start) )
191
+ return reads_sorted_outfile.name
192
+
193
+
194
+ if __name__ == '__main__':
195
+ parser = argparse.ArgumentParser(description="Evaluate pacbio IsoSeq transcripts.")
196
+ reads_file = parser.add_mutually_exclusive_group(required=True)
197
+ reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
198
+ reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
199
+ parser.add_argument('--outfile', type=str, default=None, help='A fasta file with transcripts that are shared between samples and have perfect illumina support.')
200
+ parser.add_argument('--k', type=int, default=15, help='kmer size')
201
+ parser.add_argument('--debug', action='store_true', help='Enable debug logging')
202
+
203
+ args = parser.parse_args()
204
+
205
+ loglevel = logging.debug if args.debug else logging.INFO
206
+
207
+ logging.basicConfig(
208
+ level=loglevel,
209
+ format='%(message)s'
210
+ )
211
+
212
+ if len(sys.argv)==1:
213
+ parser.print_help()
214
+ sys.exit()
215
+ path_, file_prefix = os.path.split(args.outfile)
216
+ help_functions.mkdir_p(path_)
217
+
218
+ main(args)
@@ -0,0 +1,104 @@
1
+ import os
2
+ import errno
3
+ import re
4
+ import logging
5
+ import sys
6
+
7
+
8
+
9
+ '''
10
+ Below code taken from https://github.com/lh3/readfq/blob/master/readfq.py
11
+ '''
12
+
13
+ def readfq(fp): # this is a generator function
14
+ last = None # this is a buffer keeping the last unprocessed line
15
+ while True: # mimic closure; is it a bad idea?
16
+ if not last: # the first record or a record following a fastq
17
+ for l in fp: # search for the start of the next record
18
+ if l[0] in '>@': # fasta/q header line
19
+ last = l[:-1] # save this line
20
+ break
21
+ if not last: break
22
+ name, seqs, last = last[1:], [], None
23
+ for l in fp: # read the sequence
24
+ if l[0] in '@+>':
25
+ last = l[:-1]
26
+ break
27
+ seqs.append(l[:-1])
28
+ if not last or last[0] != '+': # this is a fasta record
29
+ yield name, (''.join(seqs), None) # yield a fasta record
30
+ if not last: break
31
+ else: # this is a fastq record
32
+ seq, leng, seqs = ''.join(seqs), 0, []
33
+ for l in fp: # read the quality
34
+ seqs.append(l[:-1])
35
+ leng += len(l) - 1
36
+ if leng >= len(seq): # have read enough quality
37
+ last = None
38
+ yield name, (seq, ''.join(seqs)); # yield a fastq record
39
+ break
40
+ if last: # reach EOF before reading enough quality
41
+ yield name, (seq, None) # yield a fasta record instead
42
+ break
43
+
44
+
45
+ def mkdir_p(path):
46
+ try:
47
+ os.makedirs(path)
48
+ logging.debug(f"creating {path}")
49
+ except OSError as exc: # Python >2.5
50
+ if exc.errno == errno.EEXIST and os.path.isdir(path):
51
+ pass
52
+ else:
53
+ raise
54
+
55
+
56
+ def cigar_to_seq(cigar, query, ref):
57
+ cigar_tuples = []
58
+ result = re.split(r'[=DXSMI]+', cigar)
59
+ i = 0
60
+ for length in result[:-1]:
61
+ i += len(length)
62
+ type_ = cigar[i]
63
+ i += 1
64
+ cigar_tuples.append((int(length), type_ ))
65
+
66
+ r_index = 0
67
+ q_index = 0
68
+ q_aln = []
69
+ r_aln = []
70
+ for length_ , type_ in cigar_tuples:
71
+ if type_ == "=" or type_ == "X":
72
+ q_aln.append(query[q_index : q_index + length_])
73
+ r_aln.append(ref[r_index : r_index + length_])
74
+
75
+ r_index += length_
76
+ q_index += length_
77
+
78
+ elif type_ == "I":
79
+ # insertion w.r.t. reference
80
+ r_aln.append('-' * length_)
81
+ q_aln.append(query[q_index: q_index + length_])
82
+ # only query index change
83
+ q_index += length_
84
+
85
+ elif type_ == 'D':
86
+ # deletion w.r.t. reference
87
+ r_aln.append(ref[r_index: r_index + length_])
88
+ q_aln.append('-' * length_)
89
+ # only ref index change
90
+ r_index += length_
91
+
92
+ else:
93
+ logging.error("Error processing cigar")
94
+ logging.error(cigar)
95
+ sys.exit()
96
+
97
+ return "".join([s for s in q_aln]), "".join([s for s in r_aln])
98
+
99
+
100
+
101
+
102
+
103
+
104
+