NGSpeciesID 0.3.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modules/__init__.py +0 -0
- modules/barcode_trimmer.py +104 -0
- modules/cluster.py +373 -0
- modules/consensus.py +278 -0
- modules/get_sorted_fastq_for_cluster.py +218 -0
- modules/help_functions.py +104 -0
- modules/p_minimizers_shared.py +3 -0
- modules/parallelize.py +218 -0
- ngspeciesid-0.3.1.data/scripts/NGSpeciesID +288 -0
- ngspeciesid-0.3.1.dist-info/METADATA +350 -0
- ngspeciesid-0.3.1.dist-info/RECORD +14 -0
- ngspeciesid-0.3.1.dist-info/WHEEL +6 -0
- ngspeciesid-0.3.1.dist-info/licenses/LICENSE.txt +674 -0
- ngspeciesid-0.3.1.dist-info/top_level.txt +1 -0
modules/parallelize.py
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
|
2
|
+
import os
|
3
|
+
import math
|
4
|
+
import signal
|
5
|
+
from multiprocessing import Pool
|
6
|
+
import multiprocessing as mp
|
7
|
+
from time import time
|
8
|
+
import logging
|
9
|
+
import sys
|
10
|
+
|
11
|
+
|
12
|
+
from modules import help_functions
|
13
|
+
from modules import cluster
|
14
|
+
|
15
|
+
|
16
|
+
def reads_to_clusters_helper(arguments):
|
17
|
+
for k,v in arguments.items():
|
18
|
+
args, kwargs = v
|
19
|
+
return cluster.reads_to_clusters(*args, **kwargs)
|
20
|
+
|
21
|
+
|
22
|
+
def merge_dicts(*dict_args):
|
23
|
+
"""
|
24
|
+
Given any number of dicts, shallow copy and merge into a new dict,
|
25
|
+
precedence goes to key value pairs in latter dicts.
|
26
|
+
"""
|
27
|
+
result = {}
|
28
|
+
for dictionary in dict_args:
|
29
|
+
result.update(dictionary)
|
30
|
+
return result
|
31
|
+
|
32
|
+
|
33
|
+
def batch_list(lst, nr_cores=1, batch_type = "nr_reads" , merge_consecutive = False ):
|
34
|
+
if merge_consecutive:
|
35
|
+
batch_id = 2
|
36
|
+
batch = []
|
37
|
+
for info in lst:
|
38
|
+
if info[1] <= batch_id:
|
39
|
+
batch.append(info)
|
40
|
+
else: # first sequence in new batch
|
41
|
+
yield batch
|
42
|
+
batch_id += 2
|
43
|
+
batch = []
|
44
|
+
batch.append(info)
|
45
|
+
yield batch
|
46
|
+
|
47
|
+
else:
|
48
|
+
if batch_type == "nr_reads":
|
49
|
+
l = len(lst)
|
50
|
+
chunk_size = int(l/nr_cores) + 1
|
51
|
+
for ndx in range(0, l, chunk_size):
|
52
|
+
yield lst[ndx:min(ndx + chunk_size, l)]
|
53
|
+
|
54
|
+
elif batch_type == "total_nt":
|
55
|
+
tot_length = sum([len(seq) for i, b_i, acc, seq, qual, score in lst] )
|
56
|
+
nt_chunk_size = int(tot_length/nr_cores) + 1
|
57
|
+
|
58
|
+
batch = []
|
59
|
+
curr_size = 0
|
60
|
+
for info in lst:
|
61
|
+
curr_size += len(info[3])
|
62
|
+
batch.append(info)
|
63
|
+
if curr_size >= nt_chunk_size:
|
64
|
+
yield batch
|
65
|
+
batch = []
|
66
|
+
curr_size = 0
|
67
|
+
yield batch
|
68
|
+
|
69
|
+
elif batch_type == "read_lengths_squared":
|
70
|
+
tot_length = sum([ math.pow(len(seq),2) for i, b_i, acc, seq, qual, score in lst] )
|
71
|
+
nt_chunk_size = int(tot_length/nr_cores) + 1
|
72
|
+
batch = []
|
73
|
+
curr_size = 0
|
74
|
+
for info in lst:
|
75
|
+
curr_size += math.pow(len(info[3]),2)
|
76
|
+
batch.append(info)
|
77
|
+
if curr_size >= nt_chunk_size:
|
78
|
+
yield batch
|
79
|
+
batch = []
|
80
|
+
curr_size = 0
|
81
|
+
yield batch
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
def print_intermediate_results(clusters, cluster_seq_origin, args, iter_nr):
|
86
|
+
path = args.outfolder +"/{0}".format(iter_nr)
|
87
|
+
help_functions.mkdir_p( path )
|
88
|
+
outfile = open(os.path.join(path, "pre_clusters.csv"), "w")
|
89
|
+
nontrivial_cluster_index = 0
|
90
|
+
for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
|
91
|
+
for r_acc in all_read_acc:
|
92
|
+
outfile.write("{0}\t{1}\n".format(c_id, "_".join([item for item in r_acc.split("_")[:-1]]) ))
|
93
|
+
if len(all_read_acc) > 1:
|
94
|
+
nontrivial_cluster_index += 1
|
95
|
+
logging.debug(f"Nr clusters larger than 1: {nontrivial_cluster_index}") #, "Non-clustered reads:", len(archived_reads))
|
96
|
+
logging.debug(f"Nr clusters (all): {len(clusters)}") #, "Non-clustered reads:", len(archived_reads))
|
97
|
+
|
98
|
+
|
99
|
+
origins_outfile = open(os.path.join(path, "cluster_origins.csv"), "w")
|
100
|
+
for cl_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
|
101
|
+
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate, _ = cluster_seq_origin[cl_id]
|
102
|
+
origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(read_cl_id, acc, c_seq, c_qual, score, error_rate))
|
103
|
+
outfile.close()
|
104
|
+
origins_outfile.close()
|
105
|
+
|
106
|
+
|
107
|
+
def parallel_clustering(read_array, p_emp_probs, args):
|
108
|
+
num_batches = args.nr_cores
|
109
|
+
read_batches = [batch for batch in batch_list(read_array, num_batches, batch_type = args.batch_type )]
|
110
|
+
batch_sizes = [sum([len(seq) for i, b_i, acc, seq, qual, score in b]) for b in read_batches]
|
111
|
+
logging.debug(f"Using total nucleotide batch sizes: {batch_sizes}")
|
112
|
+
batch_lengths = [len(b) for b in read_batches]
|
113
|
+
logging.debug(f"Nr reads in batches: {batch_lengths}")
|
114
|
+
cluster_batches = []
|
115
|
+
cluster_seq_origin_batches = []
|
116
|
+
lowest_batch_index_db = []
|
117
|
+
for batch in read_batches:
|
118
|
+
tmp_clust = {}
|
119
|
+
tmp_clust_origin = {}
|
120
|
+
for i, b_i, acc, seq, qual, score in batch:
|
121
|
+
tmp_clust[i] = [acc]
|
122
|
+
tmp_clust_origin[i] = (i, b_i, acc, seq, qual, score)
|
123
|
+
cluster_batches.append(tmp_clust)
|
124
|
+
cluster_seq_origin_batches.append(tmp_clust_origin)
|
125
|
+
lowest_batch_index_db.append({})
|
126
|
+
del read_array
|
127
|
+
|
128
|
+
####### parallelize alignment #########
|
129
|
+
original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
|
130
|
+
signal.signal(signal.SIGINT, original_sigint_handler)
|
131
|
+
try:
|
132
|
+
mp.set_start_method('spawn')
|
133
|
+
logging.debug(f"Environment set: {mp.get_context()}")
|
134
|
+
except RuntimeError:
|
135
|
+
logging.debug(f"Environment already set: {mp.get_context()}")
|
136
|
+
it = 1
|
137
|
+
while True:
|
138
|
+
# Structure up batches
|
139
|
+
logging.debug(f"\nITERATION {it}")
|
140
|
+
logging.debug("Using {0} batches.".format(num_batches))
|
141
|
+
|
142
|
+
if len(read_batches) == 1:
|
143
|
+
start_cluster = time()
|
144
|
+
|
145
|
+
data = {i+1 :((cluster_batches[0], cluster_seq_origin_batches[0], read_batches[0], p_emp_probs, lowest_batch_index_db[0], 1, args), {})}
|
146
|
+
result = reads_to_clusters_helper(data) # { new_batch_index : (Cluster, cluster_seq_origin, H, new_batch_index)}
|
147
|
+
Cluster, cluster_seq_origin, _, _ = result[1]
|
148
|
+
logging.debug(f"Time elapesd clustering last iteration single core: {time() - start_cluster}")
|
149
|
+
return Cluster, cluster_seq_origin
|
150
|
+
|
151
|
+
|
152
|
+
start_multi = time()
|
153
|
+
pool = Pool(processes=int(num_batches))
|
154
|
+
try:
|
155
|
+
data = [ {i+1 :((cluster_batches[i], cluster_seq_origin_batches[i], read_batches[i], p_emp_probs, lowest_batch_index_db[i], i+1, args), {})} for i in range(len(read_batches))]
|
156
|
+
res = pool.map_async(reads_to_clusters_helper, data)
|
157
|
+
cluster_results =res.get(999999999) # Without the timeout this blocking call ignores all signals.
|
158
|
+
except KeyboardInterrupt:
|
159
|
+
logging.warning("Caught KeyboardInterrupt, terminating workers")
|
160
|
+
pool.terminate()
|
161
|
+
sys.exit()
|
162
|
+
else:
|
163
|
+
pool.close()
|
164
|
+
pool.join()
|
165
|
+
|
166
|
+
logging.debug(f"Time elapesd multiprocessing: {time() - start_multi}")
|
167
|
+
|
168
|
+
start_joining = time()
|
169
|
+
all_repr = [] # all_repr = [top_new_seq_origins]
|
170
|
+
all_cl = []
|
171
|
+
all_minimizer_databases = {}
|
172
|
+
for output_dict in cluster_results:
|
173
|
+
logging.debug("New batch")
|
174
|
+
for k, v in output_dict.items():
|
175
|
+
new_clusters, new_representatives, minimizer_database_new, batch_index = v
|
176
|
+
logging.debug(f"Batch index {k}")
|
177
|
+
# for new_clusters, new_representatives, minimizer_database_new, batch_index in cluster_results:
|
178
|
+
all_cl.append(new_clusters)
|
179
|
+
all_repr.append(new_representatives)
|
180
|
+
all_minimizer_databases[batch_index] = minimizer_database_new
|
181
|
+
|
182
|
+
all_clusters = merge_dicts(*all_cl)
|
183
|
+
all_representatives = merge_dicts(*all_repr)
|
184
|
+
read_array = [ (i, b_index, acc, seq, qual, score) for i, (i, b_index, acc, seq, qual, score, error_rate, _) in sorted(all_representatives.items(), key=lambda x: x[1][5], reverse=True)]
|
185
|
+
new_nr_repr = len(read_array)
|
186
|
+
logging.debug(f"number of representatives left to cluster: {new_nr_repr}")
|
187
|
+
logging.debug(f"Time elapesd joining clusters: {time() - start_joining}")
|
188
|
+
|
189
|
+
# Determine new number of batches
|
190
|
+
if num_batches == 1:
|
191
|
+
return all_clusters, all_representatives
|
192
|
+
else:
|
193
|
+
print_intermediate_results(all_clusters, all_representatives, args, it)
|
194
|
+
|
195
|
+
it += 1
|
196
|
+
read_batches = [batch for batch in batch_list(read_array, num_batches, batch_type = args.batch_type, merge_consecutive = True)]
|
197
|
+
num_batches = len(read_batches)
|
198
|
+
logging.debug(f"Batches after pairwise consecutive merge: {num_batches}")
|
199
|
+
batch_sizes = [sum([len(seq) for i, b_i, acc, seq, qual, score in b]) for b in read_batches]
|
200
|
+
logging.debug(f"Using total nucleotide batch sizes: {batch_sizes}")
|
201
|
+
batch_lengths = [len(b) for b in read_batches]
|
202
|
+
logging.debug(f"Using nr reads batch sizes: {batch_lengths}")
|
203
|
+
cluster_batches = []
|
204
|
+
cluster_seq_origin_batches = []
|
205
|
+
lowest_batch_index_db = []
|
206
|
+
for batch in read_batches:
|
207
|
+
tmp_clust = {}
|
208
|
+
tmp_clust_origin = {}
|
209
|
+
lowest_batch_index = min( [ prev_batch_index for (read_cl_id, prev_batch_index, acc, seq, qual, score) in batch ] )
|
210
|
+
for i, b_i, acc, seq, qual, score in batch:
|
211
|
+
tmp_clust[i] = all_clusters[i]
|
212
|
+
tmp_clust_origin[i] = all_representatives[i]
|
213
|
+
cluster_batches.append(tmp_clust)
|
214
|
+
cluster_seq_origin_batches.append(tmp_clust_origin)
|
215
|
+
lowest_batch_index_db.append( all_minimizer_databases[lowest_batch_index])
|
216
|
+
|
217
|
+
del all_minimizer_databases
|
218
|
+
|
@@ -0,0 +1,288 @@
|
|
1
|
+
#!python
|
2
|
+
|
3
|
+
from __future__ import print_function
|
4
|
+
import os,sys
|
5
|
+
import argparse
|
6
|
+
import tempfile
|
7
|
+
from time import time
|
8
|
+
import shutil
|
9
|
+
import random
|
10
|
+
import logging
|
11
|
+
|
12
|
+
from modules import get_sorted_fastq_for_cluster
|
13
|
+
from modules import p_minimizers_shared
|
14
|
+
from modules import help_functions
|
15
|
+
from modules import parallelize
|
16
|
+
from modules import cluster
|
17
|
+
from modules import consensus
|
18
|
+
from modules import barcode_trimmer
|
19
|
+
|
20
|
+
def single_clustering(read_array, p_emp_probs, args):
|
21
|
+
start_cluster = time()
|
22
|
+
clusters = {} # initialize every read as belonging to its own cluster
|
23
|
+
representatives = {} # initialize every read as its own representative
|
24
|
+
for i, b_i, acc, seq, qual, score in read_array:
|
25
|
+
clusters[i] = [acc]
|
26
|
+
representatives[i] = (i, b_i, acc, seq, qual, score)
|
27
|
+
|
28
|
+
minimizer_database, new_batch_index = {}, 1 # These data structures are used in multiprocessing mode but. We use one core so only process everything in one "batch" and dont need to pass the minimizer database to later iterations.
|
29
|
+
result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
|
30
|
+
# Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
|
31
|
+
clusters, representatives, _, _ = list(result_dict.values())[0]
|
32
|
+
logging.debug(f"Time elapesd clustering: {time() - start_cluster}")
|
33
|
+
return clusters, representatives
|
34
|
+
|
35
|
+
|
36
|
+
def main(args):
|
37
|
+
"""
|
38
|
+
Code in main function is structures into 4 steps
|
39
|
+
1. Sort all reads according to expected errorfree kmers
|
40
|
+
2. Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length.
|
41
|
+
This is used for calculating if reads matches representative.
|
42
|
+
3. Cluster the reads
|
43
|
+
4. Write output
|
44
|
+
"""
|
45
|
+
##### Sort all reads according to expected errorfree kmers #####
|
46
|
+
args.outfile = os.path.join(args.outfolder, "sorted.fastq")
|
47
|
+
logging.debug("started sorting seqs")
|
48
|
+
start = time()
|
49
|
+
sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
|
50
|
+
logging.debug(f"elapsed time sorting: {time() - start}")
|
51
|
+
#################################################################
|
52
|
+
|
53
|
+
##### Filter and subsample #####
|
54
|
+
if args.target_length > 0 and args.target_deviation > 0:
|
55
|
+
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r'))) if args.target_length - args.target_deviation <= len(seq) <= args.target_length + args.target_deviation]
|
56
|
+
logging.debug("Number of reads with read length in interval [{0},{1}]: {2}".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation, len(read_array)))
|
57
|
+
else:
|
58
|
+
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
|
59
|
+
|
60
|
+
if args.top_reads:
|
61
|
+
read_array = read_array[:args.sample_size]
|
62
|
+
elif 0 < args.sample_size < len(read_array):
|
63
|
+
read_array = [read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
|
64
|
+
|
65
|
+
abundance_cutoff = int( args.abundance_ratio * len(read_array))
|
66
|
+
#################################################################
|
67
|
+
|
68
|
+
|
69
|
+
##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
|
70
|
+
logging.debug("Started imported empirical error probabilities of minimizers shared:")
|
71
|
+
start = time()
|
72
|
+
p_min_shared = p_minimizers_shared.read_empirical_p()
|
73
|
+
p_emp_probs = {}
|
74
|
+
for k, w, p, e1, e2 in p_min_shared:
|
75
|
+
if int(k) == args.k and abs(int(w) - args.w) <= 2:
|
76
|
+
p_emp_probs[(float(e1),float(e2))] = float(p)
|
77
|
+
p_emp_probs[(float(e2),float(e1))] = float(p)
|
78
|
+
|
79
|
+
logging.debug(f"{p_emp_probs}")
|
80
|
+
logging.debug(f"{len(p_emp_probs)}")
|
81
|
+
logging.debug(f"elapsed time imported empirical error probabilities of minimizers shared: {time() - start}")
|
82
|
+
##################################################################################################################################
|
83
|
+
|
84
|
+
logging.info(f"Starting Clustering: {len(read_array)} reads")
|
85
|
+
##### Cluster reads, bulk of code base is here #####
|
86
|
+
logging.debug("started clustring")
|
87
|
+
start = time()
|
88
|
+
if args.nr_cores > 1:
|
89
|
+
clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
|
90
|
+
else:
|
91
|
+
logging.debug("Using 1 core.")
|
92
|
+
clusters, representatives = single_clustering(read_array, p_emp_probs, args)
|
93
|
+
# clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs, args)
|
94
|
+
logging.debug(f"Time elapsed clustering: {time() - start}")
|
95
|
+
####################################################
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
##### Write output in sorted quality order! #####
|
100
|
+
outfile = open(os.path.join(args.outfolder, "final_clusters.tsv"), "w")
|
101
|
+
origins_outfile = open(os.path.join(args.outfolder, "final_cluster_origins.tsv"), "w")
|
102
|
+
nontrivial_cluster_index = 0
|
103
|
+
output_cl_id = 0
|
104
|
+
for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
|
105
|
+
# for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),x[0]), reverse=True):
|
106
|
+
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate, _ = representatives[c_id]
|
107
|
+
origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(output_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))
|
108
|
+
|
109
|
+
for r_acc in sorted(all_read_acc, key = lambda x: float(x.split("_")[-1]) , reverse=True):
|
110
|
+
outfile.write("{0}\t{1}\n".format(output_cl_id, "_".join([item for item in r_acc.split("_")[:-1]]) ))
|
111
|
+
if len(all_read_acc) > 1:
|
112
|
+
nontrivial_cluster_index += 1
|
113
|
+
|
114
|
+
output_cl_id +=1
|
115
|
+
|
116
|
+
logging.debug(f"Nr clusters larger than 1: {nontrivial_cluster_index}") #, "Non-clustered reads:", len(archived_reads))
|
117
|
+
logging.debug(f"Nr clusters (all): {len(clusters)}") #, "Non-clustered reads:", len(archived_reads))
|
118
|
+
outfile.close()
|
119
|
+
origins_outfile.close()
|
120
|
+
############################
|
121
|
+
|
122
|
+
logging.info(f"Finished Clustering: {nontrivial_cluster_index} clusters formed")
|
123
|
+
|
124
|
+
if args.consensus:
|
125
|
+
logging.info(f"Starting Consensus creation and polishing")
|
126
|
+
work_dir = tempfile.mkdtemp()
|
127
|
+
logging.debug(f"Temporary workdirectory for consensus and polishing: {work_dir}")
|
128
|
+
logging.debug(
|
129
|
+
f"Forming draft consensus with abundance_cutoff >= {abundance_cutoff} "
|
130
|
+
f"({args.abundance_ratio * 100}% of {len(read_array)} reads)"
|
131
|
+
)
|
132
|
+
centers = consensus.form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, work_dir, abundance_cutoff, args)
|
133
|
+
|
134
|
+
if args.primer_file or args.remove_universal_tails:
|
135
|
+
if args.remove_universal_tails:
|
136
|
+
logging.debug("Detecting and removing universal tails")
|
137
|
+
barcodes = barcode_trimmer.get_universal_tails()
|
138
|
+
else:
|
139
|
+
logging.debug("Detecting and removing primers")
|
140
|
+
barcodes = barcode_trimmer.read_barcodes(args.primer_file)
|
141
|
+
|
142
|
+
barcode_trimmer.remove_barcodes(centers, barcodes, args)
|
143
|
+
|
144
|
+
logging.debug("{0} centers formed".format(len(centers)))
|
145
|
+
centers_filtered = consensus.detect_reverse_complements(centers, args.rc_identity_threshold)
|
146
|
+
centers_polished = consensus.polish_sequences(centers_filtered, args)
|
147
|
+
|
148
|
+
if args.primer_file or args.remove_universal_tails: # check if barcode is found after polishing with medaka
|
149
|
+
centers_updated = barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
|
150
|
+
if centers_updated:
|
151
|
+
centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
|
152
|
+
centers_polished = consensus.polish_sequences(centers_filtered, args)
|
153
|
+
|
154
|
+
|
155
|
+
logging.debug("removing temporary workdir")
|
156
|
+
shutil.rmtree(work_dir)
|
157
|
+
|
158
|
+
logging.info(f"Finished Consensus creation: {len(centers_filtered)} created")
|
159
|
+
|
160
|
+
|
161
|
+
def write_fastq(args):
|
162
|
+
from collections import defaultdict
|
163
|
+
clusters = defaultdict(list)
|
164
|
+
|
165
|
+
with open(args.clusters) as f:
|
166
|
+
for line in f:
|
167
|
+
items = line.strip().split()
|
168
|
+
cl_id, acc = items[0], items[1]
|
169
|
+
clusters[cl_id].append(acc)
|
170
|
+
|
171
|
+
help_functions.mkdir_p(args.outfolder)
|
172
|
+
reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))}
|
173
|
+
|
174
|
+
for cl_id in clusters:
|
175
|
+
r = clusters[cl_id]
|
176
|
+
|
177
|
+
if len(r) >= args.N:
|
178
|
+
curr_file = open(os.path.join(args.outfolder, str(cl_id) + ".fastq" ), "w")
|
179
|
+
for acc in r:
|
180
|
+
seq, qual = reads[acc]
|
181
|
+
curr_file.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
|
182
|
+
curr_file.close()
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
if __name__ == '__main__':
|
188
|
+
parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
189
|
+
parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
|
190
|
+
parser.add_argument('--debug', action='store_true', help='Enable debug logging')
|
191
|
+
reads_file = parser.add_mutually_exclusive_group(required=True)
|
192
|
+
reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
|
193
|
+
reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
|
194
|
+
parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
|
195
|
+
parser.add_argument('--d', dest="print_output", type=int, default=10000, help='For debugging, prints status of clustering and minimizer database every p reads processed.')
|
196
|
+
parser.add_argument('--q', dest="quality_threshold", type=float, default=7.0, help='Filters reads with average phred quality value under this number (default = 7.0).')
|
197
|
+
|
198
|
+
parser.add_argument('--ont', action="store_true", help='Clustering of ONT transcript reads.')
|
199
|
+
parser.add_argument('--isoseq', action="store_true", help='Clustering of PacBio Iso-Seq reads.')
|
200
|
+
|
201
|
+
parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
|
202
|
+
parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
|
203
|
+
parser.add_argument('--rc_identity_threshold', type=float, default=0.9, help='Threshold for --consensus algorithm. Define a reverse complement if identity is over this threshold (default 0.9)')
|
204
|
+
parser.add_argument('--max_seqs_for_consensus', type=int, default=-1, help='Maximum number of seqs to form consensus with spoa [INT] (default = -1, which means to use all sequences available regardless of cluster size).')
|
205
|
+
|
206
|
+
group = parser.add_mutually_exclusive_group()
|
207
|
+
group.add_argument('--medaka', action="store_true", help='Run final medaka polishing algorithm.')
|
208
|
+
group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')
|
209
|
+
|
210
|
+
parser.add_argument('--medaka_model', type=str, default="", help='Set specific medaka model.')
|
211
|
+
parser.add_argument('--medaka_fastq', action="store_true", help='Request Medaka to output a FASTQ file, instead of FASTA')
|
212
|
+
parser.add_argument('--racon_iter', type=int, default=2, help='Number of times to run racon iteratively')
|
213
|
+
|
214
|
+
group2 = parser.add_mutually_exclusive_group()
|
215
|
+
group2.add_argument('--remove_universal_tails', action="store_true", help='Remove universal tails "TTTCTGTTGGTGCTGATATTGC" and "ACTTGCCTGTCGCTCTATCTTC" after the spoa consensus step and before the revers complement detection.')
|
216
|
+
group2.add_argument('--primer_file', type=str, default="", help='Path to file with primers. Primers are removed after the spoa consensus step and before the revers complement detection.')
|
217
|
+
parser.add_argument('--primer_max_ed', type=int, default=2, help='Threshold edit distance for finding bracore in spoa consensus')
|
218
|
+
parser.add_argument('--trim_window', type=int, default=150, help='Window size of how many bases to look for barcodes (default 150 bases in beginning and end of consensus).')
|
219
|
+
parser.add_argument('--m', dest="target_length", type=int, default=0, help='Intended amplicon length. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
|
220
|
+
parser.add_argument('--s', dest="target_deviation", type=int, default=0, help='Maximum allowed amplicon-length deviation. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
|
221
|
+
parser.add_argument('--sample_size', type=int, default=0, help='Use sample_size reads in the NGSpecies pipeline (default = 0 which means all reads considered). If sample size is larger than actual number of reads, all reads will be used.')
|
222
|
+
parser.add_argument('--top_reads', action='store_true', help='Use the top --sample_size reads instead of a random selection (default = false, which means random reads considered). ')
|
223
|
+
|
224
|
+
|
225
|
+
parser.add_argument('--k', type=int, default=13, help='Kmer size')
|
226
|
+
parser.add_argument('--w', type=int, default=20, help='Window size')
|
227
|
+
parser.add_argument('--min_shared', type=int, default=5, help='Minmum number of minimizers shared between read and cluster')
|
228
|
+
parser.add_argument('--mapped_threshold', type=float, default=0.7, help='Minmum mapped fraction of read to be included in cluster. The density of minimizers to classify a region as mapped depends on quality of the read.')
|
229
|
+
parser.add_argument('--aligned_threshold', type=float, default=0.4, help='Minmum aligned fraction of read to be included in cluster. Aligned identity depends on the quality of the read.')
|
230
|
+
parser.add_argument('--symmetric_map_align_thresholds', action='store_true', help='Apply mapped threshold and aligned threshold to fraction of cluster representative which maps onto the read')
|
231
|
+
parser.add_argument('--batch_type', type=str, default='total_nt', help='In parrallel mode, how to split the reads into chunks "total_nt", "nr_reads", or "weighted" (default: total_nt) ')
|
232
|
+
parser.add_argument('--min_fraction', type=float, default=0.8, help='Minmum fraction of minimizers shared compared to best hit, in order to continue mapping.')
|
233
|
+
parser.add_argument('--min_prob_no_hits', type=float, default=0.1, help='Minimum probability for i consecutive minimizers to be different between read and representative and still considered as mapped region, under assumption that they come from the same transcript (depends on read quality).')
|
234
|
+
parser.add_argument('--outfolder', type=str, default=None, help='A fasta file with transcripts that are shared between samples and have perfect illumina support.')
|
235
|
+
# parser.add_argument('--pickled_subreads', type=str, help='Path to an already parsed subreads file in pickle format')
|
236
|
+
parser.set_defaults(which='main')
|
237
|
+
|
238
|
+
subparsers = parser.add_subparsers(help='sub-command help')
|
239
|
+
write_fastq_parser = subparsers.add_parser('write_fastq', help='a help')
|
240
|
+
write_fastq_parser.add_argument('--clusters', type=str, help='the file "final_clusters.csv created by isONclust."')
|
241
|
+
write_fastq_parser.add_argument('--fastq', type=str, help='Input fastq file')
|
242
|
+
write_fastq_parser.add_argument('--outfolder', type=str, help='Output folder')
|
243
|
+
write_fastq_parser.add_argument('--N', type=int, default = 0, help='Write out clusters with more or equal than N reads')
|
244
|
+
# parser.add_argument('--write_fastq_clusters', default = None, help=' --write_fastq_clusters <N>. Write out clusters with more or equal than N >= 1.')
|
245
|
+
write_fastq_parser.set_defaults(which='write_fastq')
|
246
|
+
|
247
|
+
args = parser.parse_args()
|
248
|
+
|
249
|
+
loglevel = logging.DEBUG if args.debug else logging.INFO
|
250
|
+
|
251
|
+
logging.basicConfig(
|
252
|
+
level=loglevel,
|
253
|
+
format='%(message)s'
|
254
|
+
)
|
255
|
+
|
256
|
+
if args.which == 'write_fastq':
|
257
|
+
write_fastq(args)
|
258
|
+
logging.info("Wrote clusters to separate fastq files.")
|
259
|
+
sys.exit(0)
|
260
|
+
|
261
|
+
if args.ont and args.isoseq :
|
262
|
+
logging.error("Arguments mutually exclusive, specify either --isoseq or --ont. ")
|
263
|
+
sys.exit()
|
264
|
+
elif args.isoseq:
|
265
|
+
args.k = 15
|
266
|
+
args.w = 50
|
267
|
+
elif args.ont:
|
268
|
+
args.k = 13
|
269
|
+
args.w = 20
|
270
|
+
|
271
|
+
|
272
|
+
if len(sys.argv)==1:
|
273
|
+
parser.print_help()
|
274
|
+
sys.exit()
|
275
|
+
|
276
|
+
if args.outfolder and not os.path.exists(args.outfolder):
|
277
|
+
os.makedirs(args.outfolder)
|
278
|
+
|
279
|
+
parasail_module = 'parasail'
|
280
|
+
if parasail_module not in sys.modules:
|
281
|
+
logging.error('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
|
282
|
+
sys.exit(1)
|
283
|
+
if 100 < args.w or args.w < args.k:
|
284
|
+
logging.error('Please specify a window of size larger or equal to k, and smaller than 100.')
|
285
|
+
sys.exit(1)
|
286
|
+
|
287
|
+
main(args)
|
288
|
+
|