mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +138 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- mgnify_pipelines_toolkit/utils/get_mpt_version.py +26 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.6.dist-info/RECORD +34 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/entry_points.txt +4 -0
- mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -15,23 +15,33 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
|
-
from collections import Counter
|
|
18
|
+
from collections import Counter, defaultdict
|
|
19
19
|
import gzip
|
|
20
20
|
import re
|
|
21
21
|
import os
|
|
22
22
|
import logging
|
|
23
|
-
import sys
|
|
24
23
|
import json
|
|
25
24
|
import time
|
|
26
25
|
|
|
27
|
-
from mgnify_pipelines_toolkit.constants.thresholds import
|
|
28
|
-
|
|
26
|
+
from mgnify_pipelines_toolkit.constants.thresholds import (
|
|
27
|
+
MIN_OVERLAP,
|
|
28
|
+
MIN_SEQ_COUNT,
|
|
29
|
+
MAX_ERROR_PROPORTION,
|
|
30
|
+
MAX_INTERNAL_PRIMER_PROPORTION,
|
|
31
|
+
)
|
|
32
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
33
|
+
REGIONS_16S_BACTERIA,
|
|
34
|
+
REGIONS_16S_ARCHAEA,
|
|
35
|
+
REGIONS_18S,
|
|
36
|
+
)
|
|
29
37
|
|
|
30
38
|
raw_f_regex = re.compile(
|
|
31
|
-
"([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*"
|
|
39
|
+
r"([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*" # noqa: E501
|
|
40
|
+
)
|
|
32
41
|
|
|
33
42
|
logging.basicConfig(level=logging.DEBUG)
|
|
34
43
|
|
|
44
|
+
|
|
35
45
|
def calc_overlap(read, reg):
|
|
36
46
|
read_s, read_f = read
|
|
37
47
|
reg_s, reg_f = reg
|
|
@@ -55,14 +65,17 @@ def get_multiregion(raw_sequence_coords, regions):
|
|
|
55
65
|
|
|
56
66
|
"""
|
|
57
67
|
# check if any of the coords are inside the region
|
|
58
|
-
matched_regions = [
|
|
59
|
-
|
|
68
|
+
matched_regions = [
|
|
69
|
+
region
|
|
70
|
+
for region, limits in regions.items()
|
|
71
|
+
if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP
|
|
72
|
+
]
|
|
60
73
|
if len(matched_regions) > 1:
|
|
61
|
-
amplified_region =
|
|
74
|
+
amplified_region = "{}-{}".format(min(matched_regions), max(matched_regions))
|
|
62
75
|
elif len(matched_regions) == 1:
|
|
63
76
|
amplified_region = matched_regions[0]
|
|
64
77
|
else:
|
|
65
|
-
amplified_region =
|
|
78
|
+
amplified_region = ""
|
|
66
79
|
return amplified_region
|
|
67
80
|
|
|
68
81
|
|
|
@@ -87,21 +100,26 @@ def check_primer_position(raw_sequence_coords, regions):
|
|
|
87
100
|
|
|
88
101
|
# Parse, filter empty lines and unpack into 2D array
|
|
89
102
|
def load_data(filename):
|
|
90
|
-
read_function = gzip.open if filename.endswith(
|
|
91
|
-
with read_function(filename,
|
|
103
|
+
read_function = gzip.open if filename.endswith(".gz") else open
|
|
104
|
+
with read_function(filename, "rt") as f:
|
|
92
105
|
return [l[0] for l in [raw_f_regex.findall(l) for l in f] if bool(l)]
|
|
93
106
|
|
|
94
107
|
|
|
95
108
|
def unsplit_region(long_region):
|
|
96
|
-
interval = [int(var_reg.replace(
|
|
109
|
+
interval = [int(var_reg.replace("V", "")) for var_reg in long_region.split("-")]
|
|
97
110
|
if len(interval) == 1:
|
|
98
111
|
interval = interval * 2
|
|
99
112
|
return interval
|
|
100
113
|
|
|
101
114
|
|
|
102
115
|
def check_inclusiveness(more_frequent, less_frequent):
|
|
103
|
-
unsplit_more_frequent, unsplit_less_frequent = [
|
|
104
|
-
|
|
116
|
+
unsplit_more_frequent, unsplit_less_frequent = [
|
|
117
|
+
unsplit_region(region) for region in [more_frequent, less_frequent]
|
|
118
|
+
]
|
|
119
|
+
return (
|
|
120
|
+
unsplit_more_frequent[0] <= unsplit_less_frequent[0]
|
|
121
|
+
and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
|
|
122
|
+
)
|
|
105
123
|
|
|
106
124
|
|
|
107
125
|
def normalise_results(region_matches):
|
|
@@ -121,10 +139,12 @@ def normalise_results(region_matches):
|
|
|
121
139
|
# [region, round(count / len(region_matches), 4)]
|
|
122
140
|
[region, count / len(region_matches)]
|
|
123
141
|
for region, count in counter.items()
|
|
124
|
-
if count/len(region_matches) >= MAX_ERROR_PROPORTION and region !=
|
|
142
|
+
if count / len(region_matches) >= MAX_ERROR_PROPORTION and region != ""
|
|
125
143
|
]
|
|
126
144
|
# sort by frequency in reverse order
|
|
127
|
-
var_region_proportions = sorted(
|
|
145
|
+
var_region_proportions = sorted(
|
|
146
|
+
var_region_proportions, key=lambda x: x[1], reverse=True
|
|
147
|
+
)
|
|
128
148
|
|
|
129
149
|
if len(var_region_proportions) == 1:
|
|
130
150
|
return dict(var_region_proportions)
|
|
@@ -137,8 +157,9 @@ def normalise_results(region_matches):
|
|
|
137
157
|
else:
|
|
138
158
|
return None
|
|
139
159
|
else:
|
|
140
|
-
if min(
|
|
141
|
-
|
|
160
|
+
if min(
|
|
161
|
+
more_frequent[1], less_frequent[1]
|
|
162
|
+
) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
|
|
142
163
|
return dict(var_region_proportions)
|
|
143
164
|
else:
|
|
144
165
|
return None
|
|
@@ -153,7 +174,7 @@ def identify_run(infile_name):
|
|
|
153
174
|
Return:
|
|
154
175
|
run: Run ID ERR*|SRR*
|
|
155
176
|
"""
|
|
156
|
-
run = os.path.basename(infile_name).split(
|
|
177
|
+
run = os.path.basename(infile_name).split("_")[0]
|
|
157
178
|
return run
|
|
158
179
|
|
|
159
180
|
|
|
@@ -165,63 +186,79 @@ def determine_cm(cm_detected):
|
|
|
165
186
|
Returns:
|
|
166
187
|
model: A dictionary containing the coordinates of the variable regions for the matched model.
|
|
167
188
|
"""
|
|
168
|
-
if cm_detected ==
|
|
189
|
+
if cm_detected == "RF00177":
|
|
169
190
|
model = REGIONS_16S_BACTERIA
|
|
170
|
-
elif cm_detected ==
|
|
191
|
+
elif cm_detected == "RF01959":
|
|
171
192
|
model = REGIONS_16S_ARCHAEA
|
|
172
|
-
elif cm_detected ==
|
|
193
|
+
elif cm_detected == "RF01960":
|
|
173
194
|
model = REGIONS_18S
|
|
174
195
|
else:
|
|
175
|
-
model =
|
|
196
|
+
model = "unsupported"
|
|
176
197
|
return model
|
|
177
198
|
|
|
178
199
|
|
|
179
200
|
def determine_domain(cm_detected):
|
|
180
|
-
if cm_detected ==
|
|
181
|
-
return
|
|
182
|
-
elif cm_detected ==
|
|
183
|
-
return
|
|
184
|
-
elif cm_detected ==
|
|
185
|
-
return
|
|
201
|
+
if cm_detected == "RF00177":
|
|
202
|
+
return "Bacteria"
|
|
203
|
+
elif cm_detected == "RF01959":
|
|
204
|
+
return "Archaea"
|
|
205
|
+
elif cm_detected == "RF01960":
|
|
206
|
+
return "Eukaryotes"
|
|
186
207
|
|
|
187
208
|
|
|
188
209
|
def determine_marker_gene(domain):
|
|
189
|
-
if domain in [
|
|
190
|
-
return
|
|
191
|
-
elif domain ==
|
|
192
|
-
return
|
|
210
|
+
if domain in ["Bacteria", "Archaea"]:
|
|
211
|
+
return "16S"
|
|
212
|
+
elif domain == "Eukaryotes":
|
|
213
|
+
return "18S"
|
|
193
214
|
|
|
194
215
|
|
|
195
|
-
def print_stats(
|
|
216
|
+
def print_stats(
|
|
217
|
+
run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out
|
|
218
|
+
):
|
|
196
219
|
summary_num = dict()
|
|
197
220
|
for cm in run_result:
|
|
198
221
|
summary_num[cm] = dict()
|
|
199
222
|
stats = Counter(run_result[cm])
|
|
200
|
-
summary_num[cm][
|
|
201
|
-
summary_num[cm][
|
|
202
|
-
del stats[
|
|
203
|
-
summary_num[cm][
|
|
204
|
-
summary_num[cm][
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
223
|
+
summary_num[cm]["empty"] = stats[""]
|
|
224
|
+
summary_num[cm]["total regions"] = len(stats)
|
|
225
|
+
del stats[""]
|
|
226
|
+
summary_num[cm]["regions"] = ", ".join(stats.keys())
|
|
227
|
+
summary_num[cm]["freqs"] = ", ".join(
|
|
228
|
+
[
|
|
229
|
+
(
|
|
230
|
+
"{0:.4f}".format(val / len(run_result[cm]))
|
|
231
|
+
if len(run_result[cm]) > 0
|
|
232
|
+
else "0"
|
|
233
|
+
)
|
|
234
|
+
for val in stats.values()
|
|
235
|
+
]
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
print_str = ""
|
|
239
|
+
models = ["RF00177", "RF01959", "RF01960"]
|
|
211
240
|
for model in models:
|
|
212
241
|
if model in summary_num:
|
|
213
|
-
print_str += (
|
|
214
|
-
|
|
242
|
+
print_str += ("{}\t" * 3).format(
|
|
243
|
+
summary_num[model].get("empty", 0),
|
|
244
|
+
summary_num[model].get("regions", ""),
|
|
245
|
+
summary_num[model].get("freqs", 0),
|
|
246
|
+
)
|
|
215
247
|
else:
|
|
216
|
-
print_str +=
|
|
248
|
+
print_str += " \t \t \t"
|
|
217
249
|
if num_sequences > 0:
|
|
218
|
-
stats_out.write(
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
250
|
+
stats_out.write(
|
|
251
|
+
("{}\t" * 7 + "{}\n").format(
|
|
252
|
+
run_id,
|
|
253
|
+
num_sequences,
|
|
254
|
+
"{0:.3f}".format(num_unsupported / num_sequences),
|
|
255
|
+
"{0:.3f}".format(num_inside_vr / num_sequences),
|
|
256
|
+
"{0:.3f}".format(len(run_result.get("RF00177", [])) / num_sequences),
|
|
257
|
+
"{0:.3f}".format(len(run_result.get("RF01959", [])) / num_sequences),
|
|
258
|
+
"{0:.3f}".format(len(run_result.get("RF01960", [])) / num_sequences),
|
|
259
|
+
print_str,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
225
262
|
|
|
226
263
|
|
|
227
264
|
def print_to_table(tsv_out, results, per_read_info):
|
|
@@ -230,14 +267,14 @@ def print_to_table(tsv_out, results, per_read_info):
|
|
|
230
267
|
tsv_out: The name of the tsv outfile.
|
|
231
268
|
results: The dictionary that contains a list of variable regions for a run and their match proportions.
|
|
232
269
|
"""
|
|
233
|
-
#logging.info(results)
|
|
270
|
+
# logging.info(results)
|
|
234
271
|
|
|
235
|
-
prefix = tsv_out.split(
|
|
272
|
+
prefix = tsv_out.split(".tsv")[0]
|
|
236
273
|
|
|
237
|
-
f = open(tsv_out,
|
|
238
|
-
fw = open(f
|
|
274
|
+
f = open(tsv_out, "w")
|
|
275
|
+
fw = open(f"{prefix}_regions.txt", "w")
|
|
239
276
|
# print the table header to file
|
|
240
|
-
f.write(
|
|
277
|
+
f.write("Run\tAssertionEvidence\tAssertionMethod\tMarker gene\tVariable region\n")
|
|
241
278
|
gene_hv_to_write = []
|
|
242
279
|
for run, amplified_region_dict in results.items():
|
|
243
280
|
records = set()
|
|
@@ -245,103 +282,157 @@ def print_to_table(tsv_out, results, per_read_info):
|
|
|
245
282
|
for domain, amplified_regions in amplified_region_dict.items():
|
|
246
283
|
marker_gene = determine_marker_gene(domain)
|
|
247
284
|
for vr in amplified_regions.keys():
|
|
248
|
-
if not vr ==
|
|
249
|
-
record =
|
|
250
|
-
|
|
285
|
+
if not vr == "":
|
|
286
|
+
record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(
|
|
287
|
+
run, determine_marker_gene(domain), vr
|
|
288
|
+
)
|
|
251
289
|
records.add(record)
|
|
252
|
-
records_regions.add(f
|
|
290
|
+
records_regions.add(f"{marker_gene}.{vr}\n")
|
|
253
291
|
gene_hv_to_write.append(f"{marker_gene}.{vr}")
|
|
254
292
|
for record_to_print in records:
|
|
255
293
|
f.write(record_to_print)
|
|
256
|
-
|
|
294
|
+
|
|
257
295
|
for record_to_print in records_regions:
|
|
258
296
|
fw.write(record_to_print)
|
|
259
297
|
|
|
260
298
|
for key in per_read_info.keys():
|
|
261
299
|
if key in gene_hv_to_write:
|
|
262
|
-
per_read_filename =
|
|
263
|
-
with open(per_read_filename,
|
|
264
|
-
f_hv.write(
|
|
300
|
+
per_read_filename = "{}.{}.txt".format(prefix, key)
|
|
301
|
+
with open(per_read_filename, "w") as f_hv:
|
|
302
|
+
f_hv.write("\n".join(per_read_info[key]))
|
|
265
303
|
|
|
266
304
|
f.close()
|
|
267
305
|
fw.close()
|
|
268
306
|
|
|
269
|
-
|
|
307
|
+
|
|
308
|
+
def retrieve_regions(
|
|
309
|
+
tblout_file_list,
|
|
310
|
+
outfile_prefix,
|
|
311
|
+
stats_out,
|
|
312
|
+
condensed_out,
|
|
313
|
+
missing_out,
|
|
314
|
+
seq_count_out,
|
|
315
|
+
):
|
|
270
316
|
file_counter = 0 # count how many files were analyzed
|
|
271
317
|
sequence_counter_total = 0 # count how many sequences in total were analyzed
|
|
272
318
|
sequence_counter_useful = 0 # count how many sequences an output was generated for
|
|
273
319
|
normalised_matches = dict() # dictionary that will contain results for all runs
|
|
274
|
-
failed_run_counter =
|
|
275
|
-
|
|
320
|
+
failed_run_counter = (
|
|
321
|
+
0 # total number of excluded runs for any reason (except non-existing files)
|
|
322
|
+
)
|
|
323
|
+
run_counters = {k: 0 for k in ["one", "two", "ambiguous"]} # counters
|
|
276
324
|
seq_per_variable_region_count = dict()
|
|
277
325
|
|
|
278
326
|
for tblout_file in tblout_file_list:
|
|
279
327
|
if not os.path.isfile(tblout_file):
|
|
280
|
-
unzipped_filename = tblout_file.replace(
|
|
328
|
+
unzipped_filename = tblout_file.replace(".gz", "")
|
|
281
329
|
if os.path.isfile(unzipped_filename):
|
|
282
330
|
tblout_file = unzipped_filename
|
|
283
331
|
else:
|
|
284
|
-
logging.info(
|
|
285
|
-
missing_out.write(
|
|
332
|
+
logging.info("File {} does not exist".format(tblout_file))
|
|
333
|
+
missing_out.write("{}\n".format(tblout_file))
|
|
286
334
|
continue
|
|
287
335
|
data = load_data(tblout_file)
|
|
288
336
|
run_id = identify_run(tblout_file)
|
|
289
337
|
multiregion_matches = dict()
|
|
290
|
-
unsupported_matches =
|
|
338
|
+
unsupported_matches = (
|
|
339
|
+
0 # tracks the number of sequences that map to unsupported models
|
|
340
|
+
)
|
|
291
341
|
primer_inside_vr = 0 # tracks the number of sequences that start and/or end inside a variable region
|
|
292
|
-
per_read_info =
|
|
342
|
+
per_read_info = (
|
|
343
|
+
dict()
|
|
344
|
+
) # dictionary will contain read names for each variable region
|
|
293
345
|
for read in data:
|
|
294
346
|
regions = determine_cm(read[2])
|
|
295
347
|
sequence_counter_total += 1
|
|
296
348
|
limits = list(map(int, read[4:6]))
|
|
297
349
|
domain = determine_domain(read[2])
|
|
298
350
|
marker_gene = determine_marker_gene(domain)
|
|
299
|
-
if not regions ==
|
|
300
|
-
multiregion_matches.setdefault(read[2], []).append(
|
|
351
|
+
if not regions == "unsupported":
|
|
352
|
+
multiregion_matches.setdefault(read[2], []).append(
|
|
353
|
+
get_multiregion(limits, regions)
|
|
354
|
+
)
|
|
301
355
|
if check_primer_position(limits, regions):
|
|
302
356
|
primer_inside_vr += 1
|
|
303
357
|
sequence_counter_useful += 1
|
|
304
|
-
per_read_info.setdefault(
|
|
358
|
+
per_read_info.setdefault(
|
|
359
|
+
marker_gene + "." + get_multiregion(limits, regions), []
|
|
360
|
+
).append(read[0])
|
|
305
361
|
else:
|
|
306
362
|
unsupported_matches += 1
|
|
307
363
|
|
|
308
|
-
print_stats(
|
|
364
|
+
print_stats(
|
|
365
|
+
run_id,
|
|
366
|
+
len(data),
|
|
367
|
+
unsupported_matches,
|
|
368
|
+
primer_inside_vr,
|
|
369
|
+
multiregion_matches,
|
|
370
|
+
stats_out,
|
|
371
|
+
)
|
|
309
372
|
if not data:
|
|
310
373
|
failed_run_counter += 1
|
|
311
|
-
logging.info(
|
|
374
|
+
logging.info("No output will be produced - no data")
|
|
312
375
|
continue
|
|
313
376
|
|
|
314
|
-
unsupported_fract = unsupported_matches/len(data)
|
|
377
|
+
unsupported_fract = unsupported_matches / len(data)
|
|
315
378
|
if unsupported_fract >= MAX_ERROR_PROPORTION:
|
|
316
379
|
failed_run_counter += 1
|
|
317
|
-
logging.info(
|
|
318
|
-
logging.info(
|
|
380
|
+
logging.info("No output will be produced - too many unsupported models")
|
|
381
|
+
logging.info(
|
|
382
|
+
"Excluded\t{}\t{}\t{}\n".format(
|
|
383
|
+
tblout_file, "{0:.2f}".format(unsupported_fract), len(data)
|
|
384
|
+
)
|
|
385
|
+
)
|
|
319
386
|
continue
|
|
320
387
|
|
|
321
388
|
# filter out runs with too many sequences starting/ending inside variable regions
|
|
322
|
-
internal_seq_fract = primer_inside_vr/len(data)
|
|
389
|
+
internal_seq_fract = primer_inside_vr / len(data)
|
|
323
390
|
if internal_seq_fract > MAX_INTERNAL_PRIMER_PROPORTION:
|
|
324
391
|
failed_run_counter += 1
|
|
325
|
-
logging.info(
|
|
326
|
-
logging.info(
|
|
327
|
-
|
|
392
|
+
logging.info("No output will be produced - too many internal mappings")
|
|
393
|
+
logging.info(
|
|
394
|
+
"Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
|
|
395
|
+
tblout_file, "{0:.2f}".format(internal_seq_fract)
|
|
396
|
+
)
|
|
397
|
+
)
|
|
328
398
|
continue
|
|
329
399
|
|
|
330
400
|
normalised_matches[run_id] = dict()
|
|
401
|
+
region_counter = defaultdict(int)
|
|
331
402
|
|
|
332
|
-
|
|
333
|
-
multiregion_matches = {d: v for d, v in multiregion_matches.items() if len(v)/len(data) >= 0.01}
|
|
403
|
+
regions_to_remove = []
|
|
334
404
|
|
|
335
|
-
run_ok = True
|
|
336
405
|
for model, value in multiregion_matches.items():
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
406
|
+
marker_gene = determine_marker_gene(determine_domain(model))
|
|
407
|
+
for region in value:
|
|
408
|
+
region_counter[f"{marker_gene}.{region}"] += 1
|
|
409
|
+
|
|
410
|
+
for region, count in region_counter.items():
|
|
411
|
+
if count < MIN_SEQ_COUNT:
|
|
412
|
+
regions_to_remove.append(region)
|
|
413
|
+
|
|
414
|
+
if len(regions_to_remove) == len(region_counter.keys()):
|
|
340
415
|
failed_run_counter += 1
|
|
341
|
-
logging.info(
|
|
416
|
+
logging.info("No output will be produced - too few sequences in a domain")
|
|
342
417
|
continue
|
|
343
418
|
|
|
344
|
-
|
|
419
|
+
models_to_remove = []
|
|
420
|
+
|
|
421
|
+
for model, value in multiregion_matches.items():
|
|
422
|
+
new_value = []
|
|
423
|
+
for region in value:
|
|
424
|
+
marker_gene = determine_marker_gene(determine_domain(model))
|
|
425
|
+
full_region = f"{marker_gene}.{region}"
|
|
426
|
+
if full_region not in regions_to_remove:
|
|
427
|
+
new_value.append(region)
|
|
428
|
+
if not new_value:
|
|
429
|
+
models_to_remove.append(model)
|
|
430
|
+
multiregion_matches[model] = new_value
|
|
431
|
+
|
|
432
|
+
[multiregion_matches.pop(model) for model in models_to_remove]
|
|
433
|
+
print(multiregion_matches)
|
|
434
|
+
|
|
435
|
+
run_status = "one"
|
|
345
436
|
run_result = dict()
|
|
346
437
|
total_useful_sequences = 0.0
|
|
347
438
|
temp_seq_counter = dict()
|
|
@@ -349,62 +440,86 @@ def retrieve_regions(tblout_file_list, outfile_prefix, stats_out, condensed_out,
|
|
|
349
440
|
print(model)
|
|
350
441
|
result = normalise_results(model_regions)
|
|
351
442
|
if result is None:
|
|
352
|
-
run_status =
|
|
443
|
+
run_status = "ambiguous"
|
|
353
444
|
break
|
|
354
445
|
elif len(result) == 2:
|
|
355
|
-
run_status =
|
|
446
|
+
run_status = "two"
|
|
356
447
|
run_result[determine_domain(model)] = result
|
|
357
448
|
for reg, freq in result.items():
|
|
358
449
|
total_useful_sequences += len(model_regions) * freq
|
|
359
|
-
temp_seq_counter[determine_domain(model) +
|
|
360
|
-
|
|
450
|
+
temp_seq_counter[determine_domain(model) + " " + reg] = (
|
|
451
|
+
len(model_regions) * freq
|
|
452
|
+
)
|
|
453
|
+
if total_useful_sequences / len(data) < 0.95 and run_status != "ambiguous":
|
|
361
454
|
failed_run_counter += 1
|
|
362
|
-
logging.info(
|
|
455
|
+
logging.info("No output will be produced - too few useful sequences")
|
|
363
456
|
continue
|
|
364
457
|
|
|
365
458
|
file_counter += 1
|
|
366
459
|
run_counters[run_status] += 1
|
|
367
460
|
|
|
368
|
-
if run_status !=
|
|
461
|
+
if run_status != "ambiguous":
|
|
369
462
|
normalised_matches[run_id] = run_result
|
|
370
463
|
for key, value in temp_seq_counter.items():
|
|
371
464
|
seq_per_variable_region_count.setdefault(key, 0)
|
|
372
465
|
seq_per_variable_region_count[key] += value
|
|
373
466
|
|
|
374
|
-
json_outfile =
|
|
375
|
-
tsv_outfile =
|
|
376
|
-
with open(json_outfile,
|
|
467
|
+
json_outfile = "{}.json".format(outfile_prefix)
|
|
468
|
+
tsv_outfile = "{}.tsv".format(outfile_prefix)
|
|
469
|
+
with open(json_outfile, "w") as f:
|
|
377
470
|
json.dump(normalised_matches, f)
|
|
378
471
|
print_to_table(tsv_outfile, normalised_matches, per_read_info)
|
|
379
|
-
condensed_out.write(
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
472
|
+
condensed_out.write(
|
|
473
|
+
"\t".join(
|
|
474
|
+
[
|
|
475
|
+
"Total number of files failed",
|
|
476
|
+
"Total number of files analyzed",
|
|
477
|
+
"Number of runs with one region",
|
|
478
|
+
"Number of runs with two regions",
|
|
479
|
+
"Number of runs with too many regions or unbalanced 2 region runs",
|
|
480
|
+
]
|
|
481
|
+
)
|
|
482
|
+
+ "\n"
|
|
483
|
+
)
|
|
484
|
+
condensed_out.write(
|
|
485
|
+
"{}\t{}\t{}\t{}\t{}\n".format(
|
|
486
|
+
failed_run_counter,
|
|
487
|
+
file_counter,
|
|
488
|
+
run_counters["one"],
|
|
489
|
+
run_counters["two"],
|
|
490
|
+
run_counters["ambiguous"],
|
|
491
|
+
)
|
|
492
|
+
)
|
|
391
493
|
for key, value in seq_per_variable_region_count.items():
|
|
392
|
-
seq_count_out.write(
|
|
494
|
+
seq_count_out.write("{}\t{}\n".format(key, int(value)))
|
|
393
495
|
|
|
394
|
-
logging.info(
|
|
395
|
-
|
|
496
|
+
logging.info(
|
|
497
|
+
"Analyzed {} files and {} sequences. Output generated for {} sequences".format(
|
|
498
|
+
file_counter, sequence_counter_total, sequence_counter_useful
|
|
499
|
+
)
|
|
500
|
+
)
|
|
396
501
|
|
|
397
502
|
|
|
398
503
|
def parse_args(argv):
|
|
399
|
-
parser = argparse.ArgumentParser(
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
parser.add_argument(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
504
|
+
parser = argparse.ArgumentParser(
|
|
505
|
+
description="Tool to determine which regions were amplified in 16S data"
|
|
506
|
+
)
|
|
507
|
+
parser.add_argument("files", nargs="+", help="A list of overlapped tblout files")
|
|
508
|
+
parser.add_argument(
|
|
509
|
+
"-d",
|
|
510
|
+
"--output_dir",
|
|
511
|
+
default="variable-region-inference",
|
|
512
|
+
help="Directory to which results will be saved",
|
|
513
|
+
)
|
|
514
|
+
parser.add_argument(
|
|
515
|
+
"-o",
|
|
516
|
+
"--output_prefix",
|
|
517
|
+
default="amplified_regions",
|
|
518
|
+
help="Prefix for all outputs",
|
|
519
|
+
)
|
|
520
|
+
parser.add_argument(
|
|
521
|
+
"--statistics", action="store_true", help="Print statistics files"
|
|
522
|
+
)
|
|
408
523
|
return parser.parse_args(argv)
|
|
409
524
|
|
|
410
525
|
|
|
@@ -414,33 +529,47 @@ def main(argv=None):
|
|
|
414
529
|
if not os.path.isdir(args.output_dir):
|
|
415
530
|
os.mkdir(args.output_dir)
|
|
416
531
|
prefix = os.path.join(args.output_dir, args.output_prefix)
|
|
417
|
-
stats_file =
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
532
|
+
stats_file = "{}.stats".format(
|
|
533
|
+
prefix
|
|
534
|
+
) # detailed stats for each run before filtration steps
|
|
535
|
+
condensed_stats_file = "{}.condensed_stats".format(
|
|
536
|
+
prefix
|
|
537
|
+
) # basic stats for the batch of runs
|
|
538
|
+
missing_files_log = "{}.missing_files.txt".format(
|
|
539
|
+
prefix
|
|
540
|
+
) # the names of non-existent files
|
|
541
|
+
seq_count_log = "{}.seq_count.txt".format(
|
|
542
|
+
prefix
|
|
543
|
+
) # the number of sequences per domain/VR in the batch
|
|
544
|
+
stats_out = open(stats_file, "w")
|
|
545
|
+
condensed_out = open(condensed_stats_file, "w")
|
|
546
|
+
missing_out = open(missing_files_log, "w")
|
|
547
|
+
seq_count_out = open(seq_count_log, "w")
|
|
548
|
+
stats_out.write(
|
|
549
|
+
"Run ID\tTotal # sequences\tFraction unsupported seq (map unsupported CM)\t"
|
|
550
|
+
"Fraction of sequences with start and/or end inside a VR\tFraction bacteria\t"
|
|
551
|
+
"Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t"
|
|
552
|
+
"Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n"
|
|
553
|
+
)
|
|
554
|
+
retrieve_regions(
|
|
555
|
+
args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out
|
|
556
|
+
)
|
|
430
557
|
stats_out.close()
|
|
431
558
|
condensed_out.close()
|
|
432
559
|
missing_out.close()
|
|
433
560
|
seq_count_out.close()
|
|
434
561
|
if not args.statistics:
|
|
435
|
-
for s_file in (
|
|
562
|
+
for s_file in (
|
|
563
|
+
stats_file,
|
|
564
|
+
condensed_stats_file,
|
|
565
|
+
missing_files_log,
|
|
566
|
+
seq_count_log,
|
|
567
|
+
):
|
|
436
568
|
os.remove(s_file)
|
|
437
569
|
t_stop = time.perf_counter()
|
|
438
570
|
t_fact = t_stop - t_start
|
|
439
|
-
logging.info(
|
|
571
|
+
logging.info("Elapsed time: {0:.2f} seconds".format(t_fact))
|
|
440
572
|
|
|
441
573
|
|
|
442
|
-
if __name__ ==
|
|
574
|
+
if __name__ == "__main__":
|
|
443
575
|
main()
|
|
444
|
-
|
|
445
|
-
# don't print json
|
|
446
|
-
# name the tsv file better
|