mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (31) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
  13. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +140 -0
  14. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
  15. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
  16. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
  17. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
  18. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
  19. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
  20. mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
  21. mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
  22. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
  23. mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  24. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
  25. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA +18 -1
  26. mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD +33 -0
  27. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/WHEEL +1 -1
  28. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/entry_points.txt +3 -0
  29. mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
  30. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE +0 -0
  31. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt +0 -0
@@ -15,23 +15,33 @@
15
15
  # limitations under the License.
16
16
 
17
17
  import argparse
18
- from collections import Counter
18
+ from collections import Counter, defaultdict
19
19
  import gzip
20
20
  import re
21
21
  import os
22
22
  import logging
23
- import sys
24
23
  import json
25
24
  import time
26
25
 
27
- from mgnify_pipelines_toolkit.constants.thresholds import MIN_OVERLAP, MIN_SEQ_COUNT, MAX_ERROR_PROPORTION,MAX_INTERNAL_PRIMER_PROPORTION
28
- from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
26
+ from mgnify_pipelines_toolkit.constants.thresholds import (
27
+ MIN_OVERLAP,
28
+ MIN_SEQ_COUNT,
29
+ MAX_ERROR_PROPORTION,
30
+ MAX_INTERNAL_PRIMER_PROPORTION,
31
+ )
32
+ from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
33
+ REGIONS_16S_BACTERIA,
34
+ REGIONS_16S_ARCHAEA,
35
+ REGIONS_18S,
36
+ )
29
37
 
30
38
  raw_f_regex = re.compile(
31
- "([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*")
39
+ r"([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*" # noqa: E501
40
+ )
32
41
 
33
42
  logging.basicConfig(level=logging.DEBUG)
34
43
 
44
+
35
45
  def calc_overlap(read, reg):
36
46
  read_s, read_f = read
37
47
  reg_s, reg_f = reg
@@ -55,14 +65,17 @@ def get_multiregion(raw_sequence_coords, regions):
55
65
 
56
66
  """
57
67
  # check if any of the coords are inside the region
58
- matched_regions = [region for region, limits in regions.items()
59
- if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP]
68
+ matched_regions = [
69
+ region
70
+ for region, limits in regions.items()
71
+ if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP
72
+ ]
60
73
  if len(matched_regions) > 1:
61
- amplified_region = '{}-{}'.format(min(matched_regions), max(matched_regions))
74
+ amplified_region = "{}-{}".format(min(matched_regions), max(matched_regions))
62
75
  elif len(matched_regions) == 1:
63
76
  amplified_region = matched_regions[0]
64
77
  else:
65
- amplified_region = ''
78
+ amplified_region = ""
66
79
  return amplified_region
67
80
 
68
81
 
@@ -87,21 +100,26 @@ def check_primer_position(raw_sequence_coords, regions):
87
100
 
88
101
  # Parse, filter empty lines and unpack into 2D array
89
102
  def load_data(filename):
90
- read_function = gzip.open if filename.endswith('.gz') else open
91
- with read_function(filename, 'rt') as f:
103
+ read_function = gzip.open if filename.endswith(".gz") else open
104
+ with read_function(filename, "rt") as f:
92
105
  return [l[0] for l in [raw_f_regex.findall(l) for l in f] if bool(l)]
93
106
 
94
107
 
95
108
  def unsplit_region(long_region):
96
- interval = [int(var_reg.replace('V', '')) for var_reg in long_region.split('-')]
109
+ interval = [int(var_reg.replace("V", "")) for var_reg in long_region.split("-")]
97
110
  if len(interval) == 1:
98
111
  interval = interval * 2
99
112
  return interval
100
113
 
101
114
 
102
115
  def check_inclusiveness(more_frequent, less_frequent):
103
- unsplit_more_frequent, unsplit_less_frequent = [unsplit_region(region) for region in [more_frequent, less_frequent]]
104
- return unsplit_more_frequent[0] <= unsplit_less_frequent[0] and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
116
+ unsplit_more_frequent, unsplit_less_frequent = [
117
+ unsplit_region(region) for region in [more_frequent, less_frequent]
118
+ ]
119
+ return (
120
+ unsplit_more_frequent[0] <= unsplit_less_frequent[0]
121
+ and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
122
+ )
105
123
 
106
124
 
107
125
  def normalise_results(region_matches):
@@ -121,10 +139,12 @@ def normalise_results(region_matches):
121
139
  # [region, round(count / len(region_matches), 4)]
122
140
  [region, count / len(region_matches)]
123
141
  for region, count in counter.items()
124
- if count/len(region_matches) >= MAX_ERROR_PROPORTION and region != ''
142
+ if count / len(region_matches) >= MAX_ERROR_PROPORTION and region != ""
125
143
  ]
126
144
  # sort by frequency in reverse order
127
- var_region_proportions = sorted(var_region_proportions, key=lambda x: x[1], reverse=True)
145
+ var_region_proportions = sorted(
146
+ var_region_proportions, key=lambda x: x[1], reverse=True
147
+ )
128
148
 
129
149
  if len(var_region_proportions) == 1:
130
150
  return dict(var_region_proportions)
@@ -137,8 +157,9 @@ def normalise_results(region_matches):
137
157
  else:
138
158
  return None
139
159
  else:
140
- if min(more_frequent[1], less_frequent[1]) > 0.1 and not \
141
- check_inclusiveness(less_frequent[0], more_frequent[0]):
160
+ if min(
161
+ more_frequent[1], less_frequent[1]
162
+ ) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
142
163
  return dict(var_region_proportions)
143
164
  else:
144
165
  return None
@@ -153,7 +174,7 @@ def identify_run(infile_name):
153
174
  Return:
154
175
  run: Run ID ERR*|SRR*
155
176
  """
156
- run = os.path.basename(infile_name).split('_')[0]
177
+ run = os.path.basename(infile_name).split("_")[0]
157
178
  return run
158
179
 
159
180
 
@@ -165,63 +186,79 @@ def determine_cm(cm_detected):
165
186
  Returns:
166
187
  model: A dictionary containing the coordinates of the variable regions for the matched model.
167
188
  """
168
- if cm_detected == 'RF00177':
189
+ if cm_detected == "RF00177":
169
190
  model = REGIONS_16S_BACTERIA
170
- elif cm_detected == 'RF01959':
191
+ elif cm_detected == "RF01959":
171
192
  model = REGIONS_16S_ARCHAEA
172
- elif cm_detected == 'RF01960':
193
+ elif cm_detected == "RF01960":
173
194
  model = REGIONS_18S
174
195
  else:
175
- model = 'unsupported'
196
+ model = "unsupported"
176
197
  return model
177
198
 
178
199
 
179
200
  def determine_domain(cm_detected):
180
- if cm_detected == 'RF00177':
181
- return 'Bacteria'
182
- elif cm_detected == 'RF01959':
183
- return 'Archaea'
184
- elif cm_detected == 'RF01960':
185
- return 'Eukaryotes'
201
+ if cm_detected == "RF00177":
202
+ return "Bacteria"
203
+ elif cm_detected == "RF01959":
204
+ return "Archaea"
205
+ elif cm_detected == "RF01960":
206
+ return "Eukaryotes"
186
207
 
187
208
 
188
209
  def determine_marker_gene(domain):
189
- if domain in ['Bacteria', 'Archaea']:
190
- return '16S'
191
- elif domain == 'Eukaryotes':
192
- return '18S'
210
+ if domain in ["Bacteria", "Archaea"]:
211
+ return "16S"
212
+ elif domain == "Eukaryotes":
213
+ return "18S"
193
214
 
194
215
 
195
- def print_stats(run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out):
216
+ def print_stats(
217
+ run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out
218
+ ):
196
219
  summary_num = dict()
197
220
  for cm in run_result:
198
221
  summary_num[cm] = dict()
199
222
  stats = Counter(run_result[cm])
200
- summary_num[cm]['empty'] = stats['']
201
- summary_num[cm]['total regions'] = len(stats)
202
- del stats['']
203
- summary_num[cm]['regions'] = ', '.join(stats.keys())
204
- summary_num[cm]['freqs'] = ', '.join([
205
- '{0:.4f}'.format(val/len(run_result[cm])) if len(run_result[cm]) > 0 else '0'
206
- for val in stats.values()
207
- ])
208
-
209
- print_str = ''
210
- models = ['RF00177', 'RF01959', 'RF01960']
223
+ summary_num[cm]["empty"] = stats[""]
224
+ summary_num[cm]["total regions"] = len(stats)
225
+ del stats[""]
226
+ summary_num[cm]["regions"] = ", ".join(stats.keys())
227
+ summary_num[cm]["freqs"] = ", ".join(
228
+ [
229
+ (
230
+ "{0:.4f}".format(val / len(run_result[cm]))
231
+ if len(run_result[cm]) > 0
232
+ else "0"
233
+ )
234
+ for val in stats.values()
235
+ ]
236
+ )
237
+
238
+ print_str = ""
239
+ models = ["RF00177", "RF01959", "RF01960"]
211
240
  for model in models:
212
241
  if model in summary_num:
213
- print_str += ('{}\t' * 3).format(summary_num[model].get('empty', 0), summary_num[model].get('regions', ''),
214
- summary_num[model].get('freqs', 0))
242
+ print_str += ("{}\t" * 3).format(
243
+ summary_num[model].get("empty", 0),
244
+ summary_num[model].get("regions", ""),
245
+ summary_num[model].get("freqs", 0),
246
+ )
215
247
  else:
216
- print_str += ' \t \t \t'
248
+ print_str += " \t \t \t"
217
249
  if num_sequences > 0:
218
- stats_out.write(('{}\t' * 7 + '{}\n').format(run_id, num_sequences,
219
- '{0:.3f}'.format(num_unsupported / num_sequences),
220
- '{0:.3f}'.format(num_inside_vr / num_sequences),
221
- '{0:.3f}'.format(len(run_result.get('RF00177', [])) / num_sequences),
222
- '{0:.3f}'.format(len(run_result.get('RF01959', [])) / num_sequences),
223
- '{0:.3f}'.format(len(run_result.get('RF01960', [])) / num_sequences),
224
- print_str))
250
+ stats_out.write(
251
+ ("{}\t" * 7 + "{}\n").format(
252
+ run_id,
253
+ num_sequences,
254
+ "{0:.3f}".format(num_unsupported / num_sequences),
255
+ "{0:.3f}".format(num_inside_vr / num_sequences),
256
+ "{0:.3f}".format(len(run_result.get("RF00177", [])) / num_sequences),
257
+ "{0:.3f}".format(len(run_result.get("RF01959", [])) / num_sequences),
258
+ "{0:.3f}".format(len(run_result.get("RF01960", [])) / num_sequences),
259
+ print_str,
260
+ )
261
+ )
225
262
 
226
263
 
227
264
  def print_to_table(tsv_out, results, per_read_info):
@@ -230,14 +267,14 @@ def print_to_table(tsv_out, results, per_read_info):
230
267
  tsv_out: The name of the tsv outfile.
231
268
  results: The dictionary that contains a list of variable regions for a run and their match proportions.
232
269
  """
233
- #logging.info(results)
270
+ # logging.info(results)
234
271
 
235
- prefix = tsv_out.split('.tsv')[0]
272
+ prefix = tsv_out.split(".tsv")[0]
236
273
 
237
- f = open(tsv_out, 'w')
238
- fw = open(f'{prefix}_regions.txt', 'w')
274
+ f = open(tsv_out, "w")
275
+ fw = open(f"{prefix}_regions.txt", "w")
239
276
  # print the table header to file
240
- f.write('Run\tAssertionEvidence\tAssertionMethod\tMarker gene\tVariable region\n')
277
+ f.write("Run\tAssertionEvidence\tAssertionMethod\tMarker gene\tVariable region\n")
241
278
  gene_hv_to_write = []
242
279
  for run, amplified_region_dict in results.items():
243
280
  records = set()
@@ -245,103 +282,157 @@ def print_to_table(tsv_out, results, per_read_info):
245
282
  for domain, amplified_regions in amplified_region_dict.items():
246
283
  marker_gene = determine_marker_gene(domain)
247
284
  for vr in amplified_regions.keys():
248
- if not vr == '':
249
- record = '{}\tECO_0000363\tautomatic assertion\t{}\t{}\n'.format(run, determine_marker_gene(domain),
250
- vr)
285
+ if not vr == "":
286
+ record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(
287
+ run, determine_marker_gene(domain), vr
288
+ )
251
289
  records.add(record)
252
- records_regions.add(f'{marker_gene}.{vr}\n')
290
+ records_regions.add(f"{marker_gene}.{vr}\n")
253
291
  gene_hv_to_write.append(f"{marker_gene}.{vr}")
254
292
  for record_to_print in records:
255
293
  f.write(record_to_print)
256
-
294
+
257
295
  for record_to_print in records_regions:
258
296
  fw.write(record_to_print)
259
297
 
260
298
  for key in per_read_info.keys():
261
299
  if key in gene_hv_to_write:
262
- per_read_filename = '{}.{}.txt'.format(prefix, key)
263
- with open(per_read_filename, 'w') as f_hv:
264
- f_hv.write('\n'.join(per_read_info[key]))
300
+ per_read_filename = "{}.{}.txt".format(prefix, key)
301
+ with open(per_read_filename, "w") as f_hv:
302
+ f_hv.write("\n".join(per_read_info[key]))
265
303
 
266
304
  f.close()
267
305
  fw.close()
268
306
 
269
- def retrieve_regions(tblout_file_list, outfile_prefix, stats_out, condensed_out, missing_out, seq_count_out):
307
+
308
+ def retrieve_regions(
309
+ tblout_file_list,
310
+ outfile_prefix,
311
+ stats_out,
312
+ condensed_out,
313
+ missing_out,
314
+ seq_count_out,
315
+ ):
270
316
  file_counter = 0 # count how many files were analyzed
271
317
  sequence_counter_total = 0 # count how many sequences in total were analyzed
272
318
  sequence_counter_useful = 0 # count how many sequences an output was generated for
273
319
  normalised_matches = dict() # dictionary that will contain results for all runs
274
- failed_run_counter = 0 # total number of excluded runs for any reason (except non-existing files)
275
- run_counters = {k: 0 for k in ['one', 'two', 'ambiguous']} # counters
320
+ failed_run_counter = (
321
+ 0 # total number of excluded runs for any reason (except non-existing files)
322
+ )
323
+ run_counters = {k: 0 for k in ["one", "two", "ambiguous"]} # counters
276
324
  seq_per_variable_region_count = dict()
277
325
 
278
326
  for tblout_file in tblout_file_list:
279
327
  if not os.path.isfile(tblout_file):
280
- unzipped_filename = tblout_file.replace('.gz', '')
328
+ unzipped_filename = tblout_file.replace(".gz", "")
281
329
  if os.path.isfile(unzipped_filename):
282
330
  tblout_file = unzipped_filename
283
331
  else:
284
- logging.info('File {} does not exist'.format(tblout_file))
285
- missing_out.write('{}\n'.format(tblout_file))
332
+ logging.info("File {} does not exist".format(tblout_file))
333
+ missing_out.write("{}\n".format(tblout_file))
286
334
  continue
287
335
  data = load_data(tblout_file)
288
336
  run_id = identify_run(tblout_file)
289
337
  multiregion_matches = dict()
290
- unsupported_matches = 0 # tracks the number of sequences that map to unsupported models
338
+ unsupported_matches = (
339
+ 0 # tracks the number of sequences that map to unsupported models
340
+ )
291
341
  primer_inside_vr = 0 # tracks the number of sequences that start and/or end inside a variable region
292
- per_read_info = dict() # dictionary will contain read names for each variable region
342
+ per_read_info = (
343
+ dict()
344
+ ) # dictionary will contain read names for each variable region
293
345
  for read in data:
294
346
  regions = determine_cm(read[2])
295
347
  sequence_counter_total += 1
296
348
  limits = list(map(int, read[4:6]))
297
349
  domain = determine_domain(read[2])
298
350
  marker_gene = determine_marker_gene(domain)
299
- if not regions == 'unsupported':
300
- multiregion_matches.setdefault(read[2], []).append(get_multiregion(limits, regions))
351
+ if not regions == "unsupported":
352
+ multiregion_matches.setdefault(read[2], []).append(
353
+ get_multiregion(limits, regions)
354
+ )
301
355
  if check_primer_position(limits, regions):
302
356
  primer_inside_vr += 1
303
357
  sequence_counter_useful += 1
304
- per_read_info.setdefault(marker_gene + "." + get_multiregion(limits, regions), []).append(read[0])
358
+ per_read_info.setdefault(
359
+ marker_gene + "." + get_multiregion(limits, regions), []
360
+ ).append(read[0])
305
361
  else:
306
362
  unsupported_matches += 1
307
363
 
308
- print_stats(run_id, len(data), unsupported_matches, primer_inside_vr, multiregion_matches, stats_out)
364
+ print_stats(
365
+ run_id,
366
+ len(data),
367
+ unsupported_matches,
368
+ primer_inside_vr,
369
+ multiregion_matches,
370
+ stats_out,
371
+ )
309
372
  if not data:
310
373
  failed_run_counter += 1
311
- logging.info('No output will be produced - no data')
374
+ logging.info("No output will be produced - no data")
312
375
  continue
313
376
 
314
- unsupported_fract = unsupported_matches/len(data)
377
+ unsupported_fract = unsupported_matches / len(data)
315
378
  if unsupported_fract >= MAX_ERROR_PROPORTION:
316
379
  failed_run_counter += 1
317
- logging.info('No output will be produced - too many unsupported models')
318
- logging.info("Excluded\t{}\t{}\t{}\n".format(tblout_file, '{0:.2f}'.format(unsupported_fract), len(data)))
380
+ logging.info("No output will be produced - too many unsupported models")
381
+ logging.info(
382
+ "Excluded\t{}\t{}\t{}\n".format(
383
+ tblout_file, "{0:.2f}".format(unsupported_fract), len(data)
384
+ )
385
+ )
319
386
  continue
320
387
 
321
388
  # filter out runs with too many sequences starting/ending inside variable regions
322
- internal_seq_fract = primer_inside_vr/len(data)
389
+ internal_seq_fract = primer_inside_vr / len(data)
323
390
  if internal_seq_fract > MAX_INTERNAL_PRIMER_PROPORTION:
324
391
  failed_run_counter += 1
325
- logging.info('No output will be produced - too many internal mappings')
326
- logging.info("Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
327
- tblout_file, '{0:.2f}'.format(internal_seq_fract)))
392
+ logging.info("No output will be produced - too many internal mappings")
393
+ logging.info(
394
+ "Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
395
+ tblout_file, "{0:.2f}".format(internal_seq_fract)
396
+ )
397
+ )
328
398
  continue
329
399
 
330
400
  normalised_matches[run_id] = dict()
401
+ region_counter = defaultdict(int)
331
402
 
332
- # filter out domains with <1%
333
- multiregion_matches = {d: v for d, v in multiregion_matches.items() if len(v)/len(data) >= 0.01}
403
+ regions_to_remove = []
334
404
 
335
- run_ok = True
336
405
  for model, value in multiregion_matches.items():
337
- if len(value) < MIN_SEQ_COUNT:
338
- run_ok = False
339
- if not run_ok:
406
+ marker_gene = determine_marker_gene(determine_domain(model))
407
+ for region in value:
408
+ region_counter[f"{marker_gene}.{region}"] += 1
409
+
410
+ for region, count in region_counter.items():
411
+ if count < MIN_SEQ_COUNT:
412
+ regions_to_remove.append(region)
413
+
414
+ if len(regions_to_remove) == len(region_counter.keys()):
340
415
  failed_run_counter += 1
341
- logging.info('No output will be produced - too few sequences in a domain')
416
+ logging.info("No output will be produced - too few sequences in a domain")
342
417
  continue
343
418
 
344
- run_status = 'one'
419
+ models_to_remove = []
420
+
421
+ for model, value in multiregion_matches.items():
422
+ new_value = []
423
+ for region in value:
424
+ marker_gene = determine_marker_gene(determine_domain(model))
425
+ full_region = f"{marker_gene}.{region}"
426
+ if full_region not in regions_to_remove:
427
+ new_value.append(region)
428
+ if not new_value:
429
+ models_to_remove.append(model)
430
+ multiregion_matches[model] = new_value
431
+
432
+ [multiregion_matches.pop(model) for model in models_to_remove]
433
+ print(multiregion_matches)
434
+
435
+ run_status = "one"
345
436
  run_result = dict()
346
437
  total_useful_sequences = 0.0
347
438
  temp_seq_counter = dict()
@@ -349,62 +440,86 @@ def retrieve_regions(tblout_file_list, outfile_prefix, stats_out, condensed_out,
349
440
  print(model)
350
441
  result = normalise_results(model_regions)
351
442
  if result is None:
352
- run_status = 'ambiguous'
443
+ run_status = "ambiguous"
353
444
  break
354
445
  elif len(result) == 2:
355
- run_status = 'two'
446
+ run_status = "two"
356
447
  run_result[determine_domain(model)] = result
357
448
  for reg, freq in result.items():
358
449
  total_useful_sequences += len(model_regions) * freq
359
- temp_seq_counter[determine_domain(model) + ' ' + reg] = len(model_regions) * freq
360
- if total_useful_sequences/len(data) < 0.95 and run_status != 'ambiguous':
450
+ temp_seq_counter[determine_domain(model) + " " + reg] = (
451
+ len(model_regions) * freq
452
+ )
453
+ if total_useful_sequences / len(data) < 0.95 and run_status != "ambiguous":
361
454
  failed_run_counter += 1
362
- logging.info('No output will be produced - too few useful sequences')
455
+ logging.info("No output will be produced - too few useful sequences")
363
456
  continue
364
457
 
365
458
  file_counter += 1
366
459
  run_counters[run_status] += 1
367
460
 
368
- if run_status != 'ambiguous':
461
+ if run_status != "ambiguous":
369
462
  normalised_matches[run_id] = run_result
370
463
  for key, value in temp_seq_counter.items():
371
464
  seq_per_variable_region_count.setdefault(key, 0)
372
465
  seq_per_variable_region_count[key] += value
373
466
 
374
- json_outfile = '{}.json'.format(outfile_prefix)
375
- tsv_outfile = '{}.tsv'.format(outfile_prefix)
376
- with open(json_outfile, 'w') as f:
467
+ json_outfile = "{}.json".format(outfile_prefix)
468
+ tsv_outfile = "{}.tsv".format(outfile_prefix)
469
+ with open(json_outfile, "w") as f:
377
470
  json.dump(normalised_matches, f)
378
471
  print_to_table(tsv_outfile, normalised_matches, per_read_info)
379
- condensed_out.write('\t'.join([
380
- 'Total number of files failed',
381
- 'Total number of files analyzed',
382
- 'Number of runs with one region',
383
- 'Number of runs with two regions',
384
- 'Number of runs with too many regions or unbalanced 2 region runs']) + '\n')
385
- condensed_out.write('{}\t{}\t{}\t{}\t{}\n'.format(
386
- failed_run_counter,
387
- file_counter,
388
- run_counters['one'],
389
- run_counters['two'],
390
- run_counters['ambiguous']))
472
+ condensed_out.write(
473
+ "\t".join(
474
+ [
475
+ "Total number of files failed",
476
+ "Total number of files analyzed",
477
+ "Number of runs with one region",
478
+ "Number of runs with two regions",
479
+ "Number of runs with too many regions or unbalanced 2 region runs",
480
+ ]
481
+ )
482
+ + "\n"
483
+ )
484
+ condensed_out.write(
485
+ "{}\t{}\t{}\t{}\t{}\n".format(
486
+ failed_run_counter,
487
+ file_counter,
488
+ run_counters["one"],
489
+ run_counters["two"],
490
+ run_counters["ambiguous"],
491
+ )
492
+ )
391
493
  for key, value in seq_per_variable_region_count.items():
392
- seq_count_out.write('{}\t{}\n'.format(key, int(value)))
494
+ seq_count_out.write("{}\t{}\n".format(key, int(value)))
393
495
 
394
- logging.info('Analyzed {} files and {} sequences. Output generated for {} sequences'.format(
395
- file_counter, sequence_counter_total, sequence_counter_useful))
496
+ logging.info(
497
+ "Analyzed {} files and {} sequences. Output generated for {} sequences".format(
498
+ file_counter, sequence_counter_total, sequence_counter_useful
499
+ )
500
+ )
396
501
 
397
502
 
398
503
  def parse_args(argv):
399
- parser = argparse.ArgumentParser(description='Tool to determine which regions were amplified in 16S data')
400
- parser.add_argument('files', nargs='+',
401
- help='A list of overlapped tblout files')
402
- parser.add_argument('-d', '--output_dir', default='variable-region-inference',
403
- help='Directory to which results will be saved')
404
- parser.add_argument('-o', '--output_prefix', default='amplified_regions',
405
- help='Prefix for all outputs')
406
- parser.add_argument('--statistics', action='store_true',
407
- help='Print statistics files')
504
+ parser = argparse.ArgumentParser(
505
+ description="Tool to determine which regions were amplified in 16S data"
506
+ )
507
+ parser.add_argument("files", nargs="+", help="A list of overlapped tblout files")
508
+ parser.add_argument(
509
+ "-d",
510
+ "--output_dir",
511
+ default="variable-region-inference",
512
+ help="Directory to which results will be saved",
513
+ )
514
+ parser.add_argument(
515
+ "-o",
516
+ "--output_prefix",
517
+ default="amplified_regions",
518
+ help="Prefix for all outputs",
519
+ )
520
+ parser.add_argument(
521
+ "--statistics", action="store_true", help="Print statistics files"
522
+ )
408
523
  return parser.parse_args(argv)
409
524
 
410
525
 
@@ -414,33 +529,47 @@ def main(argv=None):
414
529
  if not os.path.isdir(args.output_dir):
415
530
  os.mkdir(args.output_dir)
416
531
  prefix = os.path.join(args.output_dir, args.output_prefix)
417
- stats_file = '{}.stats'.format(prefix) # detailed stats for each run before filtration steps
418
- condensed_stats_file = '{}.condensed_stats'.format(prefix) # basic stats for the batch of runs
419
- missing_files_log = '{}.missing_files.txt'.format(prefix) # the names of non-existent files
420
- seq_count_log = '{}.seq_count.txt'.format(prefix) # the number of sequences per domain/VR in the batch
421
- stats_out = open(stats_file, 'w')
422
- condensed_out = open(condensed_stats_file, 'w')
423
- missing_out = open(missing_files_log, 'w')
424
- seq_count_out = open(seq_count_log, 'w')
425
- stats_out.write('Run ID\tTotal # sequences\tFraction unsupported seq (map unsupported CM)\t'
426
- 'Fraction of sequences with start and/or end inside a VR\tFraction bacteria\t'
427
- 'Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t'
428
- 'Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n')
429
- retrieve_regions(args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out)
532
+ stats_file = "{}.stats".format(
533
+ prefix
534
+ ) # detailed stats for each run before filtration steps
535
+ condensed_stats_file = "{}.condensed_stats".format(
536
+ prefix
537
+ ) # basic stats for the batch of runs
538
+ missing_files_log = "{}.missing_files.txt".format(
539
+ prefix
540
+ ) # the names of non-existent files
541
+ seq_count_log = "{}.seq_count.txt".format(
542
+ prefix
543
+ ) # the number of sequences per domain/VR in the batch
544
+ stats_out = open(stats_file, "w")
545
+ condensed_out = open(condensed_stats_file, "w")
546
+ missing_out = open(missing_files_log, "w")
547
+ seq_count_out = open(seq_count_log, "w")
548
+ stats_out.write(
549
+ "Run ID\tTotal # sequences\tFraction unsupported seq (map unsupported CM)\t"
550
+ "Fraction of sequences with start and/or end inside a VR\tFraction bacteria\t"
551
+ "Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t"
552
+ "Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n"
553
+ )
554
+ retrieve_regions(
555
+ args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out
556
+ )
430
557
  stats_out.close()
431
558
  condensed_out.close()
432
559
  missing_out.close()
433
560
  seq_count_out.close()
434
561
  if not args.statistics:
435
- for s_file in (stats_file, condensed_stats_file, missing_files_log, seq_count_log):
562
+ for s_file in (
563
+ stats_file,
564
+ condensed_stats_file,
565
+ missing_files_log,
566
+ seq_count_log,
567
+ ):
436
568
  os.remove(s_file)
437
569
  t_stop = time.perf_counter()
438
570
  t_fact = t_stop - t_start
439
- logging.info('Elapsed time: {0:.2f} seconds'.format(t_fact))
571
+ logging.info("Elapsed time: {0:.2f} seconds".format(t_fact))
440
572
 
441
573
 
442
- if __name__ == '__main__':
574
+ if __name__ == "__main__":
443
575
  main()
444
-
445
- # don't print json
446
- # name the tsv file better