mgnify-pipelines-toolkit 1.4.1__tar.gz → 1.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (60) hide show
  1. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/PKG-INFO +1 -1
  2. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +23 -80
  3. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +4 -12
  4. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +61 -21
  5. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +1 -3
  6. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +1 -3
  7. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +4 -12
  8. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +12 -37
  9. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +12 -37
  10. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +2 -6
  11. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +33 -91
  12. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +6 -18
  13. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +3 -9
  14. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +1 -3
  15. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +5 -15
  16. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +3 -16
  17. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +5 -19
  18. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +1 -1
  19. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +11 -22
  20. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +5 -15
  21. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +9 -29
  22. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +14 -24
  23. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +19 -72
  24. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +4 -12
  25. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +4 -12
  26. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +2 -6
  27. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +3 -10
  28. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +22 -85
  29. mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/ena/webin_cli_handler.py +741 -0
  30. mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  31. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +5 -15
  32. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +1 -1
  33. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +2 -0
  34. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
  35. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/pyproject.toml +5 -1
  36. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/LICENSE +0 -0
  37. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/README.md +0 -0
  38. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/__init__.py +0 -0
  39. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  40. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  41. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +0 -0
  42. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
  43. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  44. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +0 -0
  45. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  46. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/amrintegrator.py +0 -0
  47. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  48. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  49. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  50. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  51. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  52. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  53. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  54. {mgnify_pipelines_toolkit-1.4.1/mgnify_pipelines_toolkit/utils → mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/ena}/__init__.py +0 -0
  55. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/schemas/dataframes.py +0 -0
  56. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  57. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  58. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  59. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  60. {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.4.1
3
+ Version: 1.4.4
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -73,11 +73,7 @@ def get_multiregion(raw_sequence_coords, regions):
73
73
  region_coverages[region] = overlap
74
74
 
75
75
  # check if any of the coords are inside the region
76
- matched_regions = [
77
- region
78
- for region, limits in regions.items()
79
- if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP
80
- ]
76
+ matched_regions = [region for region, limits in regions.items() if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP]
81
77
  if len(matched_regions) > 1:
82
78
  amplified_region = "{}-{}".format(min(matched_regions), max(matched_regions))
83
79
  elif len(matched_regions) == 1:
@@ -121,13 +117,8 @@ def unsplit_region(long_region):
121
117
 
122
118
 
123
119
  def check_inclusiveness(more_frequent, less_frequent):
124
- unsplit_more_frequent, unsplit_less_frequent = [
125
- unsplit_region(region) for region in [more_frequent, less_frequent]
126
- ]
127
- return (
128
- unsplit_more_frequent[0] <= unsplit_less_frequent[0]
129
- and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
130
- )
120
+ unsplit_more_frequent, unsplit_less_frequent = [unsplit_region(region) for region in [more_frequent, less_frequent]]
121
+ return unsplit_more_frequent[0] <= unsplit_less_frequent[0] and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
131
122
 
132
123
 
133
124
  def normalise_results(region_matches):
@@ -150,9 +141,7 @@ def normalise_results(region_matches):
150
141
  if count / len(region_matches) >= MAX_ERROR_PROPORTION and region != ""
151
142
  ]
152
143
  # sort by frequency in reverse order
153
- var_region_proportions = sorted(
154
- var_region_proportions, key=lambda x: x[1], reverse=True
155
- )
144
+ var_region_proportions = sorted(var_region_proportions, key=lambda x: x[1], reverse=True)
156
145
 
157
146
  if len(var_region_proportions) == 1:
158
147
  return dict(var_region_proportions)
@@ -165,9 +154,7 @@ def normalise_results(region_matches):
165
154
  else:
166
155
  return None
167
156
  else:
168
- if min(
169
- more_frequent[1], less_frequent[1]
170
- ) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
157
+ if min(more_frequent[1], less_frequent[1]) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
171
158
  return dict(var_region_proportions)
172
159
  else:
173
160
  return None
@@ -221,9 +208,7 @@ def determine_marker_gene(domain):
221
208
  return "18S"
222
209
 
223
210
 
224
- def print_stats(
225
- run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out
226
- ):
211
+ def print_stats(run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out):
227
212
  summary_num = dict()
228
213
  for cm in run_result:
229
214
  summary_num[cm] = dict()
@@ -233,14 +218,7 @@ def print_stats(
233
218
  del stats[""]
234
219
  summary_num[cm]["regions"] = ", ".join(stats.keys())
235
220
  summary_num[cm]["freqs"] = ", ".join(
236
- [
237
- (
238
- "{0:.4f}".format(val / len(run_result[cm]))
239
- if len(run_result[cm]) > 0
240
- else "0"
241
- )
242
- for val in stats.values()
243
- ]
221
+ [("{0:.4f}".format(val / len(run_result[cm])) if len(run_result[cm]) > 0 else "0") for val in stats.values()]
244
222
  )
245
223
 
246
224
  print_str = ""
@@ -291,9 +269,7 @@ def print_to_table(tsv_out, results, per_read_info):
291
269
  marker_gene = determine_marker_gene(domain)
292
270
  for vr in amplified_regions.keys():
293
271
  if not vr == "":
294
- record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(
295
- run, determine_marker_gene(domain), vr
296
- )
272
+ record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(run, determine_marker_gene(domain), vr)
297
273
  records.add(record)
298
274
  records_regions.add(f"{marker_gene}.{vr}\n")
299
275
  gene_hv_to_write.append(f"{marker_gene}.{vr}")
@@ -325,9 +301,7 @@ def retrieve_regions(
325
301
  sequence_counter_total = 0 # count how many sequences in total were analyzed
326
302
  sequence_counter_useful = 0 # count how many sequences an output was generated for
327
303
  normalised_matches = dict() # dictionary that will contain results for all runs
328
- failed_run_counter = (
329
- 0 # total number of excluded runs for any reason (except non-existing files)
330
- )
304
+ failed_run_counter = 0 # total number of excluded runs for any reason (except non-existing files)
331
305
  run_counters = {k: 0 for k in ["one", "two", "ambiguous"]} # counters
332
306
  seq_per_variable_region_count = dict()
333
307
 
@@ -343,13 +317,9 @@ def retrieve_regions(
343
317
  data = load_data(tblout_file)
344
318
  run_id = identify_run(tblout_file)
345
319
  multiregion_matches = dict()
346
- unsupported_matches = (
347
- 0 # tracks the number of sequences that map to unsupported models
348
- )
320
+ unsupported_matches = 0 # tracks the number of sequences that map to unsupported models
349
321
  primer_inside_vr = 0 # tracks the number of sequences that start and/or end inside a variable region
350
- per_read_info = (
351
- dict()
352
- ) # dictionary will contain read names for each variable region
322
+ per_read_info = dict() # dictionary will contain read names for each variable region
353
323
  all_region_coverages = defaultdict(lambda: defaultdict(list))
354
324
  for read in data:
355
325
  # Example structure of `read`
@@ -362,18 +332,13 @@ def retrieve_regions(
362
332
  if not regions == "unsupported":
363
333
  matches, coverages = get_multiregion(limits, regions)
364
334
 
365
- [
366
- all_region_coverages[domain][region].append(coverage)
367
- for region, coverage in coverages.items()
368
- ]
335
+ [all_region_coverages[domain][region].append(coverage) for region, coverage in coverages.items()]
369
336
 
370
337
  multiregion_matches.setdefault(read[2], []).append(matches)
371
338
  if check_primer_position(limits, regions):
372
339
  primer_inside_vr += 1
373
340
  sequence_counter_useful += 1
374
- per_read_info.setdefault(marker_gene + "." + matches, []).append(
375
- read[0]
376
- )
341
+ per_read_info.setdefault(marker_gene + "." + matches, []).append(read[0])
377
342
  else:
378
343
  unsupported_matches += 1
379
344
 
@@ -394,11 +359,7 @@ def retrieve_regions(
394
359
  if unsupported_fract >= MAX_ERROR_PROPORTION:
395
360
  failed_run_counter += 1
396
361
  logging.info("No output will be produced - too many unsupported models")
397
- logging.info(
398
- "Excluded\t{}\t{}\t{}\n".format(
399
- tblout_file, "{0:.2f}".format(unsupported_fract), len(data)
400
- )
401
- )
362
+ logging.info("Excluded\t{}\t{}\t{}\n".format(tblout_file, "{0:.2f}".format(unsupported_fract), len(data)))
402
363
  continue
403
364
 
404
365
  normalised_matches[run_id] = dict()
@@ -451,9 +412,7 @@ def retrieve_regions(
451
412
  run_result[determine_domain(model)] = result
452
413
  for reg, freq in result.items():
453
414
  total_useful_sequences += len(model_regions) * freq
454
- temp_seq_counter[determine_domain(model) + " " + reg] = (
455
- len(model_regions) * freq
456
- )
415
+ temp_seq_counter[determine_domain(model) + " " + reg] = len(model_regions) * freq
457
416
  if total_useful_sequences / len(data) < 0.75 and run_status != "ambiguous":
458
417
  failed_run_counter += 1
459
418
  logging.info("No output will be produced - too few useful sequences")
@@ -511,16 +470,12 @@ def retrieve_regions(
511
470
  seq_count_out.write("{}\t{}\n".format(key, int(value)))
512
471
 
513
472
  logging.info(
514
- "Analyzed {} files and {} sequences. Output generated for {} sequences".format(
515
- file_counter, sequence_counter_total, sequence_counter_useful
516
- )
473
+ "Analyzed {} files and {} sequences. Output generated for {} sequences".format(file_counter, sequence_counter_total, sequence_counter_useful)
517
474
  )
518
475
 
519
476
 
520
477
  def parse_args(argv):
521
- parser = argparse.ArgumentParser(
522
- description="Tool to determine which regions were amplified in 16S data"
523
- )
478
+ parser = argparse.ArgumentParser(description="Tool to determine which regions were amplified in 16S data")
524
479
  parser.add_argument("files", nargs="+", help="A list of overlapped tblout files")
525
480
  parser.add_argument(
526
481
  "-d",
@@ -534,9 +489,7 @@ def parse_args(argv):
534
489
  default="amplified_regions",
535
490
  help="Prefix for all outputs",
536
491
  )
537
- parser.add_argument(
538
- "--statistics", action="store_true", help="Print statistics files"
539
- )
492
+ parser.add_argument("--statistics", action="store_true", help="Print statistics files")
540
493
  return parser.parse_args(argv)
541
494
 
542
495
 
@@ -546,18 +499,10 @@ def main(argv=None):
546
499
  if not os.path.isdir(args.output_dir):
547
500
  os.mkdir(args.output_dir)
548
501
  prefix = os.path.join(args.output_dir, args.output_prefix)
549
- stats_file = "{}.stats".format(
550
- prefix
551
- ) # detailed stats for each run before filtration steps
552
- condensed_stats_file = "{}.condensed_stats".format(
553
- prefix
554
- ) # basic stats for the batch of runs
555
- missing_files_log = "{}.missing_files.txt".format(
556
- prefix
557
- ) # the names of non-existent files
558
- seq_count_log = "{}.seq_count.txt".format(
559
- prefix
560
- ) # the number of sequences per domain/VR in the batch
502
+ stats_file = "{}.stats".format(prefix) # detailed stats for each run before filtration steps
503
+ condensed_stats_file = "{}.condensed_stats".format(prefix) # basic stats for the batch of runs
504
+ missing_files_log = "{}.missing_files.txt".format(prefix) # the names of non-existent files
505
+ seq_count_log = "{}.seq_count.txt".format(prefix) # the number of sequences per domain/VR in the batch
561
506
  stats_out = open(stats_file, "w")
562
507
  condensed_out = open(condensed_stats_file, "w")
563
508
  missing_out = open(missing_files_log, "w")
@@ -568,9 +513,7 @@ def main(argv=None):
568
513
  "Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t"
569
514
  "Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n"
570
515
  )
571
- retrieve_regions(
572
- args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out
573
- )
516
+ retrieve_regions(args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out)
574
517
  stats_out.close()
575
518
  condensed_out.close()
576
519
  missing_out.close()
@@ -25,9 +25,7 @@ logging.basicConfig(level=logging.DEBUG)
25
25
 
26
26
  def parse_args():
27
27
  parser = argparse.ArgumentParser()
28
- parser.add_argument(
29
- "-i", "--input", required=True, type=str, help="Input from MAPseq output"
30
- )
28
+ parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
31
29
  parser.add_argument(
32
30
  "-l",
33
31
  "--label",
@@ -135,19 +133,13 @@ def process_blank_tax_ends(res_df, ranks):
135
133
  for i in range(len(res_df)):
136
134
  last_empty_rank = ""
137
135
  currently_empty = False
138
- for j in reversed(
139
- range(len(ranks))
140
- ): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
136
+ for j in reversed(range(len(ranks))): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
141
137
  curr_rank = res_df.iloc[i, j + 1]
142
138
  if curr_rank in ranks:
143
- if (
144
- last_empty_rank == ""
145
- ): # Last rank is empty, start window of consecutive blanks
139
+ if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
146
140
  last_empty_rank = j + 1
147
141
  currently_empty = True
148
- elif (
149
- currently_empty
150
- ): # If we're in a window of consecutive blank assignments that started at the beginning
142
+ elif currently_empty: # If we're in a window of consecutive blank assignments that started at the beginning
151
143
  last_empty_rank = j + 1
152
144
  else:
153
145
  break
@@ -15,22 +15,25 @@
15
15
  # limitations under the License.
16
16
 
17
17
  import argparse
18
- from collections import defaultdict
18
+ import logging
19
19
  import re
20
+ from collections import defaultdict
20
21
 
22
+ import pandas as pd
21
23
  from Bio import SeqIO
22
24
  from Bio.Seq import Seq
23
- import pandas as pd
24
25
 
25
26
  from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
26
- REGIONS_16S_BACTERIA,
27
27
  REGIONS_16S_ARCHAEA,
28
+ REGIONS_16S_BACTERIA,
28
29
  REGIONS_18S,
29
30
  )
30
31
 
31
32
  STRAND_FWD = "fwd"
32
33
  STRAND_REV = "rev"
33
34
 
35
+ logging.basicConfig(level=logging.INFO)
36
+
34
37
 
35
38
  def parse_args():
36
39
  parser = argparse.ArgumentParser()
@@ -65,23 +68,44 @@ def parse_args():
65
68
  return input, fasta, sample, single_end
66
69
 
67
70
 
68
- def get_amp_region(beg, end, strand, model):
71
+ def get_amp_region(primer_beg: float, primer_end: float, strand: str, model: dict) -> str:
69
72
  prev_region = ""
70
73
 
74
+ # some valid primers go inside HV regions a little bit, this margin is to account for that
71
75
  margin = -10
72
76
 
73
77
  for region, region_coords in model.items():
74
-
78
+ # get current region start and end coordinates
75
79
  region_beg = region_coords[0]
76
- beg_diff = region_beg - beg
77
- end_diff = region_beg - end
78
-
79
- if strand == STRAND_FWD:
80
- if beg_diff >= margin and end_diff >= margin:
80
+ region_end = region_coords[1]
81
+
82
+ # compute where primer beginning is in relation to current region
83
+ region_beg_primer_beg_diff = region_beg - primer_beg
84
+ region_beg_primer_end_diff = region_beg - primer_end
85
+ primer_beg_near_region_start = region_beg_primer_beg_diff >= margin
86
+ primer_end_near_region_start = region_beg_primer_end_diff >= margin
87
+
88
+ # compute where primer end is in relation to current region
89
+ region_end_primer_beg_diff = region_end - primer_beg
90
+ region_end_primer_end_diff = region_end - primer_end
91
+ primer_beg_before_region_end = region_end_primer_beg_diff >= margin
92
+ primer_end_before_region_end = region_end_primer_end_diff >= margin
93
+
94
+ if primer_beg_near_region_start and primer_end_near_region_start:
95
+ # if both these statements are true then primer is before a HV region
96
+ # i.e. validation = true
97
+ if strand == STRAND_FWD:
81
98
  return region
82
- else:
83
- if beg_diff >= margin and end_diff >= margin:
99
+ else:
100
+ # if primer strand is REV then we return the previous region
84
101
  return prev_region
102
+ elif primer_beg_before_region_end and primer_end_before_region_end:
103
+ # if the previous if statement is FALSE
104
+ # AND if both these statements are true then primer is within a HV region
105
+ # i.e. validation = false
106
+ logging.warning(f"This primer is within HV region {region}: {str(int(primer_beg))}-{str(int(primer_end))} vs {region_beg}-{region_end}")
107
+ return ""
108
+ # keep iterating through HV regions otherwise
85
109
 
86
110
  prev_region = region
87
111
 
@@ -89,10 +113,11 @@ def get_amp_region(beg, end, strand, model):
89
113
 
90
114
 
91
115
  def main():
92
-
93
116
  input, fasta, sample, single_end = parse_args()
94
117
  res_dict = defaultdict(list)
118
+
95
119
  fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
120
+ logging.info(f"Total primers read (including permutations): {len(fasta_dict)}")
96
121
 
97
122
  fwd_primers_fw = open("./fwd_primers.fasta", "w")
98
123
  rev_primers_fw = open("./rev_primers.fasta", "w")
@@ -100,6 +125,7 @@ def main():
100
125
  matched_primers_list = []
101
126
 
102
127
  with open(input, "r") as fr:
128
+ logging.info(f"Reading deoverlap file: {input}")
103
129
  for line in fr:
104
130
  line = line.strip()
105
131
  line = re.sub("[ \t]+", "\t", line)
@@ -133,10 +159,6 @@ def main():
133
159
  amp_region = "Unknown"
134
160
  model = ""
135
161
 
136
- res_dict["Run"].append(sample)
137
- res_dict["AssertionEvidence"].append("ECO_0000363")
138
- res_dict["AssertionMethod"].append("automatic assertion")
139
-
140
162
  strand = ""
141
163
 
142
164
  if primer_name[-1] == "F":
@@ -144,18 +166,26 @@ def main():
144
166
  elif primer_name[-1] == "R":
145
167
  strand = STRAND_REV
146
168
  else:
147
- print(f"Not sure what strand this is, exiting: {primer_name}")
169
+ logging.warning(f"Not sure what strand this is, skipping: {primer_name}")
170
+ continue
148
171
 
149
172
  if model:
173
+ logging.info(f"Checking match coordinates for primer {primer_name}")
150
174
  amp_region = get_amp_region(beg, end, strand, model)
151
175
 
176
+ if not amp_region:
177
+ logging.warning(f"Primer validation failed for {primer_name}, skipping")
178
+ continue
179
+
152
180
  primer_seq = str(fasta_dict[cleaned_primer_name].seq)
153
181
 
182
+ res_dict["Run"].append(sample)
183
+ res_dict["AssertionEvidence"].append("ECO_0000363")
184
+ res_dict["AssertionMethod"].append("automatic assertion")
154
185
  res_dict["Gene"].append(gene)
155
186
  res_dict["VariableRegion"].append(amp_region)
156
187
  res_dict["PrimerName"].append(cleaned_primer_name)
157
188
  res_dict["PrimerStrand"].append(strand)
158
- res_dict["PrimerSeq"].append(primer_seq)
159
189
 
160
190
  if strand == STRAND_FWD:
161
191
  fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
@@ -164,11 +194,21 @@ def main():
164
194
  primer_seq = Seq(primer_seq).reverse_complement()
165
195
  rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
166
196
 
197
+ res_dict["PrimerSeq"].append(primer_seq)
198
+
167
199
  matched_primers_list.append(cleaned_primer_name)
200
+ logging.info(f"Added {cleaned_primer_name} to list of matched primers")
168
201
 
169
- res_df = pd.DataFrame.from_dict(res_dict)
170
202
  res_tsv_name = f"./{sample}_primer_validation.tsv"
171
- res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
203
+ if res_dict:
204
+ res_df = pd.DataFrame.from_dict(res_dict)
205
+ res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
206
+ logging.info(f"{len(res_df)} primers validated, generating output")
207
+
208
+ else:
209
+ logging.warning("No primers were successfully validated, generating empty outputs")
210
+ primer_val_fw = open(res_tsv_name, "w")
211
+ primer_val_fw.close()
172
212
 
173
213
  fwd_primers_fw.close()
174
214
  rev_primers_fw.close()
@@ -33,9 +33,7 @@ def parse_args():
33
33
  type=str,
34
34
  help="Path to forward (or single-end) fastq file",
35
35
  )
36
- parser.add_argument(
37
- "-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
38
- )
36
+ parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
39
37
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
40
38
  args = parser.parse_args()
41
39
 
@@ -55,9 +55,7 @@ def main():
55
55
  if "R" in primer_name:
56
56
  primers_dict[primer_key].seq = primer.seq.reverse_complement()
57
57
 
58
- SeqIO.write(
59
- primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
60
- )
58
+ SeqIO.write(primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta")
61
59
 
62
60
 
63
61
  if __name__ == "__main__":
@@ -63,9 +63,7 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
63
63
 
64
64
 
65
65
  def main():
66
- parser = argparse.ArgumentParser(
67
- "Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
68
- )
66
+ parser = argparse.ArgumentParser("Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein.")
69
67
  parser.add_argument(
70
68
  "-d",
71
69
  "--diamond_hits",
@@ -105,9 +103,7 @@ def main():
105
103
  proteins = args.proteins
106
104
  rhea2chebi = args.rhea2chebi
107
105
 
108
- logging.info(
109
- f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
110
- )
106
+ logging.info(f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}")
111
107
  protein_hashes = {}
112
108
  with open(proteins, "r") as fasta_file:
113
109
  for record in SeqIO.parse(fasta_file, "fasta"):
@@ -118,17 +114,13 @@ def main():
118
114
  df = pd.read_csv(rhea2chebi, delimiter="\t")
119
115
  rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
120
116
 
121
- logging.info(
122
- f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
123
- )
117
+ logging.info(f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output")
124
118
  with open(output, "w") as output_handler:
125
119
  if diamond_hits == "-":
126
120
  process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
127
121
  else:
128
122
  with open(diamond_hits, "r") as input_file:
129
- process_lines(
130
- input_file, output_handler, rhea2reaction_dict, protein_hashes
131
- )
123
+ process_lines(input_file, output_handler, rhea2reaction_dict, protein_hashes)
132
124
 
133
125
  logging.info("Processed successfully. Exiting.")
134
126
 
@@ -23,12 +23,8 @@ import pandas as pd
23
23
 
24
24
  def parse_args():
25
25
  parser = argparse.ArgumentParser()
26
- parser.add_argument(
27
- "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
28
- )
29
- parser.add_argument(
30
- "-o", "--output", required=True, type=str, help="Output GFF3 file name"
31
- )
26
+ parser.add_argument("-i", "--input", required=True, type=str, help="Input JSON from antiSMASH")
27
+ parser.add_argument("-o", "--output", required=True, type=str, help="Output GFF3 file name")
32
28
  parser.add_argument(
33
29
  "--cds_tag",
34
30
  default="ID",
@@ -57,17 +53,13 @@ def main():
57
53
  for record in antismash_analysis["records"]:
58
54
  record_id = record["id"]
59
55
 
60
- iter_cds = (
61
- "antismash.detection.genefunctions" in record["modules"].keys()
62
- ) # Flag to iterate CDS
56
+ iter_cds = "antismash.detection.genefunctions" in record["modules"].keys() # Flag to iterate CDS
63
57
  region_name = None
64
58
 
65
59
  for feature in record["features"]:
66
60
  if feature["type"] == "region":
67
61
  # Annotate region features
68
- region_name = (
69
- f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
70
- )
62
+ region_name = f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
71
63
  region_start = int(feature["location"].split(":")[0].split("[")[1])
72
64
  region_end = int(feature["location"].split(":")[1].split("]")[0])
73
65
 
@@ -82,9 +74,7 @@ def main():
82
74
 
83
75
  product = ",".join(feature["qualifiers"].get("product", []))
84
76
 
85
- attributes_dict[region_name].update(
86
- {"ID": region_name, "product": product}
87
- )
77
+ attributes_dict[region_name].update({"ID": region_name, "product": product})
88
78
 
89
79
  if iter_cds and feature["type"] == "CDS":
90
80
  # Annotate CDS features
@@ -111,12 +101,8 @@ def main():
111
101
  attributes_dict[locus_tag].update(
112
102
  {
113
103
  "ID": locus_tag,
114
- "as_type": ",".join(
115
- feature["qualifiers"].get("gene_kind", ["other"])
116
- ),
117
- "gene_functions": ",".join(
118
- feature["qualifiers"].get("gene_functions", [])
119
- )
104
+ "as_type": ",".join(feature["qualifiers"].get("gene_kind", ["other"])),
105
+ "gene_functions": ",".join(feature["qualifiers"].get("gene_functions", []))
120
106
  .replace(" ", "_")
121
107
  .replace(":_", ":")
122
108
  .replace(";_", "%3B"),
@@ -126,9 +112,7 @@ def main():
126
112
 
127
113
  # Extended CDS attributes
128
114
  if "antismash.detection.hmm_detection" in record["modules"].keys():
129
- cds_by_protocluster = record["modules"][
130
- "antismash.detection.hmm_detection"
131
- ]["rule_results"]["cds_by_protocluster"]
115
+ cds_by_protocluster = record["modules"]["antismash.detection.hmm_detection"]["rule_results"]["cds_by_protocluster"]
132
116
 
133
117
  if not cds_by_protocluster:
134
118
  continue
@@ -137,14 +121,10 @@ def main():
137
121
  if locus_tag := feature.get("cds_name"):
138
122
  as_clusters = ",".join(list(feature["definition_domains"].keys()))
139
123
  if locus_tag in attributes_dict:
140
- attributes_dict[locus_tag].update(
141
- {"as_gene_clusters": as_clusters}
142
- )
124
+ attributes_dict[locus_tag].update({"as_gene_clusters": as_clusters})
143
125
 
144
126
  if "antismash.detection.genefunctions" in record["modules"].keys():
145
- gene_function_tools = record["modules"][
146
- "antismash.detection.genefunctions"
147
- ]["tools"]
127
+ gene_function_tools = record["modules"]["antismash.detection.genefunctions"]["tools"]
148
128
  if tool_data := gene_function_tools.get("smcogs"):
149
129
 
150
130
  for locus_tag in tool_data["best_hits"]:
@@ -158,18 +138,13 @@ def main():
158
138
  if locus_tag in attributes_dict.keys():
159
139
  attributes_dict[locus_tag].update({"as_notes": smcog_note})
160
140
 
161
- attributes = [
162
- ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
163
- for attrib_data in attributes_dict.values()
164
- ]
141
+ attributes = [";".join(f"{k}={v}" for k, v in attrib_data.items() if v) for attrib_data in attributes_dict.values()]
165
142
  res_dict["attributes"] = attributes
166
143
 
167
144
  res_df = pd.DataFrame.from_dict(res_dict)
168
145
 
169
146
  with open(output_file, "w") as f_out:
170
- f_out.write(
171
- "##gff-version 3\n"
172
- ) # Save data to the GFF3 file with the proper header
147
+ f_out.write("##gff-version 3\n") # Save data to the GFF3 file with the proper header
173
148
  res_df.to_csv(f_out, header=False, index=False, sep="\t")
174
149
 
175
150