mgnify-pipelines-toolkit 1.4.1__tar.gz → 1.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/PKG-INFO +1 -1
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +23 -80
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +4 -12
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +61 -21
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +1 -3
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +1 -3
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +4 -12
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +12 -37
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +12 -37
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +2 -6
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +33 -91
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +6 -18
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +3 -9
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +1 -3
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +5 -15
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +3 -16
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +5 -19
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +1 -1
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +11 -22
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +5 -15
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +9 -29
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +14 -24
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +19 -72
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +4 -12
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +4 -12
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +2 -6
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +3 -10
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +22 -85
- mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/ena/webin_cli_handler.py +741 -0
- mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +5 -15
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +1 -1
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +2 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/pyproject.toml +5 -1
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/README.md +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/amrintegrator.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1/mgnify_pipelines_toolkit/utils → mgnify_pipelines_toolkit-1.4.4/mgnify_pipelines_toolkit/ena}/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/schemas/dataframes.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
- {mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/setup.cfg +0 -0
|
@@ -73,11 +73,7 @@ def get_multiregion(raw_sequence_coords, regions):
|
|
|
73
73
|
region_coverages[region] = overlap
|
|
74
74
|
|
|
75
75
|
# check if any of the coords are inside the region
|
|
76
|
-
matched_regions = [
|
|
77
|
-
region
|
|
78
|
-
for region, limits in regions.items()
|
|
79
|
-
if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP
|
|
80
|
-
]
|
|
76
|
+
matched_regions = [region for region, limits in regions.items() if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP]
|
|
81
77
|
if len(matched_regions) > 1:
|
|
82
78
|
amplified_region = "{}-{}".format(min(matched_regions), max(matched_regions))
|
|
83
79
|
elif len(matched_regions) == 1:
|
|
@@ -121,13 +117,8 @@ def unsplit_region(long_region):
|
|
|
121
117
|
|
|
122
118
|
|
|
123
119
|
def check_inclusiveness(more_frequent, less_frequent):
|
|
124
|
-
unsplit_more_frequent, unsplit_less_frequent = [
|
|
125
|
-
|
|
126
|
-
]
|
|
127
|
-
return (
|
|
128
|
-
unsplit_more_frequent[0] <= unsplit_less_frequent[0]
|
|
129
|
-
and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
|
|
130
|
-
)
|
|
120
|
+
unsplit_more_frequent, unsplit_less_frequent = [unsplit_region(region) for region in [more_frequent, less_frequent]]
|
|
121
|
+
return unsplit_more_frequent[0] <= unsplit_less_frequent[0] and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
|
|
131
122
|
|
|
132
123
|
|
|
133
124
|
def normalise_results(region_matches):
|
|
@@ -150,9 +141,7 @@ def normalise_results(region_matches):
|
|
|
150
141
|
if count / len(region_matches) >= MAX_ERROR_PROPORTION and region != ""
|
|
151
142
|
]
|
|
152
143
|
# sort by frequency in reverse order
|
|
153
|
-
var_region_proportions = sorted(
|
|
154
|
-
var_region_proportions, key=lambda x: x[1], reverse=True
|
|
155
|
-
)
|
|
144
|
+
var_region_proportions = sorted(var_region_proportions, key=lambda x: x[1], reverse=True)
|
|
156
145
|
|
|
157
146
|
if len(var_region_proportions) == 1:
|
|
158
147
|
return dict(var_region_proportions)
|
|
@@ -165,9 +154,7 @@ def normalise_results(region_matches):
|
|
|
165
154
|
else:
|
|
166
155
|
return None
|
|
167
156
|
else:
|
|
168
|
-
if min(
|
|
169
|
-
more_frequent[1], less_frequent[1]
|
|
170
|
-
) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
|
|
157
|
+
if min(more_frequent[1], less_frequent[1]) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
|
|
171
158
|
return dict(var_region_proportions)
|
|
172
159
|
else:
|
|
173
160
|
return None
|
|
@@ -221,9 +208,7 @@ def determine_marker_gene(domain):
|
|
|
221
208
|
return "18S"
|
|
222
209
|
|
|
223
210
|
|
|
224
|
-
def print_stats(
|
|
225
|
-
run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out
|
|
226
|
-
):
|
|
211
|
+
def print_stats(run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out):
|
|
227
212
|
summary_num = dict()
|
|
228
213
|
for cm in run_result:
|
|
229
214
|
summary_num[cm] = dict()
|
|
@@ -233,14 +218,7 @@ def print_stats(
|
|
|
233
218
|
del stats[""]
|
|
234
219
|
summary_num[cm]["regions"] = ", ".join(stats.keys())
|
|
235
220
|
summary_num[cm]["freqs"] = ", ".join(
|
|
236
|
-
[
|
|
237
|
-
(
|
|
238
|
-
"{0:.4f}".format(val / len(run_result[cm]))
|
|
239
|
-
if len(run_result[cm]) > 0
|
|
240
|
-
else "0"
|
|
241
|
-
)
|
|
242
|
-
for val in stats.values()
|
|
243
|
-
]
|
|
221
|
+
[("{0:.4f}".format(val / len(run_result[cm])) if len(run_result[cm]) > 0 else "0") for val in stats.values()]
|
|
244
222
|
)
|
|
245
223
|
|
|
246
224
|
print_str = ""
|
|
@@ -291,9 +269,7 @@ def print_to_table(tsv_out, results, per_read_info):
|
|
|
291
269
|
marker_gene = determine_marker_gene(domain)
|
|
292
270
|
for vr in amplified_regions.keys():
|
|
293
271
|
if not vr == "":
|
|
294
|
-
record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(
|
|
295
|
-
run, determine_marker_gene(domain), vr
|
|
296
|
-
)
|
|
272
|
+
record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(run, determine_marker_gene(domain), vr)
|
|
297
273
|
records.add(record)
|
|
298
274
|
records_regions.add(f"{marker_gene}.{vr}\n")
|
|
299
275
|
gene_hv_to_write.append(f"{marker_gene}.{vr}")
|
|
@@ -325,9 +301,7 @@ def retrieve_regions(
|
|
|
325
301
|
sequence_counter_total = 0 # count how many sequences in total were analyzed
|
|
326
302
|
sequence_counter_useful = 0 # count how many sequences an output was generated for
|
|
327
303
|
normalised_matches = dict() # dictionary that will contain results for all runs
|
|
328
|
-
failed_run_counter = (
|
|
329
|
-
0 # total number of excluded runs for any reason (except non-existing files)
|
|
330
|
-
)
|
|
304
|
+
failed_run_counter = 0 # total number of excluded runs for any reason (except non-existing files)
|
|
331
305
|
run_counters = {k: 0 for k in ["one", "two", "ambiguous"]} # counters
|
|
332
306
|
seq_per_variable_region_count = dict()
|
|
333
307
|
|
|
@@ -343,13 +317,9 @@ def retrieve_regions(
|
|
|
343
317
|
data = load_data(tblout_file)
|
|
344
318
|
run_id = identify_run(tblout_file)
|
|
345
319
|
multiregion_matches = dict()
|
|
346
|
-
unsupported_matches =
|
|
347
|
-
0 # tracks the number of sequences that map to unsupported models
|
|
348
|
-
)
|
|
320
|
+
unsupported_matches = 0 # tracks the number of sequences that map to unsupported models
|
|
349
321
|
primer_inside_vr = 0 # tracks the number of sequences that start and/or end inside a variable region
|
|
350
|
-
per_read_info = (
|
|
351
|
-
dict()
|
|
352
|
-
) # dictionary will contain read names for each variable region
|
|
322
|
+
per_read_info = dict() # dictionary will contain read names for each variable region
|
|
353
323
|
all_region_coverages = defaultdict(lambda: defaultdict(list))
|
|
354
324
|
for read in data:
|
|
355
325
|
# Example structure of `read`
|
|
@@ -362,18 +332,13 @@ def retrieve_regions(
|
|
|
362
332
|
if not regions == "unsupported":
|
|
363
333
|
matches, coverages = get_multiregion(limits, regions)
|
|
364
334
|
|
|
365
|
-
[
|
|
366
|
-
all_region_coverages[domain][region].append(coverage)
|
|
367
|
-
for region, coverage in coverages.items()
|
|
368
|
-
]
|
|
335
|
+
[all_region_coverages[domain][region].append(coverage) for region, coverage in coverages.items()]
|
|
369
336
|
|
|
370
337
|
multiregion_matches.setdefault(read[2], []).append(matches)
|
|
371
338
|
if check_primer_position(limits, regions):
|
|
372
339
|
primer_inside_vr += 1
|
|
373
340
|
sequence_counter_useful += 1
|
|
374
|
-
per_read_info.setdefault(marker_gene + "." + matches, []).append(
|
|
375
|
-
read[0]
|
|
376
|
-
)
|
|
341
|
+
per_read_info.setdefault(marker_gene + "." + matches, []).append(read[0])
|
|
377
342
|
else:
|
|
378
343
|
unsupported_matches += 1
|
|
379
344
|
|
|
@@ -394,11 +359,7 @@ def retrieve_regions(
|
|
|
394
359
|
if unsupported_fract >= MAX_ERROR_PROPORTION:
|
|
395
360
|
failed_run_counter += 1
|
|
396
361
|
logging.info("No output will be produced - too many unsupported models")
|
|
397
|
-
logging.info(
|
|
398
|
-
"Excluded\t{}\t{}\t{}\n".format(
|
|
399
|
-
tblout_file, "{0:.2f}".format(unsupported_fract), len(data)
|
|
400
|
-
)
|
|
401
|
-
)
|
|
362
|
+
logging.info("Excluded\t{}\t{}\t{}\n".format(tblout_file, "{0:.2f}".format(unsupported_fract), len(data)))
|
|
402
363
|
continue
|
|
403
364
|
|
|
404
365
|
normalised_matches[run_id] = dict()
|
|
@@ -451,9 +412,7 @@ def retrieve_regions(
|
|
|
451
412
|
run_result[determine_domain(model)] = result
|
|
452
413
|
for reg, freq in result.items():
|
|
453
414
|
total_useful_sequences += len(model_regions) * freq
|
|
454
|
-
temp_seq_counter[determine_domain(model) + " " + reg] = (
|
|
455
|
-
len(model_regions) * freq
|
|
456
|
-
)
|
|
415
|
+
temp_seq_counter[determine_domain(model) + " " + reg] = len(model_regions) * freq
|
|
457
416
|
if total_useful_sequences / len(data) < 0.75 and run_status != "ambiguous":
|
|
458
417
|
failed_run_counter += 1
|
|
459
418
|
logging.info("No output will be produced - too few useful sequences")
|
|
@@ -511,16 +470,12 @@ def retrieve_regions(
|
|
|
511
470
|
seq_count_out.write("{}\t{}\n".format(key, int(value)))
|
|
512
471
|
|
|
513
472
|
logging.info(
|
|
514
|
-
"Analyzed {} files and {} sequences. Output generated for {} sequences".format(
|
|
515
|
-
file_counter, sequence_counter_total, sequence_counter_useful
|
|
516
|
-
)
|
|
473
|
+
"Analyzed {} files and {} sequences. Output generated for {} sequences".format(file_counter, sequence_counter_total, sequence_counter_useful)
|
|
517
474
|
)
|
|
518
475
|
|
|
519
476
|
|
|
520
477
|
def parse_args(argv):
|
|
521
|
-
parser = argparse.ArgumentParser(
|
|
522
|
-
description="Tool to determine which regions were amplified in 16S data"
|
|
523
|
-
)
|
|
478
|
+
parser = argparse.ArgumentParser(description="Tool to determine which regions were amplified in 16S data")
|
|
524
479
|
parser.add_argument("files", nargs="+", help="A list of overlapped tblout files")
|
|
525
480
|
parser.add_argument(
|
|
526
481
|
"-d",
|
|
@@ -534,9 +489,7 @@ def parse_args(argv):
|
|
|
534
489
|
default="amplified_regions",
|
|
535
490
|
help="Prefix for all outputs",
|
|
536
491
|
)
|
|
537
|
-
parser.add_argument(
|
|
538
|
-
"--statistics", action="store_true", help="Print statistics files"
|
|
539
|
-
)
|
|
492
|
+
parser.add_argument("--statistics", action="store_true", help="Print statistics files")
|
|
540
493
|
return parser.parse_args(argv)
|
|
541
494
|
|
|
542
495
|
|
|
@@ -546,18 +499,10 @@ def main(argv=None):
|
|
|
546
499
|
if not os.path.isdir(args.output_dir):
|
|
547
500
|
os.mkdir(args.output_dir)
|
|
548
501
|
prefix = os.path.join(args.output_dir, args.output_prefix)
|
|
549
|
-
stats_file = "{}.stats".format(
|
|
550
|
-
|
|
551
|
-
) #
|
|
552
|
-
|
|
553
|
-
prefix
|
|
554
|
-
) # basic stats for the batch of runs
|
|
555
|
-
missing_files_log = "{}.missing_files.txt".format(
|
|
556
|
-
prefix
|
|
557
|
-
) # the names of non-existent files
|
|
558
|
-
seq_count_log = "{}.seq_count.txt".format(
|
|
559
|
-
prefix
|
|
560
|
-
) # the number of sequences per domain/VR in the batch
|
|
502
|
+
stats_file = "{}.stats".format(prefix) # detailed stats for each run before filtration steps
|
|
503
|
+
condensed_stats_file = "{}.condensed_stats".format(prefix) # basic stats for the batch of runs
|
|
504
|
+
missing_files_log = "{}.missing_files.txt".format(prefix) # the names of non-existent files
|
|
505
|
+
seq_count_log = "{}.seq_count.txt".format(prefix) # the number of sequences per domain/VR in the batch
|
|
561
506
|
stats_out = open(stats_file, "w")
|
|
562
507
|
condensed_out = open(condensed_stats_file, "w")
|
|
563
508
|
missing_out = open(missing_files_log, "w")
|
|
@@ -568,9 +513,7 @@ def main(argv=None):
|
|
|
568
513
|
"Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t"
|
|
569
514
|
"Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n"
|
|
570
515
|
)
|
|
571
|
-
retrieve_regions(
|
|
572
|
-
args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out
|
|
573
|
-
)
|
|
516
|
+
retrieve_regions(args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out)
|
|
574
517
|
stats_out.close()
|
|
575
518
|
condensed_out.close()
|
|
576
519
|
missing_out.close()
|
|
@@ -25,9 +25,7 @@ logging.basicConfig(level=logging.DEBUG)
|
|
|
25
25
|
|
|
26
26
|
def parse_args():
|
|
27
27
|
parser = argparse.ArgumentParser()
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
"-i", "--input", required=True, type=str, help="Input from MAPseq output"
|
|
30
|
-
)
|
|
28
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
|
|
31
29
|
parser.add_argument(
|
|
32
30
|
"-l",
|
|
33
31
|
"--label",
|
|
@@ -135,19 +133,13 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
135
133
|
for i in range(len(res_df)):
|
|
136
134
|
last_empty_rank = ""
|
|
137
135
|
currently_empty = False
|
|
138
|
-
for j in reversed(
|
|
139
|
-
range(len(ranks))
|
|
140
|
-
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
136
|
+
for j in reversed(range(len(ranks))): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
141
137
|
curr_rank = res_df.iloc[i, j + 1]
|
|
142
138
|
if curr_rank in ranks:
|
|
143
|
-
if
|
|
144
|
-
last_empty_rank == ""
|
|
145
|
-
): # Last rank is empty, start window of consecutive blanks
|
|
139
|
+
if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
|
|
146
140
|
last_empty_rank = j + 1
|
|
147
141
|
currently_empty = True
|
|
148
|
-
elif
|
|
149
|
-
currently_empty
|
|
150
|
-
): # If we're in a window of consecutive blank assignments that started at the beginning
|
|
142
|
+
elif currently_empty: # If we're in a window of consecutive blank assignments that started at the beginning
|
|
151
143
|
last_empty_rank = j + 1
|
|
152
144
|
else:
|
|
153
145
|
break
|
|
@@ -15,22 +15,25 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
|
-
|
|
18
|
+
import logging
|
|
19
19
|
import re
|
|
20
|
+
from collections import defaultdict
|
|
20
21
|
|
|
22
|
+
import pandas as pd
|
|
21
23
|
from Bio import SeqIO
|
|
22
24
|
from Bio.Seq import Seq
|
|
23
|
-
import pandas as pd
|
|
24
25
|
|
|
25
26
|
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
26
|
-
REGIONS_16S_BACTERIA,
|
|
27
27
|
REGIONS_16S_ARCHAEA,
|
|
28
|
+
REGIONS_16S_BACTERIA,
|
|
28
29
|
REGIONS_18S,
|
|
29
30
|
)
|
|
30
31
|
|
|
31
32
|
STRAND_FWD = "fwd"
|
|
32
33
|
STRAND_REV = "rev"
|
|
33
34
|
|
|
35
|
+
logging.basicConfig(level=logging.INFO)
|
|
36
|
+
|
|
34
37
|
|
|
35
38
|
def parse_args():
|
|
36
39
|
parser = argparse.ArgumentParser()
|
|
@@ -65,23 +68,44 @@ def parse_args():
|
|
|
65
68
|
return input, fasta, sample, single_end
|
|
66
69
|
|
|
67
70
|
|
|
68
|
-
def get_amp_region(
|
|
71
|
+
def get_amp_region(primer_beg: float, primer_end: float, strand: str, model: dict) -> str:
|
|
69
72
|
prev_region = ""
|
|
70
73
|
|
|
74
|
+
# some valid primers go inside HV regions a little bit, this margin is to account for that
|
|
71
75
|
margin = -10
|
|
72
76
|
|
|
73
77
|
for region, region_coords in model.items():
|
|
74
|
-
|
|
78
|
+
# get current region start and end coordinates
|
|
75
79
|
region_beg = region_coords[0]
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
80
|
+
region_end = region_coords[1]
|
|
81
|
+
|
|
82
|
+
# compute where primer beginning is in relation to current region
|
|
83
|
+
region_beg_primer_beg_diff = region_beg - primer_beg
|
|
84
|
+
region_beg_primer_end_diff = region_beg - primer_end
|
|
85
|
+
primer_beg_near_region_start = region_beg_primer_beg_diff >= margin
|
|
86
|
+
primer_end_near_region_start = region_beg_primer_end_diff >= margin
|
|
87
|
+
|
|
88
|
+
# compute where primer end is in relation to current region
|
|
89
|
+
region_end_primer_beg_diff = region_end - primer_beg
|
|
90
|
+
region_end_primer_end_diff = region_end - primer_end
|
|
91
|
+
primer_beg_before_region_end = region_end_primer_beg_diff >= margin
|
|
92
|
+
primer_end_before_region_end = region_end_primer_end_diff >= margin
|
|
93
|
+
|
|
94
|
+
if primer_beg_near_region_start and primer_end_near_region_start:
|
|
95
|
+
# if both these statements are true then primer is before a HV region
|
|
96
|
+
# i.e. validation = true
|
|
97
|
+
if strand == STRAND_FWD:
|
|
81
98
|
return region
|
|
82
|
-
|
|
83
|
-
|
|
99
|
+
else:
|
|
100
|
+
# if primer strand is REV then we return the previous region
|
|
84
101
|
return prev_region
|
|
102
|
+
elif primer_beg_before_region_end and primer_end_before_region_end:
|
|
103
|
+
# if the previous if statement is FALSE
|
|
104
|
+
# AND if both these statements are true then primer is within a HV region
|
|
105
|
+
# i.e. validation = false
|
|
106
|
+
logging.warning(f"This primer is within HV region {region}: {str(int(primer_beg))}-{str(int(primer_end))} vs {region_beg}-{region_end}")
|
|
107
|
+
return ""
|
|
108
|
+
# keep iterating through HV regions otherwise
|
|
85
109
|
|
|
86
110
|
prev_region = region
|
|
87
111
|
|
|
@@ -89,10 +113,11 @@ def get_amp_region(beg, end, strand, model):
|
|
|
89
113
|
|
|
90
114
|
|
|
91
115
|
def main():
|
|
92
|
-
|
|
93
116
|
input, fasta, sample, single_end = parse_args()
|
|
94
117
|
res_dict = defaultdict(list)
|
|
118
|
+
|
|
95
119
|
fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
|
|
120
|
+
logging.info(f"Total primers read (including permutations): {len(fasta_dict)}")
|
|
96
121
|
|
|
97
122
|
fwd_primers_fw = open("./fwd_primers.fasta", "w")
|
|
98
123
|
rev_primers_fw = open("./rev_primers.fasta", "w")
|
|
@@ -100,6 +125,7 @@ def main():
|
|
|
100
125
|
matched_primers_list = []
|
|
101
126
|
|
|
102
127
|
with open(input, "r") as fr:
|
|
128
|
+
logging.info(f"Reading deoverlap file: {input}")
|
|
103
129
|
for line in fr:
|
|
104
130
|
line = line.strip()
|
|
105
131
|
line = re.sub("[ \t]+", "\t", line)
|
|
@@ -133,10 +159,6 @@ def main():
|
|
|
133
159
|
amp_region = "Unknown"
|
|
134
160
|
model = ""
|
|
135
161
|
|
|
136
|
-
res_dict["Run"].append(sample)
|
|
137
|
-
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
138
|
-
res_dict["AssertionMethod"].append("automatic assertion")
|
|
139
|
-
|
|
140
162
|
strand = ""
|
|
141
163
|
|
|
142
164
|
if primer_name[-1] == "F":
|
|
@@ -144,18 +166,26 @@ def main():
|
|
|
144
166
|
elif primer_name[-1] == "R":
|
|
145
167
|
strand = STRAND_REV
|
|
146
168
|
else:
|
|
147
|
-
|
|
169
|
+
logging.warning(f"Not sure what strand this is, skipping: {primer_name}")
|
|
170
|
+
continue
|
|
148
171
|
|
|
149
172
|
if model:
|
|
173
|
+
logging.info(f"Checking match coordinates for primer {primer_name}")
|
|
150
174
|
amp_region = get_amp_region(beg, end, strand, model)
|
|
151
175
|
|
|
176
|
+
if not amp_region:
|
|
177
|
+
logging.warning(f"Primer validation failed for {primer_name}, skipping")
|
|
178
|
+
continue
|
|
179
|
+
|
|
152
180
|
primer_seq = str(fasta_dict[cleaned_primer_name].seq)
|
|
153
181
|
|
|
182
|
+
res_dict["Run"].append(sample)
|
|
183
|
+
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
184
|
+
res_dict["AssertionMethod"].append("automatic assertion")
|
|
154
185
|
res_dict["Gene"].append(gene)
|
|
155
186
|
res_dict["VariableRegion"].append(amp_region)
|
|
156
187
|
res_dict["PrimerName"].append(cleaned_primer_name)
|
|
157
188
|
res_dict["PrimerStrand"].append(strand)
|
|
158
|
-
res_dict["PrimerSeq"].append(primer_seq)
|
|
159
189
|
|
|
160
190
|
if strand == STRAND_FWD:
|
|
161
191
|
fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
|
|
@@ -164,11 +194,21 @@ def main():
|
|
|
164
194
|
primer_seq = Seq(primer_seq).reverse_complement()
|
|
165
195
|
rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
|
|
166
196
|
|
|
197
|
+
res_dict["PrimerSeq"].append(primer_seq)
|
|
198
|
+
|
|
167
199
|
matched_primers_list.append(cleaned_primer_name)
|
|
200
|
+
logging.info(f"Added {cleaned_primer_name} to list of matched primers")
|
|
168
201
|
|
|
169
|
-
res_df = pd.DataFrame.from_dict(res_dict)
|
|
170
202
|
res_tsv_name = f"./{sample}_primer_validation.tsv"
|
|
171
|
-
|
|
203
|
+
if res_dict:
|
|
204
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
205
|
+
res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
|
|
206
|
+
logging.info(f"{len(res_df)} primers validated, generating output")
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
logging.warning("No primers were successfully validated, generating empty outputs")
|
|
210
|
+
primer_val_fw = open(res_tsv_name, "w")
|
|
211
|
+
primer_val_fw.close()
|
|
172
212
|
|
|
173
213
|
fwd_primers_fw.close()
|
|
174
214
|
rev_primers_fw.close()
|
|
@@ -33,9 +33,7 @@ def parse_args():
|
|
|
33
33
|
type=str,
|
|
34
34
|
help="Path to forward (or single-end) fastq file",
|
|
35
35
|
)
|
|
36
|
-
parser.add_argument(
|
|
37
|
-
"-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
|
|
38
|
-
)
|
|
36
|
+
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
|
|
39
37
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
40
38
|
args = parser.parse_args()
|
|
41
39
|
|
|
@@ -55,9 +55,7 @@ def main():
|
|
|
55
55
|
if "R" in primer_name:
|
|
56
56
|
primers_dict[primer_key].seq = primer.seq.reverse_complement()
|
|
57
57
|
|
|
58
|
-
SeqIO.write(
|
|
59
|
-
primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
|
|
60
|
-
)
|
|
58
|
+
SeqIO.write(primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta")
|
|
61
59
|
|
|
62
60
|
|
|
63
61
|
if __name__ == "__main__":
|
|
@@ -63,9 +63,7 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
|
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def main():
|
|
66
|
-
parser = argparse.ArgumentParser(
|
|
67
|
-
"Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
|
|
68
|
-
)
|
|
66
|
+
parser = argparse.ArgumentParser("Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein.")
|
|
69
67
|
parser.add_argument(
|
|
70
68
|
"-d",
|
|
71
69
|
"--diamond_hits",
|
|
@@ -105,9 +103,7 @@ def main():
|
|
|
105
103
|
proteins = args.proteins
|
|
106
104
|
rhea2chebi = args.rhea2chebi
|
|
107
105
|
|
|
108
|
-
logging.info(
|
|
109
|
-
f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
|
|
110
|
-
)
|
|
106
|
+
logging.info(f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}")
|
|
111
107
|
protein_hashes = {}
|
|
112
108
|
with open(proteins, "r") as fasta_file:
|
|
113
109
|
for record in SeqIO.parse(fasta_file, "fasta"):
|
|
@@ -118,17 +114,13 @@ def main():
|
|
|
118
114
|
df = pd.read_csv(rhea2chebi, delimiter="\t")
|
|
119
115
|
rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
|
|
120
116
|
|
|
121
|
-
logging.info(
|
|
122
|
-
f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
|
|
123
|
-
)
|
|
117
|
+
logging.info(f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output")
|
|
124
118
|
with open(output, "w") as output_handler:
|
|
125
119
|
if diamond_hits == "-":
|
|
126
120
|
process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
|
|
127
121
|
else:
|
|
128
122
|
with open(diamond_hits, "r") as input_file:
|
|
129
|
-
process_lines(
|
|
130
|
-
input_file, output_handler, rhea2reaction_dict, protein_hashes
|
|
131
|
-
)
|
|
123
|
+
process_lines(input_file, output_handler, rhea2reaction_dict, protein_hashes)
|
|
132
124
|
|
|
133
125
|
logging.info("Processed successfully. Exiting.")
|
|
134
126
|
|
|
@@ -23,12 +23,8 @@ import pandas as pd
|
|
|
23
23
|
|
|
24
24
|
def parse_args():
|
|
25
25
|
parser = argparse.ArgumentParser()
|
|
26
|
-
parser.add_argument(
|
|
27
|
-
|
|
28
|
-
)
|
|
29
|
-
parser.add_argument(
|
|
30
|
-
"-o", "--output", required=True, type=str, help="Output GFF3 file name"
|
|
31
|
-
)
|
|
26
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Input JSON from antiSMASH")
|
|
27
|
+
parser.add_argument("-o", "--output", required=True, type=str, help="Output GFF3 file name")
|
|
32
28
|
parser.add_argument(
|
|
33
29
|
"--cds_tag",
|
|
34
30
|
default="ID",
|
|
@@ -57,17 +53,13 @@ def main():
|
|
|
57
53
|
for record in antismash_analysis["records"]:
|
|
58
54
|
record_id = record["id"]
|
|
59
55
|
|
|
60
|
-
iter_cds = (
|
|
61
|
-
"antismash.detection.genefunctions" in record["modules"].keys()
|
|
62
|
-
) # Flag to iterate CDS
|
|
56
|
+
iter_cds = "antismash.detection.genefunctions" in record["modules"].keys() # Flag to iterate CDS
|
|
63
57
|
region_name = None
|
|
64
58
|
|
|
65
59
|
for feature in record["features"]:
|
|
66
60
|
if feature["type"] == "region":
|
|
67
61
|
# Annotate region features
|
|
68
|
-
region_name =
|
|
69
|
-
f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
|
|
70
|
-
)
|
|
62
|
+
region_name = f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
|
|
71
63
|
region_start = int(feature["location"].split(":")[0].split("[")[1])
|
|
72
64
|
region_end = int(feature["location"].split(":")[1].split("]")[0])
|
|
73
65
|
|
|
@@ -82,9 +74,7 @@ def main():
|
|
|
82
74
|
|
|
83
75
|
product = ",".join(feature["qualifiers"].get("product", []))
|
|
84
76
|
|
|
85
|
-
attributes_dict[region_name].update(
|
|
86
|
-
{"ID": region_name, "product": product}
|
|
87
|
-
)
|
|
77
|
+
attributes_dict[region_name].update({"ID": region_name, "product": product})
|
|
88
78
|
|
|
89
79
|
if iter_cds and feature["type"] == "CDS":
|
|
90
80
|
# Annotate CDS features
|
|
@@ -111,12 +101,8 @@ def main():
|
|
|
111
101
|
attributes_dict[locus_tag].update(
|
|
112
102
|
{
|
|
113
103
|
"ID": locus_tag,
|
|
114
|
-
"as_type": ",".join(
|
|
115
|
-
|
|
116
|
-
),
|
|
117
|
-
"gene_functions": ",".join(
|
|
118
|
-
feature["qualifiers"].get("gene_functions", [])
|
|
119
|
-
)
|
|
104
|
+
"as_type": ",".join(feature["qualifiers"].get("gene_kind", ["other"])),
|
|
105
|
+
"gene_functions": ",".join(feature["qualifiers"].get("gene_functions", []))
|
|
120
106
|
.replace(" ", "_")
|
|
121
107
|
.replace(":_", ":")
|
|
122
108
|
.replace(";_", "%3B"),
|
|
@@ -126,9 +112,7 @@ def main():
|
|
|
126
112
|
|
|
127
113
|
# Extended CDS attributes
|
|
128
114
|
if "antismash.detection.hmm_detection" in record["modules"].keys():
|
|
129
|
-
cds_by_protocluster = record["modules"][
|
|
130
|
-
"antismash.detection.hmm_detection"
|
|
131
|
-
]["rule_results"]["cds_by_protocluster"]
|
|
115
|
+
cds_by_protocluster = record["modules"]["antismash.detection.hmm_detection"]["rule_results"]["cds_by_protocluster"]
|
|
132
116
|
|
|
133
117
|
if not cds_by_protocluster:
|
|
134
118
|
continue
|
|
@@ -137,14 +121,10 @@ def main():
|
|
|
137
121
|
if locus_tag := feature.get("cds_name"):
|
|
138
122
|
as_clusters = ",".join(list(feature["definition_domains"].keys()))
|
|
139
123
|
if locus_tag in attributes_dict:
|
|
140
|
-
attributes_dict[locus_tag].update(
|
|
141
|
-
{"as_gene_clusters": as_clusters}
|
|
142
|
-
)
|
|
124
|
+
attributes_dict[locus_tag].update({"as_gene_clusters": as_clusters})
|
|
143
125
|
|
|
144
126
|
if "antismash.detection.genefunctions" in record["modules"].keys():
|
|
145
|
-
gene_function_tools = record["modules"][
|
|
146
|
-
"antismash.detection.genefunctions"
|
|
147
|
-
]["tools"]
|
|
127
|
+
gene_function_tools = record["modules"]["antismash.detection.genefunctions"]["tools"]
|
|
148
128
|
if tool_data := gene_function_tools.get("smcogs"):
|
|
149
129
|
|
|
150
130
|
for locus_tag in tool_data["best_hits"]:
|
|
@@ -158,18 +138,13 @@ def main():
|
|
|
158
138
|
if locus_tag in attributes_dict.keys():
|
|
159
139
|
attributes_dict[locus_tag].update({"as_notes": smcog_note})
|
|
160
140
|
|
|
161
|
-
attributes = [
|
|
162
|
-
";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
|
|
163
|
-
for attrib_data in attributes_dict.values()
|
|
164
|
-
]
|
|
141
|
+
attributes = [";".join(f"{k}={v}" for k, v in attrib_data.items() if v) for attrib_data in attributes_dict.values()]
|
|
165
142
|
res_dict["attributes"] = attributes
|
|
166
143
|
|
|
167
144
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
168
145
|
|
|
169
146
|
with open(output_file, "w") as f_out:
|
|
170
|
-
f_out.write(
|
|
171
|
-
"##gff-version 3\n"
|
|
172
|
-
) # Save data to the GFF3 file with the proper header
|
|
147
|
+
f_out.write("##gff-version 3\n") # Save data to the GFF3 file with the proper header
|
|
173
148
|
res_df.to_csv(f_out, header=False, index=False, sep="\t")
|
|
174
149
|
|
|
175
150
|
|