mgnify-pipelines-toolkit 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/PKG-INFO +1 -1
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +35 -23
- mgnify_pipelines_toolkit-1.2.4/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +87 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +50 -11
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +1 -1
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +1 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/pyproject.toml +2 -1
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/README.md +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
- {mgnify_pipelines_toolkit-1.2.2 → mgnify_pipelines_toolkit-1.2.4}/setup.cfg +0 -0
|
@@ -22,12 +22,12 @@ import os
|
|
|
22
22
|
import logging
|
|
23
23
|
import json
|
|
24
24
|
import time
|
|
25
|
+
import numpy as np
|
|
25
26
|
|
|
26
27
|
from mgnify_pipelines_toolkit.constants.thresholds import (
|
|
27
28
|
MIN_OVERLAP,
|
|
28
29
|
MIN_SEQ_COUNT,
|
|
29
30
|
MAX_ERROR_PROPORTION,
|
|
30
|
-
MAX_INTERNAL_PRIMER_PROPORTION,
|
|
31
31
|
)
|
|
32
32
|
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
33
33
|
REGIONS_16S_BACTERIA,
|
|
@@ -62,8 +62,16 @@ def get_multiregion(raw_sequence_coords, regions):
|
|
|
62
62
|
|
|
63
63
|
Returns:
|
|
64
64
|
amplified_region: Amplified variable regions.
|
|
65
|
+
region_coverages: Coverage of all detected variable regions
|
|
65
66
|
|
|
66
67
|
"""
|
|
68
|
+
|
|
69
|
+
region_coverages = defaultdict(float)
|
|
70
|
+
|
|
71
|
+
for region, limits in regions.items():
|
|
72
|
+
overlap = calc_overlap(raw_sequence_coords, limits)
|
|
73
|
+
region_coverages[region] = overlap
|
|
74
|
+
|
|
67
75
|
# check if any of the coords are inside the region
|
|
68
76
|
matched_regions = [
|
|
69
77
|
region
|
|
@@ -76,7 +84,7 @@ def get_multiregion(raw_sequence_coords, regions):
|
|
|
76
84
|
amplified_region = matched_regions[0]
|
|
77
85
|
else:
|
|
78
86
|
amplified_region = ""
|
|
79
|
-
return amplified_region
|
|
87
|
+
return amplified_region, region_coverages
|
|
80
88
|
|
|
81
89
|
|
|
82
90
|
def check_primer_position(raw_sequence_coords, regions):
|
|
@@ -90,7 +98,7 @@ def check_primer_position(raw_sequence_coords, regions):
|
|
|
90
98
|
|
|
91
99
|
"""
|
|
92
100
|
result_flag = False
|
|
93
|
-
margin =
|
|
101
|
+
margin = 10 # allowed margin of error
|
|
94
102
|
for coord in raw_sequence_coords:
|
|
95
103
|
for region in regions.values():
|
|
96
104
|
if coord in range(region[0] + margin, region[1] - margin):
|
|
@@ -342,22 +350,30 @@ def retrieve_regions(
|
|
|
342
350
|
per_read_info = (
|
|
343
351
|
dict()
|
|
344
352
|
) # dictionary will contain read names for each variable region
|
|
353
|
+
all_region_coverages = defaultdict(lambda: defaultdict(list))
|
|
345
354
|
for read in data:
|
|
355
|
+
# Example structure of `read`
|
|
356
|
+
# ('ERR14650515.1', 'SSU_rRNA_archaea', 'RF01959', 'hmm', '3', '525', '1', '518', '+', '-', '6', '0.55', '0.6', '363.6', '7.8e-107')
|
|
346
357
|
regions = determine_cm(read[2])
|
|
347
358
|
sequence_counter_total += 1
|
|
348
359
|
limits = list(map(int, read[4:6]))
|
|
349
360
|
domain = determine_domain(read[2])
|
|
350
361
|
marker_gene = determine_marker_gene(domain)
|
|
351
362
|
if not regions == "unsupported":
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
363
|
+
matches, coverages = get_multiregion(limits, regions)
|
|
364
|
+
|
|
365
|
+
[
|
|
366
|
+
all_region_coverages[domain][region].append(coverage)
|
|
367
|
+
for region, coverage in coverages.items()
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
multiregion_matches.setdefault(read[2], []).append(matches)
|
|
355
371
|
if check_primer_position(limits, regions):
|
|
356
372
|
primer_inside_vr += 1
|
|
357
373
|
sequence_counter_useful += 1
|
|
358
|
-
per_read_info.setdefault(
|
|
359
|
-
|
|
360
|
-
)
|
|
374
|
+
per_read_info.setdefault(marker_gene + "." + matches, []).append(
|
|
375
|
+
read[0]
|
|
376
|
+
)
|
|
361
377
|
else:
|
|
362
378
|
unsupported_matches += 1
|
|
363
379
|
|
|
@@ -385,18 +401,6 @@ def retrieve_regions(
|
|
|
385
401
|
)
|
|
386
402
|
continue
|
|
387
403
|
|
|
388
|
-
# filter out runs with too many sequences starting/ending inside variable regions
|
|
389
|
-
internal_seq_fract = primer_inside_vr / len(data)
|
|
390
|
-
if internal_seq_fract > MAX_INTERNAL_PRIMER_PROPORTION:
|
|
391
|
-
failed_run_counter += 1
|
|
392
|
-
logging.info("No output will be produced - too many internal mappings")
|
|
393
|
-
logging.info(
|
|
394
|
-
"Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
|
|
395
|
-
tblout_file, "{0:.2f}".format(internal_seq_fract)
|
|
396
|
-
)
|
|
397
|
-
)
|
|
398
|
-
continue
|
|
399
|
-
|
|
400
404
|
normalised_matches[run_id] = dict()
|
|
401
405
|
region_counter = defaultdict(int)
|
|
402
406
|
|
|
@@ -432,14 +436,12 @@ def retrieve_regions(
|
|
|
432
436
|
multiregion_matches[model] = new_value
|
|
433
437
|
|
|
434
438
|
[multiregion_matches.pop(model) for model in models_to_remove]
|
|
435
|
-
print(multiregion_matches)
|
|
436
439
|
|
|
437
440
|
run_status = "one"
|
|
438
441
|
run_result = dict()
|
|
439
442
|
total_useful_sequences = 0.0
|
|
440
443
|
temp_seq_counter = dict()
|
|
441
444
|
for model, model_regions in multiregion_matches.items():
|
|
442
|
-
print(model)
|
|
443
445
|
result = normalise_results(model_regions)
|
|
444
446
|
if result is None:
|
|
445
447
|
run_status = "ambiguous"
|
|
@@ -469,6 +471,16 @@ def retrieve_regions(
|
|
|
469
471
|
logging.info("No output will be produced - the run is ambiguous.")
|
|
470
472
|
continue
|
|
471
473
|
|
|
474
|
+
coverage_fw = open(f"{outfile_prefix}_all_coverages.txt", "w")
|
|
475
|
+
|
|
476
|
+
for domain, regions in all_region_coverages.items():
|
|
477
|
+
for region in regions:
|
|
478
|
+
if len(regions[region]) < MIN_SEQ_COUNT:
|
|
479
|
+
continue
|
|
480
|
+
region_coverage = float(np.mean(regions[region]))
|
|
481
|
+
if region_coverage > 0:
|
|
482
|
+
coverage_fw.write(f"{domain}:{region}: {region_coverage}\n")
|
|
483
|
+
|
|
472
484
|
json_outfile = "{}.json".format(outfile_prefix)
|
|
473
485
|
tsv_outfile = "{}.tsv".format(outfile_prefix)
|
|
474
486
|
with open(json_outfile, "w") as f:
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from itertools import product
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_args():
|
|
9
|
+
|
|
10
|
+
parser = argparse.ArgumentParser()
|
|
11
|
+
parser.add_argument(
|
|
12
|
+
"-i",
|
|
13
|
+
"--input_primers",
|
|
14
|
+
required=True,
|
|
15
|
+
type=str,
|
|
16
|
+
help="Input primers to generate permutations for due to IUPAC ambiguous codes",
|
|
17
|
+
)
|
|
18
|
+
parser.add_argument("-p", "--prefix", required=True, type=str, help="Output prefix")
|
|
19
|
+
|
|
20
|
+
args = parser.parse_args()
|
|
21
|
+
|
|
22
|
+
input_path = args.input_primers
|
|
23
|
+
prefix = args.prefix
|
|
24
|
+
|
|
25
|
+
return input_path, prefix
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def permute_seq(seq):
|
|
29
|
+
|
|
30
|
+
ambiguous_bases_dict = {
|
|
31
|
+
"R": ["A", "G"],
|
|
32
|
+
"Y": ["C", "T"],
|
|
33
|
+
"S": ["G", "C"],
|
|
34
|
+
"W": ["A", "T"],
|
|
35
|
+
"K": ["G", "T"],
|
|
36
|
+
"M": ["A", "C"],
|
|
37
|
+
"B": ["C", "G", "T"],
|
|
38
|
+
"D": ["A", "G", "T"],
|
|
39
|
+
"H": ["A", "C", "T"],
|
|
40
|
+
"V": ["A", "C", "G"],
|
|
41
|
+
"N": ["A", "C", "T", "G"],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
seq_template = []
|
|
45
|
+
|
|
46
|
+
for base in seq:
|
|
47
|
+
if base in ["A", "C", "T", "G"]:
|
|
48
|
+
seq_template.append(base)
|
|
49
|
+
else:
|
|
50
|
+
seq_template.append(ambiguous_bases_dict[base])
|
|
51
|
+
|
|
52
|
+
seq_permutations = []
|
|
53
|
+
for combo in product(*seq_template):
|
|
54
|
+
seq_permutations.append("".join(combo))
|
|
55
|
+
|
|
56
|
+
return seq_permutations
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def make_primer_permutations(primers_dict, prefix):
|
|
60
|
+
|
|
61
|
+
with open(f"{prefix}_permuted_primers.fasta", "w") as fw:
|
|
62
|
+
for primer_name, seq in primers_dict.items():
|
|
63
|
+
|
|
64
|
+
primer_seq = seq.seq
|
|
65
|
+
fw.write(f">{primer_name}\n{primer_seq}\n")
|
|
66
|
+
|
|
67
|
+
if primer_name == "F_auto" or primer_name[-1] == "F":
|
|
68
|
+
strand = "F"
|
|
69
|
+
elif primer_name == "R_auto" or primer_name[-1] == "R":
|
|
70
|
+
strand = "R"
|
|
71
|
+
|
|
72
|
+
seq_permutations = permute_seq(primer_seq)
|
|
73
|
+
|
|
74
|
+
for counter, permuted_seq in enumerate(seq_permutations, 1):
|
|
75
|
+
variant_name = f"{primer_name}_variant_{counter}_{strand}"
|
|
76
|
+
fw.write(f">{variant_name}\n{permuted_seq}\n")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def main():
|
|
80
|
+
|
|
81
|
+
input_path, prefix = parse_args()
|
|
82
|
+
primers_dict = SeqIO.to_dict(SeqIO.parse(Path(input_path), "fasta"))
|
|
83
|
+
make_primer_permutations(primers_dict, prefix)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|
|
@@ -19,6 +19,7 @@ from collections import defaultdict
|
|
|
19
19
|
import re
|
|
20
20
|
|
|
21
21
|
from Bio import SeqIO
|
|
22
|
+
from Bio.Seq import Seq
|
|
22
23
|
import pandas as pd
|
|
23
24
|
|
|
24
25
|
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
@@ -49,18 +50,26 @@ def parse_args():
|
|
|
49
50
|
help="Path to concatenated primers fasta file",
|
|
50
51
|
)
|
|
51
52
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--se",
|
|
55
|
+
action=argparse.BooleanOptionalAction,
|
|
56
|
+
help="Flag for if run is single-end",
|
|
57
|
+
)
|
|
52
58
|
args = parser.parse_args()
|
|
53
59
|
|
|
54
60
|
input = args.input
|
|
55
61
|
fasta = args.fasta
|
|
56
62
|
sample = args.sample
|
|
63
|
+
single_end = args.se
|
|
57
64
|
|
|
58
|
-
return input, fasta, sample
|
|
65
|
+
return input, fasta, sample, single_end
|
|
59
66
|
|
|
60
67
|
|
|
61
68
|
def get_amp_region(beg, end, strand, model):
|
|
62
69
|
prev_region = ""
|
|
63
70
|
|
|
71
|
+
margin = -10
|
|
72
|
+
|
|
64
73
|
for region, region_coords in model.items():
|
|
65
74
|
|
|
66
75
|
region_beg = region_coords[0]
|
|
@@ -68,10 +77,10 @@ def get_amp_region(beg, end, strand, model):
|
|
|
68
77
|
end_diff = region_beg - end
|
|
69
78
|
|
|
70
79
|
if strand == STRAND_FWD:
|
|
71
|
-
if beg_diff
|
|
80
|
+
if beg_diff >= margin and end_diff >= margin:
|
|
72
81
|
return region
|
|
73
82
|
else:
|
|
74
|
-
if beg_diff
|
|
83
|
+
if beg_diff >= margin and end_diff >= margin:
|
|
75
84
|
return prev_region
|
|
76
85
|
|
|
77
86
|
prev_region = region
|
|
@@ -80,10 +89,16 @@ def get_amp_region(beg, end, strand, model):
|
|
|
80
89
|
|
|
81
90
|
|
|
82
91
|
def main():
|
|
83
|
-
|
|
92
|
+
|
|
93
|
+
input, fasta, sample, single_end = parse_args()
|
|
84
94
|
res_dict = defaultdict(list)
|
|
85
95
|
fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
|
|
86
96
|
|
|
97
|
+
fwd_primers_fw = open("./fwd_primers.fasta", "w")
|
|
98
|
+
rev_primers_fw = open("./rev_primers.fasta", "w")
|
|
99
|
+
|
|
100
|
+
matched_primers_list = []
|
|
101
|
+
|
|
87
102
|
with open(input, "r") as fr:
|
|
88
103
|
for line in fr:
|
|
89
104
|
line = line.strip()
|
|
@@ -95,6 +110,10 @@ def main():
|
|
|
95
110
|
beg = float(line_lst[5])
|
|
96
111
|
end = float(line_lst[6])
|
|
97
112
|
|
|
113
|
+
cleaned_primer_name = "_".join(primer_name.split("_")[0:-3])
|
|
114
|
+
if cleaned_primer_name in matched_primers_list:
|
|
115
|
+
continue
|
|
116
|
+
|
|
98
117
|
if rfam == "RF00177":
|
|
99
118
|
gene = "16S"
|
|
100
119
|
model = REGIONS_16S_BACTERIA
|
|
@@ -104,8 +123,12 @@ def main():
|
|
|
104
123
|
elif rfam == "RF01960":
|
|
105
124
|
gene = "18S"
|
|
106
125
|
model = REGIONS_18S
|
|
107
|
-
else:
|
|
108
|
-
|
|
126
|
+
else: # For cases when it's a std primer but for some reason hasn't matched the model
|
|
127
|
+
if cleaned_primer_name == "F_auto" or cleaned_primer_name == "R_auto":
|
|
128
|
+
continue
|
|
129
|
+
gene = "Unknown"
|
|
130
|
+
amp_region = "Unknown"
|
|
131
|
+
model = ""
|
|
109
132
|
|
|
110
133
|
res_dict["Run"].append(sample)
|
|
111
134
|
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
@@ -113,23 +136,39 @@ def main():
|
|
|
113
136
|
|
|
114
137
|
strand = ""
|
|
115
138
|
|
|
116
|
-
if "F"
|
|
139
|
+
if primer_name[-1] == "F":
|
|
117
140
|
strand = STRAND_FWD
|
|
118
|
-
elif "R"
|
|
141
|
+
elif primer_name[-1] == "R":
|
|
119
142
|
strand = STRAND_REV
|
|
143
|
+
else:
|
|
144
|
+
print(f"Not sure what strand this is, exiting: {primer_name}")
|
|
145
|
+
|
|
146
|
+
if model:
|
|
147
|
+
amp_region = get_amp_region(beg, end, strand, model)
|
|
120
148
|
|
|
121
|
-
|
|
122
|
-
primer_seq = str(fasta_dict[primer_name].seq)
|
|
149
|
+
primer_seq = str(fasta_dict[cleaned_primer_name].seq)
|
|
123
150
|
|
|
124
151
|
res_dict["Gene"].append(gene)
|
|
125
152
|
res_dict["VariableRegion"].append(amp_region)
|
|
126
|
-
res_dict["PrimerName"].append(
|
|
153
|
+
res_dict["PrimerName"].append(cleaned_primer_name)
|
|
127
154
|
res_dict["PrimerStrand"].append(strand)
|
|
128
155
|
res_dict["PrimerSeq"].append(primer_seq)
|
|
129
156
|
|
|
157
|
+
if strand == STRAND_FWD:
|
|
158
|
+
fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
|
|
159
|
+
elif strand == STRAND_REV:
|
|
160
|
+
if single_end:
|
|
161
|
+
primer_seq = Seq(primer_seq).reverse_complement()
|
|
162
|
+
rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
|
|
163
|
+
|
|
164
|
+
matched_primers_list.append(cleaned_primer_name)
|
|
165
|
+
|
|
130
166
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
131
167
|
res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
|
|
132
168
|
|
|
169
|
+
fwd_primers_fw.close()
|
|
170
|
+
rev_primers_fw.close()
|
|
171
|
+
|
|
133
172
|
|
|
134
173
|
if __name__ == "__main__":
|
|
135
174
|
main()
|
|
@@ -12,6 +12,7 @@ mgnify_pipelines_toolkit/analysis/__init__.py
|
|
|
12
12
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
|
|
13
13
|
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
|
|
14
14
|
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
|
|
15
|
+
mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py
|
|
15
16
|
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py
|
|
16
17
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
|
|
17
18
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
|
|
@@ -23,6 +23,7 @@ make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count
|
|
|
23
23
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
24
24
|
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
25
25
|
markergene_study_summary = mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main
|
|
26
|
+
permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:main
|
|
26
27
|
primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
|
|
27
28
|
process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
|
|
28
29
|
process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mgnify_pipelines_toolkit"
|
|
3
|
-
version = "1.2.
|
|
3
|
+
version = "1.2.4"
|
|
4
4
|
readme = "README.md"
|
|
5
5
|
license = { text = "Apache Software License 2.0" }
|
|
6
6
|
authors = [
|
|
@@ -58,6 +58,7 @@ make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_coun
|
|
|
58
58
|
remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
|
|
59
59
|
rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
|
|
60
60
|
mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
|
|
61
|
+
permute_primers = "mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:main"
|
|
61
62
|
primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
|
|
62
63
|
amplicon_study_summary_generator = "mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli"
|
|
63
64
|
# analysis.assembly #
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|