mgnify-pipelines-toolkit 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +71 -40
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +140 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD +33 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/entry_points.txt +3 -0
- mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -20,21 +20,29 @@ import argparse
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
import numpy as np
|
|
22
22
|
|
|
23
|
+
|
|
23
24
|
def parse_args():
|
|
24
25
|
|
|
25
26
|
parser = argparse.ArgumentParser()
|
|
26
27
|
|
|
27
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-i",
|
|
30
|
+
"--input",
|
|
31
|
+
required=True,
|
|
32
|
+
type=str,
|
|
33
|
+
help="Path to mcp tsv file to find inflection points",
|
|
34
|
+
)
|
|
28
35
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
29
36
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
30
37
|
|
|
31
38
|
args = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
_PATH = args.input
|
|
34
|
-
_SAMPLE = args.sample
|
|
35
|
-
_OUTPUT = args.output
|
|
36
39
|
|
|
37
|
-
|
|
40
|
+
path = args.input
|
|
41
|
+
sample = args.sample
|
|
42
|
+
output = args.output
|
|
43
|
+
|
|
44
|
+
return path, sample, output
|
|
45
|
+
|
|
38
46
|
|
|
39
47
|
def find_mcp_inf_points(mcp_df):
|
|
40
48
|
"""
|
|
@@ -50,45 +58,54 @@ def find_mcp_inf_points(mcp_df):
|
|
|
50
58
|
"""
|
|
51
59
|
|
|
52
60
|
inf_point_dict = defaultdict(list)
|
|
53
|
-
start_indices = [
|
|
61
|
+
start_indices = [int(i) for i in mcp_df.columns.tolist()]
|
|
54
62
|
|
|
55
|
-
for i in range(len(mcp_df)):
|
|
63
|
+
for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
|
|
56
64
|
strand = mcp_df.index[i]
|
|
57
65
|
props = mcp_df.iloc[i].tolist()
|
|
58
|
-
props = [
|
|
66
|
+
props = [-val for val in props]
|
|
59
67
|
|
|
60
|
-
prop_diff = np.diff(props)/np.diff(start_indices)
|
|
61
|
-
infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
|
|
68
|
+
prop_diff = np.diff(props) / np.diff(start_indices) # Get the derivative
|
|
69
|
+
infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
|
|
70
|
+
0
|
|
71
|
+
] # Grab points above 80th percentile
|
|
62
72
|
|
|
63
73
|
for ind in infl_points:
|
|
64
74
|
inf_point = start_indices[ind]
|
|
65
75
|
|
|
66
|
-
if
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
if (
|
|
77
|
+
inf_point < 10 or inf_point > 20
|
|
78
|
+
): # Rule to facilitate results - won't accept
|
|
79
|
+
continue # points below index 10 or above index 20
|
|
80
|
+
# 10 means a cutoff of 15 and 20 a cutoff of 25
|
|
81
|
+
# literature points to no primers existing that are
|
|
82
|
+
# shorter or bigger than these lengths
|
|
83
|
+
|
|
84
|
+
inf_point_dict["strand"].append(strand)
|
|
85
|
+
inf_point_dict["inf_point"].append(inf_point)
|
|
71
86
|
|
|
72
|
-
inf_point_dict['strand'].append(strand)
|
|
73
|
-
inf_point_dict['inf_point'].append(inf_point)
|
|
74
|
-
|
|
75
87
|
return inf_point_dict
|
|
76
88
|
|
|
89
|
+
|
|
77
90
|
def main():
|
|
78
91
|
|
|
79
|
-
|
|
92
|
+
path, sample, output = parse_args()
|
|
80
93
|
|
|
81
|
-
mcp_df = pd.read_csv(
|
|
82
|
-
inf_point_dict = find_mcp_inf_points(mcp_df)
|
|
94
|
+
mcp_df = pd.read_csv(path, sep="\t", index_col=0) # Read mcp_df
|
|
95
|
+
inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
|
|
83
96
|
|
|
84
|
-
if len(inf_point_dict) > 0:
|
|
85
|
-
inf_point_df = pd.DataFrame.from_dict(
|
|
86
|
-
|
|
97
|
+
if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
|
|
98
|
+
inf_point_df = pd.DataFrame.from_dict(
|
|
99
|
+
inf_point_dict
|
|
100
|
+
) # .. turn it into a dataframe
|
|
101
|
+
inf_point_df.to_csv(
|
|
102
|
+
f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
|
|
103
|
+
) # ..save it to a .tsv file
|
|
87
104
|
|
|
88
|
-
else:
|
|
89
|
-
fw = open(f
|
|
105
|
+
else: # If it is empty..
|
|
106
|
+
fw = open(f"{output}/{sample}_inf_points.tsv", "w") # ..make an empty file
|
|
90
107
|
fw.close()
|
|
91
108
|
|
|
92
109
|
|
|
93
110
|
if __name__ == "__main__":
|
|
94
|
-
main()
|
|
111
|
+
main()
|
|
@@ -20,32 +20,48 @@ import logging
|
|
|
20
20
|
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
|
-
from mgnify_pipelines_toolkit.constants.tax_ranks import
|
|
23
|
+
from mgnify_pipelines_toolkit.constants.tax_ranks import (
|
|
24
|
+
_SILVA_TAX_RANKS,
|
|
25
|
+
_PR2_TAX_RANKS,
|
|
26
|
+
)
|
|
24
27
|
|
|
25
28
|
logging.basicConfig(level=logging.DEBUG)
|
|
26
29
|
|
|
30
|
+
|
|
27
31
|
def parse_args():
|
|
28
32
|
parser = argparse.ArgumentParser()
|
|
29
33
|
|
|
30
|
-
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
31
|
-
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
32
|
-
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
33
34
|
parser.add_argument(
|
|
34
|
-
"-
|
|
35
|
+
"-t", "--taxa", required=True, type=str, help="Path to taxa file"
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file"
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file"
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-a",
|
|
45
|
+
"--amp",
|
|
46
|
+
required=True,
|
|
47
|
+
type=str,
|
|
48
|
+
help="Path to extracted amp_region reads from inference subworkflow",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-hd", "--headers", required=True, type=str, help="Path to fastq headers"
|
|
35
52
|
)
|
|
36
|
-
parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
|
|
37
53
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
38
54
|
|
|
39
55
|
args = parser.parse_args()
|
|
40
|
-
|
|
41
|
-
_TAXA = args.taxa
|
|
42
|
-
_FWD = args.fwd
|
|
43
|
-
_REV = args.rev
|
|
44
|
-
_AMP = args.amp
|
|
45
|
-
_HEADERS = args.headers
|
|
46
|
-
_SAMPLE = args.sample
|
|
47
56
|
|
|
48
|
-
|
|
57
|
+
taxa = args.taxa
|
|
58
|
+
fwd = args.fwd
|
|
59
|
+
rev = args.rev
|
|
60
|
+
amp = args.amp
|
|
61
|
+
headers = args.headers
|
|
62
|
+
sample = args.sample
|
|
63
|
+
|
|
64
|
+
return taxa, fwd, rev, amp, headers, sample
|
|
49
65
|
|
|
50
66
|
|
|
51
67
|
def order_df(taxa_df):
|
|
@@ -59,6 +75,7 @@ def order_df(taxa_df):
|
|
|
59
75
|
|
|
60
76
|
return taxa_df
|
|
61
77
|
|
|
78
|
+
|
|
62
79
|
def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
63
80
|
tax_assignment_dict = defaultdict(int)
|
|
64
81
|
|
|
@@ -93,7 +110,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
93
110
|
k = "_".join(k.split(" "))
|
|
94
111
|
tax_assignment += f"\t{k}"
|
|
95
112
|
elif sk != "0":
|
|
96
|
-
tax_assignment +=
|
|
113
|
+
tax_assignment += "\tk__"
|
|
97
114
|
else:
|
|
98
115
|
break
|
|
99
116
|
|
|
@@ -136,9 +153,10 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
136
153
|
continue
|
|
137
154
|
|
|
138
155
|
tax_assignment_dict[tax_assignment] += asv_count
|
|
139
|
-
|
|
156
|
+
|
|
140
157
|
return tax_assignment_dict
|
|
141
158
|
|
|
159
|
+
|
|
142
160
|
def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
143
161
|
tax_assignment_dict = defaultdict(int)
|
|
144
162
|
|
|
@@ -223,26 +241,45 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
223
241
|
|
|
224
242
|
return tax_assignment_dict
|
|
225
243
|
|
|
244
|
+
|
|
245
|
+
def generate_asv_count_dict(asv_dict):
|
|
246
|
+
|
|
247
|
+
res_dict = defaultdict(list)
|
|
248
|
+
|
|
249
|
+
for asv_id, count in asv_dict.items():
|
|
250
|
+
|
|
251
|
+
if count == 0:
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
res_dict["asv"].append(asv_id)
|
|
255
|
+
res_dict["count"].append(count)
|
|
256
|
+
|
|
257
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
258
|
+
res_df = res_df.sort_values(by="asv", ascending=True)
|
|
259
|
+
res_df = res_df.sort_values(by="count", ascending=False)
|
|
260
|
+
|
|
261
|
+
return res_df
|
|
262
|
+
|
|
263
|
+
|
|
226
264
|
def main():
|
|
227
|
-
|
|
265
|
+
taxa, fwd, rev, amp, headers, sample = parse_args()
|
|
228
266
|
|
|
229
|
-
fwd_fr = open(
|
|
267
|
+
fwd_fr = open(fwd, "r")
|
|
230
268
|
paired_end = True
|
|
231
269
|
|
|
232
|
-
if
|
|
270
|
+
if rev is None:
|
|
233
271
|
paired_end = False
|
|
234
272
|
rev_fr = [True]
|
|
235
273
|
else:
|
|
236
|
-
rev_fr = open(
|
|
274
|
+
rev_fr = open(rev, "r")
|
|
237
275
|
|
|
238
|
-
taxa_df = pd.read_csv(
|
|
276
|
+
taxa_df = pd.read_csv(taxa, sep="\t", dtype=str)
|
|
239
277
|
taxa_df = taxa_df.fillna("0")
|
|
240
278
|
taxa_df = order_df(taxa_df)
|
|
241
279
|
|
|
242
|
-
amp_reads = [read.strip() for read in list(open(
|
|
243
|
-
headers = [read.split(" ")[0][1:] for read in
|
|
244
|
-
|
|
245
|
-
amp_region = ".".join(_AMP.split(".")[1:3])
|
|
280
|
+
amp_reads = [read.strip() for read in list(open(amp, "r"))]
|
|
281
|
+
headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
|
|
282
|
+
amp_region = ".".join(amp.split(".")[1:3])
|
|
246
283
|
|
|
247
284
|
asv_dict = defaultdict(int)
|
|
248
285
|
|
|
@@ -250,23 +287,12 @@ def main():
|
|
|
250
287
|
for line_fwd in fwd_fr:
|
|
251
288
|
counter += 1
|
|
252
289
|
line_fwd = line_fwd.strip()
|
|
253
|
-
fwd_asvs = line_fwd.split(",")
|
|
254
|
-
|
|
255
|
-
if paired_end:
|
|
256
|
-
line_rev = next(rev_fr).strip()
|
|
257
|
-
rev_asvs = line_rev.split(",")
|
|
258
|
-
asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
|
|
259
|
-
|
|
260
|
-
if len(asv_intersection) == 0:
|
|
261
|
-
continue
|
|
262
290
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
else:
|
|
266
|
-
asv_intersection = fwd_asvs
|
|
291
|
+
if line_fwd == "0":
|
|
292
|
+
continue
|
|
267
293
|
|
|
268
294
|
if headers[counter] in amp_reads:
|
|
269
|
-
asv_dict[f"seq_{
|
|
295
|
+
asv_dict[f"seq_{line_fwd}"] += 1
|
|
270
296
|
|
|
271
297
|
fwd_fr.close()
|
|
272
298
|
if paired_end:
|
|
@@ -281,10 +307,15 @@ def main():
|
|
|
281
307
|
tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
|
|
282
308
|
ref_db = "pr2"
|
|
283
309
|
|
|
284
|
-
with open(f"./{
|
|
310
|
+
with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
|
|
285
311
|
for tax_assignment, count in tax_assignment_dict.items():
|
|
286
312
|
fw.write(f"{count}\t{tax_assignment}\n")
|
|
287
313
|
|
|
314
|
+
asv_count_df = generate_asv_count_dict(asv_dict)
|
|
315
|
+
asv_count_df.to_csv(
|
|
316
|
+
f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
|
|
317
|
+
)
|
|
318
|
+
|
|
288
319
|
|
|
289
320
|
if __name__ == "__main__":
|
|
290
321
|
main()
|
|
@@ -22,9 +22,12 @@ import pandas as pd
|
|
|
22
22
|
|
|
23
23
|
logging.basicConfig(level=logging.DEBUG)
|
|
24
24
|
|
|
25
|
+
|
|
25
26
|
def parse_args():
|
|
26
27
|
parser = argparse.ArgumentParser()
|
|
27
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-i", "--input", required=True, type=str, help="Input from MAPseq output"
|
|
30
|
+
)
|
|
28
31
|
parser.add_argument(
|
|
29
32
|
"-l",
|
|
30
33
|
"--label",
|
|
@@ -37,18 +40,48 @@ def parse_args():
|
|
|
37
40
|
|
|
38
41
|
args = parser.parse_args()
|
|
39
42
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
+
input = args.input
|
|
44
|
+
label = args.label
|
|
45
|
+
sample = args.sample
|
|
46
|
+
|
|
47
|
+
return input, label, sample
|
|
43
48
|
|
|
44
|
-
return _INPUT, _LABEL, _SAMPLE
|
|
45
49
|
|
|
46
50
|
def parse_label(label):
|
|
47
51
|
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
48
|
-
pr2_short_ranks = [
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
pr2_short_ranks = [
|
|
53
|
+
"d__",
|
|
54
|
+
"sg__",
|
|
55
|
+
"dv__",
|
|
56
|
+
"sdv__",
|
|
57
|
+
"c__",
|
|
58
|
+
"o__",
|
|
59
|
+
"f__",
|
|
60
|
+
"g__",
|
|
61
|
+
"s__",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
silva_long_ranks = [
|
|
65
|
+
"Superkingdom",
|
|
66
|
+
"Kingdom",
|
|
67
|
+
"Phylum",
|
|
68
|
+
"Class",
|
|
69
|
+
"Order",
|
|
70
|
+
"Family",
|
|
71
|
+
"Genus",
|
|
72
|
+
"Species",
|
|
73
|
+
]
|
|
74
|
+
pr2_long_ranks = [
|
|
75
|
+
"Domain",
|
|
76
|
+
"Supergroup",
|
|
77
|
+
"Division",
|
|
78
|
+
"Subdivision",
|
|
79
|
+
"Class",
|
|
80
|
+
"Order",
|
|
81
|
+
"Family",
|
|
82
|
+
"Genus",
|
|
83
|
+
"Species",
|
|
84
|
+
]
|
|
52
85
|
|
|
53
86
|
chosen_short_ranks = ""
|
|
54
87
|
chosen_long_ranks = ""
|
|
@@ -65,6 +98,7 @@ def parse_label(label):
|
|
|
65
98
|
|
|
66
99
|
return chosen_short_ranks, chosen_long_ranks
|
|
67
100
|
|
|
101
|
+
|
|
68
102
|
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
69
103
|
res_dict = defaultdict(list)
|
|
70
104
|
|
|
@@ -91,7 +125,8 @@ def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
|
91
125
|
res_dict[curr_rank].append(curr_tax)
|
|
92
126
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
93
127
|
|
|
94
|
-
return
|
|
128
|
+
return res_df
|
|
129
|
+
|
|
95
130
|
|
|
96
131
|
def process_blank_tax_ends(res_df, ranks):
|
|
97
132
|
# Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
|
|
@@ -105,7 +140,9 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
105
140
|
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
106
141
|
curr_rank = res_df.iloc[i, j + 1]
|
|
107
142
|
if curr_rank in ranks:
|
|
108
|
-
if
|
|
143
|
+
if (
|
|
144
|
+
last_empty_rank == ""
|
|
145
|
+
): # Last rank is empty, start window of consecutive blanks
|
|
109
146
|
last_empty_rank = j + 1
|
|
110
147
|
currently_empty = True
|
|
111
148
|
elif (
|
|
@@ -124,16 +161,17 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
124
161
|
return res_df
|
|
125
162
|
|
|
126
163
|
|
|
127
|
-
def main():
|
|
128
|
-
|
|
164
|
+
def main():
|
|
165
|
+
input, label, sample = parse_args()
|
|
129
166
|
|
|
130
|
-
mseq_df = pd.read_csv(
|
|
167
|
+
mseq_df = pd.read_csv(input, header=0, delim_whitespace=True, usecols=[0, 12])
|
|
131
168
|
|
|
132
|
-
short_ranks, long_ranks = parse_label(
|
|
169
|
+
short_ranks, long_ranks = parse_label(label)
|
|
133
170
|
res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
|
|
134
171
|
final_res_df = process_blank_tax_ends(res_df, short_ranks)
|
|
135
172
|
|
|
136
|
-
final_res_df.to_csv(f"./{
|
|
173
|
+
final_res_df.to_csv(f"./{sample}_{label}_asv_taxa.tsv", sep="\t", index=False)
|
|
174
|
+
|
|
137
175
|
|
|
138
176
|
if __name__ == "__main__":
|
|
139
177
|
main()
|
|
@@ -21,24 +21,41 @@ import re
|
|
|
21
21
|
from Bio import SeqIO
|
|
22
22
|
import pandas as pd
|
|
23
23
|
|
|
24
|
-
from mgnify_pipelines_toolkit.constants.var_region_coordinates import
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
25
|
+
REGIONS_16S_BACTERIA,
|
|
26
|
+
REGIONS_16S_ARCHAEA,
|
|
27
|
+
REGIONS_18S,
|
|
28
|
+
)
|
|
25
29
|
|
|
26
30
|
STRAND_FWD = "fwd"
|
|
27
31
|
STRAND_REV = "rev"
|
|
28
32
|
|
|
33
|
+
|
|
29
34
|
def parse_args():
|
|
30
35
|
parser = argparse.ArgumentParser()
|
|
31
36
|
|
|
32
|
-
parser.add_argument(
|
|
33
|
-
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-i",
|
|
39
|
+
"--input",
|
|
40
|
+
required=True,
|
|
41
|
+
type=str,
|
|
42
|
+
help="Path to cmsearch_deoverlap_tblout file",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"-f",
|
|
46
|
+
"--fasta",
|
|
47
|
+
required=True,
|
|
48
|
+
type=str,
|
|
49
|
+
help="Path to concatenated primers fasta file",
|
|
50
|
+
)
|
|
34
51
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
35
52
|
args = parser.parse_args()
|
|
36
53
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
54
|
+
input = args.input
|
|
55
|
+
fasta = args.fasta
|
|
56
|
+
sample = args.sample
|
|
40
57
|
|
|
41
|
-
return
|
|
58
|
+
return input, fasta, sample
|
|
42
59
|
|
|
43
60
|
|
|
44
61
|
def get_amp_region(beg, strand, model):
|
|
@@ -62,11 +79,11 @@ def get_amp_region(beg, strand, model):
|
|
|
62
79
|
|
|
63
80
|
|
|
64
81
|
def main():
|
|
65
|
-
|
|
82
|
+
input, fasta, sample = parse_args()
|
|
66
83
|
res_dict = defaultdict(list)
|
|
67
|
-
fasta_dict = SeqIO.to_dict(SeqIO.parse(
|
|
84
|
+
fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
|
|
68
85
|
|
|
69
|
-
with open(
|
|
86
|
+
with open(input, "r") as fr:
|
|
70
87
|
for line in fr:
|
|
71
88
|
line = line.strip()
|
|
72
89
|
line = re.sub("[ \t]+", "\t", line)
|
|
@@ -88,7 +105,7 @@ def main():
|
|
|
88
105
|
else:
|
|
89
106
|
continue
|
|
90
107
|
|
|
91
|
-
res_dict["Run"].append(
|
|
108
|
+
res_dict["Run"].append(sample)
|
|
92
109
|
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
93
110
|
res_dict["AssertionMethod"].append("automatic assertion")
|
|
94
111
|
|
|
@@ -109,7 +126,7 @@ def main():
|
|
|
109
126
|
res_dict["PrimerSeq"].append(primer_seq)
|
|
110
127
|
|
|
111
128
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
112
|
-
res_df.to_csv(f"./{
|
|
129
|
+
res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
|
|
113
130
|
|
|
114
131
|
|
|
115
132
|
if __name__ == "__main__":
|
|
@@ -21,39 +21,48 @@ import gzip
|
|
|
21
21
|
|
|
22
22
|
from Bio import SeqIO, bgzf
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
def parse_args():
|
|
25
26
|
|
|
26
27
|
parser = argparse.ArgumentParser()
|
|
27
28
|
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-f",
|
|
31
|
+
"--fwd",
|
|
32
|
+
required=True,
|
|
33
|
+
type=str,
|
|
34
|
+
help="Path to forward (or single-end) fastq file",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
|
|
38
|
+
)
|
|
30
39
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
31
40
|
args = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
_FWD = args.fwd
|
|
34
|
-
_REV = args.rev
|
|
35
|
-
_SAMPLE = args.sample
|
|
36
41
|
|
|
37
|
-
|
|
42
|
+
fwd = args.fwd
|
|
43
|
+
rev = args.rev
|
|
44
|
+
sample = args.sample
|
|
45
|
+
|
|
46
|
+
return fwd, rev, sample
|
|
38
47
|
|
|
39
48
|
|
|
40
49
|
def main():
|
|
41
50
|
|
|
42
|
-
|
|
51
|
+
fwd, rev, sample = parse_args()
|
|
43
52
|
|
|
44
|
-
fwd_handle = gzip.open(
|
|
53
|
+
fwd_handle = gzip.open(fwd, "rt")
|
|
45
54
|
fwd_reads = SeqIO.to_dict(SeqIO.parse(fwd_handle, "fastq"))
|
|
46
55
|
fwd_handle.close()
|
|
47
56
|
|
|
48
57
|
paired_end = True
|
|
49
58
|
|
|
50
|
-
if
|
|
59
|
+
if rev is None:
|
|
51
60
|
paired_end = False
|
|
52
61
|
else:
|
|
53
|
-
rev_handle = gzip.open(
|
|
62
|
+
rev_handle = gzip.open(rev, "rt")
|
|
54
63
|
rev_reads = SeqIO.to_dict(SeqIO.parse(rev_handle, "fastq"))
|
|
55
64
|
rev_handle.close()
|
|
56
|
-
|
|
65
|
+
|
|
57
66
|
remove_set = set()
|
|
58
67
|
|
|
59
68
|
for read_id in fwd_reads.keys():
|
|
@@ -78,23 +87,24 @@ def main():
|
|
|
78
87
|
remove_set.add(read_id)
|
|
79
88
|
continue
|
|
80
89
|
|
|
81
|
-
[
|
|
90
|
+
[fwd_reads.pop(read_id) for read_id in remove_set]
|
|
82
91
|
if paired_end:
|
|
83
|
-
[
|
|
92
|
+
[rev_reads.pop(read_id) for read_id in remove_set]
|
|
84
93
|
|
|
85
94
|
if paired_end:
|
|
86
|
-
fwd_handle = bgzf.BgzfWriter(f"./{
|
|
87
|
-
rev_handle = bgzf.BgzfWriter(f"./{
|
|
88
|
-
|
|
95
|
+
fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig_1.fastq.gz", "wb")
|
|
96
|
+
rev_handle = bgzf.BgzfWriter(f"./{sample}_noambig_2.fastq.gz", "wb")
|
|
97
|
+
|
|
89
98
|
SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
|
|
90
99
|
SeqIO.write(sequences=rev_reads.values(), handle=rev_handle, format="fastq")
|
|
91
100
|
|
|
92
101
|
fwd_handle.close()
|
|
93
102
|
rev_handle.close()
|
|
94
103
|
else:
|
|
95
|
-
fwd_handle = bgzf.BgzfWriter(f"./{
|
|
104
|
+
fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig.fastq.gz", "wb")
|
|
96
105
|
SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
|
|
97
106
|
fwd_handle.close()
|
|
98
107
|
|
|
108
|
+
|
|
99
109
|
if __name__ == "__main__":
|
|
100
|
-
main()
|
|
110
|
+
main()
|
|
@@ -16,29 +16,37 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
|
|
19
|
-
from Bio import
|
|
19
|
+
from Bio import SeqIO
|
|
20
|
+
|
|
20
21
|
|
|
21
22
|
def parse_args():
|
|
22
23
|
|
|
23
24
|
parser = argparse.ArgumentParser()
|
|
24
25
|
|
|
25
|
-
parser.add_argument(
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"-i",
|
|
28
|
+
"--input",
|
|
29
|
+
required=True,
|
|
30
|
+
type=str,
|
|
31
|
+
help="Path to finalised primer list fasta file",
|
|
32
|
+
)
|
|
26
33
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
27
34
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
28
35
|
args = parser.parse_args()
|
|
29
|
-
|
|
30
|
-
_INPUT = args.input
|
|
31
|
-
_SAMPLE = args.sample
|
|
32
|
-
_OUTPUT = args.output
|
|
33
36
|
|
|
34
|
-
|
|
37
|
+
input = args.input
|
|
38
|
+
sample = args.sample
|
|
39
|
+
output = args.output
|
|
40
|
+
|
|
41
|
+
return input, sample, output
|
|
42
|
+
|
|
35
43
|
|
|
36
44
|
def main():
|
|
37
|
-
|
|
38
|
-
_INPUT, _SAMPLE, _OUTPUT = parse_args()
|
|
39
45
|
|
|
40
|
-
|
|
41
|
-
|
|
46
|
+
input, sample, output = parse_args()
|
|
47
|
+
|
|
48
|
+
primers_dict = SeqIO.to_dict(SeqIO.parse(input, "fasta"))
|
|
49
|
+
|
|
42
50
|
for primer_key in primers_dict.keys():
|
|
43
51
|
|
|
44
52
|
primer = primers_dict[primer_key]
|
|
@@ -47,8 +55,10 @@ def main():
|
|
|
47
55
|
if "R" in primer_name:
|
|
48
56
|
primers_dict[primer_key].seq = primer.seq.reverse_complement()
|
|
49
57
|
|
|
50
|
-
SeqIO.write(
|
|
58
|
+
SeqIO.write(
|
|
59
|
+
primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
|
|
60
|
+
)
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
if __name__ == "__main__":
|
|
54
|
-
main()
|
|
64
|
+
main()
|