mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +140 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD +33 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/entry_points.txt +3 -0
- mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -20,21 +20,29 @@ import argparse
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
import numpy as np
|
|
22
22
|
|
|
23
|
+
|
|
23
24
|
def parse_args():
|
|
24
25
|
|
|
25
26
|
parser = argparse.ArgumentParser()
|
|
26
27
|
|
|
27
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-i",
|
|
30
|
+
"--input",
|
|
31
|
+
required=True,
|
|
32
|
+
type=str,
|
|
33
|
+
help="Path to mcp tsv file to find inflection points",
|
|
34
|
+
)
|
|
28
35
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
29
36
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
30
37
|
|
|
31
38
|
args = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
_PATH = args.input
|
|
34
|
-
_SAMPLE = args.sample
|
|
35
|
-
_OUTPUT = args.output
|
|
36
39
|
|
|
37
|
-
|
|
40
|
+
path = args.input
|
|
41
|
+
sample = args.sample
|
|
42
|
+
output = args.output
|
|
43
|
+
|
|
44
|
+
return path, sample, output
|
|
45
|
+
|
|
38
46
|
|
|
39
47
|
def find_mcp_inf_points(mcp_df):
|
|
40
48
|
"""
|
|
@@ -50,45 +58,54 @@ def find_mcp_inf_points(mcp_df):
|
|
|
50
58
|
"""
|
|
51
59
|
|
|
52
60
|
inf_point_dict = defaultdict(list)
|
|
53
|
-
start_indices = [
|
|
61
|
+
start_indices = [int(i) for i in mcp_df.columns.tolist()]
|
|
54
62
|
|
|
55
|
-
for i in range(len(mcp_df)):
|
|
63
|
+
for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
|
|
56
64
|
strand = mcp_df.index[i]
|
|
57
65
|
props = mcp_df.iloc[i].tolist()
|
|
58
|
-
props = [
|
|
66
|
+
props = [-val for val in props]
|
|
59
67
|
|
|
60
|
-
prop_diff = np.diff(props)/np.diff(start_indices)
|
|
61
|
-
infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
|
|
68
|
+
prop_diff = np.diff(props) / np.diff(start_indices) # Get the derivative
|
|
69
|
+
infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
|
|
70
|
+
0
|
|
71
|
+
] # Grab points above 80th percentile
|
|
62
72
|
|
|
63
73
|
for ind in infl_points:
|
|
64
74
|
inf_point = start_indices[ind]
|
|
65
75
|
|
|
66
|
-
if
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
if (
|
|
77
|
+
inf_point < 10 or inf_point > 20
|
|
78
|
+
): # Rule to facilitate results - won't accept
|
|
79
|
+
continue # points below index 10 or above index 20
|
|
80
|
+
# 10 means a cutoff of 15 and 20 a cutoff of 25
|
|
81
|
+
# literature points to no primers existing that are
|
|
82
|
+
# shorter or bigger than these lengths
|
|
83
|
+
|
|
84
|
+
inf_point_dict["strand"].append(strand)
|
|
85
|
+
inf_point_dict["inf_point"].append(inf_point)
|
|
71
86
|
|
|
72
|
-
inf_point_dict['strand'].append(strand)
|
|
73
|
-
inf_point_dict['inf_point'].append(inf_point)
|
|
74
|
-
|
|
75
87
|
return inf_point_dict
|
|
76
88
|
|
|
89
|
+
|
|
77
90
|
def main():
|
|
78
91
|
|
|
79
|
-
|
|
92
|
+
path, sample, output = parse_args()
|
|
80
93
|
|
|
81
|
-
mcp_df = pd.read_csv(
|
|
82
|
-
inf_point_dict = find_mcp_inf_points(mcp_df)
|
|
94
|
+
mcp_df = pd.read_csv(path, sep="\t", index_col=0) # Read mcp_df
|
|
95
|
+
inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
|
|
83
96
|
|
|
84
|
-
if len(inf_point_dict) > 0:
|
|
85
|
-
inf_point_df = pd.DataFrame.from_dict(
|
|
86
|
-
|
|
97
|
+
if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
|
|
98
|
+
inf_point_df = pd.DataFrame.from_dict(
|
|
99
|
+
inf_point_dict
|
|
100
|
+
) # .. turn it into a dataframe
|
|
101
|
+
inf_point_df.to_csv(
|
|
102
|
+
f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
|
|
103
|
+
) # ..save it to a .tsv file
|
|
87
104
|
|
|
88
|
-
else:
|
|
89
|
-
fw = open(f
|
|
105
|
+
else: # If it is empty..
|
|
106
|
+
fw = open(f"{output}/{sample}_inf_points.tsv", "w") # ..make an empty file
|
|
90
107
|
fw.close()
|
|
91
108
|
|
|
92
109
|
|
|
93
110
|
if __name__ == "__main__":
|
|
94
|
-
main()
|
|
111
|
+
main()
|
|
@@ -20,32 +20,48 @@ import logging
|
|
|
20
20
|
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
|
-
from mgnify_pipelines_toolkit.constants.tax_ranks import
|
|
23
|
+
from mgnify_pipelines_toolkit.constants.tax_ranks import (
|
|
24
|
+
_SILVA_TAX_RANKS,
|
|
25
|
+
_PR2_TAX_RANKS,
|
|
26
|
+
)
|
|
24
27
|
|
|
25
28
|
logging.basicConfig(level=logging.DEBUG)
|
|
26
29
|
|
|
30
|
+
|
|
27
31
|
def parse_args():
|
|
28
32
|
parser = argparse.ArgumentParser()
|
|
29
33
|
|
|
30
|
-
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
31
|
-
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
32
|
-
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
33
34
|
parser.add_argument(
|
|
34
|
-
"-
|
|
35
|
+
"-t", "--taxa", required=True, type=str, help="Path to taxa file"
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file"
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file"
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-a",
|
|
45
|
+
"--amp",
|
|
46
|
+
required=True,
|
|
47
|
+
type=str,
|
|
48
|
+
help="Path to extracted amp_region reads from inference subworkflow",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-hd", "--headers", required=True, type=str, help="Path to fastq headers"
|
|
35
52
|
)
|
|
36
|
-
parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
|
|
37
53
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
38
54
|
|
|
39
55
|
args = parser.parse_args()
|
|
40
|
-
|
|
41
|
-
_TAXA = args.taxa
|
|
42
|
-
_FWD = args.fwd
|
|
43
|
-
_REV = args.rev
|
|
44
|
-
_AMP = args.amp
|
|
45
|
-
_HEADERS = args.headers
|
|
46
|
-
_SAMPLE = args.sample
|
|
47
56
|
|
|
48
|
-
|
|
57
|
+
taxa = args.taxa
|
|
58
|
+
fwd = args.fwd
|
|
59
|
+
rev = args.rev
|
|
60
|
+
amp = args.amp
|
|
61
|
+
headers = args.headers
|
|
62
|
+
sample = args.sample
|
|
63
|
+
|
|
64
|
+
return taxa, fwd, rev, amp, headers, sample
|
|
49
65
|
|
|
50
66
|
|
|
51
67
|
def order_df(taxa_df):
|
|
@@ -59,6 +75,7 @@ def order_df(taxa_df):
|
|
|
59
75
|
|
|
60
76
|
return taxa_df
|
|
61
77
|
|
|
78
|
+
|
|
62
79
|
def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
63
80
|
tax_assignment_dict = defaultdict(int)
|
|
64
81
|
|
|
@@ -93,7 +110,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
93
110
|
k = "_".join(k.split(" "))
|
|
94
111
|
tax_assignment += f"\t{k}"
|
|
95
112
|
elif sk != "0":
|
|
96
|
-
tax_assignment +=
|
|
113
|
+
tax_assignment += "\tk__"
|
|
97
114
|
else:
|
|
98
115
|
break
|
|
99
116
|
|
|
@@ -136,9 +153,10 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
136
153
|
continue
|
|
137
154
|
|
|
138
155
|
tax_assignment_dict[tax_assignment] += asv_count
|
|
139
|
-
|
|
156
|
+
|
|
140
157
|
return tax_assignment_dict
|
|
141
158
|
|
|
159
|
+
|
|
142
160
|
def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
143
161
|
tax_assignment_dict = defaultdict(int)
|
|
144
162
|
|
|
@@ -223,6 +241,7 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
223
241
|
|
|
224
242
|
return tax_assignment_dict
|
|
225
243
|
|
|
244
|
+
|
|
226
245
|
def generate_asv_count_dict(asv_dict):
|
|
227
246
|
|
|
228
247
|
res_dict = defaultdict(list)
|
|
@@ -232,36 +251,35 @@ def generate_asv_count_dict(asv_dict):
|
|
|
232
251
|
if count == 0:
|
|
233
252
|
continue
|
|
234
253
|
|
|
235
|
-
res_dict[
|
|
236
|
-
res_dict[
|
|
254
|
+
res_dict["asv"].append(asv_id)
|
|
255
|
+
res_dict["count"].append(count)
|
|
237
256
|
|
|
238
257
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
239
|
-
res_df = res_df.sort_values(by=
|
|
240
|
-
res_df = res_df.sort_values(by=
|
|
258
|
+
res_df = res_df.sort_values(by="asv", ascending=True)
|
|
259
|
+
res_df = res_df.sort_values(by="count", ascending=False)
|
|
241
260
|
|
|
242
261
|
return res_df
|
|
243
262
|
|
|
244
263
|
|
|
245
264
|
def main():
|
|
246
|
-
|
|
265
|
+
taxa, fwd, rev, amp, headers, sample = parse_args()
|
|
247
266
|
|
|
248
|
-
fwd_fr = open(
|
|
267
|
+
fwd_fr = open(fwd, "r")
|
|
249
268
|
paired_end = True
|
|
250
269
|
|
|
251
|
-
if
|
|
270
|
+
if rev is None:
|
|
252
271
|
paired_end = False
|
|
253
272
|
rev_fr = [True]
|
|
254
273
|
else:
|
|
255
|
-
rev_fr = open(
|
|
274
|
+
rev_fr = open(rev, "r")
|
|
256
275
|
|
|
257
|
-
taxa_df = pd.read_csv(
|
|
276
|
+
taxa_df = pd.read_csv(taxa, sep="\t", dtype=str)
|
|
258
277
|
taxa_df = taxa_df.fillna("0")
|
|
259
278
|
taxa_df = order_df(taxa_df)
|
|
260
279
|
|
|
261
|
-
amp_reads = [read.strip() for read in list(open(
|
|
262
|
-
headers = [read.split(" ")[0][1:] for read in
|
|
263
|
-
|
|
264
|
-
amp_region = ".".join(_AMP.split(".")[1:3])
|
|
280
|
+
amp_reads = [read.strip() for read in list(open(amp, "r"))]
|
|
281
|
+
headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
|
|
282
|
+
amp_region = ".".join(amp.split(".")[1:3])
|
|
265
283
|
|
|
266
284
|
asv_dict = defaultdict(int)
|
|
267
285
|
|
|
@@ -270,7 +288,7 @@ def main():
|
|
|
270
288
|
counter += 1
|
|
271
289
|
line_fwd = line_fwd.strip()
|
|
272
290
|
|
|
273
|
-
if line_fwd ==
|
|
291
|
+
if line_fwd == "0":
|
|
274
292
|
continue
|
|
275
293
|
|
|
276
294
|
if headers[counter] in amp_reads:
|
|
@@ -289,12 +307,15 @@ def main():
|
|
|
289
307
|
tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
|
|
290
308
|
ref_db = "pr2"
|
|
291
309
|
|
|
292
|
-
with open(f"./{
|
|
310
|
+
with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
|
|
293
311
|
for tax_assignment, count in tax_assignment_dict.items():
|
|
294
312
|
fw.write(f"{count}\t{tax_assignment}\n")
|
|
295
313
|
|
|
296
314
|
asv_count_df = generate_asv_count_dict(asv_dict)
|
|
297
|
-
asv_count_df.to_csv(
|
|
315
|
+
asv_count_df.to_csv(
|
|
316
|
+
f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
|
|
317
|
+
)
|
|
318
|
+
|
|
298
319
|
|
|
299
320
|
if __name__ == "__main__":
|
|
300
321
|
main()
|
|
@@ -22,9 +22,12 @@ import pandas as pd
|
|
|
22
22
|
|
|
23
23
|
logging.basicConfig(level=logging.DEBUG)
|
|
24
24
|
|
|
25
|
+
|
|
25
26
|
def parse_args():
|
|
26
27
|
parser = argparse.ArgumentParser()
|
|
27
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-i", "--input", required=True, type=str, help="Input from MAPseq output"
|
|
30
|
+
)
|
|
28
31
|
parser.add_argument(
|
|
29
32
|
"-l",
|
|
30
33
|
"--label",
|
|
@@ -37,18 +40,48 @@ def parse_args():
|
|
|
37
40
|
|
|
38
41
|
args = parser.parse_args()
|
|
39
42
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
+
input = args.input
|
|
44
|
+
label = args.label
|
|
45
|
+
sample = args.sample
|
|
46
|
+
|
|
47
|
+
return input, label, sample
|
|
43
48
|
|
|
44
|
-
return _INPUT, _LABEL, _SAMPLE
|
|
45
49
|
|
|
46
50
|
def parse_label(label):
|
|
47
51
|
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
48
|
-
pr2_short_ranks = [
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
pr2_short_ranks = [
|
|
53
|
+
"d__",
|
|
54
|
+
"sg__",
|
|
55
|
+
"dv__",
|
|
56
|
+
"sdv__",
|
|
57
|
+
"c__",
|
|
58
|
+
"o__",
|
|
59
|
+
"f__",
|
|
60
|
+
"g__",
|
|
61
|
+
"s__",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
silva_long_ranks = [
|
|
65
|
+
"Superkingdom",
|
|
66
|
+
"Kingdom",
|
|
67
|
+
"Phylum",
|
|
68
|
+
"Class",
|
|
69
|
+
"Order",
|
|
70
|
+
"Family",
|
|
71
|
+
"Genus",
|
|
72
|
+
"Species",
|
|
73
|
+
]
|
|
74
|
+
pr2_long_ranks = [
|
|
75
|
+
"Domain",
|
|
76
|
+
"Supergroup",
|
|
77
|
+
"Division",
|
|
78
|
+
"Subdivision",
|
|
79
|
+
"Class",
|
|
80
|
+
"Order",
|
|
81
|
+
"Family",
|
|
82
|
+
"Genus",
|
|
83
|
+
"Species",
|
|
84
|
+
]
|
|
52
85
|
|
|
53
86
|
chosen_short_ranks = ""
|
|
54
87
|
chosen_long_ranks = ""
|
|
@@ -65,6 +98,7 @@ def parse_label(label):
|
|
|
65
98
|
|
|
66
99
|
return chosen_short_ranks, chosen_long_ranks
|
|
67
100
|
|
|
101
|
+
|
|
68
102
|
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
69
103
|
res_dict = defaultdict(list)
|
|
70
104
|
|
|
@@ -91,7 +125,8 @@ def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
|
91
125
|
res_dict[curr_rank].append(curr_tax)
|
|
92
126
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
93
127
|
|
|
94
|
-
return
|
|
128
|
+
return res_df
|
|
129
|
+
|
|
95
130
|
|
|
96
131
|
def process_blank_tax_ends(res_df, ranks):
|
|
97
132
|
# Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
|
|
@@ -105,7 +140,9 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
105
140
|
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
106
141
|
curr_rank = res_df.iloc[i, j + 1]
|
|
107
142
|
if curr_rank in ranks:
|
|
108
|
-
if
|
|
143
|
+
if (
|
|
144
|
+
last_empty_rank == ""
|
|
145
|
+
): # Last rank is empty, start window of consecutive blanks
|
|
109
146
|
last_empty_rank = j + 1
|
|
110
147
|
currently_empty = True
|
|
111
148
|
elif (
|
|
@@ -124,16 +161,17 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
124
161
|
return res_df
|
|
125
162
|
|
|
126
163
|
|
|
127
|
-
def main():
|
|
128
|
-
|
|
164
|
+
def main():
|
|
165
|
+
input, label, sample = parse_args()
|
|
129
166
|
|
|
130
|
-
mseq_df = pd.read_csv(
|
|
167
|
+
mseq_df = pd.read_csv(input, header=0, delim_whitespace=True, usecols=[0, 12])
|
|
131
168
|
|
|
132
|
-
short_ranks, long_ranks = parse_label(
|
|
169
|
+
short_ranks, long_ranks = parse_label(label)
|
|
133
170
|
res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
|
|
134
171
|
final_res_df = process_blank_tax_ends(res_df, short_ranks)
|
|
135
172
|
|
|
136
|
-
final_res_df.to_csv(f"./{
|
|
173
|
+
final_res_df.to_csv(f"./{sample}_{label}_asv_taxa.tsv", sep="\t", index=False)
|
|
174
|
+
|
|
137
175
|
|
|
138
176
|
if __name__ == "__main__":
|
|
139
177
|
main()
|
|
@@ -21,24 +21,41 @@ import re
|
|
|
21
21
|
from Bio import SeqIO
|
|
22
22
|
import pandas as pd
|
|
23
23
|
|
|
24
|
-
from mgnify_pipelines_toolkit.constants.var_region_coordinates import
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
|
|
25
|
+
REGIONS_16S_BACTERIA,
|
|
26
|
+
REGIONS_16S_ARCHAEA,
|
|
27
|
+
REGIONS_18S,
|
|
28
|
+
)
|
|
25
29
|
|
|
26
30
|
STRAND_FWD = "fwd"
|
|
27
31
|
STRAND_REV = "rev"
|
|
28
32
|
|
|
33
|
+
|
|
29
34
|
def parse_args():
|
|
30
35
|
parser = argparse.ArgumentParser()
|
|
31
36
|
|
|
32
|
-
parser.add_argument(
|
|
33
|
-
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-i",
|
|
39
|
+
"--input",
|
|
40
|
+
required=True,
|
|
41
|
+
type=str,
|
|
42
|
+
help="Path to cmsearch_deoverlap_tblout file",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"-f",
|
|
46
|
+
"--fasta",
|
|
47
|
+
required=True,
|
|
48
|
+
type=str,
|
|
49
|
+
help="Path to concatenated primers fasta file",
|
|
50
|
+
)
|
|
34
51
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
35
52
|
args = parser.parse_args()
|
|
36
53
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
54
|
+
input = args.input
|
|
55
|
+
fasta = args.fasta
|
|
56
|
+
sample = args.sample
|
|
40
57
|
|
|
41
|
-
return
|
|
58
|
+
return input, fasta, sample
|
|
42
59
|
|
|
43
60
|
|
|
44
61
|
def get_amp_region(beg, strand, model):
|
|
@@ -62,11 +79,11 @@ def get_amp_region(beg, strand, model):
|
|
|
62
79
|
|
|
63
80
|
|
|
64
81
|
def main():
|
|
65
|
-
|
|
82
|
+
input, fasta, sample = parse_args()
|
|
66
83
|
res_dict = defaultdict(list)
|
|
67
|
-
fasta_dict = SeqIO.to_dict(SeqIO.parse(
|
|
84
|
+
fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
|
|
68
85
|
|
|
69
|
-
with open(
|
|
86
|
+
with open(input, "r") as fr:
|
|
70
87
|
for line in fr:
|
|
71
88
|
line = line.strip()
|
|
72
89
|
line = re.sub("[ \t]+", "\t", line)
|
|
@@ -88,7 +105,7 @@ def main():
|
|
|
88
105
|
else:
|
|
89
106
|
continue
|
|
90
107
|
|
|
91
|
-
res_dict["Run"].append(
|
|
108
|
+
res_dict["Run"].append(sample)
|
|
92
109
|
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
93
110
|
res_dict["AssertionMethod"].append("automatic assertion")
|
|
94
111
|
|
|
@@ -109,7 +126,7 @@ def main():
|
|
|
109
126
|
res_dict["PrimerSeq"].append(primer_seq)
|
|
110
127
|
|
|
111
128
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
112
|
-
res_df.to_csv(f"./{
|
|
129
|
+
res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
|
|
113
130
|
|
|
114
131
|
|
|
115
132
|
if __name__ == "__main__":
|
|
@@ -21,39 +21,48 @@ import gzip
|
|
|
21
21
|
|
|
22
22
|
from Bio import SeqIO, bgzf
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
def parse_args():
|
|
25
26
|
|
|
26
27
|
parser = argparse.ArgumentParser()
|
|
27
28
|
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-f",
|
|
31
|
+
"--fwd",
|
|
32
|
+
required=True,
|
|
33
|
+
type=str,
|
|
34
|
+
help="Path to forward (or single-end) fastq file",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
|
|
38
|
+
)
|
|
30
39
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
31
40
|
args = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
_FWD = args.fwd
|
|
34
|
-
_REV = args.rev
|
|
35
|
-
_SAMPLE = args.sample
|
|
36
41
|
|
|
37
|
-
|
|
42
|
+
fwd = args.fwd
|
|
43
|
+
rev = args.rev
|
|
44
|
+
sample = args.sample
|
|
45
|
+
|
|
46
|
+
return fwd, rev, sample
|
|
38
47
|
|
|
39
48
|
|
|
40
49
|
def main():
|
|
41
50
|
|
|
42
|
-
|
|
51
|
+
fwd, rev, sample = parse_args()
|
|
43
52
|
|
|
44
|
-
fwd_handle = gzip.open(
|
|
53
|
+
fwd_handle = gzip.open(fwd, "rt")
|
|
45
54
|
fwd_reads = SeqIO.to_dict(SeqIO.parse(fwd_handle, "fastq"))
|
|
46
55
|
fwd_handle.close()
|
|
47
56
|
|
|
48
57
|
paired_end = True
|
|
49
58
|
|
|
50
|
-
if
|
|
59
|
+
if rev is None:
|
|
51
60
|
paired_end = False
|
|
52
61
|
else:
|
|
53
|
-
rev_handle = gzip.open(
|
|
62
|
+
rev_handle = gzip.open(rev, "rt")
|
|
54
63
|
rev_reads = SeqIO.to_dict(SeqIO.parse(rev_handle, "fastq"))
|
|
55
64
|
rev_handle.close()
|
|
56
|
-
|
|
65
|
+
|
|
57
66
|
remove_set = set()
|
|
58
67
|
|
|
59
68
|
for read_id in fwd_reads.keys():
|
|
@@ -78,23 +87,24 @@ def main():
|
|
|
78
87
|
remove_set.add(read_id)
|
|
79
88
|
continue
|
|
80
89
|
|
|
81
|
-
[
|
|
90
|
+
[fwd_reads.pop(read_id) for read_id in remove_set]
|
|
82
91
|
if paired_end:
|
|
83
|
-
[
|
|
92
|
+
[rev_reads.pop(read_id) for read_id in remove_set]
|
|
84
93
|
|
|
85
94
|
if paired_end:
|
|
86
|
-
fwd_handle = bgzf.BgzfWriter(f"./{
|
|
87
|
-
rev_handle = bgzf.BgzfWriter(f"./{
|
|
88
|
-
|
|
95
|
+
fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig_1.fastq.gz", "wb")
|
|
96
|
+
rev_handle = bgzf.BgzfWriter(f"./{sample}_noambig_2.fastq.gz", "wb")
|
|
97
|
+
|
|
89
98
|
SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
|
|
90
99
|
SeqIO.write(sequences=rev_reads.values(), handle=rev_handle, format="fastq")
|
|
91
100
|
|
|
92
101
|
fwd_handle.close()
|
|
93
102
|
rev_handle.close()
|
|
94
103
|
else:
|
|
95
|
-
fwd_handle = bgzf.BgzfWriter(f"./{
|
|
104
|
+
fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig.fastq.gz", "wb")
|
|
96
105
|
SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
|
|
97
106
|
fwd_handle.close()
|
|
98
107
|
|
|
108
|
+
|
|
99
109
|
if __name__ == "__main__":
|
|
100
|
-
main()
|
|
110
|
+
main()
|
|
@@ -16,29 +16,37 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
|
|
19
|
-
from Bio import
|
|
19
|
+
from Bio import SeqIO
|
|
20
|
+
|
|
20
21
|
|
|
21
22
|
def parse_args():
|
|
22
23
|
|
|
23
24
|
parser = argparse.ArgumentParser()
|
|
24
25
|
|
|
25
|
-
parser.add_argument(
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"-i",
|
|
28
|
+
"--input",
|
|
29
|
+
required=True,
|
|
30
|
+
type=str,
|
|
31
|
+
help="Path to finalised primer list fasta file",
|
|
32
|
+
)
|
|
26
33
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
27
34
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
28
35
|
args = parser.parse_args()
|
|
29
|
-
|
|
30
|
-
_INPUT = args.input
|
|
31
|
-
_SAMPLE = args.sample
|
|
32
|
-
_OUTPUT = args.output
|
|
33
36
|
|
|
34
|
-
|
|
37
|
+
input = args.input
|
|
38
|
+
sample = args.sample
|
|
39
|
+
output = args.output
|
|
40
|
+
|
|
41
|
+
return input, sample, output
|
|
42
|
+
|
|
35
43
|
|
|
36
44
|
def main():
|
|
37
|
-
|
|
38
|
-
_INPUT, _SAMPLE, _OUTPUT = parse_args()
|
|
39
45
|
|
|
40
|
-
|
|
41
|
-
|
|
46
|
+
input, sample, output = parse_args()
|
|
47
|
+
|
|
48
|
+
primers_dict = SeqIO.to_dict(SeqIO.parse(input, "fasta"))
|
|
49
|
+
|
|
42
50
|
for primer_key in primers_dict.keys():
|
|
43
51
|
|
|
44
52
|
primer = primers_dict[primer_key]
|
|
@@ -47,8 +55,10 @@ def main():
|
|
|
47
55
|
if "R" in primer_name:
|
|
48
56
|
primers_dict[primer_key].seq = primer.seq.reverse_complement()
|
|
49
57
|
|
|
50
|
-
SeqIO.write(
|
|
58
|
+
SeqIO.write(
|
|
59
|
+
primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
|
|
60
|
+
)
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
if __name__ == "__main__":
|
|
54
|
-
main()
|
|
64
|
+
main()
|