mgnify-pipelines-toolkit 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +4 -8
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +64 -51
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +139 -0
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +116 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +26 -26
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/METADATA +5 -3
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/RECORD +12 -10
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/entry_points.txt +2 -0
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -46,18 +46,11 @@ def get_read_count(read_path, type='fastq'):
|
|
|
46
46
|
]
|
|
47
47
|
zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
48
48
|
|
|
49
|
-
cmd = [
|
|
50
|
-
'sed',
|
|
51
|
-
'-n',
|
|
52
|
-
'1~4p',
|
|
53
|
-
]
|
|
54
|
-
sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
55
|
-
|
|
56
49
|
cmd = [
|
|
57
50
|
'wc',
|
|
58
51
|
'-l'
|
|
59
52
|
]
|
|
60
|
-
wc_proc = subprocess.Popen(cmd, stdin=
|
|
53
|
+
wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
61
54
|
stdout, stderr = wc_proc.communicate()
|
|
62
55
|
|
|
63
56
|
elif type == 'fasta':
|
|
@@ -78,6 +71,9 @@ def get_read_count(read_path, type='fastq'):
|
|
|
78
71
|
|
|
79
72
|
read_count = int(read_count)
|
|
80
73
|
|
|
74
|
+
if type == 'fastq':
|
|
75
|
+
read_count /= 4
|
|
76
|
+
|
|
81
77
|
return read_count
|
|
82
78
|
|
|
83
79
|
def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
|
|
@@ -25,13 +25,14 @@ from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_
|
|
|
25
25
|
logging.basicConfig(level=logging.DEBUG)
|
|
26
26
|
|
|
27
27
|
def parse_args():
|
|
28
|
-
|
|
29
28
|
parser = argparse.ArgumentParser()
|
|
30
29
|
|
|
31
|
-
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to
|
|
30
|
+
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
32
31
|
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
33
32
|
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
34
|
-
parser.add_argument(
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
|
|
35
|
+
)
|
|
35
36
|
parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
|
|
36
37
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
37
38
|
|
|
@@ -48,8 +49,7 @@ def parse_args():
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def order_df(taxa_df):
|
|
51
|
-
|
|
52
|
-
if len(taxa_df.columns) == 8:
|
|
52
|
+
if len(taxa_df.columns) == 9:
|
|
53
53
|
taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
|
|
54
54
|
elif len(taxa_df.columns) == 10:
|
|
55
55
|
taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
|
|
@@ -60,70 +60,76 @@ def order_df(taxa_df):
|
|
|
60
60
|
return taxa_df
|
|
61
61
|
|
|
62
62
|
def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
63
|
-
|
|
64
63
|
tax_assignment_dict = defaultdict(int)
|
|
65
64
|
|
|
66
65
|
for i in range(len(taxa_df)):
|
|
67
|
-
|
|
68
66
|
sorted_index = taxa_df.index[i]
|
|
69
|
-
|
|
67
|
+
asv_num = taxa_df.iloc[i, 0]
|
|
68
|
+
asv_count = asv_dict[asv_num]
|
|
70
69
|
|
|
71
70
|
if asv_count == 0:
|
|
72
71
|
continue
|
|
73
72
|
|
|
73
|
+
sk = taxa_df.loc[sorted_index, "Superkingdom"]
|
|
74
74
|
k = taxa_df.loc[sorted_index, "Kingdom"]
|
|
75
75
|
p = taxa_df.loc[sorted_index, "Phylum"]
|
|
76
76
|
c = taxa_df.loc[sorted_index, "Class"]
|
|
77
77
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
78
78
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
79
|
-
g = taxa_df.loc[sorted_index, "Genus"]
|
|
79
|
+
g = taxa_df.loc[sorted_index, "Genus"]
|
|
80
80
|
s = taxa_df.loc[sorted_index, "Species"]
|
|
81
81
|
|
|
82
82
|
tax_assignment = ""
|
|
83
83
|
|
|
84
84
|
while True:
|
|
85
85
|
|
|
86
|
+
if sk != "0":
|
|
87
|
+
sk = "_".join(sk.split(" "))
|
|
88
|
+
tax_assignment += sk
|
|
89
|
+
else:
|
|
90
|
+
break
|
|
91
|
+
|
|
86
92
|
if k != "0":
|
|
87
93
|
k = "_".join(k.split(" "))
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
tax_assignment += f"sk__Eukaryota"
|
|
92
|
-
else:
|
|
93
|
-
tax_assignment += f"sk__Eukaryota\tk__{k}"
|
|
94
|
+
tax_assignment += f"\t{k}"
|
|
95
|
+
elif sk != "0":
|
|
96
|
+
tax_assignment += f"\tk__"
|
|
94
97
|
else:
|
|
95
98
|
break
|
|
96
99
|
|
|
97
100
|
if p != "0":
|
|
98
|
-
if k == "Archaea" or k == "Bacteria":
|
|
99
|
-
tax_assignment += f"\tk__"
|
|
100
101
|
p = "_".join(p.split(" "))
|
|
101
|
-
tax_assignment += f"\
|
|
102
|
+
tax_assignment += f"\t{p}"
|
|
102
103
|
else:
|
|
103
104
|
break
|
|
105
|
+
|
|
104
106
|
if c != "0":
|
|
105
107
|
c = "_".join(c.split(" "))
|
|
106
|
-
tax_assignment += f"\
|
|
108
|
+
tax_assignment += f"\t{c}"
|
|
107
109
|
else:
|
|
108
110
|
break
|
|
111
|
+
|
|
109
112
|
if o != "0":
|
|
110
113
|
o = "_".join(o.split(" "))
|
|
111
|
-
tax_assignment += f"\
|
|
114
|
+
tax_assignment += f"\t{o}"
|
|
112
115
|
else:
|
|
113
116
|
break
|
|
117
|
+
|
|
114
118
|
if f != "0":
|
|
115
119
|
f = "_".join(f.split(" "))
|
|
116
|
-
tax_assignment += f"\
|
|
120
|
+
tax_assignment += f"\t{f}"
|
|
117
121
|
else:
|
|
118
122
|
break
|
|
123
|
+
|
|
119
124
|
if g != "0":
|
|
120
125
|
g = "_".join(g.split(" "))
|
|
121
|
-
tax_assignment += f"\
|
|
126
|
+
tax_assignment += f"\t{g}"
|
|
122
127
|
else:
|
|
123
128
|
break
|
|
129
|
+
|
|
124
130
|
if s != "0":
|
|
125
131
|
s = "_".join(s.split(" "))
|
|
126
|
-
tax_assignment += f"\
|
|
132
|
+
tax_assignment += f"\t{s}"
|
|
127
133
|
break
|
|
128
134
|
|
|
129
135
|
if tax_assignment == "":
|
|
@@ -134,13 +140,12 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
134
140
|
return tax_assignment_dict
|
|
135
141
|
|
|
136
142
|
def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
137
|
-
|
|
138
143
|
tax_assignment_dict = defaultdict(int)
|
|
139
144
|
|
|
140
145
|
for i in range(len(taxa_df)):
|
|
141
|
-
|
|
142
146
|
sorted_index = taxa_df.index[i]
|
|
143
|
-
|
|
147
|
+
asv_num = taxa_df.iloc[i, 0]
|
|
148
|
+
asv_count = asv_dict[asv_num]
|
|
144
149
|
|
|
145
150
|
if asv_count == 0:
|
|
146
151
|
continue
|
|
@@ -153,53 +158,62 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
153
158
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
154
159
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
155
160
|
g = taxa_df.loc[sorted_index, "Genus"]
|
|
156
|
-
s = taxa_df.loc[sorted_index, "Species"]
|
|
161
|
+
s = taxa_df.loc[sorted_index, "Species"]
|
|
157
162
|
|
|
158
163
|
tax_assignment = ""
|
|
159
164
|
|
|
160
165
|
while True:
|
|
161
|
-
|
|
162
166
|
if d != "0":
|
|
163
167
|
d = "_".join(d.split(" "))
|
|
164
|
-
tax_assignment +=
|
|
168
|
+
tax_assignment += d
|
|
165
169
|
else:
|
|
166
170
|
break
|
|
167
171
|
|
|
168
172
|
if sg != "0":
|
|
169
173
|
sg = "_".join(sg.split(" "))
|
|
170
|
-
tax_assignment += f"\
|
|
174
|
+
tax_assignment += f"\t{sg}"
|
|
171
175
|
else:
|
|
172
176
|
break
|
|
177
|
+
|
|
173
178
|
if dv != "0":
|
|
174
179
|
dv = "_".join(dv.split(" "))
|
|
175
|
-
tax_assignment += f"\
|
|
180
|
+
tax_assignment += f"\t{dv}"
|
|
181
|
+
else:
|
|
182
|
+
break
|
|
176
183
|
|
|
177
184
|
if sdv != "0":
|
|
178
185
|
sdv = "_".join(sdv.split(" "))
|
|
179
|
-
tax_assignment += f"\
|
|
186
|
+
tax_assignment += f"\t{sdv}"
|
|
187
|
+
else:
|
|
188
|
+
break
|
|
189
|
+
|
|
180
190
|
if c != "0":
|
|
181
191
|
c = "_".join(c.split(" "))
|
|
182
|
-
tax_assignment += f"\
|
|
192
|
+
tax_assignment += f"\t{c}"
|
|
183
193
|
else:
|
|
184
194
|
break
|
|
195
|
+
|
|
185
196
|
if o != "0":
|
|
186
197
|
o = "_".join(o.split(" "))
|
|
187
|
-
tax_assignment += f"\
|
|
198
|
+
tax_assignment += f"\t{o}"
|
|
188
199
|
else:
|
|
189
200
|
break
|
|
201
|
+
|
|
190
202
|
if f != "0":
|
|
191
203
|
f = "_".join(f.split(" "))
|
|
192
|
-
tax_assignment += f"\
|
|
204
|
+
tax_assignment += f"\t{f}"
|
|
193
205
|
else:
|
|
194
206
|
break
|
|
207
|
+
|
|
195
208
|
if g != "0":
|
|
196
209
|
g = "_".join(g.split(" "))
|
|
197
|
-
tax_assignment += f"\
|
|
210
|
+
tax_assignment += f"\t{g}"
|
|
198
211
|
else:
|
|
199
212
|
break
|
|
213
|
+
|
|
200
214
|
if s != "0":
|
|
201
215
|
s = "_".join(s.split(" "))
|
|
202
|
-
tax_assignment += f"\
|
|
216
|
+
tax_assignment += f"\t{s}"
|
|
203
217
|
break
|
|
204
218
|
|
|
205
219
|
if tax_assignment == "":
|
|
@@ -210,9 +224,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
210
224
|
return tax_assignment_dict
|
|
211
225
|
|
|
212
226
|
def main():
|
|
213
|
-
|
|
214
227
|
_TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
|
|
215
|
-
|
|
228
|
+
|
|
216
229
|
fwd_fr = open(_FWD, "r")
|
|
217
230
|
paired_end = True
|
|
218
231
|
|
|
@@ -225,43 +238,43 @@ def main():
|
|
|
225
238
|
taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
|
|
226
239
|
taxa_df = taxa_df.fillna("0")
|
|
227
240
|
taxa_df = order_df(taxa_df)
|
|
228
|
-
|
|
229
|
-
amp_reads = [
|
|
230
|
-
headers = [
|
|
241
|
+
|
|
242
|
+
amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
|
|
243
|
+
headers = [read.split(" ")[0][1:] for read in
|
|
244
|
+
list(open(_HEADERS, "r"))]
|
|
231
245
|
amp_region = ".".join(_AMP.split(".")[1:3])
|
|
232
246
|
|
|
233
247
|
asv_dict = defaultdict(int)
|
|
234
248
|
|
|
235
249
|
counter = -1
|
|
236
250
|
for line_fwd in fwd_fr:
|
|
237
|
-
|
|
238
251
|
counter += 1
|
|
239
252
|
line_fwd = line_fwd.strip()
|
|
240
253
|
fwd_asvs = line_fwd.split(",")
|
|
241
254
|
|
|
242
255
|
if paired_end:
|
|
243
256
|
line_rev = next(rev_fr).strip()
|
|
244
|
-
rev_asvs = line_rev.split(",")
|
|
257
|
+
rev_asvs = line_rev.split(",")
|
|
245
258
|
asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
|
|
246
|
-
|
|
259
|
+
|
|
247
260
|
if len(asv_intersection) == 0:
|
|
248
261
|
continue
|
|
249
|
-
|
|
262
|
+
|
|
250
263
|
if len(asv_intersection) == 1 and asv_intersection[0] == "0":
|
|
251
264
|
continue
|
|
252
265
|
else:
|
|
253
266
|
asv_intersection = fwd_asvs
|
|
254
267
|
|
|
255
268
|
if headers[counter] in amp_reads:
|
|
256
|
-
asv_dict[int(asv_intersection[0]) - 1] += 1
|
|
257
|
-
|
|
269
|
+
asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
|
|
270
|
+
|
|
258
271
|
fwd_fr.close()
|
|
259
272
|
if paired_end:
|
|
260
273
|
rev_fr.close()
|
|
261
274
|
|
|
262
275
|
ref_db = ""
|
|
263
276
|
|
|
264
|
-
if len(taxa_df.columns) ==
|
|
277
|
+
if len(taxa_df.columns) == 9:
|
|
265
278
|
tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
|
|
266
279
|
ref_db = "silva"
|
|
267
280
|
elif len(taxa_df.columns) == 10:
|
|
@@ -271,7 +284,7 @@ def main():
|
|
|
271
284
|
with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
|
|
272
285
|
for tax_assignment, count in tax_assignment_dict.items():
|
|
273
286
|
fw.write(f"{count}\t{tax_assignment}\n")
|
|
274
|
-
|
|
287
|
+
|
|
275
288
|
|
|
276
289
|
if __name__ == "__main__":
|
|
277
|
-
main()
|
|
290
|
+
main()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
24
|
+
|
|
25
|
+
def parse_args():
|
|
26
|
+
parser = argparse.ArgumentParser()
|
|
27
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-l",
|
|
30
|
+
"--label",
|
|
31
|
+
choices=["DADA2-SILVA", "DADA2-PR2"],
|
|
32
|
+
required=True,
|
|
33
|
+
type=str,
|
|
34
|
+
help="Database label - either DADA2-SILVA or DADA2-PR2",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
37
|
+
|
|
38
|
+
args = parser.parse_args()
|
|
39
|
+
|
|
40
|
+
_INPUT = args.input
|
|
41
|
+
_LABEL = args.label
|
|
42
|
+
_SAMPLE = args.sample
|
|
43
|
+
|
|
44
|
+
return _INPUT, _LABEL, _SAMPLE
|
|
45
|
+
|
|
46
|
+
def parse_label(label):
|
|
47
|
+
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
48
|
+
pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
|
|
49
|
+
|
|
50
|
+
silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
51
|
+
pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
52
|
+
|
|
53
|
+
chosen_short_ranks = ""
|
|
54
|
+
chosen_long_ranks = ""
|
|
55
|
+
|
|
56
|
+
if label == "DADA2-SILVA":
|
|
57
|
+
chosen_short_ranks = silva_short_ranks
|
|
58
|
+
chosen_long_ranks = silva_long_ranks
|
|
59
|
+
elif label == "DADA2-PR2":
|
|
60
|
+
chosen_short_ranks = pr2_short_ranks
|
|
61
|
+
chosen_long_ranks = pr2_long_ranks
|
|
62
|
+
else:
|
|
63
|
+
logging.error("Incorrect database label - exiting.")
|
|
64
|
+
exit(1)
|
|
65
|
+
|
|
66
|
+
return chosen_short_ranks, chosen_long_ranks
|
|
67
|
+
|
|
68
|
+
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
69
|
+
res_dict = defaultdict(list)
|
|
70
|
+
|
|
71
|
+
for i in range(len(mseq_df)):
|
|
72
|
+
asv_id = mseq_df.iloc[i, 0]
|
|
73
|
+
|
|
74
|
+
if pd.isna(mseq_df.iloc[i, 1]):
|
|
75
|
+
tax_ass = [short_ranks[0]]
|
|
76
|
+
else:
|
|
77
|
+
tax_ass = mseq_df.iloc[i, 1].split(";")
|
|
78
|
+
|
|
79
|
+
res_dict["ASV"].append(asv_id)
|
|
80
|
+
|
|
81
|
+
for j in range(len(short_ranks)):
|
|
82
|
+
curr_rank = long_ranks[j]
|
|
83
|
+
|
|
84
|
+
if j >= len(tax_ass):
|
|
85
|
+
# This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
|
|
86
|
+
# so fill each remaining rank with its respective short rank blank
|
|
87
|
+
curr_tax = short_ranks[j]
|
|
88
|
+
else:
|
|
89
|
+
curr_tax = tax_ass[j]
|
|
90
|
+
|
|
91
|
+
res_dict[curr_rank].append(curr_tax)
|
|
92
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
93
|
+
|
|
94
|
+
return(res_df)
|
|
95
|
+
|
|
96
|
+
def process_blank_tax_ends(res_df, ranks):
|
|
97
|
+
# Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
|
|
98
|
+
# while avoiding making blanks in the middle as NAs
|
|
99
|
+
|
|
100
|
+
for i in range(len(res_df)):
|
|
101
|
+
last_empty_rank = ""
|
|
102
|
+
currently_empty = False
|
|
103
|
+
for j in reversed(
|
|
104
|
+
range(len(ranks))
|
|
105
|
+
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
106
|
+
curr_rank = res_df.iloc[i, j + 1]
|
|
107
|
+
if curr_rank in ranks:
|
|
108
|
+
if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
|
|
109
|
+
last_empty_rank = j + 1
|
|
110
|
+
currently_empty = True
|
|
111
|
+
elif (
|
|
112
|
+
currently_empty
|
|
113
|
+
): # If we're in a window of consecutive blank assignments that started at the beginning
|
|
114
|
+
last_empty_rank = j + 1
|
|
115
|
+
else:
|
|
116
|
+
break
|
|
117
|
+
else:
|
|
118
|
+
break
|
|
119
|
+
if last_empty_rank != "":
|
|
120
|
+
res_df.iloc[i, last_empty_rank:] = "NA"
|
|
121
|
+
if last_empty_rank == 1:
|
|
122
|
+
res_df.iloc[i, 1] = ranks[0]
|
|
123
|
+
|
|
124
|
+
return res_df
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def main():
|
|
128
|
+
_INPUT, _LABEL, _SAMPLE = parse_args()
|
|
129
|
+
|
|
130
|
+
mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
|
|
131
|
+
|
|
132
|
+
short_ranks, long_ranks = parse_label(_LABEL)
|
|
133
|
+
res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
|
|
134
|
+
final_res_df = process_blank_tax_ends(res_df, short_ranks)
|
|
135
|
+
|
|
136
|
+
final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import re
|
|
20
|
+
|
|
21
|
+
from Bio import SeqIO
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
|
|
25
|
+
|
|
26
|
+
STRAND_FWD = "fwd"
|
|
27
|
+
STRAND_REV = "rev"
|
|
28
|
+
|
|
29
|
+
def parse_args():
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
|
|
32
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
|
|
33
|
+
parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
|
|
34
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
_INPUT = args.input
|
|
38
|
+
_FASTA = args.fasta
|
|
39
|
+
_SAMPLE = args.sample
|
|
40
|
+
|
|
41
|
+
return _INPUT, _FASTA, _SAMPLE
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_amp_region(beg, strand, model):
|
|
45
|
+
prev_region = ""
|
|
46
|
+
|
|
47
|
+
for region, region_coords in model.items():
|
|
48
|
+
|
|
49
|
+
region_beg = region_coords[0]
|
|
50
|
+
beg_diff = region_beg - beg
|
|
51
|
+
|
|
52
|
+
if strand == STRAND_FWD:
|
|
53
|
+
if beg_diff > 0:
|
|
54
|
+
return region
|
|
55
|
+
else:
|
|
56
|
+
if beg_diff > 0:
|
|
57
|
+
return prev_region
|
|
58
|
+
|
|
59
|
+
prev_region = region
|
|
60
|
+
|
|
61
|
+
return prev_region
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def main():
|
|
65
|
+
_INPUT, _FASTA, _SAMPLE = parse_args()
|
|
66
|
+
res_dict = defaultdict(list)
|
|
67
|
+
fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
|
|
68
|
+
|
|
69
|
+
with open(_INPUT, "r") as fr:
|
|
70
|
+
for line in fr:
|
|
71
|
+
line = line.strip()
|
|
72
|
+
line = re.sub("[ \t]+", "\t", line)
|
|
73
|
+
line_lst = line.split("\t")
|
|
74
|
+
|
|
75
|
+
primer_name = line_lst[0]
|
|
76
|
+
rfam = line_lst[3]
|
|
77
|
+
beg = float(line_lst[5])
|
|
78
|
+
|
|
79
|
+
if rfam == "RF00177":
|
|
80
|
+
gene = "16S"
|
|
81
|
+
model = REGIONS_16S_BACTERIA
|
|
82
|
+
elif rfam == "RF01959":
|
|
83
|
+
gene = "16S"
|
|
84
|
+
model = REGIONS_16S_ARCHAEA
|
|
85
|
+
elif rfam == "RF01960":
|
|
86
|
+
gene = "18S"
|
|
87
|
+
model = REGIONS_18S
|
|
88
|
+
else:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
res_dict["Run"].append(_SAMPLE)
|
|
92
|
+
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
93
|
+
res_dict["AssertionMethod"].append("automatic assertion")
|
|
94
|
+
|
|
95
|
+
strand = ""
|
|
96
|
+
|
|
97
|
+
if "F" in primer_name:
|
|
98
|
+
strand = STRAND_FWD
|
|
99
|
+
elif "R" in primer_name:
|
|
100
|
+
strand = STRAND_REV
|
|
101
|
+
|
|
102
|
+
amp_region = get_amp_region(beg, strand, model)
|
|
103
|
+
primer_seq = str(fasta_dict[primer_name].seq)
|
|
104
|
+
|
|
105
|
+
res_dict["Gene"].append(gene)
|
|
106
|
+
res_dict["VariableRegion"].append(amp_region)
|
|
107
|
+
res_dict["PrimerName"].append(primer_name)
|
|
108
|
+
res_dict["PrimerStrand"].append(strand)
|
|
109
|
+
res_dict["PrimerSeq"].append(primer_seq)
|
|
110
|
+
|
|
111
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
112
|
+
res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -14,5 +14,5 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
_SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
17
|
+
_SILVA_TAX_RANKS = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
18
18
|
_PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
@@ -15,36 +15,36 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
REGIONS_16S_BACTERIA = {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
18
|
+
"V1": [69, 92],
|
|
19
|
+
"V2": [131, 239],
|
|
20
|
+
"V3": [430, 487],
|
|
21
|
+
"V4": [566, 672],
|
|
22
|
+
"V5": [812, 869],
|
|
23
|
+
"V6": [976, 1033],
|
|
24
|
+
"V7": [1107, 1164],
|
|
25
|
+
"V8": [1234, 1285],
|
|
26
|
+
"V9": [1426, 1456]
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
REGIONS_16S_ARCHAEA = {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
30
|
+
"V1": [61, 79],
|
|
31
|
+
"V2": [114, 223],
|
|
32
|
+
"V3": [397, 436],
|
|
33
|
+
"V4": [516, 623],
|
|
34
|
+
"V5": [763, 824],
|
|
35
|
+
"V6": [932, 982],
|
|
36
|
+
"V7": [1056, 1119],
|
|
37
|
+
"V8": [1189, 1240],
|
|
38
|
+
"V9": [1372, 1410]
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
REGIONS_18S = {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
42
|
+
"V1": [69, 109],
|
|
43
|
+
"V2": [136, 298],
|
|
44
|
+
"V3": [474, 545],
|
|
45
|
+
"V4": [627, 873],
|
|
46
|
+
"V5": [1059, 1102],
|
|
47
|
+
"V7": [1366, 1454],
|
|
48
|
+
"V8": [1526, 1608],
|
|
49
|
+
"V9": [1728, 1795]
|
|
50
50
|
}
|
{mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -55,13 +55,14 @@ You should then be able to run the packages from the command-line. For example t
|
|
|
55
55
|
### New script requirements
|
|
56
56
|
|
|
57
57
|
There are a few requirements for your script:
|
|
58
|
+
|
|
58
59
|
- It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
|
|
59
60
|
- Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
|
|
60
61
|
- A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
|
|
61
62
|
|
|
62
63
|
### How to add a new script
|
|
63
64
|
|
|
64
|
-
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
65
|
+
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
65
66
|
|
|
66
67
|
Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
|
|
67
68
|
|
|
@@ -73,7 +74,7 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
|
|
|
73
74
|
|
|
74
75
|
- `get_subunits` is the alias
|
|
75
76
|
- `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
|
|
76
|
-
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
77
|
+
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
77
78
|
|
|
78
79
|
When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
|
|
79
80
|
|
|
@@ -86,4 +87,5 @@ Finally, you will need to bump up the version in the `version` line.
|
|
|
86
87
|
At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
|
|
87
88
|
|
|
88
89
|
### Building and uploading to PyPi
|
|
90
|
+
|
|
89
91
|
The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
|
{mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/RECORD
RENAMED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=
|
|
3
|
+
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=EvWTFV4gGn0SkrjwC2hzvNGSXFLeyFDmVj2QDa5DmtE,6402
|
|
4
4
|
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
|
|
5
5
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=pBNpqHFb0zzWgTD1mY3Q5MslQ5nmT99-pSHpyngVEuo,7159
|
|
6
6
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=BCzLlfvRqiPC-YwzET901f_d0anYt1zpf5y0iOCQnvs,5191
|
|
7
7
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=89LKH4rvqRydAEGvfaWqIClcitJ1Vbu7b5d4FApzGp4,18392
|
|
8
8
|
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
|
|
9
|
-
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=
|
|
9
|
+
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=dZIygTbMZvVbSVBmFFAZz7x24oQEpvdEOTpTcnYAyoM,8444
|
|
10
|
+
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=lWVIhDxfsTuDzWjjUlMGx3RL7iD_Yy8m9Ppc9wjfCFg,4765
|
|
11
|
+
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=F4ALSuH8N-0hHUqPCFwHgoAnteb2Ft3tUN9j6DaD5h8,3539
|
|
10
12
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
|
|
11
13
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
|
|
12
14
|
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
|
|
@@ -15,12 +17,12 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=vvhn8O9t1zzD8rIv
|
|
|
15
17
|
mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=hFXUTZb-etmJS7Si3mCCVCXV5ZYN0tP6FSbeiVxG1jo,1879
|
|
16
18
|
mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZz2nlmST3SE6LJbep9sKdMH-vaI,5565
|
|
17
19
|
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
|
|
18
|
-
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=
|
|
20
|
+
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=fP97JvlUdxJBakJ694VresIY8-N3pcU99m7kZ9buKys,867
|
|
19
21
|
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=7J3caCikkEcLdKF4zSR0z8qMQw4-h9aSkSbFbS0LNg4,873
|
|
20
|
-
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=
|
|
21
|
-
mgnify_pipelines_toolkit-0.1.
|
|
22
|
-
mgnify_pipelines_toolkit-0.1.
|
|
23
|
-
mgnify_pipelines_toolkit-0.1.
|
|
24
|
-
mgnify_pipelines_toolkit-0.1.
|
|
25
|
-
mgnify_pipelines_toolkit-0.1.
|
|
26
|
-
mgnify_pipelines_toolkit-0.1.
|
|
22
|
+
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=E8Cd3A1Hj9M95zw9Ut-2x8sE6_PlH6RJJEoikyZUMaQ,1303
|
|
23
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/METADATA,sha256=D8bYOR2kQZzJPdqtFkHj_Xd4axEHjzJPJXKAHtFj8L0,4950
|
|
25
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
26
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/entry_points.txt,sha256=K8nqnyAQG9jqHGgIfMIaCIe20u5a0FFCCqJWi4DoD2U,1306
|
|
27
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
28
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD,,
|
|
@@ -8,6 +8,8 @@ get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
|
|
|
8
8
|
get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
|
|
9
9
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
10
10
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
11
|
+
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
12
|
+
primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
|
|
11
13
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
12
14
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
13
15
|
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
{mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/LICENSE
RENAMED
|
File without changes
|
{mgnify_pipelines_toolkit-0.1.1.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/top_level.txt
RENAMED
|
File without changes
|