mgnify-pipelines-toolkit 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +4 -8
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +38 -34
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +39 -26
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +116 -0
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +26 -26
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/METADATA +1 -1
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/RECORD +11 -10
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/entry_points.txt +1 -0
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -46,18 +46,11 @@ def get_read_count(read_path, type='fastq'):
|
|
|
46
46
|
]
|
|
47
47
|
zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
48
48
|
|
|
49
|
-
cmd = [
|
|
50
|
-
'sed',
|
|
51
|
-
'-n',
|
|
52
|
-
'1~4p',
|
|
53
|
-
]
|
|
54
|
-
sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
55
|
-
|
|
56
49
|
cmd = [
|
|
57
50
|
'wc',
|
|
58
51
|
'-l'
|
|
59
52
|
]
|
|
60
|
-
wc_proc = subprocess.Popen(cmd, stdin=
|
|
53
|
+
wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
61
54
|
stdout, stderr = wc_proc.communicate()
|
|
62
55
|
|
|
63
56
|
elif type == 'fasta':
|
|
@@ -78,6 +71,9 @@ def get_read_count(read_path, type='fastq'):
|
|
|
78
71
|
|
|
79
72
|
read_count = int(read_count)
|
|
80
73
|
|
|
74
|
+
if type == 'fastq':
|
|
75
|
+
read_count /= 4
|
|
76
|
+
|
|
81
77
|
return read_count
|
|
82
78
|
|
|
83
79
|
def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
|
|
@@ -25,13 +25,14 @@ from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_
|
|
|
25
25
|
logging.basicConfig(level=logging.DEBUG)
|
|
26
26
|
|
|
27
27
|
def parse_args():
|
|
28
|
-
|
|
29
28
|
parser = argparse.ArgumentParser()
|
|
30
29
|
|
|
31
30
|
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
32
31
|
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
33
32
|
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
34
|
-
parser.add_argument(
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
|
|
35
|
+
)
|
|
35
36
|
parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
|
|
36
37
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
37
38
|
|
|
@@ -48,7 +49,6 @@ def parse_args():
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def order_df(taxa_df):
|
|
51
|
-
|
|
52
52
|
if len(taxa_df.columns) == 9:
|
|
53
53
|
taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
|
|
54
54
|
elif len(taxa_df.columns) == 10:
|
|
@@ -60,11 +60,9 @@ def order_df(taxa_df):
|
|
|
60
60
|
return taxa_df
|
|
61
61
|
|
|
62
62
|
def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
63
|
-
|
|
64
63
|
tax_assignment_dict = defaultdict(int)
|
|
65
64
|
|
|
66
65
|
for i in range(len(taxa_df)):
|
|
67
|
-
|
|
68
66
|
sorted_index = taxa_df.index[i]
|
|
69
67
|
asv_num = taxa_df.iloc[i, 0]
|
|
70
68
|
asv_count = asv_dict[asv_num]
|
|
@@ -78,7 +76,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
78
76
|
c = taxa_df.loc[sorted_index, "Class"]
|
|
79
77
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
80
78
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
81
|
-
g = taxa_df.loc[sorted_index, "Genus"]
|
|
79
|
+
g = taxa_df.loc[sorted_index, "Genus"]
|
|
82
80
|
s = taxa_df.loc[sorted_index, "Species"]
|
|
83
81
|
|
|
84
82
|
tax_assignment = ""
|
|
@@ -142,11 +140,9 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
142
140
|
return tax_assignment_dict
|
|
143
141
|
|
|
144
142
|
def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
145
|
-
|
|
146
143
|
tax_assignment_dict = defaultdict(int)
|
|
147
144
|
|
|
148
145
|
for i in range(len(taxa_df)):
|
|
149
|
-
|
|
150
146
|
sorted_index = taxa_df.index[i]
|
|
151
147
|
asv_num = taxa_df.iloc[i, 0]
|
|
152
148
|
asv_count = asv_dict[asv_num]
|
|
@@ -162,12 +158,11 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
162
158
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
163
159
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
164
160
|
g = taxa_df.loc[sorted_index, "Genus"]
|
|
165
|
-
s = taxa_df.loc[sorted_index, "Species"]
|
|
161
|
+
s = taxa_df.loc[sorted_index, "Species"]
|
|
166
162
|
|
|
167
163
|
tax_assignment = ""
|
|
168
164
|
|
|
169
165
|
while True:
|
|
170
|
-
|
|
171
166
|
if d != "0":
|
|
172
167
|
d = "_".join(d.split(" "))
|
|
173
168
|
tax_assignment += d
|
|
@@ -228,10 +223,28 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
228
223
|
|
|
229
224
|
return tax_assignment_dict
|
|
230
225
|
|
|
231
|
-
def
|
|
226
|
+
def generate_asv_count_dict(asv_dict):
|
|
227
|
+
|
|
228
|
+
res_dict = defaultdict(list)
|
|
229
|
+
|
|
230
|
+
for asv_id, count in asv_dict.items():
|
|
231
|
+
|
|
232
|
+
if count == 0:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
res_dict['asv'].append(asv_id)
|
|
236
|
+
res_dict['count'].append(count)
|
|
237
|
+
|
|
238
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
239
|
+
res_df = res_df.sort_values(by='asv', ascending=True)
|
|
240
|
+
res_df = res_df.sort_values(by='count', ascending=False)
|
|
232
241
|
|
|
242
|
+
return res_df
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def main():
|
|
233
246
|
_TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
|
|
234
|
-
|
|
247
|
+
|
|
235
248
|
fwd_fr = open(_FWD, "r")
|
|
236
249
|
paired_end = True
|
|
237
250
|
|
|
@@ -244,36 +257,25 @@ def main():
|
|
|
244
257
|
taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
|
|
245
258
|
taxa_df = taxa_df.fillna("0")
|
|
246
259
|
taxa_df = order_df(taxa_df)
|
|
247
|
-
|
|
248
|
-
amp_reads = [
|
|
249
|
-
headers = [
|
|
260
|
+
|
|
261
|
+
amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
|
|
262
|
+
headers = [read.split(" ")[0][1:] for read in
|
|
263
|
+
list(open(_HEADERS, "r"))]
|
|
250
264
|
amp_region = ".".join(_AMP.split(".")[1:3])
|
|
251
265
|
|
|
252
266
|
asv_dict = defaultdict(int)
|
|
253
267
|
|
|
254
268
|
counter = -1
|
|
255
269
|
for line_fwd in fwd_fr:
|
|
256
|
-
|
|
257
270
|
counter += 1
|
|
258
271
|
line_fwd = line_fwd.strip()
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
line_rev = next(rev_fr).strip()
|
|
263
|
-
rev_asvs = line_rev.split(",")
|
|
264
|
-
asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
|
|
265
|
-
|
|
266
|
-
if len(asv_intersection) == 0:
|
|
267
|
-
continue
|
|
268
|
-
|
|
269
|
-
if len(asv_intersection) == 1 and asv_intersection[0] == "0":
|
|
270
|
-
continue
|
|
271
|
-
else:
|
|
272
|
-
asv_intersection = fwd_asvs
|
|
272
|
+
|
|
273
|
+
if line_fwd == '0':
|
|
274
|
+
continue
|
|
273
275
|
|
|
274
276
|
if headers[counter] in amp_reads:
|
|
275
|
-
asv_dict[f"seq_{
|
|
276
|
-
|
|
277
|
+
asv_dict[f"seq_{line_fwd}"] += 1
|
|
278
|
+
|
|
277
279
|
fwd_fr.close()
|
|
278
280
|
if paired_end:
|
|
279
281
|
rev_fr.close()
|
|
@@ -290,7 +292,9 @@ def main():
|
|
|
290
292
|
with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
|
|
291
293
|
for tax_assignment, count in tax_assignment_dict.items():
|
|
292
294
|
fw.write(f"{count}\t{tax_assignment}\n")
|
|
293
|
-
|
|
295
|
+
|
|
296
|
+
asv_count_df = generate_asv_count_dict(asv_dict)
|
|
297
|
+
asv_count_df.to_csv(f'./{_SAMPLE}_{amp_region}_asv_read_counts.tsv', sep='\t', index=False)
|
|
294
298
|
|
|
295
299
|
if __name__ == "__main__":
|
|
296
|
-
main()
|
|
300
|
+
main()
|
|
@@ -23,10 +23,16 @@ import pandas as pd
|
|
|
23
23
|
logging.basicConfig(level=logging.DEBUG)
|
|
24
24
|
|
|
25
25
|
def parse_args():
|
|
26
|
-
|
|
27
26
|
parser = argparse.ArgumentParser()
|
|
28
27
|
parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
|
|
29
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-l",
|
|
30
|
+
"--label",
|
|
31
|
+
choices=["DADA2-SILVA", "DADA2-PR2"],
|
|
32
|
+
required=True,
|
|
33
|
+
type=str,
|
|
34
|
+
help="Database label - either DADA2-SILVA or DADA2-PR2",
|
|
35
|
+
)
|
|
30
36
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
31
37
|
|
|
32
38
|
args = parser.parse_args()
|
|
@@ -38,20 +44,19 @@ def parse_args():
|
|
|
38
44
|
return _INPUT, _LABEL, _SAMPLE
|
|
39
45
|
|
|
40
46
|
def parse_label(label):
|
|
41
|
-
|
|
42
47
|
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
43
48
|
pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
|
|
44
49
|
|
|
45
50
|
silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
46
51
|
pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
47
52
|
|
|
48
|
-
chosen_short_ranks =
|
|
49
|
-
chosen_long_ranks =
|
|
53
|
+
chosen_short_ranks = ""
|
|
54
|
+
chosen_long_ranks = ""
|
|
50
55
|
|
|
51
|
-
if label ==
|
|
56
|
+
if label == "DADA2-SILVA":
|
|
52
57
|
chosen_short_ranks = silva_short_ranks
|
|
53
58
|
chosen_long_ranks = silva_long_ranks
|
|
54
|
-
elif label ==
|
|
59
|
+
elif label == "DADA2-PR2":
|
|
55
60
|
chosen_short_ranks = pr2_short_ranks
|
|
56
61
|
chosen_long_ranks = pr2_long_ranks
|
|
57
62
|
else:
|
|
@@ -61,26 +66,28 @@ def parse_label(label):
|
|
|
61
66
|
return chosen_short_ranks, chosen_long_ranks
|
|
62
67
|
|
|
63
68
|
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
64
|
-
|
|
65
69
|
res_dict = defaultdict(list)
|
|
66
70
|
|
|
67
71
|
for i in range(len(mseq_df)):
|
|
68
72
|
asv_id = mseq_df.iloc[i, 0]
|
|
69
|
-
tax_ass = mseq_df.iloc[i, 1].split(';')
|
|
70
73
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
if pd.isna(mseq_df.iloc[i, 1]):
|
|
75
|
+
tax_ass = [short_ranks[0]]
|
|
76
|
+
else:
|
|
77
|
+
tax_ass = mseq_df.iloc[i, 1].split(";")
|
|
78
|
+
|
|
79
|
+
res_dict["ASV"].append(asv_id)
|
|
74
80
|
|
|
81
|
+
for j in range(len(short_ranks)):
|
|
75
82
|
curr_rank = long_ranks[j]
|
|
76
|
-
|
|
83
|
+
|
|
77
84
|
if j >= len(tax_ass):
|
|
78
85
|
# This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
|
|
79
86
|
# so fill each remaining rank with its respective short rank blank
|
|
80
87
|
curr_tax = short_ranks[j]
|
|
81
88
|
else:
|
|
82
89
|
curr_tax = tax_ass[j]
|
|
83
|
-
|
|
90
|
+
|
|
84
91
|
res_dict[curr_rank].append(curr_tax)
|
|
85
92
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
86
93
|
|
|
@@ -91,27 +98,33 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
91
98
|
# while avoiding making blanks in the middle as NAs
|
|
92
99
|
|
|
93
100
|
for i in range(len(res_df)):
|
|
94
|
-
last_empty_rank =
|
|
101
|
+
last_empty_rank = ""
|
|
95
102
|
currently_empty = False
|
|
96
|
-
for j in reversed(
|
|
97
|
-
|
|
103
|
+
for j in reversed(
|
|
104
|
+
range(len(ranks))
|
|
105
|
+
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
106
|
+
curr_rank = res_df.iloc[i, j + 1]
|
|
98
107
|
if curr_rank in ranks:
|
|
99
|
-
if last_empty_rank ==
|
|
100
|
-
last_empty_rank = j+1
|
|
108
|
+
if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
|
|
109
|
+
last_empty_rank = j + 1
|
|
101
110
|
currently_empty = True
|
|
102
|
-
elif
|
|
103
|
-
|
|
111
|
+
elif (
|
|
112
|
+
currently_empty
|
|
113
|
+
): # If we're in a window of consecutive blank assignments that started at the beginning
|
|
114
|
+
last_empty_rank = j + 1
|
|
104
115
|
else:
|
|
105
116
|
break
|
|
106
117
|
else:
|
|
107
118
|
break
|
|
108
|
-
if last_empty_rank !=
|
|
109
|
-
res_df.iloc[i, last_empty_rank:] =
|
|
119
|
+
if last_empty_rank != "":
|
|
120
|
+
res_df.iloc[i, last_empty_rank:] = "NA"
|
|
121
|
+
if last_empty_rank == 1:
|
|
122
|
+
res_df.iloc[i, 1] = ranks[0]
|
|
110
123
|
|
|
111
124
|
return res_df
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
|
|
126
|
+
|
|
127
|
+
def main():
|
|
115
128
|
_INPUT, _LABEL, _SAMPLE = parse_args()
|
|
116
129
|
|
|
117
130
|
mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
|
|
@@ -123,4 +136,4 @@ def main():
|
|
|
123
136
|
final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
|
|
124
137
|
|
|
125
138
|
if __name__ == "__main__":
|
|
126
|
-
main()
|
|
139
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import re
|
|
20
|
+
|
|
21
|
+
from Bio import SeqIO
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
|
|
25
|
+
|
|
26
|
+
STRAND_FWD = "fwd"
|
|
27
|
+
STRAND_REV = "rev"
|
|
28
|
+
|
|
29
|
+
def parse_args():
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
|
|
32
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
|
|
33
|
+
parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
|
|
34
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
_INPUT = args.input
|
|
38
|
+
_FASTA = args.fasta
|
|
39
|
+
_SAMPLE = args.sample
|
|
40
|
+
|
|
41
|
+
return _INPUT, _FASTA, _SAMPLE
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_amp_region(beg, strand, model):
|
|
45
|
+
prev_region = ""
|
|
46
|
+
|
|
47
|
+
for region, region_coords in model.items():
|
|
48
|
+
|
|
49
|
+
region_beg = region_coords[0]
|
|
50
|
+
beg_diff = region_beg - beg
|
|
51
|
+
|
|
52
|
+
if strand == STRAND_FWD:
|
|
53
|
+
if beg_diff > 0:
|
|
54
|
+
return region
|
|
55
|
+
else:
|
|
56
|
+
if beg_diff > 0:
|
|
57
|
+
return prev_region
|
|
58
|
+
|
|
59
|
+
prev_region = region
|
|
60
|
+
|
|
61
|
+
return prev_region
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def main():
|
|
65
|
+
_INPUT, _FASTA, _SAMPLE = parse_args()
|
|
66
|
+
res_dict = defaultdict(list)
|
|
67
|
+
fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
|
|
68
|
+
|
|
69
|
+
with open(_INPUT, "r") as fr:
|
|
70
|
+
for line in fr:
|
|
71
|
+
line = line.strip()
|
|
72
|
+
line = re.sub("[ \t]+", "\t", line)
|
|
73
|
+
line_lst = line.split("\t")
|
|
74
|
+
|
|
75
|
+
primer_name = line_lst[0]
|
|
76
|
+
rfam = line_lst[3]
|
|
77
|
+
beg = float(line_lst[5])
|
|
78
|
+
|
|
79
|
+
if rfam == "RF00177":
|
|
80
|
+
gene = "16S"
|
|
81
|
+
model = REGIONS_16S_BACTERIA
|
|
82
|
+
elif rfam == "RF01959":
|
|
83
|
+
gene = "16S"
|
|
84
|
+
model = REGIONS_16S_ARCHAEA
|
|
85
|
+
elif rfam == "RF01960":
|
|
86
|
+
gene = "18S"
|
|
87
|
+
model = REGIONS_18S
|
|
88
|
+
else:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
res_dict["Run"].append(_SAMPLE)
|
|
92
|
+
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
93
|
+
res_dict["AssertionMethod"].append("automatic assertion")
|
|
94
|
+
|
|
95
|
+
strand = ""
|
|
96
|
+
|
|
97
|
+
if "F" in primer_name:
|
|
98
|
+
strand = STRAND_FWD
|
|
99
|
+
elif "R" in primer_name:
|
|
100
|
+
strand = STRAND_REV
|
|
101
|
+
|
|
102
|
+
amp_region = get_amp_region(beg, strand, model)
|
|
103
|
+
primer_seq = str(fasta_dict[primer_name].seq)
|
|
104
|
+
|
|
105
|
+
res_dict["Gene"].append(gene)
|
|
106
|
+
res_dict["VariableRegion"].append(amp_region)
|
|
107
|
+
res_dict["PrimerName"].append(primer_name)
|
|
108
|
+
res_dict["PrimerStrand"].append(strand)
|
|
109
|
+
res_dict["PrimerSeq"].append(primer_seq)
|
|
110
|
+
|
|
111
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
112
|
+
res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -15,36 +15,36 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
REGIONS_16S_BACTERIA = {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
18
|
+
"V1": [69, 92],
|
|
19
|
+
"V2": [131, 239],
|
|
20
|
+
"V3": [430, 487],
|
|
21
|
+
"V4": [566, 672],
|
|
22
|
+
"V5": [812, 869],
|
|
23
|
+
"V6": [976, 1033],
|
|
24
|
+
"V7": [1107, 1164],
|
|
25
|
+
"V8": [1234, 1285],
|
|
26
|
+
"V9": [1426, 1456]
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
REGIONS_16S_ARCHAEA = {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
30
|
+
"V1": [61, 79],
|
|
31
|
+
"V2": [114, 223],
|
|
32
|
+
"V3": [397, 436],
|
|
33
|
+
"V4": [516, 623],
|
|
34
|
+
"V5": [763, 824],
|
|
35
|
+
"V6": [932, 982],
|
|
36
|
+
"V7": [1056, 1119],
|
|
37
|
+
"V8": [1189, 1240],
|
|
38
|
+
"V9": [1372, 1410]
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
REGIONS_18S = {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
42
|
+
"V1": [69, 109],
|
|
43
|
+
"V2": [136, 298],
|
|
44
|
+
"V3": [474, 545],
|
|
45
|
+
"V4": [627, 873],
|
|
46
|
+
"V5": [1059, 1102],
|
|
47
|
+
"V7": [1366, 1454],
|
|
48
|
+
"V8": [1526, 1608],
|
|
49
|
+
"V9": [1728, 1795]
|
|
50
50
|
}
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/RECORD
RENAMED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=
|
|
3
|
+
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=EvWTFV4gGn0SkrjwC2hzvNGSXFLeyFDmVj2QDa5DmtE,6402
|
|
4
4
|
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
|
|
5
5
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=pBNpqHFb0zzWgTD1mY3Q5MslQ5nmT99-pSHpyngVEuo,7159
|
|
6
6
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=BCzLlfvRqiPC-YwzET901f_d0anYt1zpf5y0iOCQnvs,5191
|
|
7
7
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=89LKH4rvqRydAEGvfaWqIClcitJ1Vbu7b5d4FApzGp4,18392
|
|
8
8
|
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
|
|
9
|
-
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256
|
|
10
|
-
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=
|
|
9
|
+
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=-Kh9KlEuWWFAWUaNjkCxl-L3IdEMURLM3UKb2Tf81CM,8605
|
|
10
|
+
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=lWVIhDxfsTuDzWjjUlMGx3RL7iD_Yy8m9Ppc9wjfCFg,4765
|
|
11
|
+
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=F4ALSuH8N-0hHUqPCFwHgoAnteb2Ft3tUN9j6DaD5h8,3539
|
|
11
12
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
|
|
12
13
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
|
|
13
14
|
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
|
|
@@ -18,10 +19,10 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZ
|
|
|
18
19
|
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
|
|
19
20
|
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=fP97JvlUdxJBakJ694VresIY8-N3pcU99m7kZ9buKys,867
|
|
20
21
|
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=7J3caCikkEcLdKF4zSR0z8qMQw4-h9aSkSbFbS0LNg4,873
|
|
21
|
-
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=
|
|
22
|
-
mgnify_pipelines_toolkit-0.1.
|
|
23
|
-
mgnify_pipelines_toolkit-0.1.
|
|
24
|
-
mgnify_pipelines_toolkit-0.1.
|
|
25
|
-
mgnify_pipelines_toolkit-0.1.
|
|
26
|
-
mgnify_pipelines_toolkit-0.1.
|
|
27
|
-
mgnify_pipelines_toolkit-0.1.
|
|
22
|
+
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=E8Cd3A1Hj9M95zw9Ut-2x8sE6_PlH6RJJEoikyZUMaQ,1303
|
|
23
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/METADATA,sha256=1s--DpsRsfAyM2HomRYTAfLF3shjuZ5o0CF-FoceZmw,4950
|
|
25
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
26
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/entry_points.txt,sha256=K8nqnyAQG9jqHGgIfMIaCIe20u5a0FFCCqJWi4DoD2U,1306
|
|
27
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
28
|
+
mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD,,
|
|
@@ -9,6 +9,7 @@ get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coor
|
|
|
9
9
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
10
10
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
11
11
|
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
12
|
+
primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
|
|
12
13
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
13
14
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
14
15
|
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/LICENSE
RENAMED
|
File without changes
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.4.dist-info}/top_level.txt
RENAMED
|
File without changes
|