mgnify-pipelines-toolkit 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +4 -8
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +16 -22
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +39 -26
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +116 -0
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +26 -26
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/METADATA +1 -1
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/RECORD +11 -10
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/entry_points.txt +1 -0
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/WHEEL +0 -0
- {mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -46,18 +46,11 @@ def get_read_count(read_path, type='fastq'):
|
|
|
46
46
|
]
|
|
47
47
|
zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
48
48
|
|
|
49
|
-
cmd = [
|
|
50
|
-
'sed',
|
|
51
|
-
'-n',
|
|
52
|
-
'1~4p',
|
|
53
|
-
]
|
|
54
|
-
sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
55
|
-
|
|
56
49
|
cmd = [
|
|
57
50
|
'wc',
|
|
58
51
|
'-l'
|
|
59
52
|
]
|
|
60
|
-
wc_proc = subprocess.Popen(cmd, stdin=
|
|
53
|
+
wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
61
54
|
stdout, stderr = wc_proc.communicate()
|
|
62
55
|
|
|
63
56
|
elif type == 'fasta':
|
|
@@ -78,6 +71,9 @@ def get_read_count(read_path, type='fastq'):
|
|
|
78
71
|
|
|
79
72
|
read_count = int(read_count)
|
|
80
73
|
|
|
74
|
+
if type == 'fastq':
|
|
75
|
+
read_count /= 4
|
|
76
|
+
|
|
81
77
|
return read_count
|
|
82
78
|
|
|
83
79
|
def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
|
|
@@ -25,13 +25,14 @@ from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_
|
|
|
25
25
|
logging.basicConfig(level=logging.DEBUG)
|
|
26
26
|
|
|
27
27
|
def parse_args():
|
|
28
|
-
|
|
29
28
|
parser = argparse.ArgumentParser()
|
|
30
29
|
|
|
31
30
|
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
32
31
|
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
33
32
|
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
34
|
-
parser.add_argument(
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
|
|
35
|
+
)
|
|
35
36
|
parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
|
|
36
37
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
37
38
|
|
|
@@ -48,7 +49,6 @@ def parse_args():
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def order_df(taxa_df):
|
|
51
|
-
|
|
52
52
|
if len(taxa_df.columns) == 9:
|
|
53
53
|
taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
|
|
54
54
|
elif len(taxa_df.columns) == 10:
|
|
@@ -60,11 +60,9 @@ def order_df(taxa_df):
|
|
|
60
60
|
return taxa_df
|
|
61
61
|
|
|
62
62
|
def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
63
|
-
|
|
64
63
|
tax_assignment_dict = defaultdict(int)
|
|
65
64
|
|
|
66
65
|
for i in range(len(taxa_df)):
|
|
67
|
-
|
|
68
66
|
sorted_index = taxa_df.index[i]
|
|
69
67
|
asv_num = taxa_df.iloc[i, 0]
|
|
70
68
|
asv_count = asv_dict[asv_num]
|
|
@@ -78,7 +76,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
78
76
|
c = taxa_df.loc[sorted_index, "Class"]
|
|
79
77
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
80
78
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
81
|
-
g = taxa_df.loc[sorted_index, "Genus"]
|
|
79
|
+
g = taxa_df.loc[sorted_index, "Genus"]
|
|
82
80
|
s = taxa_df.loc[sorted_index, "Species"]
|
|
83
81
|
|
|
84
82
|
tax_assignment = ""
|
|
@@ -142,11 +140,9 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
142
140
|
return tax_assignment_dict
|
|
143
141
|
|
|
144
142
|
def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
145
|
-
|
|
146
143
|
tax_assignment_dict = defaultdict(int)
|
|
147
144
|
|
|
148
145
|
for i in range(len(taxa_df)):
|
|
149
|
-
|
|
150
146
|
sorted_index = taxa_df.index[i]
|
|
151
147
|
asv_num = taxa_df.iloc[i, 0]
|
|
152
148
|
asv_count = asv_dict[asv_num]
|
|
@@ -162,12 +158,11 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
162
158
|
o = taxa_df.loc[sorted_index, "Order"]
|
|
163
159
|
f = taxa_df.loc[sorted_index, "Family"]
|
|
164
160
|
g = taxa_df.loc[sorted_index, "Genus"]
|
|
165
|
-
s = taxa_df.loc[sorted_index, "Species"]
|
|
161
|
+
s = taxa_df.loc[sorted_index, "Species"]
|
|
166
162
|
|
|
167
163
|
tax_assignment = ""
|
|
168
164
|
|
|
169
165
|
while True:
|
|
170
|
-
|
|
171
166
|
if d != "0":
|
|
172
167
|
d = "_".join(d.split(" "))
|
|
173
168
|
tax_assignment += d
|
|
@@ -229,9 +224,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
229
224
|
return tax_assignment_dict
|
|
230
225
|
|
|
231
226
|
def main():
|
|
232
|
-
|
|
233
227
|
_TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
|
|
234
|
-
|
|
228
|
+
|
|
235
229
|
fwd_fr = open(_FWD, "r")
|
|
236
230
|
paired_end = True
|
|
237
231
|
|
|
@@ -244,28 +238,28 @@ def main():
|
|
|
244
238
|
taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
|
|
245
239
|
taxa_df = taxa_df.fillna("0")
|
|
246
240
|
taxa_df = order_df(taxa_df)
|
|
247
|
-
|
|
248
|
-
amp_reads = [
|
|
249
|
-
headers = [
|
|
241
|
+
|
|
242
|
+
amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
|
|
243
|
+
headers = [read.split(" ")[0][1:] for read in
|
|
244
|
+
list(open(_HEADERS, "r"))]
|
|
250
245
|
amp_region = ".".join(_AMP.split(".")[1:3])
|
|
251
246
|
|
|
252
247
|
asv_dict = defaultdict(int)
|
|
253
248
|
|
|
254
249
|
counter = -1
|
|
255
250
|
for line_fwd in fwd_fr:
|
|
256
|
-
|
|
257
251
|
counter += 1
|
|
258
252
|
line_fwd = line_fwd.strip()
|
|
259
253
|
fwd_asvs = line_fwd.split(",")
|
|
260
254
|
|
|
261
255
|
if paired_end:
|
|
262
256
|
line_rev = next(rev_fr).strip()
|
|
263
|
-
rev_asvs = line_rev.split(",")
|
|
257
|
+
rev_asvs = line_rev.split(",")
|
|
264
258
|
asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
|
|
265
|
-
|
|
259
|
+
|
|
266
260
|
if len(asv_intersection) == 0:
|
|
267
261
|
continue
|
|
268
|
-
|
|
262
|
+
|
|
269
263
|
if len(asv_intersection) == 1 and asv_intersection[0] == "0":
|
|
270
264
|
continue
|
|
271
265
|
else:
|
|
@@ -273,7 +267,7 @@ def main():
|
|
|
273
267
|
|
|
274
268
|
if headers[counter] in amp_reads:
|
|
275
269
|
asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
|
|
276
|
-
|
|
270
|
+
|
|
277
271
|
fwd_fr.close()
|
|
278
272
|
if paired_end:
|
|
279
273
|
rev_fr.close()
|
|
@@ -290,7 +284,7 @@ def main():
|
|
|
290
284
|
with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
|
|
291
285
|
for tax_assignment, count in tax_assignment_dict.items():
|
|
292
286
|
fw.write(f"{count}\t{tax_assignment}\n")
|
|
293
|
-
|
|
287
|
+
|
|
294
288
|
|
|
295
289
|
if __name__ == "__main__":
|
|
296
|
-
main()
|
|
290
|
+
main()
|
|
@@ -23,10 +23,16 @@ import pandas as pd
|
|
|
23
23
|
logging.basicConfig(level=logging.DEBUG)
|
|
24
24
|
|
|
25
25
|
def parse_args():
|
|
26
|
-
|
|
27
26
|
parser = argparse.ArgumentParser()
|
|
28
27
|
parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
|
|
29
|
-
parser.add_argument(
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-l",
|
|
30
|
+
"--label",
|
|
31
|
+
choices=["DADA2-SILVA", "DADA2-PR2"],
|
|
32
|
+
required=True,
|
|
33
|
+
type=str,
|
|
34
|
+
help="Database label - either DADA2-SILVA or DADA2-PR2",
|
|
35
|
+
)
|
|
30
36
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
31
37
|
|
|
32
38
|
args = parser.parse_args()
|
|
@@ -38,20 +44,19 @@ def parse_args():
|
|
|
38
44
|
return _INPUT, _LABEL, _SAMPLE
|
|
39
45
|
|
|
40
46
|
def parse_label(label):
|
|
41
|
-
|
|
42
47
|
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
43
48
|
pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
|
|
44
49
|
|
|
45
50
|
silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
46
51
|
pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
47
52
|
|
|
48
|
-
chosen_short_ranks =
|
|
49
|
-
chosen_long_ranks =
|
|
53
|
+
chosen_short_ranks = ""
|
|
54
|
+
chosen_long_ranks = ""
|
|
50
55
|
|
|
51
|
-
if label ==
|
|
56
|
+
if label == "DADA2-SILVA":
|
|
52
57
|
chosen_short_ranks = silva_short_ranks
|
|
53
58
|
chosen_long_ranks = silva_long_ranks
|
|
54
|
-
elif label ==
|
|
59
|
+
elif label == "DADA2-PR2":
|
|
55
60
|
chosen_short_ranks = pr2_short_ranks
|
|
56
61
|
chosen_long_ranks = pr2_long_ranks
|
|
57
62
|
else:
|
|
@@ -61,26 +66,28 @@ def parse_label(label):
|
|
|
61
66
|
return chosen_short_ranks, chosen_long_ranks
|
|
62
67
|
|
|
63
68
|
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
64
|
-
|
|
65
69
|
res_dict = defaultdict(list)
|
|
66
70
|
|
|
67
71
|
for i in range(len(mseq_df)):
|
|
68
72
|
asv_id = mseq_df.iloc[i, 0]
|
|
69
|
-
tax_ass = mseq_df.iloc[i, 1].split(';')
|
|
70
73
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
if pd.isna(mseq_df.iloc[i, 1]):
|
|
75
|
+
tax_ass = [short_ranks[0]]
|
|
76
|
+
else:
|
|
77
|
+
tax_ass = mseq_df.iloc[i, 1].split(";")
|
|
78
|
+
|
|
79
|
+
res_dict["ASV"].append(asv_id)
|
|
74
80
|
|
|
81
|
+
for j in range(len(short_ranks)):
|
|
75
82
|
curr_rank = long_ranks[j]
|
|
76
|
-
|
|
83
|
+
|
|
77
84
|
if j >= len(tax_ass):
|
|
78
85
|
# This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
|
|
79
86
|
# so fill each remaining rank with its respective short rank blank
|
|
80
87
|
curr_tax = short_ranks[j]
|
|
81
88
|
else:
|
|
82
89
|
curr_tax = tax_ass[j]
|
|
83
|
-
|
|
90
|
+
|
|
84
91
|
res_dict[curr_rank].append(curr_tax)
|
|
85
92
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
86
93
|
|
|
@@ -91,27 +98,33 @@ def process_blank_tax_ends(res_df, ranks):
|
|
|
91
98
|
# while avoiding making blanks in the middle as NAs
|
|
92
99
|
|
|
93
100
|
for i in range(len(res_df)):
|
|
94
|
-
last_empty_rank =
|
|
101
|
+
last_empty_rank = ""
|
|
95
102
|
currently_empty = False
|
|
96
|
-
for j in reversed(
|
|
97
|
-
|
|
103
|
+
for j in reversed(
|
|
104
|
+
range(len(ranks))
|
|
105
|
+
): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
106
|
+
curr_rank = res_df.iloc[i, j + 1]
|
|
98
107
|
if curr_rank in ranks:
|
|
99
|
-
if last_empty_rank ==
|
|
100
|
-
last_empty_rank = j+1
|
|
108
|
+
if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
|
|
109
|
+
last_empty_rank = j + 1
|
|
101
110
|
currently_empty = True
|
|
102
|
-
elif
|
|
103
|
-
|
|
111
|
+
elif (
|
|
112
|
+
currently_empty
|
|
113
|
+
): # If we're in a window of consecutive blank assignments that started at the beginning
|
|
114
|
+
last_empty_rank = j + 1
|
|
104
115
|
else:
|
|
105
116
|
break
|
|
106
117
|
else:
|
|
107
118
|
break
|
|
108
|
-
if last_empty_rank !=
|
|
109
|
-
res_df.iloc[i, last_empty_rank:] =
|
|
119
|
+
if last_empty_rank != "":
|
|
120
|
+
res_df.iloc[i, last_empty_rank:] = "NA"
|
|
121
|
+
if last_empty_rank == 1:
|
|
122
|
+
res_df.iloc[i, 1] = ranks[0]
|
|
110
123
|
|
|
111
124
|
return res_df
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
|
|
126
|
+
|
|
127
|
+
def main():
|
|
115
128
|
_INPUT, _LABEL, _SAMPLE = parse_args()
|
|
116
129
|
|
|
117
130
|
mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
|
|
@@ -123,4 +136,4 @@ def main():
|
|
|
123
136
|
final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
|
|
124
137
|
|
|
125
138
|
if __name__ == "__main__":
|
|
126
|
-
main()
|
|
139
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import re
|
|
20
|
+
|
|
21
|
+
from Bio import SeqIO
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
|
|
25
|
+
|
|
26
|
+
STRAND_FWD = "fwd"
|
|
27
|
+
STRAND_REV = "rev"
|
|
28
|
+
|
|
29
|
+
def parse_args():
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
|
|
32
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
|
|
33
|
+
parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
|
|
34
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
_INPUT = args.input
|
|
38
|
+
_FASTA = args.fasta
|
|
39
|
+
_SAMPLE = args.sample
|
|
40
|
+
|
|
41
|
+
return _INPUT, _FASTA, _SAMPLE
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_amp_region(beg, strand, model):
|
|
45
|
+
prev_region = ""
|
|
46
|
+
|
|
47
|
+
for region, region_coords in model.items():
|
|
48
|
+
|
|
49
|
+
region_beg = region_coords[0]
|
|
50
|
+
beg_diff = region_beg - beg
|
|
51
|
+
|
|
52
|
+
if strand == STRAND_FWD:
|
|
53
|
+
if beg_diff > 0:
|
|
54
|
+
return region
|
|
55
|
+
else:
|
|
56
|
+
if beg_diff > 0:
|
|
57
|
+
return prev_region
|
|
58
|
+
|
|
59
|
+
prev_region = region
|
|
60
|
+
|
|
61
|
+
return prev_region
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def main():
|
|
65
|
+
_INPUT, _FASTA, _SAMPLE = parse_args()
|
|
66
|
+
res_dict = defaultdict(list)
|
|
67
|
+
fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
|
|
68
|
+
|
|
69
|
+
with open(_INPUT, "r") as fr:
|
|
70
|
+
for line in fr:
|
|
71
|
+
line = line.strip()
|
|
72
|
+
line = re.sub("[ \t]+", "\t", line)
|
|
73
|
+
line_lst = line.split("\t")
|
|
74
|
+
|
|
75
|
+
primer_name = line_lst[0]
|
|
76
|
+
rfam = line_lst[3]
|
|
77
|
+
beg = float(line_lst[5])
|
|
78
|
+
|
|
79
|
+
if rfam == "RF00177":
|
|
80
|
+
gene = "16S"
|
|
81
|
+
model = REGIONS_16S_BACTERIA
|
|
82
|
+
elif rfam == "RF01959":
|
|
83
|
+
gene = "16S"
|
|
84
|
+
model = REGIONS_16S_ARCHAEA
|
|
85
|
+
elif rfam == "RF01960":
|
|
86
|
+
gene = "18S"
|
|
87
|
+
model = REGIONS_18S
|
|
88
|
+
else:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
res_dict["Run"].append(_SAMPLE)
|
|
92
|
+
res_dict["AssertionEvidence"].append("ECO_0000363")
|
|
93
|
+
res_dict["AssertionMethod"].append("automatic assertion")
|
|
94
|
+
|
|
95
|
+
strand = ""
|
|
96
|
+
|
|
97
|
+
if "F" in primer_name:
|
|
98
|
+
strand = STRAND_FWD
|
|
99
|
+
elif "R" in primer_name:
|
|
100
|
+
strand = STRAND_REV
|
|
101
|
+
|
|
102
|
+
amp_region = get_amp_region(beg, strand, model)
|
|
103
|
+
primer_seq = str(fasta_dict[primer_name].seq)
|
|
104
|
+
|
|
105
|
+
res_dict["Gene"].append(gene)
|
|
106
|
+
res_dict["VariableRegion"].append(amp_region)
|
|
107
|
+
res_dict["PrimerName"].append(primer_name)
|
|
108
|
+
res_dict["PrimerStrand"].append(strand)
|
|
109
|
+
res_dict["PrimerSeq"].append(primer_seq)
|
|
110
|
+
|
|
111
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
112
|
+
res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -15,36 +15,36 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
REGIONS_16S_BACTERIA = {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
18
|
+
"V1": [69, 92],
|
|
19
|
+
"V2": [131, 239],
|
|
20
|
+
"V3": [430, 487],
|
|
21
|
+
"V4": [566, 672],
|
|
22
|
+
"V5": [812, 869],
|
|
23
|
+
"V6": [976, 1033],
|
|
24
|
+
"V7": [1107, 1164],
|
|
25
|
+
"V8": [1234, 1285],
|
|
26
|
+
"V9": [1426, 1456]
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
REGIONS_16S_ARCHAEA = {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
30
|
+
"V1": [61, 79],
|
|
31
|
+
"V2": [114, 223],
|
|
32
|
+
"V3": [397, 436],
|
|
33
|
+
"V4": [516, 623],
|
|
34
|
+
"V5": [763, 824],
|
|
35
|
+
"V6": [932, 982],
|
|
36
|
+
"V7": [1056, 1119],
|
|
37
|
+
"V8": [1189, 1240],
|
|
38
|
+
"V9": [1372, 1410]
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
REGIONS_18S = {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
42
|
+
"V1": [69, 109],
|
|
43
|
+
"V2": [136, 298],
|
|
44
|
+
"V3": [474, 545],
|
|
45
|
+
"V4": [627, 873],
|
|
46
|
+
"V5": [1059, 1102],
|
|
47
|
+
"V7": [1366, 1454],
|
|
48
|
+
"V8": [1526, 1608],
|
|
49
|
+
"V9": [1728, 1795]
|
|
50
50
|
}
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/RECORD
RENAMED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=
|
|
3
|
+
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=EvWTFV4gGn0SkrjwC2hzvNGSXFLeyFDmVj2QDa5DmtE,6402
|
|
4
4
|
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
|
|
5
5
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=pBNpqHFb0zzWgTD1mY3Q5MslQ5nmT99-pSHpyngVEuo,7159
|
|
6
6
|
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=BCzLlfvRqiPC-YwzET901f_d0anYt1zpf5y0iOCQnvs,5191
|
|
7
7
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=89LKH4rvqRydAEGvfaWqIClcitJ1Vbu7b5d4FApzGp4,18392
|
|
8
8
|
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
|
|
9
|
-
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=
|
|
10
|
-
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=
|
|
9
|
+
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=dZIygTbMZvVbSVBmFFAZz7x24oQEpvdEOTpTcnYAyoM,8444
|
|
10
|
+
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=lWVIhDxfsTuDzWjjUlMGx3RL7iD_Yy8m9Ppc9wjfCFg,4765
|
|
11
|
+
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=F4ALSuH8N-0hHUqPCFwHgoAnteb2Ft3tUN9j6DaD5h8,3539
|
|
11
12
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
|
|
12
13
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
|
|
13
14
|
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
|
|
@@ -18,10 +19,10 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZ
|
|
|
18
19
|
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
|
|
19
20
|
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=fP97JvlUdxJBakJ694VresIY8-N3pcU99m7kZ9buKys,867
|
|
20
21
|
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=7J3caCikkEcLdKF4zSR0z8qMQw4-h9aSkSbFbS0LNg4,873
|
|
21
|
-
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=
|
|
22
|
-
mgnify_pipelines_toolkit-0.1.
|
|
23
|
-
mgnify_pipelines_toolkit-0.1.
|
|
24
|
-
mgnify_pipelines_toolkit-0.1.
|
|
25
|
-
mgnify_pipelines_toolkit-0.1.
|
|
26
|
-
mgnify_pipelines_toolkit-0.1.
|
|
27
|
-
mgnify_pipelines_toolkit-0.1.
|
|
22
|
+
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=E8Cd3A1Hj9M95zw9Ut-2x8sE6_PlH6RJJEoikyZUMaQ,1303
|
|
23
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/METADATA,sha256=D8bYOR2kQZzJPdqtFkHj_Xd4axEHjzJPJXKAHtFj8L0,4950
|
|
25
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
26
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/entry_points.txt,sha256=K8nqnyAQG9jqHGgIfMIaCIe20u5a0FFCCqJWi4DoD2U,1306
|
|
27
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
28
|
+
mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD,,
|
|
@@ -9,6 +9,7 @@ get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coor
|
|
|
9
9
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
10
10
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
11
11
|
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
12
|
+
primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
|
|
12
13
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
13
14
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
14
15
|
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|
{mgnify_pipelines_toolkit-0.1.2.dist-info → mgnify_pipelines_toolkit-0.1.3.dist-info}/top_level.txt
RENAMED
|
File without changes
|