mgnify-pipelines-toolkit 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -46,18 +46,11 @@ def get_read_count(read_path, type='fastq'):
46
46
  ]
47
47
  zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
48
48
 
49
- cmd = [
50
- 'sed',
51
- '-n',
52
- '1~4p',
53
- ]
54
- sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
55
-
56
49
  cmd = [
57
50
  'wc',
58
51
  '-l'
59
52
  ]
60
- wc_proc = subprocess.Popen(cmd, stdin=sed_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
53
+ wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
61
54
  stdout, stderr = wc_proc.communicate()
62
55
 
63
56
  elif type == 'fasta':
@@ -78,6 +71,9 @@ def get_read_count(read_path, type='fastq'):
78
71
 
79
72
  read_count = int(read_count)
80
73
 
74
+ if type == 'fastq':
75
+ read_count /= 4
76
+
81
77
  return read_count
82
78
 
83
79
  def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
@@ -25,13 +25,14 @@ from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_
25
25
  logging.basicConfig(level=logging.DEBUG)
26
26
 
27
27
  def parse_args():
28
-
29
28
  parser = argparse.ArgumentParser()
30
29
 
31
- parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to DADA2 taxa file")
30
+ parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
32
31
  parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
33
32
  parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
34
- parser.add_argument("-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow")
33
+ parser.add_argument(
34
+ "-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
35
+ )
35
36
  parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
36
37
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
37
38
 
@@ -48,8 +49,7 @@ def parse_args():
48
49
 
49
50
 
50
51
  def order_df(taxa_df):
51
-
52
- if len(taxa_df.columns) == 8:
52
+ if len(taxa_df.columns) == 9:
53
53
  taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
54
54
  elif len(taxa_df.columns) == 10:
55
55
  taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
@@ -60,70 +60,76 @@ def order_df(taxa_df):
60
60
  return taxa_df
61
61
 
62
62
  def make_tax_assignment_dict_silva(taxa_df, asv_dict):
63
-
64
63
  tax_assignment_dict = defaultdict(int)
65
64
 
66
65
  for i in range(len(taxa_df)):
67
-
68
66
  sorted_index = taxa_df.index[i]
69
- asv_count = asv_dict[sorted_index]
67
+ asv_num = taxa_df.iloc[i, 0]
68
+ asv_count = asv_dict[asv_num]
70
69
 
71
70
  if asv_count == 0:
72
71
  continue
73
72
 
73
+ sk = taxa_df.loc[sorted_index, "Superkingdom"]
74
74
  k = taxa_df.loc[sorted_index, "Kingdom"]
75
75
  p = taxa_df.loc[sorted_index, "Phylum"]
76
76
  c = taxa_df.loc[sorted_index, "Class"]
77
77
  o = taxa_df.loc[sorted_index, "Order"]
78
78
  f = taxa_df.loc[sorted_index, "Family"]
79
- g = taxa_df.loc[sorted_index, "Genus"]
79
+ g = taxa_df.loc[sorted_index, "Genus"]
80
80
  s = taxa_df.loc[sorted_index, "Species"]
81
81
 
82
82
  tax_assignment = ""
83
83
 
84
84
  while True:
85
85
 
86
+ if sk != "0":
87
+ sk = "_".join(sk.split(" "))
88
+ tax_assignment += sk
89
+ else:
90
+ break
91
+
86
92
  if k != "0":
87
93
  k = "_".join(k.split(" "))
88
- if k == "Archaea" or k == "Bacteria":
89
- tax_assignment += f"sk__{k}"
90
- elif k == "Eukaryota":
91
- tax_assignment += f"sk__Eukaryota"
92
- else:
93
- tax_assignment += f"sk__Eukaryota\tk__{k}"
94
+ tax_assignment += f"\t{k}"
95
+ elif sk != "0":
96
+ tax_assignment += f"\tk__"
94
97
  else:
95
98
  break
96
99
 
97
100
  if p != "0":
98
- if k == "Archaea" or k == "Bacteria":
99
- tax_assignment += f"\tk__"
100
101
  p = "_".join(p.split(" "))
101
- tax_assignment += f"\tp__{p}"
102
+ tax_assignment += f"\t{p}"
102
103
  else:
103
104
  break
105
+
104
106
  if c != "0":
105
107
  c = "_".join(c.split(" "))
106
- tax_assignment += f"\tc__{c}"
108
+ tax_assignment += f"\t{c}"
107
109
  else:
108
110
  break
111
+
109
112
  if o != "0":
110
113
  o = "_".join(o.split(" "))
111
- tax_assignment += f"\to__{o}"
114
+ tax_assignment += f"\t{o}"
112
115
  else:
113
116
  break
117
+
114
118
  if f != "0":
115
119
  f = "_".join(f.split(" "))
116
- tax_assignment += f"\tf__{f}"
120
+ tax_assignment += f"\t{f}"
117
121
  else:
118
122
  break
123
+
119
124
  if g != "0":
120
125
  g = "_".join(g.split(" "))
121
- tax_assignment += f"\tg__{g}"
126
+ tax_assignment += f"\t{g}"
122
127
  else:
123
128
  break
129
+
124
130
  if s != "0":
125
131
  s = "_".join(s.split(" "))
126
- tax_assignment += f"\ts__{s}"
132
+ tax_assignment += f"\t{s}"
127
133
  break
128
134
 
129
135
  if tax_assignment == "":
@@ -134,13 +140,12 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
134
140
  return tax_assignment_dict
135
141
 
136
142
  def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
137
-
138
143
  tax_assignment_dict = defaultdict(int)
139
144
 
140
145
  for i in range(len(taxa_df)):
141
-
142
146
  sorted_index = taxa_df.index[i]
143
- asv_count = asv_dict[sorted_index]
147
+ asv_num = taxa_df.iloc[i, 0]
148
+ asv_count = asv_dict[asv_num]
144
149
 
145
150
  if asv_count == 0:
146
151
  continue
@@ -153,53 +158,62 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
153
158
  o = taxa_df.loc[sorted_index, "Order"]
154
159
  f = taxa_df.loc[sorted_index, "Family"]
155
160
  g = taxa_df.loc[sorted_index, "Genus"]
156
- s = taxa_df.loc[sorted_index, "Species"]
161
+ s = taxa_df.loc[sorted_index, "Species"]
157
162
 
158
163
  tax_assignment = ""
159
164
 
160
165
  while True:
161
-
162
166
  if d != "0":
163
167
  d = "_".join(d.split(" "))
164
- tax_assignment += f"d__{d}"
168
+ tax_assignment += d
165
169
  else:
166
170
  break
167
171
 
168
172
  if sg != "0":
169
173
  sg = "_".join(sg.split(" "))
170
- tax_assignment += f"\tsg__{sg}"
174
+ tax_assignment += f"\t{sg}"
171
175
  else:
172
176
  break
177
+
173
178
  if dv != "0":
174
179
  dv = "_".join(dv.split(" "))
175
- tax_assignment += f"\tdv__{dv}"
180
+ tax_assignment += f"\t{dv}"
181
+ else:
182
+ break
176
183
 
177
184
  if sdv != "0":
178
185
  sdv = "_".join(sdv.split(" "))
179
- tax_assignment += f"\tsdv__{sdv}"
186
+ tax_assignment += f"\t{sdv}"
187
+ else:
188
+ break
189
+
180
190
  if c != "0":
181
191
  c = "_".join(c.split(" "))
182
- tax_assignment += f"\tc__{c}"
192
+ tax_assignment += f"\t{c}"
183
193
  else:
184
194
  break
195
+
185
196
  if o != "0":
186
197
  o = "_".join(o.split(" "))
187
- tax_assignment += f"\to__{o}"
198
+ tax_assignment += f"\t{o}"
188
199
  else:
189
200
  break
201
+
190
202
  if f != "0":
191
203
  f = "_".join(f.split(" "))
192
- tax_assignment += f"\tf__{f}"
204
+ tax_assignment += f"\t{f}"
193
205
  else:
194
206
  break
207
+
195
208
  if g != "0":
196
209
  g = "_".join(g.split(" "))
197
- tax_assignment += f"\tg__{g}"
210
+ tax_assignment += f"\t{g}"
198
211
  else:
199
212
  break
213
+
200
214
  if s != "0":
201
215
  s = "_".join(s.split(" "))
202
- tax_assignment += f"\ts__{s}"
216
+ tax_assignment += f"\t{s}"
203
217
  break
204
218
 
205
219
  if tax_assignment == "":
@@ -210,9 +224,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
210
224
  return tax_assignment_dict
211
225
 
212
226
  def main():
213
-
214
227
  _TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
215
-
228
+
216
229
  fwd_fr = open(_FWD, "r")
217
230
  paired_end = True
218
231
 
@@ -225,43 +238,43 @@ def main():
225
238
  taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
226
239
  taxa_df = taxa_df.fillna("0")
227
240
  taxa_df = order_df(taxa_df)
228
-
229
- amp_reads = [ read.strip() for read in list(open(_AMP, "r")) ]
230
- headers = [ read.split(" ")[0][1:] for read in list(open(_HEADERS, "r")) ]
241
+
242
+ amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
243
+ headers = [read.split(" ")[0][1:] for read in
244
+ list(open(_HEADERS, "r"))]
231
245
  amp_region = ".".join(_AMP.split(".")[1:3])
232
246
 
233
247
  asv_dict = defaultdict(int)
234
248
 
235
249
  counter = -1
236
250
  for line_fwd in fwd_fr:
237
-
238
251
  counter += 1
239
252
  line_fwd = line_fwd.strip()
240
253
  fwd_asvs = line_fwd.split(",")
241
254
 
242
255
  if paired_end:
243
256
  line_rev = next(rev_fr).strip()
244
- rev_asvs = line_rev.split(",")
257
+ rev_asvs = line_rev.split(",")
245
258
  asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
246
-
259
+
247
260
  if len(asv_intersection) == 0:
248
261
  continue
249
-
262
+
250
263
  if len(asv_intersection) == 1 and asv_intersection[0] == "0":
251
264
  continue
252
265
  else:
253
266
  asv_intersection = fwd_asvs
254
267
 
255
268
  if headers[counter] in amp_reads:
256
- asv_dict[int(asv_intersection[0]) - 1] += 1
257
-
269
+ asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
270
+
258
271
  fwd_fr.close()
259
272
  if paired_end:
260
273
  rev_fr.close()
261
274
 
262
275
  ref_db = ""
263
276
 
264
- if len(taxa_df.columns) == 8:
277
+ if len(taxa_df.columns) == 9:
265
278
  tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
266
279
  ref_db = "silva"
267
280
  elif len(taxa_df.columns) == 10:
@@ -271,7 +284,7 @@ def main():
271
284
  with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
272
285
  for tax_assignment, count in tax_assignment_dict.items():
273
286
  fw.write(f"{count}\t{tax_assignment}\n")
274
-
287
+
275
288
 
276
289
  if __name__ == "__main__":
277
- main()
290
+ main()
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import logging
20
+
21
+ import pandas as pd
22
+
23
+ logging.basicConfig(level=logging.DEBUG)
24
+
25
+ def parse_args():
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
28
+ parser.add_argument(
29
+ "-l",
30
+ "--label",
31
+ choices=["DADA2-SILVA", "DADA2-PR2"],
32
+ required=True,
33
+ type=str,
34
+ help="Database label - either DADA2-SILVA or DADA2-PR2",
35
+ )
36
+ parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
37
+
38
+ args = parser.parse_args()
39
+
40
+ _INPUT = args.input
41
+ _LABEL = args.label
42
+ _SAMPLE = args.sample
43
+
44
+ return _INPUT, _LABEL, _SAMPLE
45
+
46
+ def parse_label(label):
47
+ silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
48
+ pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
49
+
50
+ silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
51
+ pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
52
+
53
+ chosen_short_ranks = ""
54
+ chosen_long_ranks = ""
55
+
56
+ if label == "DADA2-SILVA":
57
+ chosen_short_ranks = silva_short_ranks
58
+ chosen_long_ranks = silva_long_ranks
59
+ elif label == "DADA2-PR2":
60
+ chosen_short_ranks = pr2_short_ranks
61
+ chosen_long_ranks = pr2_long_ranks
62
+ else:
63
+ logging.error("Incorrect database label - exiting.")
64
+ exit(1)
65
+
66
+ return chosen_short_ranks, chosen_long_ranks
67
+
68
+ def parse_mapseq(mseq_df, short_ranks, long_ranks):
69
+ res_dict = defaultdict(list)
70
+
71
+ for i in range(len(mseq_df)):
72
+ asv_id = mseq_df.iloc[i, 0]
73
+
74
+ if pd.isna(mseq_df.iloc[i, 1]):
75
+ tax_ass = [short_ranks[0]]
76
+ else:
77
+ tax_ass = mseq_df.iloc[i, 1].split(";")
78
+
79
+ res_dict["ASV"].append(asv_id)
80
+
81
+ for j in range(len(short_ranks)):
82
+ curr_rank = long_ranks[j]
83
+
84
+ if j >= len(tax_ass):
85
+ # This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
86
+ # so fill each remaining rank with its respective short rank blank
87
+ curr_tax = short_ranks[j]
88
+ else:
89
+ curr_tax = tax_ass[j]
90
+
91
+ res_dict[curr_rank].append(curr_tax)
92
+ res_df = pd.DataFrame.from_dict(res_dict)
93
+
94
+ return(res_df)
95
+
96
+ def process_blank_tax_ends(res_df, ranks):
97
+ # Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
98
+ # while avoiding making blanks in the middle as NAs
99
+
100
+ for i in range(len(res_df)):
101
+ last_empty_rank = ""
102
+ currently_empty = False
103
+ for j in reversed(
104
+ range(len(ranks))
105
+ ): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
106
+ curr_rank = res_df.iloc[i, j + 1]
107
+ if curr_rank in ranks:
108
+ if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
109
+ last_empty_rank = j + 1
110
+ currently_empty = True
111
+ elif (
112
+ currently_empty
113
+ ): # If we're in a window of consecutive blank assignments that started at the beginning
114
+ last_empty_rank = j + 1
115
+ else:
116
+ break
117
+ else:
118
+ break
119
+ if last_empty_rank != "":
120
+ res_df.iloc[i, last_empty_rank:] = "NA"
121
+ if last_empty_rank == 1:
122
+ res_df.iloc[i, 1] = ranks[0]
123
+
124
+ return res_df
125
+
126
+
127
+ def main():
128
+ _INPUT, _LABEL, _SAMPLE = parse_args()
129
+
130
+ mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
131
+
132
+ short_ranks, long_ranks = parse_label(_LABEL)
133
+ res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
134
+ final_res_df = process_blank_tax_ends(res_df, short_ranks)
135
+
136
+ final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import re
20
+
21
+ from Bio import SeqIO
22
+ import pandas as pd
23
+
24
+ from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
25
+
26
+ STRAND_FWD = "fwd"
27
+ STRAND_REV = "rev"
28
+
29
+ def parse_args():
30
+ parser = argparse.ArgumentParser()
31
+
32
+ parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
33
+ parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
34
+ parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
35
+ args = parser.parse_args()
36
+
37
+ _INPUT = args.input
38
+ _FASTA = args.fasta
39
+ _SAMPLE = args.sample
40
+
41
+ return _INPUT, _FASTA, _SAMPLE
42
+
43
+
44
+ def get_amp_region(beg, strand, model):
45
+ prev_region = ""
46
+
47
+ for region, region_coords in model.items():
48
+
49
+ region_beg = region_coords[0]
50
+ beg_diff = region_beg - beg
51
+
52
+ if strand == STRAND_FWD:
53
+ if beg_diff > 0:
54
+ return region
55
+ else:
56
+ if beg_diff > 0:
57
+ return prev_region
58
+
59
+ prev_region = region
60
+
61
+ return prev_region
62
+
63
+
64
+ def main():
65
+ _INPUT, _FASTA, _SAMPLE = parse_args()
66
+ res_dict = defaultdict(list)
67
+ fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
68
+
69
+ with open(_INPUT, "r") as fr:
70
+ for line in fr:
71
+ line = line.strip()
72
+ line = re.sub("[ \t]+", "\t", line)
73
+ line_lst = line.split("\t")
74
+
75
+ primer_name = line_lst[0]
76
+ rfam = line_lst[3]
77
+ beg = float(line_lst[5])
78
+
79
+ if rfam == "RF00177":
80
+ gene = "16S"
81
+ model = REGIONS_16S_BACTERIA
82
+ elif rfam == "RF01959":
83
+ gene = "16S"
84
+ model = REGIONS_16S_ARCHAEA
85
+ elif rfam == "RF01960":
86
+ gene = "18S"
87
+ model = REGIONS_18S
88
+ else:
89
+ continue
90
+
91
+ res_dict["Run"].append(_SAMPLE)
92
+ res_dict["AssertionEvidence"].append("ECO_0000363")
93
+ res_dict["AssertionMethod"].append("automatic assertion")
94
+
95
+ strand = ""
96
+
97
+ if "F" in primer_name:
98
+ strand = STRAND_FWD
99
+ elif "R" in primer_name:
100
+ strand = STRAND_REV
101
+
102
+ amp_region = get_amp_region(beg, strand, model)
103
+ primer_seq = str(fasta_dict[primer_name].seq)
104
+
105
+ res_dict["Gene"].append(gene)
106
+ res_dict["VariableRegion"].append(amp_region)
107
+ res_dict["PrimerName"].append(primer_name)
108
+ res_dict["PrimerStrand"].append(strand)
109
+ res_dict["PrimerSeq"].append(primer_seq)
110
+
111
+ res_df = pd.DataFrame.from_dict(res_dict)
112
+ res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -14,5 +14,5 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- _SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
17
+ _SILVA_TAX_RANKS = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
18
18
  _PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
@@ -15,36 +15,36 @@
15
15
  # limitations under the License.
16
16
 
17
17
  REGIONS_16S_BACTERIA = {
18
- 'V1': [69, 92],
19
- 'V2': [131, 239],
20
- 'V3': [430, 487],
21
- 'V4': [566, 672],
22
- 'V5': [812, 869],
23
- 'V6': [976, 1033],
24
- 'V7': [1107, 1164],
25
- 'V8': [1234, 1285],
26
- 'V9': [1426, 1456]
18
+ "V1": [69, 92],
19
+ "V2": [131, 239],
20
+ "V3": [430, 487],
21
+ "V4": [566, 672],
22
+ "V5": [812, 869],
23
+ "V6": [976, 1033],
24
+ "V7": [1107, 1164],
25
+ "V8": [1234, 1285],
26
+ "V9": [1426, 1456]
27
27
  }
28
28
 
29
29
  REGIONS_16S_ARCHAEA = {
30
- 'V1': [61, 79],
31
- 'V2': [114, 223],
32
- 'V3': [397, 436],
33
- 'V4': [516, 623],
34
- 'V5': [763, 824],
35
- 'V6': [932, 982],
36
- 'V7': [1056, 1119],
37
- 'V8': [1189, 1240],
38
- 'V9': [1372, 1410]
30
+ "V1": [61, 79],
31
+ "V2": [114, 223],
32
+ "V3": [397, 436],
33
+ "V4": [516, 623],
34
+ "V5": [763, 824],
35
+ "V6": [932, 982],
36
+ "V7": [1056, 1119],
37
+ "V8": [1189, 1240],
38
+ "V9": [1372, 1410]
39
39
  }
40
40
 
41
41
  REGIONS_18S = {
42
- 'V1': [69, 109],
43
- 'V2': [136, 298],
44
- 'V3': [474, 545],
45
- 'V4': [627, 873],
46
- 'V5': [1059, 1102],
47
- 'V7': [1366, 1454],
48
- 'V8': [1526, 1608],
49
- 'V9': [1728, 1795]
42
+ "V1": [69, 109],
43
+ "V2": [136, 298],
44
+ "V3": [474, 545],
45
+ "V4": [627, 873],
46
+ "V5": [1059, 1102],
47
+ "V7": [1366, 1454],
48
+ "V8": [1526, 1608],
49
+ "V9": [1728, 1795]
50
50
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -55,13 +55,14 @@ You should then be able to run the packages from the command-line. For example t
55
55
  ### New script requirements
56
56
 
57
57
  There are a few requirements for your script:
58
+
58
59
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
59
60
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
60
61
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
61
62
 
62
63
  ### How to add a new script
63
64
 
64
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
66
 
66
67
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
67
68
 
@@ -73,7 +74,7 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
73
74
 
74
75
  - `get_subunits` is the alias
75
76
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
76
- - `:main` will specifically call the function named `main()` when the alias is run.
77
+ - `:main` will specifically call the function named `main()` when the alias is run.
77
78
 
78
79
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
79
80
 
@@ -86,4 +87,5 @@ Finally, you will need to bump up the version in the `version` line.
86
87
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
87
88
 
88
89
  ### Building and uploading to PyPi
90
+
89
91
  The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -1,12 +1,14 @@
1
1
  mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=OkQsiVTh6Y-bYPZMHOnD9xn0dd9bENySFwgibK6x6CU,6549
3
+ mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=EvWTFV4gGn0SkrjwC2hzvNGSXFLeyFDmVj2QDa5DmtE,6402
4
4
  mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=pBNpqHFb0zzWgTD1mY3Q5MslQ5nmT99-pSHpyngVEuo,7159
6
6
  mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=BCzLlfvRqiPC-YwzET901f_d0anYt1zpf5y0iOCQnvs,5191
7
7
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=89LKH4rvqRydAEGvfaWqIClcitJ1Vbu7b5d4FApzGp4,18392
8
8
  mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
9
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=3ODVM7YxYrPiM95UnbYiKmtN93PdlF_JmjZYTEyoCL8,8468
9
+ mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=dZIygTbMZvVbSVBmFFAZz7x24oQEpvdEOTpTcnYAyoM,8444
10
+ mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=lWVIhDxfsTuDzWjjUlMGx3RL7iD_Yy8m9Ppc9wjfCFg,4765
11
+ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=F4ALSuH8N-0hHUqPCFwHgoAnteb2Ft3tUN9j6DaD5h8,3539
10
12
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
11
13
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
12
14
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
@@ -15,12 +17,12 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=vvhn8O9t1zzD8rIv
15
17
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=hFXUTZb-etmJS7Si3mCCVCXV5ZYN0tP6FSbeiVxG1jo,1879
16
18
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZz2nlmST3SE6LJbep9sKdMH-vaI,5565
17
19
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
18
- mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=QOVkxZjMTYwIxCN7L4CLYFQjWoxsuZ4WKlKiTblD4tM,851
20
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=fP97JvlUdxJBakJ694VresIY8-N3pcU99m7kZ9buKys,867
19
21
  mgnify_pipelines_toolkit/constants/thresholds.py,sha256=7J3caCikkEcLdKF4zSR0z8qMQw4-h9aSkSbFbS0LNg4,873
20
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=GAW1DdQ9IM3dCBMVMS6ZNtxVFEGwEGdJaDNorVAzR1g,1303
21
- mgnify_pipelines_toolkit-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
- mgnify_pipelines_toolkit-0.1.1.dist-info/METADATA,sha256=D1u8X2ougLo4WNGCmxy_0NZDOyfN5UF5_N65zF2dhR8,4950
23
- mgnify_pipelines_toolkit-0.1.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
24
- mgnify_pipelines_toolkit-0.1.1.dist-info/entry_points.txt,sha256=GH2Vn239CpRUjapHA555c4YQ7aKsxBlaFu-XvloJb4I,1114
25
- mgnify_pipelines_toolkit-0.1.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
26
- mgnify_pipelines_toolkit-0.1.1.dist-info/RECORD,,
22
+ mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=E8Cd3A1Hj9M95zw9Ut-2x8sE6_PlH6RJJEoikyZUMaQ,1303
23
+ mgnify_pipelines_toolkit-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
24
+ mgnify_pipelines_toolkit-0.1.3.dist-info/METADATA,sha256=D8bYOR2kQZzJPdqtFkHj_Xd4axEHjzJPJXKAHtFj8L0,4950
25
+ mgnify_pipelines_toolkit-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
26
+ mgnify_pipelines_toolkit-0.1.3.dist-info/entry_points.txt,sha256=K8nqnyAQG9jqHGgIfMIaCIe20u5a0FFCCqJWi4DoD2U,1306
27
+ mgnify_pipelines_toolkit-0.1.3.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
28
+ mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -8,6 +8,8 @@ get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
8
8
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
9
9
  make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
10
10
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
11
+ mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
12
+ primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
11
13
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
12
14
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
13
15
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main