mgnify-pipelines-toolkit 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (32) hide show
  1. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/PKG-INFO +5 -3
  2. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/README.md +5 -3
  3. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +48 -29
  4. mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +126 -0
  5. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
  6. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +5 -3
  7. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +1 -0
  8. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
  9. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/pyproject.toml +2 -1
  10. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/LICENSE +0 -0
  11. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/__init__.py +0 -0
  12. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  13. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  14. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  15. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  16. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  17. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  18. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  19. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  20. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  21. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  22. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  23. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  24. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  25. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  26. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  27. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  28. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  29. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  30. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  31. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  32. {mgnify_pipelines_toolkit-0.1.1 → mgnify_pipelines_toolkit-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -55,13 +55,14 @@ You should then be able to run the packages from the command-line. For example t
55
55
  ### New script requirements
56
56
 
57
57
  There are a few requirements for your script:
58
+
58
59
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
59
60
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
60
61
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
61
62
 
62
63
  ### How to add a new script
63
64
 
64
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
66
 
66
67
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
67
68
 
@@ -73,7 +74,7 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
73
74
 
74
75
  - `get_subunits` is the alias
75
76
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
76
- - `:main` will specifically call the function named `main()` when the alias is run.
77
+ - `:main` will specifically call the function named `main()` when the alias is run.
77
78
 
78
79
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
79
80
 
@@ -86,4 +87,5 @@ Finally, you will need to bump up the version in the `version` line.
86
87
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
87
88
 
88
89
  ### Building and uploading to PyPi
90
+
89
91
  The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -30,13 +30,14 @@ You should then be able to run the packages from the command-line. For example t
30
30
  ### New script requirements
31
31
 
32
32
  There are a few requirements for your script:
33
+
33
34
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
34
35
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
35
36
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
36
37
 
37
38
  ### How to add a new script
38
39
 
39
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
40
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
40
41
 
41
42
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
42
43
 
@@ -48,7 +49,7 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
48
49
 
49
50
  - `get_subunits` is the alias
50
51
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
51
- - `:main` will specifically call the function named `main()` when the alias is run.
52
+ - `:main` will specifically call the function named `main()` when the alias is run.
52
53
 
53
54
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
54
55
 
@@ -61,4 +62,5 @@ Finally, you will need to bump up the version in the `version` line.
61
62
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
62
63
 
63
64
  ### Building and uploading to PyPi
64
- The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
65
+
66
+ The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -28,7 +28,7 @@ def parse_args():
28
28
 
29
29
  parser = argparse.ArgumentParser()
30
30
 
31
- parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to DADA2 taxa file")
31
+ parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
32
32
  parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
33
33
  parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
34
34
  parser.add_argument("-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow")
@@ -49,7 +49,7 @@ def parse_args():
49
49
 
50
50
  def order_df(taxa_df):
51
51
 
52
- if len(taxa_df.columns) == 8:
52
+ if len(taxa_df.columns) == 9:
53
53
  taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
54
54
  elif len(taxa_df.columns) == 10:
55
55
  taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
@@ -66,11 +66,13 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
66
66
  for i in range(len(taxa_df)):
67
67
 
68
68
  sorted_index = taxa_df.index[i]
69
- asv_count = asv_dict[sorted_index]
69
+ asv_num = taxa_df.iloc[i, 0]
70
+ asv_count = asv_dict[asv_num]
70
71
 
71
72
  if asv_count == 0:
72
73
  continue
73
74
 
75
+ sk = taxa_df.loc[sorted_index, "Superkingdom"]
74
76
  k = taxa_df.loc[sorted_index, "Kingdom"]
75
77
  p = taxa_df.loc[sorted_index, "Phylum"]
76
78
  c = taxa_df.loc[sorted_index, "Class"]
@@ -83,47 +85,53 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
83
85
 
84
86
  while True:
85
87
 
88
+ if sk != "0":
89
+ sk = "_".join(sk.split(" "))
90
+ tax_assignment += sk
91
+ else:
92
+ break
93
+
86
94
  if k != "0":
87
95
  k = "_".join(k.split(" "))
88
- if k == "Archaea" or k == "Bacteria":
89
- tax_assignment += f"sk__{k}"
90
- elif k == "Eukaryota":
91
- tax_assignment += f"sk__Eukaryota"
92
- else:
93
- tax_assignment += f"sk__Eukaryota\tk__{k}"
96
+ tax_assignment += f"\t{k}"
97
+ elif sk != "0":
98
+ tax_assignment += f"\tk__"
94
99
  else:
95
100
  break
96
101
 
97
102
  if p != "0":
98
- if k == "Archaea" or k == "Bacteria":
99
- tax_assignment += f"\tk__"
100
103
  p = "_".join(p.split(" "))
101
- tax_assignment += f"\tp__{p}"
104
+ tax_assignment += f"\t{p}"
102
105
  else:
103
106
  break
107
+
104
108
  if c != "0":
105
109
  c = "_".join(c.split(" "))
106
- tax_assignment += f"\tc__{c}"
110
+ tax_assignment += f"\t{c}"
107
111
  else:
108
112
  break
113
+
109
114
  if o != "0":
110
115
  o = "_".join(o.split(" "))
111
- tax_assignment += f"\to__{o}"
116
+ tax_assignment += f"\t{o}"
112
117
  else:
113
118
  break
119
+
114
120
  if f != "0":
115
121
  f = "_".join(f.split(" "))
116
- tax_assignment += f"\tf__{f}"
122
+ tax_assignment += f"\t{f}"
117
123
  else:
118
124
  break
125
+
119
126
  if g != "0":
120
127
  g = "_".join(g.split(" "))
121
- tax_assignment += f"\tg__{g}"
128
+ tax_assignment += f"\t{g}"
122
129
  else:
123
130
  break
131
+
124
132
  if s != "0":
125
133
  s = "_".join(s.split(" "))
126
- tax_assignment += f"\ts__{s}"
134
+ tax_assignment += f"\t{s}"
127
135
  break
128
136
 
129
137
  if tax_assignment == "":
@@ -140,7 +148,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
140
148
  for i in range(len(taxa_df)):
141
149
 
142
150
  sorted_index = taxa_df.index[i]
143
- asv_count = asv_dict[sorted_index]
151
+ asv_num = taxa_df.iloc[i, 0]
152
+ asv_count = asv_dict[asv_num]
144
153
 
145
154
  if asv_count == 0:
146
155
  continue
@@ -161,45 +170,55 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
161
170
 
162
171
  if d != "0":
163
172
  d = "_".join(d.split(" "))
164
- tax_assignment += f"d__{d}"
173
+ tax_assignment += d
165
174
  else:
166
175
  break
167
176
 
168
177
  if sg != "0":
169
178
  sg = "_".join(sg.split(" "))
170
- tax_assignment += f"\tsg__{sg}"
179
+ tax_assignment += f"\t{sg}"
171
180
  else:
172
181
  break
182
+
173
183
  if dv != "0":
174
184
  dv = "_".join(dv.split(" "))
175
- tax_assignment += f"\tdv__{dv}"
185
+ tax_assignment += f"\t{dv}"
186
+ else:
187
+ break
176
188
 
177
189
  if sdv != "0":
178
190
  sdv = "_".join(sdv.split(" "))
179
- tax_assignment += f"\tsdv__{sdv}"
191
+ tax_assignment += f"\t{sdv}"
192
+ else:
193
+ break
194
+
180
195
  if c != "0":
181
196
  c = "_".join(c.split(" "))
182
- tax_assignment += f"\tc__{c}"
197
+ tax_assignment += f"\t{c}"
183
198
  else:
184
199
  break
200
+
185
201
  if o != "0":
186
202
  o = "_".join(o.split(" "))
187
- tax_assignment += f"\to__{o}"
203
+ tax_assignment += f"\t{o}"
188
204
  else:
189
205
  break
206
+
190
207
  if f != "0":
191
208
  f = "_".join(f.split(" "))
192
- tax_assignment += f"\tf__{f}"
209
+ tax_assignment += f"\t{f}"
193
210
  else:
194
211
  break
212
+
195
213
  if g != "0":
196
214
  g = "_".join(g.split(" "))
197
- tax_assignment += f"\tg__{g}"
215
+ tax_assignment += f"\t{g}"
198
216
  else:
199
217
  break
218
+
200
219
  if s != "0":
201
220
  s = "_".join(s.split(" "))
202
- tax_assignment += f"\ts__{s}"
221
+ tax_assignment += f"\t{s}"
203
222
  break
204
223
 
205
224
  if tax_assignment == "":
@@ -253,7 +272,7 @@ def main():
253
272
  asv_intersection = fwd_asvs
254
273
 
255
274
  if headers[counter] in amp_reads:
256
- asv_dict[int(asv_intersection[0]) - 1] += 1
275
+ asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
257
276
 
258
277
  fwd_fr.close()
259
278
  if paired_end:
@@ -261,7 +280,7 @@ def main():
261
280
 
262
281
  ref_db = ""
263
282
 
264
- if len(taxa_df.columns) == 8:
283
+ if len(taxa_df.columns) == 9:
265
284
  tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
266
285
  ref_db = "silva"
267
286
  elif len(taxa_df.columns) == 10:
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import logging
20
+
21
+ import pandas as pd
22
+
23
+ logging.basicConfig(level=logging.DEBUG)
24
+
25
+ def parse_args():
26
+
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
29
+ parser.add_argument("-l", "--label", choices=['DADA2-SILVA', 'DADA2-PR2'], required=True, type=str, help="Database label - either DADA2-SILVA or DADA2-PR2")
30
+ parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
31
+
32
+ args = parser.parse_args()
33
+
34
+ _INPUT = args.input
35
+ _LABEL = args.label
36
+ _SAMPLE = args.sample
37
+
38
+ return _INPUT, _LABEL, _SAMPLE
39
+
40
+ def parse_label(label):
41
+
42
+ silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
43
+ pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
44
+
45
+ silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
46
+ pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
47
+
48
+ chosen_short_ranks = ''
49
+ chosen_long_ranks = ''
50
+
51
+ if label == 'DADA2-SILVA':
52
+ chosen_short_ranks = silva_short_ranks
53
+ chosen_long_ranks = silva_long_ranks
54
+ elif label == 'DADA2-PR2':
55
+ chosen_short_ranks = pr2_short_ranks
56
+ chosen_long_ranks = pr2_long_ranks
57
+ else:
58
+ logging.error("Incorrect database label - exiting.")
59
+ exit(1)
60
+
61
+ return chosen_short_ranks, chosen_long_ranks
62
+
63
+ def parse_mapseq(mseq_df, short_ranks, long_ranks):
64
+
65
+ res_dict = defaultdict(list)
66
+
67
+ for i in range(len(mseq_df)):
68
+ asv_id = mseq_df.iloc[i, 0]
69
+ tax_ass = mseq_df.iloc[i, 1].split(';')
70
+
71
+ res_dict['ASV'].append(asv_id)
72
+
73
+ for j in range(len(short_ranks)):
74
+
75
+ curr_rank = long_ranks[j]
76
+
77
+ if j >= len(tax_ass):
78
+ # This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
79
+ # so fill each remaining rank with its respective short rank blank
80
+ curr_tax = short_ranks[j]
81
+ else:
82
+ curr_tax = tax_ass[j]
83
+
84
+ res_dict[curr_rank].append(curr_tax)
85
+ res_df = pd.DataFrame.from_dict(res_dict)
86
+
87
+ return(res_df)
88
+
89
+ def process_blank_tax_ends(res_df, ranks):
90
+ # Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
91
+ # while avoiding making blanks in the middle as NAs
92
+
93
+ for i in range(len(res_df)):
94
+ last_empty_rank = ''
95
+ currently_empty = False
96
+ for j in reversed(range(len(ranks))): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
97
+ curr_rank = res_df.iloc[i, j+1]
98
+ if curr_rank in ranks:
99
+ if last_empty_rank == '': # Last rank is empty, start window of consecutive blanks
100
+ last_empty_rank = j+1
101
+ currently_empty = True
102
+ elif currently_empty: # If we're in a window of consecutive blank assignments that started at the beginning
103
+ last_empty_rank = j+1
104
+ else:
105
+ break
106
+ else:
107
+ break
108
+ if last_empty_rank != '':
109
+ res_df.iloc[i, last_empty_rank:] = 'NA'
110
+
111
+ return res_df
112
+
113
+ def main():
114
+
115
+ _INPUT, _LABEL, _SAMPLE = parse_args()
116
+
117
+ mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
118
+
119
+ short_ranks, long_ranks = parse_label(_LABEL)
120
+ res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
121
+ final_res_df = process_blank_tax_ends(res_df, short_ranks)
122
+
123
+ final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
124
+
125
+ if __name__ == "__main__":
126
+ main()
@@ -14,5 +14,5 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- _SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
17
+ _SILVA_TAX_RANKS = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
18
18
  _PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -55,13 +55,14 @@ You should then be able to run the packages from the command-line. For example t
55
55
  ### New script requirements
56
56
 
57
57
  There are a few requirements for your script:
58
+
58
59
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
59
60
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
60
61
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
61
62
 
62
63
  ### How to add a new script
63
64
 
64
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
66
 
66
67
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
67
68
 
@@ -73,7 +74,7 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
73
74
 
74
75
  - `get_subunits` is the alias
75
76
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
76
- - `:main` will specifically call the function named `main()` when the alias is run.
77
+ - `:main` will specifically call the function named `main()` when the alias is run.
77
78
 
78
79
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
79
80
 
@@ -86,4 +87,5 @@ Finally, you will need to bump up the version in the `version` line.
86
87
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
87
88
 
88
89
  ### Building and uploading to PyPi
90
+
89
91
  The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -16,6 +16,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py
16
16
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
17
17
  mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py
18
18
  mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
19
+ mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
19
20
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
20
21
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
21
22
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
@@ -8,6 +8,7 @@ get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
8
8
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
9
9
  make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
10
10
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
11
+ mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
11
12
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
12
13
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
13
14
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  readme = "README.md"
5
5
  license = {text = "Apache Software License 2.0"}
6
6
  authors = [
@@ -49,6 +49,7 @@ make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_coun
49
49
  remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
50
50
  rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
51
51
  standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
52
+ mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
52
53
 
53
54
  [project.optional-dependencies]
54
55
  tests = [