mgnify-pipelines-toolkit 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (32) hide show
  1. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/PKG-INFO +19 -22
  2. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/README.md +19 -22
  3. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +37 -9
  4. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +9 -4
  5. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +8 -2
  6. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +6 -47
  7. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +48 -29
  8. mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +126 -0
  9. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
  10. mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/constants/thresholds.py +24 -0
  11. mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +50 -0
  12. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +19 -22
  13. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -1
  14. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
  15. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/pyproject.toml +2 -1
  16. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/LICENSE +0 -0
  17. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/__init__.py +0 -0
  18. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  19. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  20. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  21. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  22. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  23. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  24. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  25. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  26. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  27. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  28. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  29. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  30. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  31. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  32. {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -32,46 +32,37 @@ This Python package contains a collection of scripts and tools for including in
32
32
  - Scripts that don't have existing containers built to run them
33
33
  - Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
34
34
 
35
- This package can be built and uploaded to PyPi, to be installed using pip. The package bundles scripts and makes them executable from the command-line when this package is installed.
36
-
37
- > **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
35
+ This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
38
36
 
39
37
  ## How to install
40
38
 
41
- Currently this package is only available on TestPyPi and is installed like this:
42
-
43
- `pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
44
-
45
- You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
46
-
47
- `get_subunits -i ${easel_coords} -n ${meta.id}`
48
-
39
+ This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
49
40
 
50
- ## Building and uploading to PyPi
51
- This command this build the package:
41
+ To install from PyPi with pip:
52
42
 
53
- `python3 -m build`
43
+ `pip install mgnify-pipelines-toolkit`
54
44
 
55
- Then this command will upload it to Test-PyPi (you will need to generate an API token)
45
+ To install from bioconda with conda/mamba:
56
46
 
57
- `python3 -m twine upload --repository testpypi dist/mgnify_pipelines_toolkit-0.0.x*`
47
+ `conda install -c bioconda mgnify-pipelines-toolkit`
58
48
 
59
- To upload it to actual PyPi:
49
+ You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
60
50
 
61
- `python3 -m twine upload dist/mgnify_pipelines_toolkit-0.0.x*`
51
+ `get_subunits -i ${easel_coords} -n ${meta.id}`
62
52
 
63
53
  ## Adding a new script to the package
64
54
 
65
55
  ### New script requirements
66
56
 
67
57
  There are a few requirements for your script:
58
+
68
59
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
69
60
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
70
61
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
71
62
 
72
63
  ### How to add a new script
73
64
 
74
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
75
66
 
76
67
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
77
68
 
@@ -83,12 +74,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
83
74
 
84
75
  - `get_subunits` is the alias
85
76
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
86
- - `:main` will specifically call the function named `main()` when the alias is run.
77
+ - `:main` will specifically call the function named `main()` when the alias is run.
87
78
 
88
79
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
89
80
 
90
81
  `from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
91
82
 
92
- Finally, you will need to bump up the version in the `version` line. How/when we bump versions is to be determined.
83
+ You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
84
+
85
+ Finally, you will need to bump up the version in the `version` line.
93
86
 
94
87
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
88
+
89
+ ### Building and uploading to PyPi
90
+
91
+ The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -7,46 +7,37 @@ This Python package contains a collection of scripts and tools for including in
7
7
  - Scripts that don't have existing containers built to run them
8
8
  - Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
9
9
 
10
- This package can be built and uploaded to PyPi, to be installed using pip. The package bundles scripts and makes them executable from the command-line when this package is installed.
11
-
12
- > **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
10
+ This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
13
11
 
14
12
  ## How to install
15
13
 
16
- Currently this package is only available on TestPyPi and is installed like this:
17
-
18
- `pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
19
-
20
- You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
21
-
22
- `get_subunits -i ${easel_coords} -n ${meta.id}`
23
-
14
+ This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
24
15
 
25
- ## Building and uploading to PyPi
26
- This command this build the package:
16
+ To install from PyPi with pip:
27
17
 
28
- `python3 -m build`
18
+ `pip install mgnify-pipelines-toolkit`
29
19
 
30
- Then this command will upload it to Test-PyPi (you will need to generate an API token)
20
+ To install from bioconda with conda/mamba:
31
21
 
32
- `python3 -m twine upload --repository testpypi dist/mgnify_pipelines_toolkit-0.0.x*`
22
+ `conda install -c bioconda mgnify-pipelines-toolkit`
33
23
 
34
- To upload it to actual PyPi:
24
+ You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
35
25
 
36
- `python3 -m twine upload dist/mgnify_pipelines_toolkit-0.0.x*`
26
+ `get_subunits -i ${easel_coords} -n ${meta.id}`
37
27
 
38
28
  ## Adding a new script to the package
39
29
 
40
30
  ### New script requirements
41
31
 
42
32
  There are a few requirements for your script:
33
+
43
34
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
44
35
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
45
36
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
46
37
 
47
38
  ### How to add a new script
48
39
 
49
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
40
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
50
41
 
51
42
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
52
43
 
@@ -58,12 +49,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
58
49
 
59
50
  - `get_subunits` is the alias
60
51
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
61
- - `:main` will specifically call the function named `main()` when the alias is run.
52
+ - `:main` will specifically call the function named `main()` when the alias is run.
62
53
 
63
54
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
64
55
 
65
56
  `from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
66
57
 
67
- Finally, you will need to bump up the version in the `version` line. How/when we bump versions is to be determined.
58
+ You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
59
+
60
+ Finally, you will need to bump up the version in the `version` line.
61
+
62
+ At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
63
+
64
+ ### Building and uploading to PyPi
68
65
 
69
- At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
66
+ The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -15,12 +15,15 @@
15
15
  # limitations under the License.
16
16
 
17
17
  from collections import defaultdict, Counter
18
+ import logging
18
19
  import gzip
19
20
  import os
20
21
  import subprocess
21
22
 
22
23
  from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import _AMBIGUOUS_BASES_DICT, _AMBIGUOUS_BASES_DICT_REV
23
24
 
25
+ logging.basicConfig(level=logging.DEBUG)
26
+
24
27
  def split_dir_into_sample_paths(_DIR):
25
28
 
26
29
  file_list = os.listdir(_DIR)
@@ -34,14 +37,28 @@ def split_dir_into_sample_paths(_DIR):
34
37
  def get_read_count(read_path, type='fastq'):
35
38
 
36
39
  cmd = []
40
+ stdout = ''
37
41
 
38
42
  if type == 'fastq':
39
43
  cmd = [
40
- 'zgrep',
41
- '-c',
42
- '^@',
44
+ 'zcat',
43
45
  read_path
44
46
  ]
47
+ zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
48
+
49
+ cmd = [
50
+ 'sed',
51
+ '-n',
52
+ '1~4p',
53
+ ]
54
+ sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
55
+
56
+ cmd = [
57
+ 'wc',
58
+ '-l'
59
+ ]
60
+ wc_proc = subprocess.Popen(cmd, stdin=sed_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
61
+ stdout, stderr = wc_proc.communicate()
45
62
 
46
63
  elif type == 'fasta':
47
64
  cmd = [
@@ -50,15 +67,20 @@ def get_read_count(read_path, type='fastq'):
50
67
  '^>',
51
68
  read_path
52
69
  ]
70
+ grep_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
71
+ stdout, stderr = grep_proc.communicate()
72
+
73
+ read_count = stdout.strip() if stdout is not None else ""
53
74
 
54
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
55
- stdout, stderr = proc.communicate()
75
+ if not read_count.isdigit():
76
+ logging.error(f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'")
77
+ exit(1)
56
78
 
57
- read_count = int(stdout.strip())
79
+ read_count = int(read_count)
58
80
 
59
81
  return read_count
60
82
 
61
- def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1):
83
+ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
62
84
  """
63
85
  Generate consensus sequence using a list of base conservation dictionaries most likely
64
86
  generated by the `build_mcp_cons_dict_list()` function.
@@ -85,7 +107,10 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
85
107
  if base not in ('A', 'T', 'C', 'G'):
86
108
  continue
87
109
 
88
- cons_dict[base] = count/read_count
110
+ if max_line_count is None:
111
+ cons_dict[base] = count/read_count
112
+ else:
113
+ cons_dict[base] = count/max_line_count
89
114
 
90
115
  if count > max_count:
91
116
  max_count = count
@@ -160,7 +185,7 @@ def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
160
185
 
161
186
  return mcp_cons_list
162
187
 
163
- def fetch_mcp(fastq, prefix_len, start=1, rev=False):
188
+ def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
164
189
  """
165
190
  Generates the most common prefix sequences along with their counts in a fastq file.
166
191
  Outputs dictionary containing counts for each generated MCP in the fastq.
@@ -177,6 +202,9 @@ def fetch_mcp(fastq, prefix_len, start=1, rev=False):
177
202
  else:
178
203
  rev_line = line[::-1]
179
204
  selected_lines.append(rev_line[start-1:start+prefix_len-1])
205
+ if max_line_count != None:
206
+ if len(selected_lines) > max_line_count:
207
+ break
180
208
 
181
209
  sequence_counts = Counter(selected_lines)
182
210
  mcp_count_dict = dict(sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True))
@@ -21,6 +21,7 @@ import numpy as np
21
21
  import pandas as pd
22
22
 
23
23
  from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
24
+ from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
24
25
 
25
26
  def parse_args():
26
27
 
@@ -65,14 +66,18 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
65
66
 
66
67
  read_count = get_read_count(_PATH) # get readcount from fastq
67
68
 
69
+ max_line_count = None
70
+ if read_count > MCP_MAX_LINE_COUNT:
71
+ max_line_count = MCP_MAX_LINE_COUNT
72
+
68
73
  n_prop = 0.8
69
74
 
70
75
  for start in inf_point_list: # Looping through the pre-inflection point mcps
71
76
  mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
72
77
 
73
- mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev) # get MCP count dict
78
+ mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev, max_line_count=max_line_count) # get MCP count dict
74
79
  mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
75
- cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list) # get list of max base conservations for each index
80
+ cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, max_line_count=max_line_count) # get list of max base conservations for each index
76
81
  # also get consensus sequence
77
82
  cons_seq_list.append(cons_seq)
78
83
  start_confs.append(np.mean(cons_confs))
@@ -83,9 +88,9 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
83
88
  subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
84
89
  l = mcp_len + subs_len - 1 # final index of MCP
85
90
 
86
- mcp_count_dict = fetch_mcp(_PATH, l, mcp_len, rev=rev)
91
+ mcp_count_dict = fetch_mcp(_PATH, l, mcp_len, rev=rev, max_line_count=max_line_count)
87
92
  mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
88
- cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, subs_len)
93
+ cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, subs_len, max_line_count=max_line_count)
89
94
 
90
95
  end_confs.append(np.mean(cons_confs))
91
96
 
@@ -22,6 +22,7 @@ import pandas as pd
22
22
  import numpy as np
23
23
 
24
24
  from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
25
+ from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
25
26
 
26
27
  def parse_args():
27
28
 
@@ -67,9 +68,14 @@ def find_mcp_props_for_sample(_PATH, rev=False):
67
68
  end = start+mcp_len-1 # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
68
69
 
69
70
  read_count = get_read_count(_PATH, type='fastq') # get read count for fastq file
70
- mcp_count_dict = fetch_mcp(_PATH, end, start, rev) # get MCP count dict
71
+
72
+ max_line_count = None
73
+ if read_count > MCP_MAX_LINE_COUNT:
74
+ max_line_count = MCP_MAX_LINE_COUNT
75
+
76
+ mcp_count_dict = fetch_mcp(_PATH, end, start, rev, max_line_count) # get MCP count dict
71
77
  mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
72
- cons_seq, cons_conf = build_cons_seq(mcp_cons_list, read_count) # get list of max base conservations for each index
78
+ cons_seq, cons_conf = build_cons_seq(mcp_cons_list, read_count, max_line_count=max_line_count) # get list of max base conservations for each index
73
79
 
74
80
  res_dict[start] = np.mean(cons_conf) # compute the mean
75
81
 
@@ -24,55 +24,14 @@ import sys
24
24
  import json
25
25
  import time
26
26
 
27
+ from mgnify_pipelines_toolkit.constants.thresholds import MIN_OVERLAP, MIN_SEQ_COUNT, MAX_ERROR_PROPORTION,MAX_INTERNAL_PRIMER_PROPORTION
28
+ from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
29
+
27
30
  raw_f_regex = re.compile(
28
31
  "([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*")
29
32
 
30
- MIN_OVERLAP = 0.95
31
-
32
- MIN_SEQ_COUNT = 5000
33
-
34
- MAX_ERROR_PROPORTION = 0.01
35
-
36
- MAX_INTERNAL_PRIMER_PROPORTION = 0.2
37
-
38
- regions_16S_bacteria = {
39
- 'V1': [69, 92],
40
- 'V2': [131, 239],
41
- 'V3': [430, 487],
42
- 'V4': [566, 672],
43
- 'V5': [812, 869],
44
- 'V6': [976, 1033],
45
- 'V7': [1107, 1164],
46
- 'V8': [1234, 1285],
47
- 'V9': [1426, 1456]
48
- }
49
-
50
- regions_16S_archaea = {
51
- 'V1': [61, 79],
52
- 'V2': [114, 223],
53
- 'V3': [397, 436],
54
- 'V4': [516, 623],
55
- 'V5': [763, 824],
56
- 'V6': [932, 982],
57
- 'V7': [1056, 1119],
58
- 'V8': [1189, 1240],
59
- 'V9': [1372, 1410]
60
- }
61
-
62
- regions_18S = {
63
- 'V1': [69, 109],
64
- 'V2': [136, 298],
65
- 'V3': [474, 545],
66
- 'V4': [627, 873],
67
- 'V5': [1059, 1102],
68
- 'V7': [1366, 1454],
69
- 'V8': [1526, 1608],
70
- 'V9': [1728, 1795]
71
- }
72
-
73
33
  logging.basicConfig(level=logging.DEBUG)
74
34
 
75
-
76
35
  def calc_overlap(read, reg):
77
36
  read_s, read_f = read
78
37
  reg_s, reg_f = reg
@@ -207,11 +166,11 @@ def determine_cm(cm_detected):
207
166
  model: A dictionary containing the coordinates of the variable regions for the matched model.
208
167
  """
209
168
  if cm_detected == 'RF00177':
210
- model = regions_16S_bacteria
169
+ model = REGIONS_16S_BACTERIA
211
170
  elif cm_detected == 'RF01959':
212
- model = regions_16S_archaea
171
+ model = REGIONS_16S_ARCHAEA
213
172
  elif cm_detected == 'RF01960':
214
- model = regions_18S
173
+ model = REGIONS_18S
215
174
  else:
216
175
  model = 'unsupported'
217
176
  return model
@@ -28,7 +28,7 @@ def parse_args():
28
28
 
29
29
  parser = argparse.ArgumentParser()
30
30
 
31
- parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to DADA2 taxa file")
31
+ parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
32
32
  parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
33
33
  parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
34
34
  parser.add_argument("-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow")
@@ -49,7 +49,7 @@ def parse_args():
49
49
 
50
50
  def order_df(taxa_df):
51
51
 
52
- if len(taxa_df.columns) == 8:
52
+ if len(taxa_df.columns) == 9:
53
53
  taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
54
54
  elif len(taxa_df.columns) == 10:
55
55
  taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
@@ -66,11 +66,13 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
66
66
  for i in range(len(taxa_df)):
67
67
 
68
68
  sorted_index = taxa_df.index[i]
69
- asv_count = asv_dict[sorted_index]
69
+ asv_num = taxa_df.iloc[i, 0]
70
+ asv_count = asv_dict[asv_num]
70
71
 
71
72
  if asv_count == 0:
72
73
  continue
73
74
 
75
+ sk = taxa_df.loc[sorted_index, "Superkingdom"]
74
76
  k = taxa_df.loc[sorted_index, "Kingdom"]
75
77
  p = taxa_df.loc[sorted_index, "Phylum"]
76
78
  c = taxa_df.loc[sorted_index, "Class"]
@@ -83,47 +85,53 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
83
85
 
84
86
  while True:
85
87
 
88
+ if sk != "0":
89
+ sk = "_".join(sk.split(" "))
90
+ tax_assignment += sk
91
+ else:
92
+ break
93
+
86
94
  if k != "0":
87
95
  k = "_".join(k.split(" "))
88
- if k == "Archaea" or k == "Bacteria":
89
- tax_assignment += f"sk__{k}"
90
- elif k == "Eukaryota":
91
- tax_assignment += f"sk__Eukaryota"
92
- else:
93
- tax_assignment += f"sk__Eukaryota\tk__{k}"
96
+ tax_assignment += f"\t{k}"
97
+ elif sk != "0":
98
+ tax_assignment += f"\tk__"
94
99
  else:
95
100
  break
96
101
 
97
102
  if p != "0":
98
- if k == "Archaea" or k == "Bacteria":
99
- tax_assignment += f"\tk__"
100
103
  p = "_".join(p.split(" "))
101
- tax_assignment += f"\tp__{p}"
104
+ tax_assignment += f"\t{p}"
102
105
  else:
103
106
  break
107
+
104
108
  if c != "0":
105
109
  c = "_".join(c.split(" "))
106
- tax_assignment += f"\tc__{c}"
110
+ tax_assignment += f"\t{c}"
107
111
  else:
108
112
  break
113
+
109
114
  if o != "0":
110
115
  o = "_".join(o.split(" "))
111
- tax_assignment += f"\to__{o}"
116
+ tax_assignment += f"\t{o}"
112
117
  else:
113
118
  break
119
+
114
120
  if f != "0":
115
121
  f = "_".join(f.split(" "))
116
- tax_assignment += f"\tf__{f}"
122
+ tax_assignment += f"\t{f}"
117
123
  else:
118
124
  break
125
+
119
126
  if g != "0":
120
127
  g = "_".join(g.split(" "))
121
- tax_assignment += f"\tg__{g}"
128
+ tax_assignment += f"\t{g}"
122
129
  else:
123
130
  break
131
+
124
132
  if s != "0":
125
133
  s = "_".join(s.split(" "))
126
- tax_assignment += f"\ts__{s}"
134
+ tax_assignment += f"\t{s}"
127
135
  break
128
136
 
129
137
  if tax_assignment == "":
@@ -140,7 +148,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
140
148
  for i in range(len(taxa_df)):
141
149
 
142
150
  sorted_index = taxa_df.index[i]
143
- asv_count = asv_dict[sorted_index]
151
+ asv_num = taxa_df.iloc[i, 0]
152
+ asv_count = asv_dict[asv_num]
144
153
 
145
154
  if asv_count == 0:
146
155
  continue
@@ -161,45 +170,55 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
161
170
 
162
171
  if d != "0":
163
172
  d = "_".join(d.split(" "))
164
- tax_assignment += f"d__{d}"
173
+ tax_assignment += d
165
174
  else:
166
175
  break
167
176
 
168
177
  if sg != "0":
169
178
  sg = "_".join(sg.split(" "))
170
- tax_assignment += f"\tsg__{sg}"
179
+ tax_assignment += f"\t{sg}"
171
180
  else:
172
181
  break
182
+
173
183
  if dv != "0":
174
184
  dv = "_".join(dv.split(" "))
175
- tax_assignment += f"\tdv__{dv}"
185
+ tax_assignment += f"\t{dv}"
186
+ else:
187
+ break
176
188
 
177
189
  if sdv != "0":
178
190
  sdv = "_".join(sdv.split(" "))
179
- tax_assignment += f"\tsdv__{sdv}"
191
+ tax_assignment += f"\t{sdv}"
192
+ else:
193
+ break
194
+
180
195
  if c != "0":
181
196
  c = "_".join(c.split(" "))
182
- tax_assignment += f"\tc__{c}"
197
+ tax_assignment += f"\t{c}"
183
198
  else:
184
199
  break
200
+
185
201
  if o != "0":
186
202
  o = "_".join(o.split(" "))
187
- tax_assignment += f"\to__{o}"
203
+ tax_assignment += f"\t{o}"
188
204
  else:
189
205
  break
206
+
190
207
  if f != "0":
191
208
  f = "_".join(f.split(" "))
192
- tax_assignment += f"\tf__{f}"
209
+ tax_assignment += f"\t{f}"
193
210
  else:
194
211
  break
212
+
195
213
  if g != "0":
196
214
  g = "_".join(g.split(" "))
197
- tax_assignment += f"\tg__{g}"
215
+ tax_assignment += f"\t{g}"
198
216
  else:
199
217
  break
218
+
200
219
  if s != "0":
201
220
  s = "_".join(s.split(" "))
202
- tax_assignment += f"\ts__{s}"
221
+ tax_assignment += f"\t{s}"
203
222
  break
204
223
 
205
224
  if tax_assignment == "":
@@ -253,7 +272,7 @@ def main():
253
272
  asv_intersection = fwd_asvs
254
273
 
255
274
  if headers[counter] in amp_reads:
256
- asv_dict[int(asv_intersection[0]) - 1] += 1
275
+ asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
257
276
 
258
277
  fwd_fr.close()
259
278
  if paired_end:
@@ -261,7 +280,7 @@ def main():
261
280
 
262
281
  ref_db = ""
263
282
 
264
- if len(taxa_df.columns) == 8:
283
+ if len(taxa_df.columns) == 9:
265
284
  tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
266
285
  ref_db = "silva"
267
286
  elif len(taxa_df.columns) == 10:
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import logging
20
+
21
+ import pandas as pd
22
+
23
+ logging.basicConfig(level=logging.DEBUG)
24
+
25
+ def parse_args():
26
+
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
29
+ parser.add_argument("-l", "--label", choices=['DADA2-SILVA', 'DADA2-PR2'], required=True, type=str, help="Database label - either DADA2-SILVA or DADA2-PR2")
30
+ parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
31
+
32
+ args = parser.parse_args()
33
+
34
+ _INPUT = args.input
35
+ _LABEL = args.label
36
+ _SAMPLE = args.sample
37
+
38
+ return _INPUT, _LABEL, _SAMPLE
39
+
40
+ def parse_label(label):
41
+
42
+ silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
43
+ pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
44
+
45
+ silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
46
+ pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
47
+
48
+ chosen_short_ranks = ''
49
+ chosen_long_ranks = ''
50
+
51
+ if label == 'DADA2-SILVA':
52
+ chosen_short_ranks = silva_short_ranks
53
+ chosen_long_ranks = silva_long_ranks
54
+ elif label == 'DADA2-PR2':
55
+ chosen_short_ranks = pr2_short_ranks
56
+ chosen_long_ranks = pr2_long_ranks
57
+ else:
58
+ logging.error("Incorrect database label - exiting.")
59
+ exit(1)
60
+
61
+ return chosen_short_ranks, chosen_long_ranks
62
+
63
+ def parse_mapseq(mseq_df, short_ranks, long_ranks):
64
+
65
+ res_dict = defaultdict(list)
66
+
67
+ for i in range(len(mseq_df)):
68
+ asv_id = mseq_df.iloc[i, 0]
69
+ tax_ass = mseq_df.iloc[i, 1].split(';')
70
+
71
+ res_dict['ASV'].append(asv_id)
72
+
73
+ for j in range(len(short_ranks)):
74
+
75
+ curr_rank = long_ranks[j]
76
+
77
+ if j >= len(tax_ass):
78
+ # This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
79
+ # so fill each remaining rank with its respective short rank blank
80
+ curr_tax = short_ranks[j]
81
+ else:
82
+ curr_tax = tax_ass[j]
83
+
84
+ res_dict[curr_rank].append(curr_tax)
85
+ res_df = pd.DataFrame.from_dict(res_dict)
86
+
87
+ return(res_df)
88
+
89
+ def process_blank_tax_ends(res_df, ranks):
90
+ # Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
91
+ # while avoiding making blanks in the middle as NAs
92
+
93
+ for i in range(len(res_df)):
94
+ last_empty_rank = ''
95
+ currently_empty = False
96
+ for j in reversed(range(len(ranks))): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
97
+ curr_rank = res_df.iloc[i, j+1]
98
+ if curr_rank in ranks:
99
+ if last_empty_rank == '': # Last rank is empty, start window of consecutive blanks
100
+ last_empty_rank = j+1
101
+ currently_empty = True
102
+ elif currently_empty: # If we're in a window of consecutive blank assignments that started at the beginning
103
+ last_empty_rank = j+1
104
+ else:
105
+ break
106
+ else:
107
+ break
108
+ if last_empty_rank != '':
109
+ res_df.iloc[i, last_empty_rank:] = 'NA'
110
+
111
+ return res_df
112
+
113
+ def main():
114
+
115
+ _INPUT, _LABEL, _SAMPLE = parse_args()
116
+
117
+ mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
118
+
119
+ short_ranks, long_ranks = parse_label(_LABEL)
120
+ res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
121
+ final_res_df = process_blank_tax_ends(res_df, short_ranks)
122
+
123
+ final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
124
+
125
+ if __name__ == "__main__":
126
+ main()
@@ -14,5 +14,5 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- _SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
17
+ _SILVA_TAX_RANKS = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
18
18
  _PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # used by fetch_mcp in analysis.amplicon
18
+ MCP_MAX_LINE_COUNT = 300_000
19
+
20
+ # used by classify_var_regions in analysis.amplicon
21
+ MIN_OVERLAP = 0.95
22
+ MIN_SEQ_COUNT = 5000
23
+ MAX_ERROR_PROPORTION = 0.01
24
+ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ REGIONS_16S_BACTERIA = {
18
+ 'V1': [69, 92],
19
+ 'V2': [131, 239],
20
+ 'V3': [430, 487],
21
+ 'V4': [566, 672],
22
+ 'V5': [812, 869],
23
+ 'V6': [976, 1033],
24
+ 'V7': [1107, 1164],
25
+ 'V8': [1234, 1285],
26
+ 'V9': [1426, 1456]
27
+ }
28
+
29
+ REGIONS_16S_ARCHAEA = {
30
+ 'V1': [61, 79],
31
+ 'V2': [114, 223],
32
+ 'V3': [397, 436],
33
+ 'V4': [516, 623],
34
+ 'V5': [763, 824],
35
+ 'V6': [932, 982],
36
+ 'V7': [1056, 1119],
37
+ 'V8': [1189, 1240],
38
+ 'V9': [1372, 1410]
39
+ }
40
+
41
+ REGIONS_18S = {
42
+ 'V1': [69, 109],
43
+ 'V2': [136, 298],
44
+ 'V3': [474, 545],
45
+ 'V4': [627, 873],
46
+ 'V5': [1059, 1102],
47
+ 'V7': [1366, 1454],
48
+ 'V8': [1526, 1608],
49
+ 'V9': [1728, 1795]
50
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -32,46 +32,37 @@ This Python package contains a collection of scripts and tools for including in
32
32
  - Scripts that don't have existing containers built to run them
33
33
  - Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
34
34
 
35
- This package can be built and uploaded to PyPi, to be installed using pip. The package bundles scripts and makes them executable from the command-line when this package is installed.
36
-
37
- > **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
35
+ This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
38
36
 
39
37
  ## How to install
40
38
 
41
- Currently this package is only available on TestPyPi and is installed like this:
42
-
43
- `pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
44
-
45
- You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
46
-
47
- `get_subunits -i ${easel_coords} -n ${meta.id}`
48
-
39
+ This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
49
40
 
50
- ## Building and uploading to PyPi
51
- This command this build the package:
41
+ To install from PyPi with pip:
52
42
 
53
- `python3 -m build`
43
+ `pip install mgnify-pipelines-toolkit`
54
44
 
55
- Then this command will upload it to Test-PyPi (you will need to generate an API token)
45
+ To install from bioconda with conda/mamba:
56
46
 
57
- `python3 -m twine upload --repository testpypi dist/mgnify_pipelines_toolkit-0.0.x*`
47
+ `conda install -c bioconda mgnify-pipelines-toolkit`
58
48
 
59
- To upload it to actual PyPi:
49
+ You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
60
50
 
61
- `python3 -m twine upload dist/mgnify_pipelines_toolkit-0.0.x*`
51
+ `get_subunits -i ${easel_coords} -n ${meta.id}`
62
52
 
63
53
  ## Adding a new script to the package
64
54
 
65
55
  ### New script requirements
66
56
 
67
57
  There are a few requirements for your script:
58
+
68
59
  - It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
69
60
  - Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
70
61
  - A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
71
62
 
72
63
  ### How to add a new script
73
64
 
74
- To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
65
+ To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
75
66
 
76
67
  Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
77
68
 
@@ -83,12 +74,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
83
74
 
84
75
  - `get_subunits` is the alias
85
76
  - `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
86
- - `:main` will specifically call the function named `main()` when the alias is run.
77
+ - `:main` will specifically call the function named `main()` when the alias is run.
87
78
 
88
79
  When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
89
80
 
90
81
  `from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
91
82
 
92
- Finally, you will need to bump up the version in the `version` line. How/when we bump versions is to be determined.
83
+ You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
84
+
85
+ Finally, you will need to bump up the version in the `version` line.
93
86
 
94
87
  At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
88
+
89
+ ### Building and uploading to PyPi
90
+
91
+ The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
@@ -16,6 +16,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py
16
16
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
17
17
  mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py
18
18
  mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
19
+ mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
19
20
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
20
21
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
21
22
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
@@ -24,4 +25,6 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
24
25
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py
25
26
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
26
27
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
27
- mgnify_pipelines_toolkit/constants/tax_ranks.py
28
+ mgnify_pipelines_toolkit/constants/tax_ranks.py
29
+ mgnify_pipelines_toolkit/constants/thresholds.py
30
+ mgnify_pipelines_toolkit/constants/var_region_coordinates.py
@@ -8,6 +8,7 @@ get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
8
8
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
9
9
  make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
10
10
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
11
+ mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
11
12
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
12
13
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
13
14
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "0.1.0"
3
+ version = "0.1.2"
4
4
  readme = "README.md"
5
5
  license = {text = "Apache Software License 2.0"}
6
6
  authors = [
@@ -49,6 +49,7 @@ make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_coun
49
49
  remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
50
50
  rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
51
51
  standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
52
+ mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
52
53
 
53
54
  [project.optional-dependencies]
54
55
  tests = [