mgnify-pipelines-toolkit 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/PKG-INFO +19 -22
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/README.md +19 -22
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +37 -9
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +9 -4
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +8 -2
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +6 -47
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +48 -29
- mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +126 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
- mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/constants/thresholds.py +24 -0
- mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +50 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +19 -22
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -1
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/pyproject.toml +2 -1
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
- {mgnify_pipelines_toolkit-0.1.0 → mgnify_pipelines_toolkit-0.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -32,46 +32,37 @@ This Python package contains a collection of scripts and tools for including in
|
|
|
32
32
|
- Scripts that don't have existing containers built to run them
|
|
33
33
|
- Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
|
|
34
34
|
|
|
35
|
-
This package
|
|
36
|
-
|
|
37
|
-
> **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
|
|
35
|
+
This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
|
|
38
36
|
|
|
39
37
|
## How to install
|
|
40
38
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
`pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
|
|
44
|
-
|
|
45
|
-
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
46
|
-
|
|
47
|
-
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
48
|
-
|
|
39
|
+
This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
|
|
49
40
|
|
|
50
|
-
|
|
51
|
-
This command this build the package:
|
|
41
|
+
To install from PyPi with pip:
|
|
52
42
|
|
|
53
|
-
`
|
|
43
|
+
`pip install mgnify-pipelines-toolkit`
|
|
54
44
|
|
|
55
|
-
|
|
45
|
+
To install from bioconda with conda/mamba:
|
|
56
46
|
|
|
57
|
-
`
|
|
47
|
+
`conda install -c bioconda mgnify-pipelines-toolkit`
|
|
58
48
|
|
|
59
|
-
|
|
49
|
+
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
60
50
|
|
|
61
|
-
`
|
|
51
|
+
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
62
52
|
|
|
63
53
|
## Adding a new script to the package
|
|
64
54
|
|
|
65
55
|
### New script requirements
|
|
66
56
|
|
|
67
57
|
There are a few requirements for your script:
|
|
58
|
+
|
|
68
59
|
- It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
|
|
69
60
|
- Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
|
|
70
61
|
- A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
|
|
71
62
|
|
|
72
63
|
### How to add a new script
|
|
73
64
|
|
|
74
|
-
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
65
|
+
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
75
66
|
|
|
76
67
|
Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
|
|
77
68
|
|
|
@@ -83,12 +74,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
|
|
|
83
74
|
|
|
84
75
|
- `get_subunits` is the alias
|
|
85
76
|
- `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
|
|
86
|
-
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
77
|
+
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
87
78
|
|
|
88
79
|
When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
|
|
89
80
|
|
|
90
81
|
`from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
|
|
91
82
|
|
|
92
|
-
|
|
83
|
+
You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
|
|
84
|
+
|
|
85
|
+
Finally, you will need to bump up the version in the `version` line.
|
|
93
86
|
|
|
94
87
|
At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
|
|
88
|
+
|
|
89
|
+
### Building and uploading to PyPi
|
|
90
|
+
|
|
91
|
+
The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
|
|
@@ -7,46 +7,37 @@ This Python package contains a collection of scripts and tools for including in
|
|
|
7
7
|
- Scripts that don't have existing containers built to run them
|
|
8
8
|
- Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
|
|
9
9
|
|
|
10
|
-
This package
|
|
11
|
-
|
|
12
|
-
> **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
|
|
10
|
+
This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
|
|
13
11
|
|
|
14
12
|
## How to install
|
|
15
13
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
`pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
|
|
19
|
-
|
|
20
|
-
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
21
|
-
|
|
22
|
-
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
23
|
-
|
|
14
|
+
This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
|
|
24
15
|
|
|
25
|
-
|
|
26
|
-
This command this build the package:
|
|
16
|
+
To install from PyPi with pip:
|
|
27
17
|
|
|
28
|
-
`
|
|
18
|
+
`pip install mgnify-pipelines-toolkit`
|
|
29
19
|
|
|
30
|
-
|
|
20
|
+
To install from bioconda with conda/mamba:
|
|
31
21
|
|
|
32
|
-
`
|
|
22
|
+
`conda install -c bioconda mgnify-pipelines-toolkit`
|
|
33
23
|
|
|
34
|
-
|
|
24
|
+
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
35
25
|
|
|
36
|
-
`
|
|
26
|
+
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
37
27
|
|
|
38
28
|
## Adding a new script to the package
|
|
39
29
|
|
|
40
30
|
### New script requirements
|
|
41
31
|
|
|
42
32
|
There are a few requirements for your script:
|
|
33
|
+
|
|
43
34
|
- It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
|
|
44
35
|
- Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
|
|
45
36
|
- A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
|
|
46
37
|
|
|
47
38
|
### How to add a new script
|
|
48
39
|
|
|
49
|
-
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
40
|
+
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
50
41
|
|
|
51
42
|
Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
|
|
52
43
|
|
|
@@ -58,12 +49,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
|
|
|
58
49
|
|
|
59
50
|
- `get_subunits` is the alias
|
|
60
51
|
- `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
|
|
61
|
-
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
52
|
+
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
62
53
|
|
|
63
54
|
When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
|
|
64
55
|
|
|
65
56
|
`from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
|
|
66
57
|
|
|
67
|
-
|
|
58
|
+
You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
|
|
59
|
+
|
|
60
|
+
Finally, you will need to bump up the version in the `version` line.
|
|
61
|
+
|
|
62
|
+
At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
|
|
63
|
+
|
|
64
|
+
### Building and uploading to PyPi
|
|
68
65
|
|
|
69
|
-
|
|
66
|
+
The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
|
|
@@ -15,12 +15,15 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
from collections import defaultdict, Counter
|
|
18
|
+
import logging
|
|
18
19
|
import gzip
|
|
19
20
|
import os
|
|
20
21
|
import subprocess
|
|
21
22
|
|
|
22
23
|
from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import _AMBIGUOUS_BASES_DICT, _AMBIGUOUS_BASES_DICT_REV
|
|
23
24
|
|
|
25
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
26
|
+
|
|
24
27
|
def split_dir_into_sample_paths(_DIR):
|
|
25
28
|
|
|
26
29
|
file_list = os.listdir(_DIR)
|
|
@@ -34,14 +37,28 @@ def split_dir_into_sample_paths(_DIR):
|
|
|
34
37
|
def get_read_count(read_path, type='fastq'):
|
|
35
38
|
|
|
36
39
|
cmd = []
|
|
40
|
+
stdout = ''
|
|
37
41
|
|
|
38
42
|
if type == 'fastq':
|
|
39
43
|
cmd = [
|
|
40
|
-
'
|
|
41
|
-
'-c',
|
|
42
|
-
'^@',
|
|
44
|
+
'zcat',
|
|
43
45
|
read_path
|
|
44
46
|
]
|
|
47
|
+
zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
48
|
+
|
|
49
|
+
cmd = [
|
|
50
|
+
'sed',
|
|
51
|
+
'-n',
|
|
52
|
+
'1~4p',
|
|
53
|
+
]
|
|
54
|
+
sed_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
55
|
+
|
|
56
|
+
cmd = [
|
|
57
|
+
'wc',
|
|
58
|
+
'-l'
|
|
59
|
+
]
|
|
60
|
+
wc_proc = subprocess.Popen(cmd, stdin=sed_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
61
|
+
stdout, stderr = wc_proc.communicate()
|
|
45
62
|
|
|
46
63
|
elif type == 'fasta':
|
|
47
64
|
cmd = [
|
|
@@ -50,15 +67,20 @@ def get_read_count(read_path, type='fastq'):
|
|
|
50
67
|
'^>',
|
|
51
68
|
read_path
|
|
52
69
|
]
|
|
70
|
+
grep_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
71
|
+
stdout, stderr = grep_proc.communicate()
|
|
72
|
+
|
|
73
|
+
read_count = stdout.strip() if stdout is not None else ""
|
|
53
74
|
|
|
54
|
-
|
|
55
|
-
|
|
75
|
+
if not read_count.isdigit():
|
|
76
|
+
logging.error(f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'")
|
|
77
|
+
exit(1)
|
|
56
78
|
|
|
57
|
-
read_count = int(
|
|
79
|
+
read_count = int(read_count)
|
|
58
80
|
|
|
59
81
|
return read_count
|
|
60
82
|
|
|
61
|
-
def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1):
|
|
83
|
+
def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
|
|
62
84
|
"""
|
|
63
85
|
Generate consensus sequence using a list of base conservation dictionaries most likely
|
|
64
86
|
generated by the `build_mcp_cons_dict_list()` function.
|
|
@@ -85,7 +107,10 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
|
|
|
85
107
|
if base not in ('A', 'T', 'C', 'G'):
|
|
86
108
|
continue
|
|
87
109
|
|
|
88
|
-
|
|
110
|
+
if max_line_count is None:
|
|
111
|
+
cons_dict[base] = count/read_count
|
|
112
|
+
else:
|
|
113
|
+
cons_dict[base] = count/max_line_count
|
|
89
114
|
|
|
90
115
|
if count > max_count:
|
|
91
116
|
max_count = count
|
|
@@ -160,7 +185,7 @@ def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
|
|
|
160
185
|
|
|
161
186
|
return mcp_cons_list
|
|
162
187
|
|
|
163
|
-
def fetch_mcp(fastq, prefix_len, start=1, rev=False):
|
|
188
|
+
def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
|
|
164
189
|
"""
|
|
165
190
|
Generates the most common prefix sequences along with their counts in a fastq file.
|
|
166
191
|
Outputs dictionary containing counts for each generated MCP in the fastq.
|
|
@@ -177,6 +202,9 @@ def fetch_mcp(fastq, prefix_len, start=1, rev=False):
|
|
|
177
202
|
else:
|
|
178
203
|
rev_line = line[::-1]
|
|
179
204
|
selected_lines.append(rev_line[start-1:start+prefix_len-1])
|
|
205
|
+
if max_line_count != None:
|
|
206
|
+
if len(selected_lines) > max_line_count:
|
|
207
|
+
break
|
|
180
208
|
|
|
181
209
|
sequence_counts = Counter(selected_lines)
|
|
182
210
|
mcp_count_dict = dict(sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True))
|
|
@@ -21,6 +21,7 @@ import numpy as np
|
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
23
|
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
24
25
|
|
|
25
26
|
def parse_args():
|
|
26
27
|
|
|
@@ -65,14 +66,18 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
|
|
|
65
66
|
|
|
66
67
|
read_count = get_read_count(_PATH) # get readcount from fastq
|
|
67
68
|
|
|
69
|
+
max_line_count = None
|
|
70
|
+
if read_count > MCP_MAX_LINE_COUNT:
|
|
71
|
+
max_line_count = MCP_MAX_LINE_COUNT
|
|
72
|
+
|
|
68
73
|
n_prop = 0.8
|
|
69
74
|
|
|
70
75
|
for start in inf_point_list: # Looping through the pre-inflection point mcps
|
|
71
76
|
mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
|
|
72
77
|
|
|
73
|
-
mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev) # get MCP count dict
|
|
78
|
+
mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev, max_line_count=max_line_count) # get MCP count dict
|
|
74
79
|
mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
|
|
75
|
-
cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list) # get list of max base conservations for each index
|
|
80
|
+
cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, max_line_count=max_line_count) # get list of max base conservations for each index
|
|
76
81
|
# also get consensus sequence
|
|
77
82
|
cons_seq_list.append(cons_seq)
|
|
78
83
|
start_confs.append(np.mean(cons_confs))
|
|
@@ -83,9 +88,9 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
|
|
|
83
88
|
subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
|
|
84
89
|
l = mcp_len + subs_len - 1 # final index of MCP
|
|
85
90
|
|
|
86
|
-
mcp_count_dict = fetch_mcp(_PATH, l, mcp_len, rev=rev)
|
|
91
|
+
mcp_count_dict = fetch_mcp(_PATH, l, mcp_len, rev=rev, max_line_count=max_line_count)
|
|
87
92
|
mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
|
|
88
|
-
cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, subs_len)
|
|
93
|
+
cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, subs_len, max_line_count=max_line_count)
|
|
89
94
|
|
|
90
95
|
end_confs.append(np.mean(cons_confs))
|
|
91
96
|
|
|
@@ -22,6 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import numpy as np
|
|
23
23
|
|
|
24
24
|
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
|
|
25
|
+
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
25
26
|
|
|
26
27
|
def parse_args():
|
|
27
28
|
|
|
@@ -67,9 +68,14 @@ def find_mcp_props_for_sample(_PATH, rev=False):
|
|
|
67
68
|
end = start+mcp_len-1 # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
|
|
68
69
|
|
|
69
70
|
read_count = get_read_count(_PATH, type='fastq') # get read count for fastq file
|
|
70
|
-
|
|
71
|
+
|
|
72
|
+
max_line_count = None
|
|
73
|
+
if read_count > MCP_MAX_LINE_COUNT:
|
|
74
|
+
max_line_count = MCP_MAX_LINE_COUNT
|
|
75
|
+
|
|
76
|
+
mcp_count_dict = fetch_mcp(_PATH, end, start, rev, max_line_count) # get MCP count dict
|
|
71
77
|
mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
|
|
72
|
-
cons_seq, cons_conf = build_cons_seq(mcp_cons_list, read_count) # get list of max base conservations for each index
|
|
78
|
+
cons_seq, cons_conf = build_cons_seq(mcp_cons_list, read_count, max_line_count=max_line_count) # get list of max base conservations for each index
|
|
73
79
|
|
|
74
80
|
res_dict[start] = np.mean(cons_conf) # compute the mean
|
|
75
81
|
|
|
@@ -24,55 +24,14 @@ import sys
|
|
|
24
24
|
import json
|
|
25
25
|
import time
|
|
26
26
|
|
|
27
|
+
from mgnify_pipelines_toolkit.constants.thresholds import MIN_OVERLAP, MIN_SEQ_COUNT, MAX_ERROR_PROPORTION,MAX_INTERNAL_PRIMER_PROPORTION
|
|
28
|
+
from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
|
|
29
|
+
|
|
27
30
|
raw_f_regex = re.compile(
|
|
28
31
|
"([A-z0-9\.\-\:]+)\s+-\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+])\s+([-+])\s+(\d+)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(\d+[\.\d]*)\s+(.+)\s!\s+.*")
|
|
29
32
|
|
|
30
|
-
MIN_OVERLAP = 0.95
|
|
31
|
-
|
|
32
|
-
MIN_SEQ_COUNT = 5000
|
|
33
|
-
|
|
34
|
-
MAX_ERROR_PROPORTION = 0.01
|
|
35
|
-
|
|
36
|
-
MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
37
|
-
|
|
38
|
-
regions_16S_bacteria = {
|
|
39
|
-
'V1': [69, 92],
|
|
40
|
-
'V2': [131, 239],
|
|
41
|
-
'V3': [430, 487],
|
|
42
|
-
'V4': [566, 672],
|
|
43
|
-
'V5': [812, 869],
|
|
44
|
-
'V6': [976, 1033],
|
|
45
|
-
'V7': [1107, 1164],
|
|
46
|
-
'V8': [1234, 1285],
|
|
47
|
-
'V9': [1426, 1456]
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
regions_16S_archaea = {
|
|
51
|
-
'V1': [61, 79],
|
|
52
|
-
'V2': [114, 223],
|
|
53
|
-
'V3': [397, 436],
|
|
54
|
-
'V4': [516, 623],
|
|
55
|
-
'V5': [763, 824],
|
|
56
|
-
'V6': [932, 982],
|
|
57
|
-
'V7': [1056, 1119],
|
|
58
|
-
'V8': [1189, 1240],
|
|
59
|
-
'V9': [1372, 1410]
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
regions_18S = {
|
|
63
|
-
'V1': [69, 109],
|
|
64
|
-
'V2': [136, 298],
|
|
65
|
-
'V3': [474, 545],
|
|
66
|
-
'V4': [627, 873],
|
|
67
|
-
'V5': [1059, 1102],
|
|
68
|
-
'V7': [1366, 1454],
|
|
69
|
-
'V8': [1526, 1608],
|
|
70
|
-
'V9': [1728, 1795]
|
|
71
|
-
}
|
|
72
|
-
|
|
73
33
|
logging.basicConfig(level=logging.DEBUG)
|
|
74
34
|
|
|
75
|
-
|
|
76
35
|
def calc_overlap(read, reg):
|
|
77
36
|
read_s, read_f = read
|
|
78
37
|
reg_s, reg_f = reg
|
|
@@ -207,11 +166,11 @@ def determine_cm(cm_detected):
|
|
|
207
166
|
model: A dictionary containing the coordinates of the variable regions for the matched model.
|
|
208
167
|
"""
|
|
209
168
|
if cm_detected == 'RF00177':
|
|
210
|
-
model =
|
|
169
|
+
model = REGIONS_16S_BACTERIA
|
|
211
170
|
elif cm_detected == 'RF01959':
|
|
212
|
-
model =
|
|
171
|
+
model = REGIONS_16S_ARCHAEA
|
|
213
172
|
elif cm_detected == 'RF01960':
|
|
214
|
-
model =
|
|
173
|
+
model = REGIONS_18S
|
|
215
174
|
else:
|
|
216
175
|
model = 'unsupported'
|
|
217
176
|
return model
|
|
@@ -28,7 +28,7 @@ def parse_args():
|
|
|
28
28
|
|
|
29
29
|
parser = argparse.ArgumentParser()
|
|
30
30
|
|
|
31
|
-
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to
|
|
31
|
+
parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
|
|
32
32
|
parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
|
|
33
33
|
parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
|
|
34
34
|
parser.add_argument("-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow")
|
|
@@ -49,7 +49,7 @@ def parse_args():
|
|
|
49
49
|
|
|
50
50
|
def order_df(taxa_df):
|
|
51
51
|
|
|
52
|
-
if len(taxa_df.columns) ==
|
|
52
|
+
if len(taxa_df.columns) == 9:
|
|
53
53
|
taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
|
|
54
54
|
elif len(taxa_df.columns) == 10:
|
|
55
55
|
taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
|
|
@@ -66,11 +66,13 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
66
66
|
for i in range(len(taxa_df)):
|
|
67
67
|
|
|
68
68
|
sorted_index = taxa_df.index[i]
|
|
69
|
-
|
|
69
|
+
asv_num = taxa_df.iloc[i, 0]
|
|
70
|
+
asv_count = asv_dict[asv_num]
|
|
70
71
|
|
|
71
72
|
if asv_count == 0:
|
|
72
73
|
continue
|
|
73
74
|
|
|
75
|
+
sk = taxa_df.loc[sorted_index, "Superkingdom"]
|
|
74
76
|
k = taxa_df.loc[sorted_index, "Kingdom"]
|
|
75
77
|
p = taxa_df.loc[sorted_index, "Phylum"]
|
|
76
78
|
c = taxa_df.loc[sorted_index, "Class"]
|
|
@@ -83,47 +85,53 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
|
|
|
83
85
|
|
|
84
86
|
while True:
|
|
85
87
|
|
|
88
|
+
if sk != "0":
|
|
89
|
+
sk = "_".join(sk.split(" "))
|
|
90
|
+
tax_assignment += sk
|
|
91
|
+
else:
|
|
92
|
+
break
|
|
93
|
+
|
|
86
94
|
if k != "0":
|
|
87
95
|
k = "_".join(k.split(" "))
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
tax_assignment += f"sk__Eukaryota"
|
|
92
|
-
else:
|
|
93
|
-
tax_assignment += f"sk__Eukaryota\tk__{k}"
|
|
96
|
+
tax_assignment += f"\t{k}"
|
|
97
|
+
elif sk != "0":
|
|
98
|
+
tax_assignment += f"\tk__"
|
|
94
99
|
else:
|
|
95
100
|
break
|
|
96
101
|
|
|
97
102
|
if p != "0":
|
|
98
|
-
if k == "Archaea" or k == "Bacteria":
|
|
99
|
-
tax_assignment += f"\tk__"
|
|
100
103
|
p = "_".join(p.split(" "))
|
|
101
|
-
tax_assignment += f"\
|
|
104
|
+
tax_assignment += f"\t{p}"
|
|
102
105
|
else:
|
|
103
106
|
break
|
|
107
|
+
|
|
104
108
|
if c != "0":
|
|
105
109
|
c = "_".join(c.split(" "))
|
|
106
|
-
tax_assignment += f"\
|
|
110
|
+
tax_assignment += f"\t{c}"
|
|
107
111
|
else:
|
|
108
112
|
break
|
|
113
|
+
|
|
109
114
|
if o != "0":
|
|
110
115
|
o = "_".join(o.split(" "))
|
|
111
|
-
tax_assignment += f"\
|
|
116
|
+
tax_assignment += f"\t{o}"
|
|
112
117
|
else:
|
|
113
118
|
break
|
|
119
|
+
|
|
114
120
|
if f != "0":
|
|
115
121
|
f = "_".join(f.split(" "))
|
|
116
|
-
tax_assignment += f"\
|
|
122
|
+
tax_assignment += f"\t{f}"
|
|
117
123
|
else:
|
|
118
124
|
break
|
|
125
|
+
|
|
119
126
|
if g != "0":
|
|
120
127
|
g = "_".join(g.split(" "))
|
|
121
|
-
tax_assignment += f"\
|
|
128
|
+
tax_assignment += f"\t{g}"
|
|
122
129
|
else:
|
|
123
130
|
break
|
|
131
|
+
|
|
124
132
|
if s != "0":
|
|
125
133
|
s = "_".join(s.split(" "))
|
|
126
|
-
tax_assignment += f"\
|
|
134
|
+
tax_assignment += f"\t{s}"
|
|
127
135
|
break
|
|
128
136
|
|
|
129
137
|
if tax_assignment == "":
|
|
@@ -140,7 +148,8 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
140
148
|
for i in range(len(taxa_df)):
|
|
141
149
|
|
|
142
150
|
sorted_index = taxa_df.index[i]
|
|
143
|
-
|
|
151
|
+
asv_num = taxa_df.iloc[i, 0]
|
|
152
|
+
asv_count = asv_dict[asv_num]
|
|
144
153
|
|
|
145
154
|
if asv_count == 0:
|
|
146
155
|
continue
|
|
@@ -161,45 +170,55 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
|
|
|
161
170
|
|
|
162
171
|
if d != "0":
|
|
163
172
|
d = "_".join(d.split(" "))
|
|
164
|
-
tax_assignment +=
|
|
173
|
+
tax_assignment += d
|
|
165
174
|
else:
|
|
166
175
|
break
|
|
167
176
|
|
|
168
177
|
if sg != "0":
|
|
169
178
|
sg = "_".join(sg.split(" "))
|
|
170
|
-
tax_assignment += f"\
|
|
179
|
+
tax_assignment += f"\t{sg}"
|
|
171
180
|
else:
|
|
172
181
|
break
|
|
182
|
+
|
|
173
183
|
if dv != "0":
|
|
174
184
|
dv = "_".join(dv.split(" "))
|
|
175
|
-
tax_assignment += f"\
|
|
185
|
+
tax_assignment += f"\t{dv}"
|
|
186
|
+
else:
|
|
187
|
+
break
|
|
176
188
|
|
|
177
189
|
if sdv != "0":
|
|
178
190
|
sdv = "_".join(sdv.split(" "))
|
|
179
|
-
tax_assignment += f"\
|
|
191
|
+
tax_assignment += f"\t{sdv}"
|
|
192
|
+
else:
|
|
193
|
+
break
|
|
194
|
+
|
|
180
195
|
if c != "0":
|
|
181
196
|
c = "_".join(c.split(" "))
|
|
182
|
-
tax_assignment += f"\
|
|
197
|
+
tax_assignment += f"\t{c}"
|
|
183
198
|
else:
|
|
184
199
|
break
|
|
200
|
+
|
|
185
201
|
if o != "0":
|
|
186
202
|
o = "_".join(o.split(" "))
|
|
187
|
-
tax_assignment += f"\
|
|
203
|
+
tax_assignment += f"\t{o}"
|
|
188
204
|
else:
|
|
189
205
|
break
|
|
206
|
+
|
|
190
207
|
if f != "0":
|
|
191
208
|
f = "_".join(f.split(" "))
|
|
192
|
-
tax_assignment += f"\
|
|
209
|
+
tax_assignment += f"\t{f}"
|
|
193
210
|
else:
|
|
194
211
|
break
|
|
212
|
+
|
|
195
213
|
if g != "0":
|
|
196
214
|
g = "_".join(g.split(" "))
|
|
197
|
-
tax_assignment += f"\
|
|
215
|
+
tax_assignment += f"\t{g}"
|
|
198
216
|
else:
|
|
199
217
|
break
|
|
218
|
+
|
|
200
219
|
if s != "0":
|
|
201
220
|
s = "_".join(s.split(" "))
|
|
202
|
-
tax_assignment += f"\
|
|
221
|
+
tax_assignment += f"\t{s}"
|
|
203
222
|
break
|
|
204
223
|
|
|
205
224
|
if tax_assignment == "":
|
|
@@ -253,7 +272,7 @@ def main():
|
|
|
253
272
|
asv_intersection = fwd_asvs
|
|
254
273
|
|
|
255
274
|
if headers[counter] in amp_reads:
|
|
256
|
-
asv_dict[int(asv_intersection[0]) - 1] += 1
|
|
275
|
+
asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
|
|
257
276
|
|
|
258
277
|
fwd_fr.close()
|
|
259
278
|
if paired_end:
|
|
@@ -261,7 +280,7 @@ def main():
|
|
|
261
280
|
|
|
262
281
|
ref_db = ""
|
|
263
282
|
|
|
264
|
-
if len(taxa_df.columns) ==
|
|
283
|
+
if len(taxa_df.columns) == 9:
|
|
265
284
|
tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
|
|
266
285
|
ref_db = "silva"
|
|
267
286
|
elif len(taxa_df.columns) == 10:
|
mgnify_pipelines_toolkit-0.1.2/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
24
|
+
|
|
25
|
+
def parse_args():
|
|
26
|
+
|
|
27
|
+
parser = argparse.ArgumentParser()
|
|
28
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
|
|
29
|
+
parser.add_argument("-l", "--label", choices=['DADA2-SILVA', 'DADA2-PR2'], required=True, type=str, help="Database label - either DADA2-SILVA or DADA2-PR2")
|
|
30
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
31
|
+
|
|
32
|
+
args = parser.parse_args()
|
|
33
|
+
|
|
34
|
+
_INPUT = args.input
|
|
35
|
+
_LABEL = args.label
|
|
36
|
+
_SAMPLE = args.sample
|
|
37
|
+
|
|
38
|
+
return _INPUT, _LABEL, _SAMPLE
|
|
39
|
+
|
|
40
|
+
def parse_label(label):
|
|
41
|
+
|
|
42
|
+
silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
|
|
43
|
+
pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
|
|
44
|
+
|
|
45
|
+
silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
46
|
+
pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
47
|
+
|
|
48
|
+
chosen_short_ranks = ''
|
|
49
|
+
chosen_long_ranks = ''
|
|
50
|
+
|
|
51
|
+
if label == 'DADA2-SILVA':
|
|
52
|
+
chosen_short_ranks = silva_short_ranks
|
|
53
|
+
chosen_long_ranks = silva_long_ranks
|
|
54
|
+
elif label == 'DADA2-PR2':
|
|
55
|
+
chosen_short_ranks = pr2_short_ranks
|
|
56
|
+
chosen_long_ranks = pr2_long_ranks
|
|
57
|
+
else:
|
|
58
|
+
logging.error("Incorrect database label - exiting.")
|
|
59
|
+
exit(1)
|
|
60
|
+
|
|
61
|
+
return chosen_short_ranks, chosen_long_ranks
|
|
62
|
+
|
|
63
|
+
def parse_mapseq(mseq_df, short_ranks, long_ranks):
|
|
64
|
+
|
|
65
|
+
res_dict = defaultdict(list)
|
|
66
|
+
|
|
67
|
+
for i in range(len(mseq_df)):
|
|
68
|
+
asv_id = mseq_df.iloc[i, 0]
|
|
69
|
+
tax_ass = mseq_df.iloc[i, 1].split(';')
|
|
70
|
+
|
|
71
|
+
res_dict['ASV'].append(asv_id)
|
|
72
|
+
|
|
73
|
+
for j in range(len(short_ranks)):
|
|
74
|
+
|
|
75
|
+
curr_rank = long_ranks[j]
|
|
76
|
+
|
|
77
|
+
if j >= len(tax_ass):
|
|
78
|
+
# This would only be true if the assigned taxonomy is shorter than the total reference database taxononmy
|
|
79
|
+
# so fill each remaining rank with its respective short rank blank
|
|
80
|
+
curr_tax = short_ranks[j]
|
|
81
|
+
else:
|
|
82
|
+
curr_tax = tax_ass[j]
|
|
83
|
+
|
|
84
|
+
res_dict[curr_rank].append(curr_tax)
|
|
85
|
+
res_df = pd.DataFrame.from_dict(res_dict)
|
|
86
|
+
|
|
87
|
+
return(res_df)
|
|
88
|
+
|
|
89
|
+
def process_blank_tax_ends(res_df, ranks):
|
|
90
|
+
# Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
|
|
91
|
+
# while avoiding making blanks in the middle as NAs
|
|
92
|
+
|
|
93
|
+
for i in range(len(res_df)):
|
|
94
|
+
last_empty_rank = ''
|
|
95
|
+
currently_empty = False
|
|
96
|
+
for j in reversed(range(len(ranks))): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
|
|
97
|
+
curr_rank = res_df.iloc[i, j+1]
|
|
98
|
+
if curr_rank in ranks:
|
|
99
|
+
if last_empty_rank == '': # Last rank is empty, start window of consecutive blanks
|
|
100
|
+
last_empty_rank = j+1
|
|
101
|
+
currently_empty = True
|
|
102
|
+
elif currently_empty: # If we're in a window of consecutive blank assignments that started at the beginning
|
|
103
|
+
last_empty_rank = j+1
|
|
104
|
+
else:
|
|
105
|
+
break
|
|
106
|
+
else:
|
|
107
|
+
break
|
|
108
|
+
if last_empty_rank != '':
|
|
109
|
+
res_df.iloc[i, last_empty_rank:] = 'NA'
|
|
110
|
+
|
|
111
|
+
return res_df
|
|
112
|
+
|
|
113
|
+
def main():
|
|
114
|
+
|
|
115
|
+
_INPUT, _LABEL, _SAMPLE = parse_args()
|
|
116
|
+
|
|
117
|
+
mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
|
|
118
|
+
|
|
119
|
+
short_ranks, long_ranks = parse_label(_LABEL)
|
|
120
|
+
res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
|
|
121
|
+
final_res_df = process_blank_tax_ends(res_df, short_ranks)
|
|
122
|
+
|
|
123
|
+
final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
main()
|
|
@@ -14,5 +14,5 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
_SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
17
|
+
_SILVA_TAX_RANKS = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
|
|
18
18
|
_PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
# used by fetch_mcp in analysis.amplicon
|
|
18
|
+
MCP_MAX_LINE_COUNT = 300_000
|
|
19
|
+
|
|
20
|
+
# used by classify_var_regions in analysis.amplicon
|
|
21
|
+
MIN_OVERLAP = 0.95
|
|
22
|
+
MIN_SEQ_COUNT = 5000
|
|
23
|
+
MAX_ERROR_PROPORTION = 0.01
|
|
24
|
+
MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
REGIONS_16S_BACTERIA = {
|
|
18
|
+
'V1': [69, 92],
|
|
19
|
+
'V2': [131, 239],
|
|
20
|
+
'V3': [430, 487],
|
|
21
|
+
'V4': [566, 672],
|
|
22
|
+
'V5': [812, 869],
|
|
23
|
+
'V6': [976, 1033],
|
|
24
|
+
'V7': [1107, 1164],
|
|
25
|
+
'V8': [1234, 1285],
|
|
26
|
+
'V9': [1426, 1456]
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
REGIONS_16S_ARCHAEA = {
|
|
30
|
+
'V1': [61, 79],
|
|
31
|
+
'V2': [114, 223],
|
|
32
|
+
'V3': [397, 436],
|
|
33
|
+
'V4': [516, 623],
|
|
34
|
+
'V5': [763, 824],
|
|
35
|
+
'V6': [932, 982],
|
|
36
|
+
'V7': [1056, 1119],
|
|
37
|
+
'V8': [1189, 1240],
|
|
38
|
+
'V9': [1372, 1410]
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
REGIONS_18S = {
|
|
42
|
+
'V1': [69, 109],
|
|
43
|
+
'V2': [136, 298],
|
|
44
|
+
'V3': [474, 545],
|
|
45
|
+
'V4': [627, 873],
|
|
46
|
+
'V5': [1059, 1102],
|
|
47
|
+
'V7': [1366, 1454],
|
|
48
|
+
'V8': [1526, 1608],
|
|
49
|
+
'V9': [1728, 1795]
|
|
50
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -32,46 +32,37 @@ This Python package contains a collection of scripts and tools for including in
|
|
|
32
32
|
- Scripts that don't have existing containers built to run them
|
|
33
33
|
- Scripts for which building an entire container would be too bulky of a solution to deploy in pipelines
|
|
34
34
|
|
|
35
|
-
This package
|
|
36
|
-
|
|
37
|
-
> **Soon: this repository will be made available on bioconda for even easier integration in nextflow/nf-core pipelines**.
|
|
35
|
+
This package is built and uploaded to PyPi and bioconda. The package bundles scripts and makes them executable from the command-line when this package is installed.
|
|
38
36
|
|
|
39
37
|
## How to install
|
|
40
38
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
`pip install -i https://test.pypi.org/simple/ --no-deps mgnify-pipelines-toolkit`
|
|
44
|
-
|
|
45
|
-
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
46
|
-
|
|
47
|
-
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
48
|
-
|
|
39
|
+
This package is available both on [PyPi](https://pypi.org/project/mgnify-pipelines-toolkit/) and bioconda.
|
|
49
40
|
|
|
50
|
-
|
|
51
|
-
This command this build the package:
|
|
41
|
+
To install from PyPi with pip:
|
|
52
42
|
|
|
53
|
-
`
|
|
43
|
+
`pip install mgnify-pipelines-toolkit`
|
|
54
44
|
|
|
55
|
-
|
|
45
|
+
To install from bioconda with conda/mamba:
|
|
56
46
|
|
|
57
|
-
`
|
|
47
|
+
`conda install -c bioconda mgnify-pipelines-toolkit`
|
|
58
48
|
|
|
59
|
-
|
|
49
|
+
You should then be able to run the packages from the command-line. For example to run the `get_subunits.py` script:
|
|
60
50
|
|
|
61
|
-
`
|
|
51
|
+
`get_subunits -i ${easel_coords} -n ${meta.id}`
|
|
62
52
|
|
|
63
53
|
## Adding a new script to the package
|
|
64
54
|
|
|
65
55
|
### New script requirements
|
|
66
56
|
|
|
67
57
|
There are a few requirements for your script:
|
|
58
|
+
|
|
68
59
|
- It needs to have a named main function of some kind. See `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py` and the `main()` function for an example
|
|
69
60
|
- Because this package is meant to be run from the command-line, make sure your script can easily pass arguments using tools like `argparse` or `click`
|
|
70
61
|
- A small amount of dependencies. This requirement is subjective, but for example if your script only requires a handful of basic packages like `Biopython`, `numpy`, `pandas`, etc., then it's fine. However if the script has a more extensive list of dependencies, a container is probably a better fit.
|
|
71
62
|
|
|
72
63
|
### How to add a new script
|
|
73
64
|
|
|
74
|
-
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
65
|
+
To add a new Python script, first copy it over to the `mgnify_pipelines_toolkit` directory in this repository, specifically to the subdirectory that makes the most sense. If none of the subdirectories make sense for your script, create a new one. If your script doesn't have a `main()` type function yet, write one.
|
|
75
66
|
|
|
76
67
|
Then, open `pyproject.toml` as you will need to add some bits. First, add any missing dependencies (include the version) to the `dependencies` field.
|
|
77
68
|
|
|
@@ -83,12 +74,18 @@ Then, scroll down to the `[project.scripts]` line. Here, you will create an alia
|
|
|
83
74
|
|
|
84
75
|
- `get_subunits` is the alias
|
|
85
76
|
- `mgnify_pipelines_toolkit.analysis.shared.get_subunits` will link the alias to the script with the path `mgnify_pipelines_toolkit/analysis/shared/get_subunits.py`
|
|
86
|
-
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
77
|
+
- `:main` will specifically call the function named `main()` when the alias is run.
|
|
87
78
|
|
|
88
79
|
When you have setup this command, executing `get_subunits` on the command-line will be the equivalent of doing:
|
|
89
80
|
|
|
90
81
|
`from mgnify_pipelines_toolkit.analysis.shared.get_subunits import main; main()`
|
|
91
82
|
|
|
92
|
-
|
|
83
|
+
You should then write at least one unit test for your addition. This package uses `pytest` at the moment for this purpose. A GitHub Action workflow will run all of the unit tests whenever a commit is pushed to any branch.
|
|
84
|
+
|
|
85
|
+
Finally, you will need to bump up the version in the `version` line.
|
|
93
86
|
|
|
94
87
|
At the moment, these should be the only steps required to setup your script in this package (which is subject to change).
|
|
88
|
+
|
|
89
|
+
### Building and uploading to PyPi
|
|
90
|
+
|
|
91
|
+
The building and pushing of the package is automated by GitHub Actions, which will activate only on a new release. Bioconda should then automatically pick up the new PyPi release and push it to their recipes, though it's worth keeping an eye on their automated pull requests just in case [here](https://github.com/bioconda/bioconda-recipes/pulls).
|
|
@@ -16,6 +16,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py
|
|
|
16
16
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
|
|
17
17
|
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py
|
|
18
18
|
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
|
|
19
|
+
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
|
|
19
20
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
|
|
20
21
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
|
|
21
22
|
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
|
|
@@ -24,4 +25,6 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
|
|
|
24
25
|
mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py
|
|
25
26
|
mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
|
|
26
27
|
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
|
|
27
|
-
mgnify_pipelines_toolkit/constants/tax_ranks.py
|
|
28
|
+
mgnify_pipelines_toolkit/constants/tax_ranks.py
|
|
29
|
+
mgnify_pipelines_toolkit/constants/thresholds.py
|
|
30
|
+
mgnify_pipelines_toolkit/constants/var_region_coordinates.py
|
|
@@ -8,6 +8,7 @@ get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
|
|
|
8
8
|
get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
|
|
9
9
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
10
10
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
11
|
+
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
11
12
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
12
13
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
13
14
|
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mgnify_pipelines_toolkit"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.2"
|
|
4
4
|
readme = "README.md"
|
|
5
5
|
license = {text = "Apache Software License 2.0"}
|
|
6
6
|
authors = [
|
|
@@ -49,6 +49,7 @@ make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_coun
|
|
|
49
49
|
remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
|
|
50
50
|
rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
|
|
51
51
|
standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
|
|
52
|
+
mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
|
|
52
53
|
|
|
53
54
|
[project.optional-dependencies]
|
|
54
55
|
tests = [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|