mgnify-pipelines-toolkit 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +71 -40
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +140 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD +33 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/entry_points.txt +3 -0
- mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -16,41 +16,47 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
import sys
|
|
19
|
-
import os
|
|
20
|
-
import gzip
|
|
21
|
-
from Bio import SeqIO
|
|
22
19
|
|
|
23
20
|
|
|
24
21
|
def main():
|
|
25
22
|
parser = argparse.ArgumentParser(description="Extract lsu, ssu and 5s")
|
|
26
|
-
parser.add_argument(
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"-i", "--input", dest="input", help="Input fasta file", required=True
|
|
25
|
+
)
|
|
27
26
|
parser.add_argument("-l", "--lsu", dest="lsu", help="LSU pattern", required=True)
|
|
28
27
|
parser.add_argument("-s", "--ssu", dest="ssu", help="SSU pattern", required=True)
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
ssu_coords = "SSU_coords"
|
|
30
|
+
lsu_coords = "LSU_coords"
|
|
31
|
+
ssu_count = 0
|
|
32
|
+
lsu_count = 0
|
|
34
33
|
|
|
35
34
|
if len(sys.argv) == 1:
|
|
36
35
|
parser.print_help()
|
|
37
36
|
else:
|
|
38
37
|
args = parser.parse_args()
|
|
39
38
|
|
|
40
|
-
with
|
|
39
|
+
with (
|
|
40
|
+
open(ssu_coords, "w") as out_ssu,
|
|
41
|
+
open(lsu_coords, "w") as out_lsu,
|
|
42
|
+
open(args.input, "r") as input,
|
|
43
|
+
):
|
|
41
44
|
for line in input:
|
|
42
45
|
if args.lsu in line:
|
|
43
46
|
out_lsu.write(line)
|
|
44
|
-
|
|
47
|
+
lsu_count += 1
|
|
45
48
|
elif args.ssu in line:
|
|
46
49
|
out_ssu.write(line)
|
|
47
|
-
|
|
48
|
-
with open("RNA-counts",
|
|
49
|
-
count.write(
|
|
50
|
+
ssu_count += 1
|
|
51
|
+
with open("RNA-counts", "w") as count:
|
|
52
|
+
count.write(
|
|
53
|
+
"LSU count\t" + str(lsu_count) + "\nSSU count\t" + str(ssu_count)
|
|
54
|
+
)
|
|
50
55
|
|
|
51
56
|
out_ssu.close()
|
|
52
57
|
out_lsu.close()
|
|
53
58
|
count.close()
|
|
54
59
|
|
|
60
|
+
|
|
55
61
|
if __name__ == "__main__":
|
|
56
62
|
main()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from mgnify_pipelines_toolkit.constants.thresholds import MIN_AMPLICON_STRATEGY_CHECK
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_args():
|
|
26
|
+
|
|
27
|
+
parser = argparse.ArgumentParser(
|
|
28
|
+
description="Script that checks the output of assess_mcp_proportions.py to guess whether a FASTQ file is AMPLICON or NOT AMPLICON."
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("-i", "--input", required=True, type=str, help="Input")
|
|
31
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
32
|
+
parser.add_argument("-o", "--output", required=True, type=str, help="Output")
|
|
33
|
+
|
|
34
|
+
args = parser.parse_args()
|
|
35
|
+
|
|
36
|
+
input = args.input
|
|
37
|
+
sample = args.sample
|
|
38
|
+
output = args.output
|
|
39
|
+
|
|
40
|
+
return input, sample, output
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main():
|
|
44
|
+
|
|
45
|
+
input, sample, output = parse_args()
|
|
46
|
+
|
|
47
|
+
cons_df = pd.read_csv(input, sep="\t")
|
|
48
|
+
|
|
49
|
+
cons_values = cons_df.values[0][1:]
|
|
50
|
+
mean_cons = np.mean(cons_values)
|
|
51
|
+
|
|
52
|
+
fw = open(f"{output}/{sample}_library_check_out.txt", "w")
|
|
53
|
+
|
|
54
|
+
if mean_cons >= MIN_AMPLICON_STRATEGY_CHECK:
|
|
55
|
+
print("This data is likely to be AMPLICON.")
|
|
56
|
+
fw.write("AMPLICON") # File with "AMPLICON" written as a result.
|
|
57
|
+
|
|
58
|
+
else:
|
|
59
|
+
print("This data is unlikely to be AMPLICON.")
|
|
60
|
+
# If unlikely to be AMPLICON, the output file will be empty.
|
|
61
|
+
|
|
62
|
+
fw.close()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|
|
@@ -61,7 +61,7 @@ def main():
|
|
|
61
61
|
# Pull out the fields that we need
|
|
62
62
|
line = line.strip()
|
|
63
63
|
fields = line.split("\t")
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
if len(fields) < 14:
|
|
66
66
|
tax = "Unclassified"
|
|
67
67
|
else:
|
|
@@ -168,4 +168,4 @@ def main():
|
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
if __name__ == "__main__":
|
|
171
|
-
main()
|
|
171
|
+
main()
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
17
|
_AMBIGUOUS_BASES_DICT = {
|
|
18
|
-
|
|
19
18
|
"R": "[AG]",
|
|
20
19
|
"Y": "[CT]",
|
|
21
20
|
"S": "[GC]",
|
|
@@ -26,11 +25,10 @@ _AMBIGUOUS_BASES_DICT = {
|
|
|
26
25
|
"D": "[AGT]",
|
|
27
26
|
"H": "[ACT]",
|
|
28
27
|
"V": "[ACG]",
|
|
29
|
-
"N": "[ACTG]"
|
|
28
|
+
"N": "[ACTG]",
|
|
30
29
|
}
|
|
31
30
|
|
|
32
31
|
_AMBIGUOUS_BASES_DICT_REV = {
|
|
33
|
-
|
|
34
32
|
"A,G": "R",
|
|
35
33
|
"C,T": "Y",
|
|
36
34
|
"C,G": "S",
|
|
@@ -41,5 +39,5 @@ _AMBIGUOUS_BASES_DICT_REV = {
|
|
|
41
39
|
"A,G,T": "D",
|
|
42
40
|
"A,C,T": "H",
|
|
43
41
|
"A,C,G": "V",
|
|
44
|
-
"A,C,G,T": "N"
|
|
45
|
-
}
|
|
42
|
+
"A,C,G,T": "N",
|
|
43
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
FORMAT_REGEX_MAP = {
|
|
18
|
+
"uniprotkb": r"^(?P<db>\w+)\|(?P<unique_identifier>\w+)\|(?P<entry_name>\w+)\s(?P<protein_name>.+)\sOS=(?P<organism_name>.+)\sOX=(?P<organism_identifier>\d+)(\sGN=(?P<gene_name>.+))?\sPE=(?P<protein_existence>\d+)\sSV=(?P<sequence_version>\d+)", # noqa: E501
|
|
19
|
+
"rpxx": r"^(?P<unique_identifier>\S+)\s(?P<entry_name>\S+)\^\|\^.*\^\|\^(?P<protein_name>.+)\^\|\^.*\^\|\^.*\^\|\^(?P<organism_name>.+)\^\|\^(?P<organism_identifier>\d+)\^\|\^(?P<common_tax_name>.+)\^\|\^(?P<common_tax_identifier>\d+)", # noqa: E501
|
|
20
|
+
}
|
|
@@ -14,5 +14,24 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
_SILVA_TAX_RANKS = [
|
|
18
|
-
|
|
17
|
+
_SILVA_TAX_RANKS = [
|
|
18
|
+
"Superkingdom",
|
|
19
|
+
"Kingdom",
|
|
20
|
+
"Phylum",
|
|
21
|
+
"Class",
|
|
22
|
+
"Order",
|
|
23
|
+
"Family",
|
|
24
|
+
"Genus",
|
|
25
|
+
"Species",
|
|
26
|
+
]
|
|
27
|
+
_PR2_TAX_RANKS = [
|
|
28
|
+
"Domain",
|
|
29
|
+
"Supergroup",
|
|
30
|
+
"Division",
|
|
31
|
+
"Subdivision",
|
|
32
|
+
"Class",
|
|
33
|
+
"Order",
|
|
34
|
+
"Family",
|
|
35
|
+
"Genus",
|
|
36
|
+
"Species",
|
|
37
|
+
]
|
|
@@ -21,4 +21,7 @@ MCP_MAX_LINE_COUNT = 300_000
|
|
|
21
21
|
MIN_OVERLAP = 0.95
|
|
22
22
|
MIN_SEQ_COUNT = 5000
|
|
23
23
|
MAX_ERROR_PROPORTION = 0.01
|
|
24
|
-
MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
24
|
+
MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
25
|
+
|
|
26
|
+
# used by library_strategy_checker in analysis.shared
|
|
27
|
+
MIN_AMPLICON_STRATEGY_CHECK = 0.30
|
|
@@ -23,7 +23,7 @@ REGIONS_16S_BACTERIA = {
|
|
|
23
23
|
"V6": [976, 1033],
|
|
24
24
|
"V7": [1107, 1164],
|
|
25
25
|
"V8": [1234, 1285],
|
|
26
|
-
"V9": [1426, 1456]
|
|
26
|
+
"V9": [1426, 1456],
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
REGIONS_16S_ARCHAEA = {
|
|
@@ -35,7 +35,7 @@ REGIONS_16S_ARCHAEA = {
|
|
|
35
35
|
"V6": [932, 982],
|
|
36
36
|
"V7": [1056, 1119],
|
|
37
37
|
"V8": [1189, 1240],
|
|
38
|
-
"V9": [1372, 1410]
|
|
38
|
+
"V9": [1372, 1410],
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
REGIONS_18S = {
|
|
@@ -46,5 +46,5 @@ REGIONS_18S = {
|
|
|
46
46
|
"V5": [1059, 1102],
|
|
47
47
|
"V7": [1366, 1454],
|
|
48
48
|
"V8": [1526, 1608],
|
|
49
|
-
"V9": [1728, 1795]
|
|
50
|
-
}
|
|
49
|
+
"V9": [1728, 1795],
|
|
50
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import sys
|
|
19
|
+
import re
|
|
20
|
+
import csv
|
|
21
|
+
import hashlib
|
|
22
|
+
import gzip
|
|
23
|
+
from Bio import SeqIO
|
|
24
|
+
from mgnify_pipelines_toolkit.constants.regex_fasta_header import FORMAT_REGEX_MAP
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_gzipped(filepath):
|
|
28
|
+
with open(filepath, "rb") as test_f:
|
|
29
|
+
return test_f.read(2) == b"\x1f\x8b"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def guess_header_format(header):
|
|
33
|
+
matches = [
|
|
34
|
+
(format, re.search(regex, header)) for format, regex in FORMAT_REGEX_MAP.items()
|
|
35
|
+
]
|
|
36
|
+
guesses = [
|
|
37
|
+
(format, match.groups()) for format, match in matches if match is not None
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
if not guesses:
|
|
41
|
+
raise ValueError("Header format could not be determined")
|
|
42
|
+
|
|
43
|
+
guessed_format, _ = max(guesses, key=lambda g: g[1])
|
|
44
|
+
|
|
45
|
+
return guessed_format
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def md5_hash(s):
|
|
49
|
+
md5 = hashlib.md5()
|
|
50
|
+
md5.update(s.encode("utf-8"))
|
|
51
|
+
|
|
52
|
+
return md5.hexdigest()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def parse_args():
|
|
56
|
+
|
|
57
|
+
parser = argparse.ArgumentParser()
|
|
58
|
+
|
|
59
|
+
parser.add_argument("input", type=str, help="Path to (gzipped) Fasta file")
|
|
60
|
+
parser.add_argument("-o", "--output", type=str, help="Output path")
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"-f",
|
|
63
|
+
"--format",
|
|
64
|
+
type=str,
|
|
65
|
+
choices=["auto", "uniprotkb", "rpxx"],
|
|
66
|
+
default="auto",
|
|
67
|
+
help="Format of the input Fasta header",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"-d", "--delimiter", type=str, default="\t", help="Output column delimiter"
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--with-hash",
|
|
74
|
+
action="store_true",
|
|
75
|
+
help="Add a MD5 hash of the sequence to the output",
|
|
76
|
+
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--no-header", action="store_true", help="Do not add header to output file"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
args = parser.parse_args()
|
|
82
|
+
|
|
83
|
+
path = args.input
|
|
84
|
+
output = args.output
|
|
85
|
+
format = args.format
|
|
86
|
+
delimiter = args.delimiter
|
|
87
|
+
with_hash = args.with_hash
|
|
88
|
+
no_header = args.no_header
|
|
89
|
+
|
|
90
|
+
return path, output, format, delimiter, with_hash, no_header
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
path, output, format, delimiter, with_hash, no_header = parse_args()
|
|
95
|
+
|
|
96
|
+
if is_gzipped(path):
|
|
97
|
+
input_fh = gzip.open(path, mode="rt")
|
|
98
|
+
else:
|
|
99
|
+
input_fh = open(path, mode="rt")
|
|
100
|
+
|
|
101
|
+
if output is None:
|
|
102
|
+
output_fh = sys.stdout
|
|
103
|
+
else:
|
|
104
|
+
output_fh = open(output, mode="w", newline="")
|
|
105
|
+
|
|
106
|
+
with input_fh, output_fh:
|
|
107
|
+
if format != "auto":
|
|
108
|
+
header_regex = re.compile(FORMAT_REGEX_MAP[format])
|
|
109
|
+
else:
|
|
110
|
+
header, _ = next(SeqIO.FastaIO.SimpleFastaParser(input_fh))
|
|
111
|
+
format = guess_header_format(header)
|
|
112
|
+
header_regex = re.compile(FORMAT_REGEX_MAP[format])
|
|
113
|
+
input_fh.seek(0)
|
|
114
|
+
|
|
115
|
+
fieldnames = list(header_regex.groupindex.keys())
|
|
116
|
+
|
|
117
|
+
if with_hash:
|
|
118
|
+
fieldnames.append("sequence_hash")
|
|
119
|
+
|
|
120
|
+
fieldnames.append("sequence")
|
|
121
|
+
|
|
122
|
+
csv_writer = csv.DictWriter(
|
|
123
|
+
output_fh, fieldnames=fieldnames, delimiter=delimiter, extrasaction="ignore"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if not no_header:
|
|
127
|
+
csv_writer.writeheader()
|
|
128
|
+
|
|
129
|
+
for header, sequence in SeqIO.FastaIO.SimpleFastaParser(input_fh):
|
|
130
|
+
header_match = header_regex.match(header)
|
|
131
|
+
|
|
132
|
+
row = {"sequence": sequence}
|
|
133
|
+
|
|
134
|
+
if header_match:
|
|
135
|
+
row.update(header_match.groupdict())
|
|
136
|
+
|
|
137
|
+
if with_hash:
|
|
138
|
+
row["sequence_hash"] = md5_hash(sequence)
|
|
139
|
+
|
|
140
|
+
csv_writer.writerow(row)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
if __name__ == "__main__":
|
|
144
|
+
main()
|
{mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -15,6 +15,12 @@ Requires-Dist: biopython ==1.82
|
|
|
15
15
|
Requires-Dist: numpy ==1.26.0
|
|
16
16
|
Requires-Dist: pandas ==2.0.2
|
|
17
17
|
Requires-Dist: regex ==2023.12.25
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: mgnify-pipelines-toolkit[tests] ; extra == 'dev'
|
|
20
|
+
Requires-Dist: pre-commit ==3.8.0 ; extra == 'dev'
|
|
21
|
+
Requires-Dist: black ==24.8.0 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: flake8 ==7.1.1 ; extra == 'dev'
|
|
23
|
+
Requires-Dist: pep8-naming ==0.14.1 ; extra == 'dev'
|
|
18
24
|
Provides-Extra: tests
|
|
19
25
|
Requires-Dist: pytest ==7.4.0 ; extra == 'tests'
|
|
20
26
|
Requires-Dist: pytest-md ==0.2.0 ; extra == 'tests'
|
|
@@ -22,6 +28,7 @@ Requires-Dist: pytest-workflow ==2.0.1 ; extra == 'tests'
|
|
|
22
28
|
Requires-Dist: biopython ==1.82 ; extra == 'tests'
|
|
23
29
|
Requires-Dist: pandas ==2.0.2 ; extra == 'tests'
|
|
24
30
|
Requires-Dist: numpy ==1.26.0 ; extra == 'tests'
|
|
31
|
+
Requires-Dist: regex ==2023.12.25 ; extra == 'tests'
|
|
25
32
|
|
|
26
33
|
# mgnify-pipelines-toolkit
|
|
27
34
|
|
|
@@ -52,6 +59,16 @@ You should then be able to run the packages from the command-line. For example t
|
|
|
52
59
|
|
|
53
60
|
## Adding a new script to the package
|
|
54
61
|
|
|
62
|
+
### Local development requirements
|
|
63
|
+
Before starting any development, you should do these few steps:
|
|
64
|
+
- Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
|
|
65
|
+
- Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
|
|
66
|
+
- Activate you new environment (i.e. `conda activate my_new_env`)
|
|
67
|
+
- Install dev dependencies `pip install -e '.[dev]'`
|
|
68
|
+
- Install pre-commit hooks `pre-commit install`
|
|
69
|
+
|
|
70
|
+
When doing these steps above, you ensure that the code you add will be linted and formatted properly.
|
|
71
|
+
|
|
55
72
|
### New script requirements
|
|
56
73
|
|
|
57
74
|
There are a few requirements for your script:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=ySMZkgRSg-dnh6HMAE_1Vx8_EvJj7AiHJ2FcCaXKI-s,6448
|
|
4
|
+
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=P_BM3GTB1KKKDb5chDK7-6cP6KORJef7i8ub-XLDtM0,5289
|
|
5
|
+
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=hVkg8-tdLLf1Ewy9hor-H9zsyi-n8dnuj_shTQ5_rrM,7548
|
|
6
|
+
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=aNucaUnYejl2Not4YLMBSzyYWGYJvYwLPZcFE94TIDc,5355
|
|
7
|
+
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=kIuE2wo3FaFZw2-HRGxstKz29FyGuhqVDRhf_vPZgsA,19921
|
|
8
|
+
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=EnsIrPGigsy8jVnjYgSECihhuquSJTgCi-k6fhusKYM,3547
|
|
9
|
+
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=pvMDfq-KA9LrD359fvM1uXbXa2Mow5Fja-1iUoVdSEg,8676
|
|
10
|
+
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=9QI6o85T4JPFq4EdKmnYzI6sxPLJG6t9W0xKiu24aqw,5035
|
|
11
|
+
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=Zyo2Fb8ZFi8a5vNhLb-XpboNGWd2qyfMRaaGrXuFIgw,3634
|
|
12
|
+
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=8vwH6PY-XwMZhaUo08tOwdFsoREfNumvvDawTb9Y98U,3168
|
|
13
|
+
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=19NgCYE12bEvRBVibhZtZywwRiMdiBUBJjzL4by3_qo,1717
|
|
14
|
+
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=RDPsaWKf0wIDwvCHXyRCh2zSJf3y9E7uOhHjaAeX8bY,11099
|
|
15
|
+
mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=t0nleHm45KXsoPTuNOJESfL2DxBWH6bGZJnRxXuCKB4,4061
|
|
17
|
+
mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=xT-66JaeVA-jVh6l7PkvKSmMjnU6RsHz7sTiCehG_wk,4771
|
|
18
|
+
mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=DTX7S1P_BkGPEeDkbmUn1YoB247hpdNIe5rdFdRYDdA,1929
|
|
19
|
+
mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=XV1vjkjIHhzouM1k5hu_51XK_mgC_EOOGDN3mx4LOvc,1991
|
|
20
|
+
mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=exzWyuK0YxDiVSu4WX2H7g-uT5Y00w_EmrFqSHjRObU,5554
|
|
21
|
+
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=dCP3u_Qo-JMk3aqVapkqEbVUGE06jBQmUH6bB3bT8k0,1088
|
|
22
|
+
mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=_2UTWfHKJyyFkIRQIPM2wDf-QkRTdLJ4xsA6gAkY9f4,1188
|
|
23
|
+
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=VaHL4mbof_9Gl7Ca3b2UkqjRqjAAvBYqprfbchae480,942
|
|
24
|
+
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=zz8paGQfZAU8tT-RbSGpzZ1Aopf77yEs97BAblHH5fk,964
|
|
25
|
+
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=jbOB_bTnW2TRjmdF7IS1A7nNOLt-lGnGyVXUHu0TmvQ,1307
|
|
26
|
+
mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=GbNT7clHso21w_1PbPpWKVRd5bNs_MDbGXt8XVIGl2o,3991
|
|
28
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
29
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/METADATA,sha256=qTVmZHGu2BCpES3sd5OwcuO63oAWcTk_X8z3ZtSogX8,5859
|
|
30
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
|
31
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/entry_points.txt,sha256=wzYtVvCSM5JfOaOh1KsDnU70o3VEj9cR83TV3qqDzeE,1576
|
|
32
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
33
|
+
mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD,,
|
|
@@ -3,9 +3,12 @@ are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers
|
|
|
3
3
|
assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
|
|
4
4
|
assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
|
|
5
5
|
classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
|
|
6
|
+
fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
|
|
7
|
+
fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
|
|
6
8
|
find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
|
|
7
9
|
get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
|
|
8
10
|
get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
|
|
11
|
+
library_strategy_check = mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main
|
|
9
12
|
make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
|
|
10
13
|
mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
|
|
11
14
|
mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=EvWTFV4gGn0SkrjwC2hzvNGSXFLeyFDmVj2QDa5DmtE,6402
|
|
4
|
-
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
|
|
5
|
-
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=pBNpqHFb0zzWgTD1mY3Q5MslQ5nmT99-pSHpyngVEuo,7159
|
|
6
|
-
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=BCzLlfvRqiPC-YwzET901f_d0anYt1zpf5y0iOCQnvs,5191
|
|
7
|
-
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=89LKH4rvqRydAEGvfaWqIClcitJ1Vbu7b5d4FApzGp4,18392
|
|
8
|
-
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
|
|
9
|
-
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=dZIygTbMZvVbSVBmFFAZz7x24oQEpvdEOTpTcnYAyoM,8444
|
|
10
|
-
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=lWVIhDxfsTuDzWjjUlMGx3RL7iD_Yy8m9Ppc9wjfCFg,4765
|
|
11
|
-
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=F4ALSuH8N-0hHUqPCFwHgoAnteb2Ft3tUN9j6DaD5h8,3539
|
|
12
|
-
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
|
|
13
|
-
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
|
|
14
|
-
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
|
|
15
|
-
mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=vvhn8O9t1zzD8rIvQ5bDLLgdzogBGKqgKXuMybnHEXA,4551
|
|
17
|
-
mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=hFXUTZb-etmJS7Si3mCCVCXV5ZYN0tP6FSbeiVxG1jo,1879
|
|
18
|
-
mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZz2nlmST3SE6LJbep9sKdMH-vaI,5565
|
|
19
|
-
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
|
|
20
|
-
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=fP97JvlUdxJBakJ694VresIY8-N3pcU99m7kZ9buKys,867
|
|
21
|
-
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=7J3caCikkEcLdKF4zSR0z8qMQw4-h9aSkSbFbS0LNg4,873
|
|
22
|
-
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=E8Cd3A1Hj9M95zw9Ut-2x8sE6_PlH6RJJEoikyZUMaQ,1303
|
|
23
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
24
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/METADATA,sha256=D8bYOR2kQZzJPdqtFkHj_Xd4axEHjzJPJXKAHtFj8L0,4950
|
|
25
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
26
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/entry_points.txt,sha256=K8nqnyAQG9jqHGgIfMIaCIe20u5a0FFCCqJWi4DoD2U,1306
|
|
27
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
28
|
-
mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD,,
|
{mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE
RENAMED
|
File without changes
|
{mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt
RENAMED
|
File without changes
|