mgnify-pipelines-toolkit 0.1.9__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/PKG-INFO +2 -1
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +30 -37
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +3 -3
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +3 -1
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +41 -38
- mgnify_pipelines_toolkit-0.2.1/mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +424 -0
- mgnify_pipelines_toolkit-0.2.1/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +117 -0
- mgnify_pipelines_toolkit-0.2.1/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +135 -0
- mgnify_pipelines_toolkit-0.2.1/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +181 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +2 -1
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +3 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/requires.txt +1 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/pyproject.toml +6 -2
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/README.md +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
- {mgnify_pipelines_toolkit-0.1.9 → mgnify_pipelines_toolkit-0.2.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -18,6 +18,7 @@ Requires-Dist: regex==2023.12.25
|
|
|
18
18
|
Requires-Dist: requests==2.32.3
|
|
19
19
|
Requires-Dist: click==8.1.7
|
|
20
20
|
Requires-Dist: pandera==0.22.1
|
|
21
|
+
Requires-Dist: pyfastx>=2.2.0
|
|
21
22
|
Provides-Extra: tests
|
|
22
23
|
Requires-Dist: pytest==7.4.0; extra == "tests"
|
|
23
24
|
Requires-Dist: pytest-md==0.2.0; extra == "tests"
|
|
@@ -18,7 +18,7 @@ from collections import defaultdict, Counter
|
|
|
18
18
|
import logging
|
|
19
19
|
import gzip
|
|
20
20
|
import os
|
|
21
|
-
import
|
|
21
|
+
import pyfastx
|
|
22
22
|
|
|
23
23
|
from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
|
|
24
24
|
_AMBIGUOUS_BASES_DICT,
|
|
@@ -29,7 +29,6 @@ logging.basicConfig(level=logging.DEBUG)
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def split_dir_into_sample_paths(dir):
|
|
32
|
-
|
|
33
32
|
file_list = os.listdir(dir)
|
|
34
33
|
file_list = [
|
|
35
34
|
file
|
|
@@ -43,42 +42,33 @@ def split_dir_into_sample_paths(dir):
|
|
|
43
42
|
return sample_list
|
|
44
43
|
|
|
45
44
|
|
|
46
|
-
def get_read_count(read_path,
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
elif
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
read_count = stdout.strip() if stdout is not None else ""
|
|
71
|
-
|
|
72
|
-
if not read_count.isdigit():
|
|
73
|
-
logging.error(
|
|
74
|
-
f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'"
|
|
45
|
+
def get_read_count(read_path: str, file_type: str = "fastq") -> int:
|
|
46
|
+
"""
|
|
47
|
+
Get the read count of a FASTQ or FASTA file.
|
|
48
|
+
|
|
49
|
+
:param read_path: The path to the FASTQ or FASTA file.
|
|
50
|
+
:type read_path: str
|
|
51
|
+
:param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
|
|
52
|
+
:type fasta_type: str
|
|
53
|
+
:return: The number of reads in the file.
|
|
54
|
+
:rtype: int
|
|
55
|
+
:raises ValueError: If the file type is not supported or the read count is not a positive integer.
|
|
56
|
+
"""
|
|
57
|
+
read_count = 0
|
|
58
|
+
|
|
59
|
+
if file_type == "fasta":
|
|
60
|
+
fasta = pyfastx.Fasta(read_path, build_index=False)
|
|
61
|
+
read_count = sum(1 for _ in fasta)
|
|
62
|
+
elif file_type == "fastq":
|
|
63
|
+
fastq = pyfastx.Fastq(read_path, build_index=False)
|
|
64
|
+
read_count = sum(1 for _ in fastq)
|
|
65
|
+
else:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
|
|
75
68
|
)
|
|
76
|
-
exit(1)
|
|
77
69
|
|
|
78
|
-
read_count
|
|
79
|
-
|
|
80
|
-
if type == "fastq":
|
|
81
|
-
read_count /= 4
|
|
70
|
+
if read_count <= 0:
|
|
71
|
+
raise ValueError(f"Read count is not a positive integer: {read_count}")
|
|
82
72
|
|
|
83
73
|
return read_count
|
|
84
74
|
|
|
@@ -128,7 +118,10 @@ def build_cons_seq(
|
|
|
128
118
|
counter += 1
|
|
129
119
|
|
|
130
120
|
try:
|
|
131
|
-
|
|
121
|
+
if max_line_count is None:
|
|
122
|
+
max_prop = max_count / read_count
|
|
123
|
+
else:
|
|
124
|
+
max_prop = max_count / max_line_count
|
|
132
125
|
|
|
133
126
|
cons_bases = []
|
|
134
127
|
curr_prop = 0.0
|
|
@@ -27,7 +27,6 @@ from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def parse_args(argv=None):
|
|
30
|
-
|
|
31
30
|
parser = argparse.ArgumentParser()
|
|
32
31
|
|
|
33
32
|
parser.add_argument(
|
|
@@ -63,7 +62,9 @@ def are_there_primers_in_this_sample(path, rev=False):
|
|
|
63
62
|
False if a primer was not identified
|
|
64
63
|
"""
|
|
65
64
|
|
|
66
|
-
read_count = get_read_count(
|
|
65
|
+
read_count = get_read_count(
|
|
66
|
+
path, file_type="fastq"
|
|
67
|
+
) # Get read count for fastq file
|
|
67
68
|
mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
|
|
68
69
|
|
|
69
70
|
mcp_count_dict = fetch_mcp(
|
|
@@ -133,7 +134,6 @@ def save_out(results, sample_id, output):
|
|
|
133
134
|
|
|
134
135
|
|
|
135
136
|
def main(argv=None):
|
|
136
|
-
|
|
137
137
|
path, sample, output = parse_args(argv)
|
|
138
138
|
|
|
139
139
|
fwd_primer_flag = are_there_primers_in_this_sample(
|
|
@@ -87,7 +87,9 @@ def find_mcp_props_for_sample(path, rev=False):
|
|
|
87
87
|
start + mcp_len - 1
|
|
88
88
|
) # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
|
|
89
89
|
|
|
90
|
-
read_count = get_read_count(
|
|
90
|
+
read_count = get_read_count(
|
|
91
|
+
path, file_type="fastq"
|
|
92
|
+
) # get read count for fastq file
|
|
91
93
|
|
|
92
94
|
max_line_count = None
|
|
93
95
|
if read_count > MCP_MAX_LINE_COUNT:
|
|
@@ -143,7 +143,7 @@ def get_primer_props(std_primer_dict_regex, input_path):
|
|
|
143
143
|
|
|
144
144
|
threshold = 0.60 # Arbitrary threshold for collecting a matched primer
|
|
145
145
|
read_count = get_read_count(
|
|
146
|
-
input_path, "fastq"
|
|
146
|
+
input_path, file_type="fastq"
|
|
147
147
|
) # Get read count of fastq file to calculate proportion with
|
|
148
148
|
res_dict = defaultdict(defaultdict)
|
|
149
149
|
|
|
@@ -62,45 +62,13 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
|
|
|
62
62
|
protein_rheas.add(rhea)
|
|
63
63
|
|
|
64
64
|
|
|
65
|
-
def main(
|
|
66
|
-
logging.info(
|
|
67
|
-
f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
|
|
68
|
-
)
|
|
69
|
-
protein_hashes = {}
|
|
70
|
-
with open(proteins, "r") as fasta_file:
|
|
71
|
-
for record in SeqIO.parse(fasta_file, "fasta"):
|
|
72
|
-
protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
|
|
73
|
-
protein_hashes[record.id] = protein_hash
|
|
74
|
-
|
|
75
|
-
logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
|
|
76
|
-
df = pd.read_csv(rhea2chebi, delimiter="\t")
|
|
77
|
-
rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
|
|
78
|
-
|
|
79
|
-
logging.info(
|
|
80
|
-
f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
|
|
81
|
-
)
|
|
82
|
-
with open(output, "w") as output_handler:
|
|
83
|
-
if input == "-":
|
|
84
|
-
process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
|
|
85
|
-
else:
|
|
86
|
-
with open(args.input, "r") as input_file:
|
|
87
|
-
process_lines(
|
|
88
|
-
input_file, output_handler, rhea2reaction_dict, protein_hashes
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
logging.info("Processed successfully. Exiting.")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
if __name__ == "__main__":
|
|
65
|
+
def main():
|
|
95
66
|
parser = argparse.ArgumentParser(
|
|
96
|
-
""
|
|
97
|
-
Use diamond output file to create a table with Rhea and CHEBI
|
|
98
|
-
reaction annotation for every protein.
|
|
99
|
-
"""
|
|
67
|
+
"Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
|
|
100
68
|
)
|
|
101
69
|
parser.add_argument(
|
|
102
|
-
"-
|
|
103
|
-
"--
|
|
70
|
+
"-d",
|
|
71
|
+
"--diamond_hits",
|
|
104
72
|
required=True,
|
|
105
73
|
type=str,
|
|
106
74
|
help="DIAMOND results file, use '-' for stdin",
|
|
@@ -121,10 +89,45 @@ if __name__ == "__main__":
|
|
|
121
89
|
)
|
|
122
90
|
parser.add_argument(
|
|
123
91
|
"--rhea2chebi",
|
|
124
|
-
|
|
92
|
+
required=True,
|
|
125
93
|
type=Path,
|
|
126
94
|
help="File that maps rhea_ids to CHEBI",
|
|
127
95
|
)
|
|
128
96
|
|
|
129
97
|
args = parser.parse_args()
|
|
130
|
-
|
|
98
|
+
|
|
99
|
+
diamond_hits = args.diamond_hits
|
|
100
|
+
output = args.output
|
|
101
|
+
proteins = args.proteins
|
|
102
|
+
rhea2chebi = args.rhea2chebi
|
|
103
|
+
|
|
104
|
+
logging.info(
|
|
105
|
+
f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
|
|
106
|
+
)
|
|
107
|
+
protein_hashes = {}
|
|
108
|
+
with open(proteins, "r") as fasta_file:
|
|
109
|
+
for record in SeqIO.parse(fasta_file, "fasta"):
|
|
110
|
+
protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
|
|
111
|
+
protein_hashes[record.id] = protein_hash
|
|
112
|
+
|
|
113
|
+
logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
|
|
114
|
+
df = pd.read_csv(rhea2chebi, delimiter="\t")
|
|
115
|
+
rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
|
|
116
|
+
|
|
117
|
+
logging.info(
|
|
118
|
+
f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
|
|
119
|
+
)
|
|
120
|
+
with open(output, "w") as output_handler:
|
|
121
|
+
if diamond_hits == "-":
|
|
122
|
+
process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
|
|
123
|
+
else:
|
|
124
|
+
with open(diamond_hits, "r") as input_file:
|
|
125
|
+
process_lines(
|
|
126
|
+
input_file, output_handler, rhea2reaction_dict, protein_hashes
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
logging.info("Processed successfully. Exiting.")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
main()
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
|
|
11
|
+
__version__ = "1.0.4"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Region:
|
|
15
|
+
def __init__(self, start, end):
|
|
16
|
+
# if end < start: # assuming that for +/- start always lower
|
|
17
|
+
# start, end = end, start
|
|
18
|
+
self.start = int(start)
|
|
19
|
+
self.end = int(end)
|
|
20
|
+
|
|
21
|
+
def __str__(self):
|
|
22
|
+
return "[" + str(self.start) + "," + str(self.end) + "]"
|
|
23
|
+
|
|
24
|
+
def __ge__(self, other):
|
|
25
|
+
return self.start >= other.end
|
|
26
|
+
|
|
27
|
+
def __gt__(self, other):
|
|
28
|
+
return self.start > other.end
|
|
29
|
+
|
|
30
|
+
def __le__(self, other):
|
|
31
|
+
return self.end <= other.start
|
|
32
|
+
|
|
33
|
+
def __lt__(self, other):
|
|
34
|
+
return self.end < other.start
|
|
35
|
+
|
|
36
|
+
def length(self):
|
|
37
|
+
return self.end - self.start + 1
|
|
38
|
+
|
|
39
|
+
# If 'other' overlaps and has a greater end position
|
|
40
|
+
def extends_right(self, other):
|
|
41
|
+
if self.overlaps(other) and self.end > other.end:
|
|
42
|
+
return True
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
# For overlapping fragments extend start and end to match other
|
|
46
|
+
def extend(self, other):
|
|
47
|
+
if self.overlaps(other):
|
|
48
|
+
if other.end > self.end:
|
|
49
|
+
self.end = other.end
|
|
50
|
+
if other.start < self.start:
|
|
51
|
+
self.start = other.start
|
|
52
|
+
|
|
53
|
+
def within(self, other):
|
|
54
|
+
if self.start >= other.start and self.end <= other.end:
|
|
55
|
+
return True
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
# Return length of overlap between regions
|
|
59
|
+
def overlaps(self, other):
|
|
60
|
+
if self > other or other > self:
|
|
61
|
+
return False
|
|
62
|
+
# overlap = sum of the individual lengths ...
|
|
63
|
+
ltot = self.length() + other.length()
|
|
64
|
+
# ... minus length of the combined region (i.e. min start to max end)
|
|
65
|
+
lmax = max(self.end, other.end) - min(self.start, other.start) + 1
|
|
66
|
+
return ltot - lmax
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# FGS has seq_id/start/end in the fasta files - use those to extract the sequences we want to keep;
|
|
70
|
+
# for prodigal it uses a seq_id/index_number, so need to add an extra field
|
|
71
|
+
class NumberedRegion(Region):
|
|
72
|
+
def __init__(self, start, end, nid):
|
|
73
|
+
super().__init__(start, end)
|
|
74
|
+
self.nid = nid
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def flatten_regions(regions):
|
|
78
|
+
"""Take a list of regions (possibly overlapping) and return the non-overlapping set"""
|
|
79
|
+
if len(regions) < 2:
|
|
80
|
+
return regions
|
|
81
|
+
|
|
82
|
+
flattened = []
|
|
83
|
+
regions = sorted(regions, key=lambda x: x.start) # sort by start
|
|
84
|
+
flattened = [regions[0]]
|
|
85
|
+
regions = regions[1:] # store the first
|
|
86
|
+
for region in regions:
|
|
87
|
+
if not region.overlaps(flattened[-1]): # doesn't overlap: store new region
|
|
88
|
+
flattened.append(region)
|
|
89
|
+
elif region.extends_right(flattened[-1]): # overlaps to the right: extend previous region
|
|
90
|
+
flattened[-1].extend(region)
|
|
91
|
+
# else end < prev end => new region within old: do nothing
|
|
92
|
+
return flattened
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def check_against_gaps(regions, candidates):
|
|
96
|
+
"""Given a set of non-overlapping gaps and a list of candidate regions, return the candidates that do not overlap"""
|
|
97
|
+
regions = sorted(regions, key=lambda line: line.start)
|
|
98
|
+
candidates = sorted(candidates, key=lambda line: line.start)
|
|
99
|
+
selected = []
|
|
100
|
+
r = 0
|
|
101
|
+
if not len(regions):
|
|
102
|
+
return candidates # no existing predictions - all candidates accepted
|
|
103
|
+
|
|
104
|
+
for c in candidates:
|
|
105
|
+
if c < regions[0] or c > regions[-1]: # outside any of the regions: just append
|
|
106
|
+
selected.append(c)
|
|
107
|
+
else:
|
|
108
|
+
while r < len(regions) - 1 and c >= regions[r]:
|
|
109
|
+
r += 1
|
|
110
|
+
if c < regions[r]: # found a gap
|
|
111
|
+
selected.append(c)
|
|
112
|
+
|
|
113
|
+
return selected
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def output_prodigal(predictions, files, outputs):
|
|
117
|
+
"""From the combined predictions output the prodigal data"""
|
|
118
|
+
|
|
119
|
+
sequence_set = set()
|
|
120
|
+
for seq in predictions:
|
|
121
|
+
for strand in ["-", "+"]:
|
|
122
|
+
for region in predictions[seq][strand]:
|
|
123
|
+
sequence_set.add("_".join([seq, str(region.nid)]))
|
|
124
|
+
|
|
125
|
+
# files contains the .faa and .ffn fasta files
|
|
126
|
+
for index in [1, 2]:
|
|
127
|
+
sequences = []
|
|
128
|
+
for record in SeqIO.parse(files[index], "fasta"):
|
|
129
|
+
# remove anything after the first space
|
|
130
|
+
seq_name = record.id.split(" ")[0]
|
|
131
|
+
# Replace ending * #
|
|
132
|
+
record.seq = record.seq.rstrip("*")
|
|
133
|
+
if seq_name in sequence_set:
|
|
134
|
+
sequences.append(record)
|
|
135
|
+
|
|
136
|
+
with open(outputs[index], "a") as output_handle:
|
|
137
|
+
SeqIO.write(sequences, output_handle, "fasta")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def output_fgs(predictions, files, outputs):
|
|
141
|
+
"""From the combined predictions output the FGS data"""
|
|
142
|
+
sequence_set = set()
|
|
143
|
+
for seq in predictions:
|
|
144
|
+
for strand in ["-", "+"]:
|
|
145
|
+
for region in predictions[seq][strand]:
|
|
146
|
+
sequence_set.add("_".join([seq, str(region.start), str(region.end), strand]))
|
|
147
|
+
|
|
148
|
+
# files contains the .faa and .ffn fasta files
|
|
149
|
+
for index in [1, 2]:
|
|
150
|
+
sequences = []
|
|
151
|
+
for record in SeqIO.parse(files[index], "fasta"):
|
|
152
|
+
# remove anything after the first space
|
|
153
|
+
seq_name = record.id.split(" ")[0]
|
|
154
|
+
# Replace "*" with "X"
|
|
155
|
+
record.seq = record.seq.replace("*", "X")
|
|
156
|
+
if seq_name in sequence_set:
|
|
157
|
+
sequences.append(record)
|
|
158
|
+
|
|
159
|
+
with open(outputs[index], "a") as output_handle:
|
|
160
|
+
SeqIO.write(sequences, output_handle, "fasta")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def output_files(predictions, summary, files):
|
|
164
|
+
"""Output all files"""
|
|
165
|
+
# To avoid that sequences get appended to the merged output files after restart,
|
|
166
|
+
# make sure the files get deleted if they exist
|
|
167
|
+
logging.info("Removing output files if they exist.")
|
|
168
|
+
for file_ in files["merged"]:
|
|
169
|
+
if os.path.exists(file_):
|
|
170
|
+
logging.info(f"Removing {file_}")
|
|
171
|
+
os.remove(file_)
|
|
172
|
+
|
|
173
|
+
for caller in predictions:
|
|
174
|
+
if caller == "fgs":
|
|
175
|
+
output_fgs(predictions["fgs"], files["fgs"], files["merged"])
|
|
176
|
+
if caller == "prodigal":
|
|
177
|
+
output_prodigal(predictions["prodigal"], files["prodigal"], files["merged"])
|
|
178
|
+
|
|
179
|
+
with open(files["merged"][0], "w") as sf:
|
|
180
|
+
sf.write(json.dumps(summary, sort_keys=True, indent=4) + "\n")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_regions_fgs(fn):
|
|
184
|
+
"""Parse FGS output.
|
|
185
|
+
Example:
|
|
186
|
+
# >Bifidobacterium-longum-subsp-infantis-MC2-contig1
|
|
187
|
+
# 256 2133 - 1 1.263995 I: D:
|
|
188
|
+
"""
|
|
189
|
+
regions = {}
|
|
190
|
+
with open(fn) as f:
|
|
191
|
+
for line in f:
|
|
192
|
+
if line[0] == ">":
|
|
193
|
+
id_ = line.split()[0][1:]
|
|
194
|
+
regions[id_] = {}
|
|
195
|
+
regions[id_]["+"] = []
|
|
196
|
+
regions[id_]["-"] = []
|
|
197
|
+
else:
|
|
198
|
+
r = line.split() # start end strand
|
|
199
|
+
s = int(r[0])
|
|
200
|
+
e = int(r[1])
|
|
201
|
+
regions[id_][r[2]].append(Region(s, e))
|
|
202
|
+
return regions
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
"""
|
|
206
|
+
# noqa: E501
|
|
207
|
+
This is from cmsearch
|
|
208
|
+
ERR855786.1000054-HWI-M02024:111:000000000-A8H14:1:1115:23473:14586-1 - LSU_rRNA_bacteria RF02541 hmm 1224 1446 5 227 + - 6 0.61 0.8 135.2 2.8e-38 ! -
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def get_regions_mask(mask_file):
|
|
213
|
+
"""Parse masked region file (i.e. ncRNA)"""
|
|
214
|
+
regions = {}
|
|
215
|
+
with open(mask_file) as f:
|
|
216
|
+
for line in f:
|
|
217
|
+
if line[:1] == "#":
|
|
218
|
+
continue
|
|
219
|
+
r = line.rstrip().split()
|
|
220
|
+
id_ = r[0]
|
|
221
|
+
start = int(r[7])
|
|
222
|
+
end = int(r[8])
|
|
223
|
+
if id_ not in regions:
|
|
224
|
+
regions[id_] = []
|
|
225
|
+
if start > end:
|
|
226
|
+
start, end = end, start
|
|
227
|
+
regions[id_].append(Region(start, end))
|
|
228
|
+
return regions
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# # Sequence Data: seqnum=1;seqlen=25479;seqhdr="Bifidobacterium-longum-subsp-infantis-MC2-contig1"
|
|
232
|
+
# # Model Data: version=Prodigal.v2.6.3;run_type=Single;model="Ab initio";gc_cont=59.94;transl_table=11;uses_sd=1
|
|
233
|
+
# >1_1_279_+
|
|
234
|
+
def get_regions_prodigal(fn):
|
|
235
|
+
"""Parse prodigal output"""
|
|
236
|
+
regions = {}
|
|
237
|
+
with open(fn) as f:
|
|
238
|
+
for line in f:
|
|
239
|
+
if line[:12] == "# Model Data":
|
|
240
|
+
continue
|
|
241
|
+
if line[:15] == "# Sequence Data":
|
|
242
|
+
m = re.search(r'seqhdr="(\S+)"', line)
|
|
243
|
+
if m:
|
|
244
|
+
id_ = m.group(1)
|
|
245
|
+
regions[id_] = {}
|
|
246
|
+
regions[id_]["+"] = []
|
|
247
|
+
regions[id_]["-"] = []
|
|
248
|
+
else:
|
|
249
|
+
r = line[1:].rstrip().split("_")
|
|
250
|
+
n = int(
|
|
251
|
+
r[0]
|
|
252
|
+
) # also store the index of the fragment - prodigal uses these (rather than coords) to identify sequences in the fasta output
|
|
253
|
+
s = int(r[1])
|
|
254
|
+
e = int(r[2])
|
|
255
|
+
regions[id_][r[3]].append(NumberedRegion(s, e, n))
|
|
256
|
+
return regions
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def mask_regions(regions, mask):
|
|
260
|
+
"""Look for overlaps of more than 5 base pairs of the supplied regions against a set of masks
|
|
261
|
+
This is probably O(N^2) but, in theory, there shouldn't be many mask regions
|
|
262
|
+
"""
|
|
263
|
+
new_regions = {}
|
|
264
|
+
for seq in regions:
|
|
265
|
+
new_regions[seq] = {}
|
|
266
|
+
for strand in ["-", "+"]:
|
|
267
|
+
new_regions[seq][strand] = []
|
|
268
|
+
for r in regions[seq][strand]:
|
|
269
|
+
if seq in mask:
|
|
270
|
+
overlap = 0
|
|
271
|
+
for r2 in mask[seq]:
|
|
272
|
+
if r.overlaps(r2) > 5:
|
|
273
|
+
overlap = 1
|
|
274
|
+
if not overlap:
|
|
275
|
+
new_regions[seq][strand].append(r)
|
|
276
|
+
else:
|
|
277
|
+
new_regions[seq][strand].append(r)
|
|
278
|
+
|
|
279
|
+
return new_regions
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# FIXME - This won't work if we have only a single set of predictions, but then
|
|
283
|
+
# there's no point in trying to merge
|
|
284
|
+
def merge_predictions(predictions, callers):
|
|
285
|
+
"""Check that we have priorities set of for all callers we have data for"""
|
|
286
|
+
p = set(callers)
|
|
287
|
+
new_predictions = {}
|
|
288
|
+
for type_ in predictions:
|
|
289
|
+
if type_ not in p:
|
|
290
|
+
return None
|
|
291
|
+
# throw here? - if we've used a caller that we don't have a priority for
|
|
292
|
+
|
|
293
|
+
# first set of predictions takes priority - just transfer them
|
|
294
|
+
new_predictions[callers[0]] = predictions[callers[0]]
|
|
295
|
+
|
|
296
|
+
# for now assume only two callers, but can be extended
|
|
297
|
+
new_predictions[callers[1]] = {} # empty set for second priority caller
|
|
298
|
+
for seq in predictions[callers[1]]:
|
|
299
|
+
new_predictions[callers[1]][seq] = {}
|
|
300
|
+
for strand in ["-", "+"]:
|
|
301
|
+
new_predictions[callers[1]][seq][strand] = []
|
|
302
|
+
if seq in predictions[callers[0]]: # if this sequence already has predictions
|
|
303
|
+
prev_predictions = flatten_regions(
|
|
304
|
+
predictions[callers[0]][seq][strand]
|
|
305
|
+
) # non-overlapping set of existing predictions/regions
|
|
306
|
+
new_predictions[callers[1]][seq][strand] = check_against_gaps(
|
|
307
|
+
prev_predictions, predictions[callers[1]][seq][strand]
|
|
308
|
+
) # plug new predictions/regions into gaps
|
|
309
|
+
else: # no existing predictions: just add them
|
|
310
|
+
new_predictions[callers[1]][seq][strand] = predictions[callers[1]][seq][strand]
|
|
311
|
+
|
|
312
|
+
return new_predictions
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def get_counts(predictions):
|
|
316
|
+
total = {}
|
|
317
|
+
for caller in predictions:
|
|
318
|
+
total[caller] = 0
|
|
319
|
+
for sample in predictions[caller]:
|
|
320
|
+
for strand in ["-", "+"]:
|
|
321
|
+
total[caller] += len(predictions[caller][sample][strand])
|
|
322
|
+
return total
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def combine_main():
|
|
326
|
+
parser = argparse.ArgumentParser(
|
|
327
|
+
"MGnify gene caller combiner. This script will merge the gene called by prodigal and fraggenescan (in any order)"
|
|
328
|
+
)
|
|
329
|
+
parser.add_argument("-n", "--name", action="store", dest="name", required=True, help="basename")
|
|
330
|
+
parser.add_argument("-k", "--mask", action="store", dest="mask", required=False, help="Sequence mask file")
|
|
331
|
+
|
|
332
|
+
parser.add_argument("-a", "--prodigal-out", action="store", dest="prodigal_out", required=False, help="Stats out prodigal")
|
|
333
|
+
parser.add_argument("-b", "--prodigal-ffn", action="store", dest="prodigal_ffn", required=False, help="Stats ffn prodigal")
|
|
334
|
+
parser.add_argument("-c", "--prodigal-faa", action="store", dest="prodigal_faa", required=False, help="Stats faa prodigal")
|
|
335
|
+
|
|
336
|
+
parser.add_argument("-d", "--fgs-out", action="store", dest="fgs_out", required=False, help="Stats out FGS")
|
|
337
|
+
parser.add_argument("-e", "--fgs-ffn", action="store", dest="fgs_ffn", required=False, help="Stats ffn FGS")
|
|
338
|
+
parser.add_argument("-f", "--fgs-faa", action="store", dest="fgs_faa", required=False, help="Stats faa FGS")
|
|
339
|
+
|
|
340
|
+
parser.add_argument(
|
|
341
|
+
"-p",
|
|
342
|
+
"--caller-priority",
|
|
343
|
+
action="store",
|
|
344
|
+
dest="caller_priority",
|
|
345
|
+
required=False,
|
|
346
|
+
choices=["prodigal_fgs", "fgs_prodigal"],
|
|
347
|
+
default="prodigal_fgs",
|
|
348
|
+
help="Caller priority.",
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
parser.add_argument("-v", "--verbose", help="verbose output", dest="verbose", action="count", required=False)
|
|
352
|
+
|
|
353
|
+
parser.add_argument("--version", action="version", version=f"{__version__}")
|
|
354
|
+
|
|
355
|
+
args = parser.parse_args()
|
|
356
|
+
|
|
357
|
+
# Set up logging system
|
|
358
|
+
verbose_mode = args.verbose or 0
|
|
359
|
+
|
|
360
|
+
log_level = logging.WARNING
|
|
361
|
+
if verbose_mode:
|
|
362
|
+
log_level = logging.DEBUG if verbose_mode > 1 else logging.INFO
|
|
363
|
+
|
|
364
|
+
logging.basicConfig(level=log_level, format="%(levelname)s %(asctime)s - %(message)s", datefmt="%Y/%m/%d %I:%M:%S %p")
|
|
365
|
+
|
|
366
|
+
summary = {}
|
|
367
|
+
all_predictions = {}
|
|
368
|
+
files = {}
|
|
369
|
+
caller_priority = []
|
|
370
|
+
if args.caller_priority:
|
|
371
|
+
caller_priority = args.caller_priority.split("_")
|
|
372
|
+
else:
|
|
373
|
+
caller_priority = ["prodigal", "fgs"]
|
|
374
|
+
|
|
375
|
+
logging.info(f"Caller priority: 1. {caller_priority[0]}, 2. {caller_priority[1]}")
|
|
376
|
+
|
|
377
|
+
if args.prodigal_out:
|
|
378
|
+
logging.info("Prodigal presented")
|
|
379
|
+
logging.info("Getting Prodigal regions...")
|
|
380
|
+
all_predictions["prodigal"] = get_regions_prodigal(args.prodigal_out)
|
|
381
|
+
|
|
382
|
+
files["prodigal"] = [args.prodigal_out, args.prodigal_ffn, args.prodigal_faa]
|
|
383
|
+
|
|
384
|
+
if args.fgs_out:
|
|
385
|
+
logging.info("FGS presented")
|
|
386
|
+
logging.info("Getting FragGeneScan regions ...")
|
|
387
|
+
all_predictions["fgs"] = get_regions_fgs(args.fgs_out)
|
|
388
|
+
|
|
389
|
+
files["fgs"] = [args.fgs_out, args.fgs_ffn, args.fgs_faa]
|
|
390
|
+
|
|
391
|
+
summary["all"] = get_counts(all_predictions)
|
|
392
|
+
|
|
393
|
+
# Apply mask of ncRNA search
|
|
394
|
+
logging.info("Masking non coding RNA regions...")
|
|
395
|
+
if args.mask:
|
|
396
|
+
logging.info("Reading regions for masking...")
|
|
397
|
+
mask = get_regions_mask(args.mask)
|
|
398
|
+
if "prodigal" in all_predictions:
|
|
399
|
+
logging.info("Masking Prodigal outputs...")
|
|
400
|
+
all_predictions["prodigal"] = mask_regions(all_predictions["prodigal"], mask)
|
|
401
|
+
if "fgs" in all_predictions:
|
|
402
|
+
logging.info("Masking FragGeneScan outputs...")
|
|
403
|
+
all_predictions["fgs"] = mask_regions(all_predictions["fgs"], mask)
|
|
404
|
+
summary["masked"] = get_counts(all_predictions)
|
|
405
|
+
|
|
406
|
+
# Run the merging step
|
|
407
|
+
if len(all_predictions) > 1:
|
|
408
|
+
logging.info("Merging combined gene caller results...")
|
|
409
|
+
merged_predictions = merge_predictions(all_predictions, caller_priority)
|
|
410
|
+
else:
|
|
411
|
+
logging.info("Skipping merging step...")
|
|
412
|
+
merged_predictions = all_predictions
|
|
413
|
+
summary["merged"] = get_counts(merged_predictions)
|
|
414
|
+
|
|
415
|
+
# Output fasta files and summary (json)
|
|
416
|
+
logging.info("Writing output files...")
|
|
417
|
+
|
|
418
|
+
files["merged"] = [args.name + ext for ext in [".out", ".ffn", ".faa"]]
|
|
419
|
+
|
|
420
|
+
output_files(merged_predictions, summary, files)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
if __name__ == "__main__":
|
|
424
|
+
combine_main()
|