mgnify-pipelines-toolkit 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +1 -1
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +511 -0
- mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +829 -0
- mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +82 -0
- mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +170 -0
- mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +1 -1
- mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +240 -0
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +1 -1
- mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +243 -0
- mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +1 -1
- mgnify_pipelines_toolkit/constants/db_labels.py +1 -1
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +1 -1
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +1 -1
- mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
- mgnify_pipelines_toolkit/constants/thresholds.py +8 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +1 -1
- mgnify_pipelines_toolkit/schemas/schemas.py +1 -1
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +1 -1
- mgnify_pipelines_toolkit/utils/get_mpt_version.py +1 -1
- {mgnify_pipelines_toolkit-0.2.1.dist-info → mgnify_pipelines_toolkit-1.0.0.dist-info}/METADATA +3 -1
- mgnify_pipelines_toolkit-1.0.0.dist-info/RECORD +48 -0
- {mgnify_pipelines_toolkit-0.2.1.dist-info → mgnify_pipelines_toolkit-1.0.0.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.2.1.dist-info → mgnify_pipelines_toolkit-1.0.0.dist-info}/entry_points.txt +4 -2
- mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +0 -424
- mgnify_pipelines_toolkit-0.2.1.dist-info/RECORD +0 -43
- {mgnify_pipelines_toolkit-0.2.1.dist-info → mgnify_pipelines_toolkit-1.0.0.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.2.1.dist-info → mgnify_pipelines_toolkit-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the 'License');
|
|
7
7
|
# you may not use this file except in compliance with the License.
|
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the 'License');
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an 'AS IS' BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
import csv
|
|
22
|
+
import re
|
|
23
|
+
|
|
24
|
+
from intervaltree import Interval, IntervalTree
|
|
25
|
+
from Bio import SeqIO
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
MASK_OVERLAP_THRESHOLD = 5
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_gff(gff_file):
|
|
32
|
+
"""
|
|
33
|
+
Parse a GFF file and extract CDS features as Interval objects.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
gff_file (str): Path to the GFF file.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict: A nested dictionary with sequence IDs as keys, and within each,
|
|
40
|
+
strand (+/-) as keys, containing a list of Intervals for CDS regions.
|
|
41
|
+
Each Interval object apart from the start and end positions of the CDS region
|
|
42
|
+
also stores the protein ID.
|
|
43
|
+
"""
|
|
44
|
+
predictions = defaultdict(lambda: defaultdict(list))
|
|
45
|
+
with open(gff_file, "r") as gff_in:
|
|
46
|
+
for line in gff_in:
|
|
47
|
+
if line.startswith("#"):
|
|
48
|
+
continue
|
|
49
|
+
fields = line.strip().split("\t")
|
|
50
|
+
seq_id, _, feature_type, start, end, _, strand, _, attributes = fields
|
|
51
|
+
if feature_type == "CDS":
|
|
52
|
+
# Parse attributes to get the ID value
|
|
53
|
+
attr_dict = dict(
|
|
54
|
+
attr.split("=") for attr in attributes.split(";") if "=" in attr
|
|
55
|
+
)
|
|
56
|
+
protein_id = attr_dict["ID"]
|
|
57
|
+
predictions[seq_id][strand].append(
|
|
58
|
+
Interval(int(start), int(end), data={"protein_id": protein_id})
|
|
59
|
+
)
|
|
60
|
+
if not predictions:
|
|
61
|
+
raise ValueError("Zero gene predictions was read from the GFF file")
|
|
62
|
+
return predictions
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def parse_pyrodigal_output(file):
|
|
66
|
+
"""
|
|
67
|
+
Parse Pyrodigal *.out file to extract gene predictions as Interval objects.
|
|
68
|
+
Example of *.out file:
|
|
69
|
+
# Sequence Data: seqnum=1;seqlen=25479;seqhdr="Bifidobacterium-longum-subsp-infantis-MC2-contig1"
|
|
70
|
+
# Model Data: version=Pyrodigal.v2.6.3;run_type=Single;model="Ab initio";gc_cont=59.94;transl_table=11;uses_sd=1
|
|
71
|
+
>1_1_279_+
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
file (str): Path to the Pyrodigal *.out file.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
dict: A nested dictionary with sequence IDs as keys, and within each,
|
|
78
|
+
strand (+/-) as keys, containing a list of Intervals for CDS regions.
|
|
79
|
+
Each Interval object apart from the start and end positions of the CDS region
|
|
80
|
+
also stores the protein ID.
|
|
81
|
+
"""
|
|
82
|
+
predictions = defaultdict(lambda: defaultdict(list))
|
|
83
|
+
with open(file) as file_in:
|
|
84
|
+
for line in file_in:
|
|
85
|
+
if line.startswith("# Model Data"):
|
|
86
|
+
continue
|
|
87
|
+
if line.startswith("# Sequence Data"):
|
|
88
|
+
matches = re.search(r'seqhdr="(\S+)"', line)
|
|
89
|
+
if matches:
|
|
90
|
+
seq_id = matches.group(1)
|
|
91
|
+
else:
|
|
92
|
+
fields = line[1:].strip().split("_")
|
|
93
|
+
# Fragment_id is an index of the fragment
|
|
94
|
+
# Pyrodigal uses these (rather than coordinates) to identify sequences in the fasta output
|
|
95
|
+
fragment_id, start, end, strand = fields
|
|
96
|
+
protein_id = f"{seq_id}_{fragment_id}"
|
|
97
|
+
predictions[seq_id][strand].append(
|
|
98
|
+
Interval(int(start), int(end), data={"protein_id": protein_id})
|
|
99
|
+
)
|
|
100
|
+
if not predictions:
|
|
101
|
+
raise ValueError("Zero gene predictions was read from the *.out file")
|
|
102
|
+
return predictions
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def parse_fgsrs_output(file):
|
|
106
|
+
"""
|
|
107
|
+
Parse FragGeneScanRS *.out file to extract gene predictions as Interval objects.
|
|
108
|
+
Example of *.out file:
|
|
109
|
+
>Bifidobacterium-longum-subsp-infantis-MC2-contig1
|
|
110
|
+
256 2133 - 1 1.263995 I: D:
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
file (str): Path to the FragGeneScanRS *.out file.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
dict: A nested dictionary with sequence IDs as keys, and within each,
|
|
117
|
+
strand (+/-) as keys, containing a list of Intervals for CDS regions.
|
|
118
|
+
Each Interval object apart from the start and end positions of the CDS region
|
|
119
|
+
also stores the protein ID.
|
|
120
|
+
"""
|
|
121
|
+
predictions = defaultdict(lambda: defaultdict(list))
|
|
122
|
+
with open(file) as file_in:
|
|
123
|
+
for line in file_in:
|
|
124
|
+
if line.startswith(">"):
|
|
125
|
+
seq_id = line.split()[0][1:]
|
|
126
|
+
else:
|
|
127
|
+
fields = line.strip().split("\t")
|
|
128
|
+
start, end, strand, *_ = fields
|
|
129
|
+
protein_id = f"{seq_id}_{start}_{end}_{strand}"
|
|
130
|
+
predictions[seq_id][strand].append(
|
|
131
|
+
Interval(int(start), int(end), data={"protein_id": protein_id})
|
|
132
|
+
)
|
|
133
|
+
if not predictions:
|
|
134
|
+
raise ValueError("Zero gene predictions was read from the *.out file")
|
|
135
|
+
return predictions
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def parse_cmsearch_output(mask_file):
|
|
139
|
+
"""
|
|
140
|
+
Parse masking regions from a cmsearch output file and store them as Intervals.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
mask_file (str): Path to the masking file (possibly BED or GFF-like format).
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
dict: A dictionary with sequence IDs as keys, and a list of Intervals representing masked regions.
|
|
147
|
+
"""
|
|
148
|
+
regions = defaultdict(list)
|
|
149
|
+
with open(mask_file) as file_in:
|
|
150
|
+
for line in file_in:
|
|
151
|
+
if line.startswith("#"):
|
|
152
|
+
continue
|
|
153
|
+
fields = line.rstrip().split()
|
|
154
|
+
seq_id = fields[0]
|
|
155
|
+
start = int(fields[7])
|
|
156
|
+
end = int(fields[8])
|
|
157
|
+
if start > end:
|
|
158
|
+
start, end = end, start
|
|
159
|
+
regions[seq_id].append(Interval(start, end))
|
|
160
|
+
if not regions:
|
|
161
|
+
raise ValueError("Zero intervals was read from the input masking file")
|
|
162
|
+
return regions
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def mask_regions(predictions, mask):
|
|
166
|
+
"""
|
|
167
|
+
Apply masking to predictions by removing regions that overlap significantly
|
|
168
|
+
(more than MASK_OVERLAP_THRESHOLD)
|
|
169
|
+
with masked regions.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
predictions (dict): A nested dictionary with sequence IDs as keys, and within each,
|
|
173
|
+
strand (+/-) as keys, containing a list of Intervals as values.
|
|
174
|
+
mask (dict): A dictionary with sequence IDs as keys, and a list of Intervals as values.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
dict: Updated predictions with masked regions removed.
|
|
178
|
+
"""
|
|
179
|
+
masked = defaultdict(lambda: defaultdict(list))
|
|
180
|
+
|
|
181
|
+
for seq_id, strand_dict in predictions.items():
|
|
182
|
+
if seq_id in mask:
|
|
183
|
+
mask_tree = create_interval_tree(mask[seq_id])
|
|
184
|
+
for strand, regions in strand_dict.items():
|
|
185
|
+
tree = create_interval_tree(regions)
|
|
186
|
+
masked_intervals = []
|
|
187
|
+
for region in tree:
|
|
188
|
+
# Check for overlaps greater than 5 base pairs
|
|
189
|
+
overlapping_intervals = mask_tree.overlap(region.begin, region.end)
|
|
190
|
+
overlap = False
|
|
191
|
+
for mask_region in overlapping_intervals:
|
|
192
|
+
# If overlap is more than 5 base pairs, mark for masking
|
|
193
|
+
# Add 1 to make boundaries inclusive
|
|
194
|
+
overlap_len = 1 + abs(
|
|
195
|
+
min(region.end, mask_region.end)
|
|
196
|
+
- max(region.begin, mask_region.begin)
|
|
197
|
+
)
|
|
198
|
+
if overlap_len > MASK_OVERLAP_THRESHOLD:
|
|
199
|
+
overlap = True
|
|
200
|
+
break
|
|
201
|
+
if not overlap:
|
|
202
|
+
masked_intervals.append(region)
|
|
203
|
+
masked[seq_id][strand] = sorted(masked_intervals)
|
|
204
|
+
else:
|
|
205
|
+
# If no mask information exists, add the predictions directly
|
|
206
|
+
masked[seq_id] = strand_dict
|
|
207
|
+
return masked
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def merge_predictions(predictions, priority):
|
|
211
|
+
"""
|
|
212
|
+
Merge gene predictions from two sources, applying a priority order.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
predictions (dict): Nested dictionary containing gene predictions from both sources.
|
|
216
|
+
priority (list): List specifying the order of priority for merging the predictions.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
dict: Nested dictionary with all predictions of the first priority source merged with non-overlapping predictions
|
|
220
|
+
the secondary source.
|
|
221
|
+
"""
|
|
222
|
+
merged = defaultdict(lambda: defaultdict((lambda: defaultdict(list))))
|
|
223
|
+
primary, secondary = priority
|
|
224
|
+
|
|
225
|
+
# Primary merge
|
|
226
|
+
merged[primary] = predictions[primary]
|
|
227
|
+
|
|
228
|
+
# Secondary merge: add non-overlapping regions from the secondary gene caller
|
|
229
|
+
for seq_id in predictions[secondary]:
|
|
230
|
+
for strand in ["+", "-"]:
|
|
231
|
+
secondary_regions = predictions[secondary][seq_id][strand]
|
|
232
|
+
if seq_id in predictions[primary]:
|
|
233
|
+
primary_regions = merged[primary][seq_id][strand]
|
|
234
|
+
merged[secondary][seq_id][strand].extend(
|
|
235
|
+
check_against_gaps(primary_regions, secondary_regions)
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
merged[secondary][seq_id][strand] = secondary_regions
|
|
239
|
+
return merged
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def check_against_gaps(regions, candidates):
|
|
243
|
+
"""
|
|
244
|
+
Check candidate regions against existing regions and select those
|
|
245
|
+
that do not overlap with any existing ones.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
regions (list): Interval objects for existing regions.
|
|
249
|
+
candidates (list): Interval objects for candidate regions.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
list: Selected candidate Intervals that do not overlap with existing ones.
|
|
253
|
+
"""
|
|
254
|
+
regions_tree = create_interval_tree(regions)
|
|
255
|
+
selected_candidates = []
|
|
256
|
+
for candidate in candidates:
|
|
257
|
+
# Check if the candidate overlaps with any existing region
|
|
258
|
+
if not regions_tree.overlap(candidate.begin, candidate.end):
|
|
259
|
+
selected_candidates.append(candidate)
|
|
260
|
+
return selected_candidates
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def output_fasta_files(predictions, files_dict, output_faa, output_ffn):
|
|
264
|
+
"""
|
|
265
|
+
Write FASTA output files containing protein and transcript sequences for
|
|
266
|
+
the predicted genes after merging.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
predictions (dict): Nested dictionary with merged gene predictions as Interval objects.
|
|
270
|
+
Each Interval object stores a protein ID in the data attribute.
|
|
271
|
+
files_dict (dict): Dictionary containing input FASTA files for both Pyrodigal and FragGeneScanRS.
|
|
272
|
+
output_faa (str): Path to output protein FASTA file.
|
|
273
|
+
output_ffn (str): Path to output transcript FASTA file.
|
|
274
|
+
"""
|
|
275
|
+
with (
|
|
276
|
+
open(output_faa, "w") as output_faa_fh,
|
|
277
|
+
open(output_ffn, "w") as output_ffn_fh,
|
|
278
|
+
):
|
|
279
|
+
for caller, seq_data in predictions.items():
|
|
280
|
+
proteins = set()
|
|
281
|
+
for seq_id, strand_dict in seq_data.items():
|
|
282
|
+
for strand, regions in strand_dict.items():
|
|
283
|
+
for region in regions:
|
|
284
|
+
protein_id = region.data["protein_id"]
|
|
285
|
+
proteins.add(protein_id)
|
|
286
|
+
|
|
287
|
+
for input_file, output_file in [
|
|
288
|
+
(files_dict[caller]["proteins"], output_faa_fh),
|
|
289
|
+
(files_dict[caller]["transcripts"], output_ffn_fh),
|
|
290
|
+
]:
|
|
291
|
+
sequences = []
|
|
292
|
+
for record in SeqIO.parse(input_file, "fasta"):
|
|
293
|
+
if record.id in proteins:
|
|
294
|
+
# Prodigal appends * to the end of a truncated sequence
|
|
295
|
+
# FGS uses * to mark an ambiguous amino acid
|
|
296
|
+
# Replace ending * and replace any other "*" with "X"
|
|
297
|
+
record.seq = record.seq.rstrip("*").replace("*", "X")
|
|
298
|
+
sequences.append(record)
|
|
299
|
+
SeqIO.write(sequences, output_file, "fasta")
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def output_gff(predictions, output_gff):
|
|
303
|
+
"""
|
|
304
|
+
Write merged gene predictions to a GFF output file.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
predictions (dict): Nested dictionary with merged gene predictions as Interval objects.
|
|
308
|
+
Each Interval object stores a protein ID in the data attribute.
|
|
309
|
+
output_gff (str): Path to the output GFF file.
|
|
310
|
+
"""
|
|
311
|
+
with open(output_gff, "w") as gff_out:
|
|
312
|
+
writer = csv.writer(gff_out, delimiter="\t")
|
|
313
|
+
gff_out.write("##gff-version 3\n")
|
|
314
|
+
for caller, seq_data in predictions.items():
|
|
315
|
+
for seq_id, strand_dict in seq_data.items():
|
|
316
|
+
for strand, regions in strand_dict.items():
|
|
317
|
+
for region in regions:
|
|
318
|
+
writer.writerow(
|
|
319
|
+
[
|
|
320
|
+
seq_id, # Sequence ID
|
|
321
|
+
caller, # Source
|
|
322
|
+
"CDS", # Feature type
|
|
323
|
+
region.begin, # Start position
|
|
324
|
+
region.end, # End position
|
|
325
|
+
".", # Score (not used, hence '.')
|
|
326
|
+
strand, # Strand (+/-)
|
|
327
|
+
".", # Phase (not used, hence '.')
|
|
328
|
+
f"ID={region.data['protein_id']}", # Attributes
|
|
329
|
+
]
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def output_summary(summary, output_file):
|
|
334
|
+
"""
|
|
335
|
+
Write a summary of gene counts to a text file in JSON format.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
summary (dict): Summary of gene counts.
|
|
339
|
+
output_file (str): Path to the summary output file.
|
|
340
|
+
"""
|
|
341
|
+
with open(output_file, "w") as sf:
|
|
342
|
+
sf.write(json.dumps(summary, sort_keys=True, indent=4) + "\n")
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def get_counts(predictions):
|
|
346
|
+
"""
|
|
347
|
+
Count the number of gene predictions for each caller.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
predictions (dict): Nested dictionary with gene predictions for each caller.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
dict: Total count of genes for each caller.
|
|
354
|
+
"""
|
|
355
|
+
total = {}
|
|
356
|
+
for caller, seq_data in predictions.items():
|
|
357
|
+
count = sum(
|
|
358
|
+
len(seq_data[seq_id]["+"] + seq_data[seq_id]["-"]) for seq_id in seq_data
|
|
359
|
+
)
|
|
360
|
+
total[caller] = count
|
|
361
|
+
return total
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def create_interval_tree(regions):
|
|
365
|
+
"""
|
|
366
|
+
Create an IntervalTree from a list of regions.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
regions (list): List of Interval objects.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
IntervalTree: An interval tree for efficient overlap checking.
|
|
373
|
+
"""
|
|
374
|
+
tree = IntervalTree()
|
|
375
|
+
for region in regions:
|
|
376
|
+
tree.add(region)
|
|
377
|
+
return tree
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def main():
|
|
381
|
+
parser = argparse.ArgumentParser(
|
|
382
|
+
"""
|
|
383
|
+
MGnify gene caller combiner.
|
|
384
|
+
This script merges gene predictions made by Pyrodigal and FragGeneScanRS (FGS)
|
|
385
|
+
and outputs FASTA and GFF files.
|
|
386
|
+
For each gene caller, the script expects a set of files:
|
|
387
|
+
- GFF file with gene predictions OR *.out file
|
|
388
|
+
- FASTA file with protein sequences
|
|
389
|
+
- FASTA file with transcript sequences
|
|
390
|
+
"""
|
|
391
|
+
)
|
|
392
|
+
parser.add_argument(
|
|
393
|
+
"--name", "-n", required=True, help="Base name for output files"
|
|
394
|
+
)
|
|
395
|
+
parser.add_argument(
|
|
396
|
+
"--priority",
|
|
397
|
+
"-P",
|
|
398
|
+
choices=["Pyrodigal_FragGeneScanRS", "FragGeneScanRS_Pyrodigal"],
|
|
399
|
+
default="Pyrodigal_FragGeneScanRS",
|
|
400
|
+
help="Merge priority",
|
|
401
|
+
)
|
|
402
|
+
parser.add_argument(
|
|
403
|
+
"--mask",
|
|
404
|
+
"-m",
|
|
405
|
+
help="Regions for masking (Infernal cmsearch output file)",
|
|
406
|
+
)
|
|
407
|
+
parser.add_argument("--pyrodigal-gff", "-pg", help="Pyrodigal *.gff file")
|
|
408
|
+
parser.add_argument("--pyrodigal-out", "-po", help="Pyrodigal *.out file")
|
|
409
|
+
parser.add_argument(
|
|
410
|
+
"--pyrodigal-ffn",
|
|
411
|
+
"-pt",
|
|
412
|
+
required=True,
|
|
413
|
+
help="Pyrodigal *.ffn file with transcripts",
|
|
414
|
+
)
|
|
415
|
+
parser.add_argument(
|
|
416
|
+
"--pyrodigal-faa",
|
|
417
|
+
"-pp",
|
|
418
|
+
required=True,
|
|
419
|
+
help="Pyrodigal *.faa file with proteins",
|
|
420
|
+
)
|
|
421
|
+
parser.add_argument("--fgsrs-gff", "-fg", help="FragGeneScanRS *.gff file")
|
|
422
|
+
parser.add_argument("--fgsrs-out", "-fo", help="FragGeneScanRS *.out file")
|
|
423
|
+
parser.add_argument(
|
|
424
|
+
"--fgsrs-ffn",
|
|
425
|
+
"-ft",
|
|
426
|
+
required=True,
|
|
427
|
+
help="FragGeneScanRS *.ffn file with transcripts",
|
|
428
|
+
)
|
|
429
|
+
parser.add_argument(
|
|
430
|
+
"--fgsrs-faa",
|
|
431
|
+
"-fp",
|
|
432
|
+
required=True,
|
|
433
|
+
help="FragGeneScanRS *.faa file with proteins",
|
|
434
|
+
)
|
|
435
|
+
parser.add_argument(
|
|
436
|
+
"--verbose", "-v", action="store_true", help="Increase verbosity level to debug"
|
|
437
|
+
)
|
|
438
|
+
args = parser.parse_args()
|
|
439
|
+
|
|
440
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
|
441
|
+
logging.basicConfig(
|
|
442
|
+
level=log_level,
|
|
443
|
+
format="%(levelname)s %(asctime)s - %(message)s",
|
|
444
|
+
datefmt="%Y/%m/%d %H:%M:%S",
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
if not args.pyrodigal_out and not args.pyrodigal_gff:
|
|
448
|
+
parser.error(
|
|
449
|
+
"For Pyrodigal, you must provide either --pyrodigal-out or --pyrodigal-gff"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
if not args.fgsrs_out and not args.fgsrs_gff:
|
|
453
|
+
parser.error(
|
|
454
|
+
"For FragGeneScanRS, you must provide either --fgsrs-out or --fgsrs-gff"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
summary = {}
|
|
458
|
+
all_predictions = {}
|
|
459
|
+
|
|
460
|
+
caller_priority = args.priority.split("_")
|
|
461
|
+
logging.info(f"Caller priority: 1. {caller_priority[0]}, 2. {caller_priority[1]}")
|
|
462
|
+
|
|
463
|
+
logging.info("Parsing Pyrodigal annotations...")
|
|
464
|
+
if args.pyrodigal_out:
|
|
465
|
+
all_predictions["Pyrodigal"] = parse_pyrodigal_output(args.pyrodigal_out)
|
|
466
|
+
elif args.pyrodigal_gff:
|
|
467
|
+
all_predictions["Pyrodigal"] = parse_gff(args.pyrodigal_gff)
|
|
468
|
+
|
|
469
|
+
logging.info("Parsing FragGeneScanRS annotations...")
|
|
470
|
+
if args.fgsrs_out:
|
|
471
|
+
all_predictions["FragGeneScanRS"] = parse_fgsrs_output(args.fgsrs_out)
|
|
472
|
+
elif args.fgsrs_gff:
|
|
473
|
+
all_predictions["FragGeneScanRS"] = parse_gff(args.fgsrs_gff)
|
|
474
|
+
|
|
475
|
+
summary["all"] = get_counts(all_predictions)
|
|
476
|
+
|
|
477
|
+
if args.mask:
|
|
478
|
+
logging.info("Masking of non-coding RNA regions was enabled")
|
|
479
|
+
logging.info(f"Parsing masking intervals from file {args.mask}")
|
|
480
|
+
mask_regions_file = parse_cmsearch_output(args.mask)
|
|
481
|
+
for caller in all_predictions:
|
|
482
|
+
logging.info(f"Masking {caller} outputs...")
|
|
483
|
+
all_predictions[caller] = mask_regions(
|
|
484
|
+
all_predictions[caller], mask_regions_file
|
|
485
|
+
)
|
|
486
|
+
summary["after_masking"] = get_counts(all_predictions)
|
|
487
|
+
|
|
488
|
+
logging.info("Merging combined gene caller results")
|
|
489
|
+
merged_predictions = merge_predictions(all_predictions, caller_priority)
|
|
490
|
+
summary["merged"] = get_counts(merged_predictions)
|
|
491
|
+
|
|
492
|
+
logging.info("Writing output files...")
|
|
493
|
+
output_summary(summary, f"{args.name}.summary.txt")
|
|
494
|
+
output_gff(merged_predictions, f"{args.name}.gff")
|
|
495
|
+
files = {
|
|
496
|
+
"Pyrodigal": {
|
|
497
|
+
"proteins": args.pyrodigal_faa,
|
|
498
|
+
"transcripts": args.pyrodigal_ffn,
|
|
499
|
+
},
|
|
500
|
+
"FragGeneScanRS": {"proteins": args.fgsrs_faa, "transcripts": args.fgsrs_ffn},
|
|
501
|
+
}
|
|
502
|
+
output_fasta_files(
|
|
503
|
+
merged_predictions,
|
|
504
|
+
files,
|
|
505
|
+
f"{args.name}.faa",
|
|
506
|
+
f"{args.name}.ffn",
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
if __name__ == "__main__":
|
|
511
|
+
main()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
4
|
+
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the 'License');
|
|
7
7
|
# you may not use this file except in compliance with the License.
|