mgnify-pipelines-toolkit 0.2.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (42) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +1 -1
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +1 -1
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +1 -1
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +1 -1
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +1 -1
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +1 -1
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +1 -1
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +1 -1
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +1 -1
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +1 -1
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +1 -1
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
  13. mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +1 -1
  14. mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +1 -1
  15. mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +511 -0
  16. mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +1 -1
  17. mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +1 -1
  18. mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +1 -1
  19. mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +240 -0
  20. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +1 -1
  21. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
  22. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +1 -1
  23. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +1 -1
  24. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +1 -1
  25. mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +18 -11
  26. mgnify_pipelines_toolkit/constants/db_labels.py +1 -1
  27. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +1 -1
  28. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +1 -1
  29. mgnify_pipelines_toolkit/constants/tax_ranks.py +1 -1
  30. mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
  31. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +1 -1
  32. mgnify_pipelines_toolkit/schemas/schemas.py +21 -3
  33. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +1 -1
  34. mgnify_pipelines_toolkit/utils/get_mpt_version.py +1 -1
  35. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/METADATA +2 -1
  36. mgnify_pipelines_toolkit-1.0.1.dist-info/RECORD +48 -0
  37. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/WHEEL +1 -1
  38. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/entry_points.txt +2 -1
  39. mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +0 -424
  40. mgnify_pipelines_toolkit-0.2.2.dist-info/RECORD +0 -47
  41. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/LICENSE +0 -0
  42. {mgnify_pipelines_toolkit-0.2.2.dist-info → mgnify_pipelines_toolkit-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the 'License');
7
7
  # you may not use this file except in compliance with the License.
@@ -0,0 +1,511 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import json
19
+ import logging
20
+ from collections import defaultdict
21
+ import csv
22
+ import re
23
+
24
+ from intervaltree import Interval, IntervalTree
25
+ from Bio import SeqIO
26
+
27
+
28
+ MASK_OVERLAP_THRESHOLD = 5
29
+
30
+
31
+ def parse_gff(gff_file):
32
+ """
33
+ Parse a GFF file and extract CDS features as Interval objects.
34
+
35
+ Args:
36
+ gff_file (str): Path to the GFF file.
37
+
38
+ Returns:
39
+ dict: A nested dictionary with sequence IDs as keys, and within each,
40
+ strand (+/-) as keys, containing a list of Intervals for CDS regions.
41
+ Each Interval object apart from the start and end positions of the CDS region
42
+ also stores the protein ID.
43
+ """
44
+ predictions = defaultdict(lambda: defaultdict(list))
45
+ with open(gff_file, "r") as gff_in:
46
+ for line in gff_in:
47
+ if line.startswith("#"):
48
+ continue
49
+ fields = line.strip().split("\t")
50
+ seq_id, _, feature_type, start, end, _, strand, _, attributes = fields
51
+ if feature_type == "CDS":
52
+ # Parse attributes to get the ID value
53
+ attr_dict = dict(
54
+ attr.split("=") for attr in attributes.split(";") if "=" in attr
55
+ )
56
+ protein_id = attr_dict["ID"]
57
+ predictions[seq_id][strand].append(
58
+ Interval(int(start), int(end), data={"protein_id": protein_id})
59
+ )
60
+ if not predictions:
61
+ raise ValueError("Zero gene predictions was read from the GFF file")
62
+ return predictions
63
+
64
+
65
+ def parse_pyrodigal_output(file):
66
+ """
67
+ Parse Pyrodigal *.out file to extract gene predictions as Interval objects.
68
+ Example of *.out file:
69
+ # Sequence Data: seqnum=1;seqlen=25479;seqhdr="Bifidobacterium-longum-subsp-infantis-MC2-contig1"
70
+ # Model Data: version=Pyrodigal.v2.6.3;run_type=Single;model="Ab initio";gc_cont=59.94;transl_table=11;uses_sd=1
71
+ >1_1_279_+
72
+
73
+ Args:
74
+ file (str): Path to the Pyrodigal *.out file.
75
+
76
+ Returns:
77
+ dict: A nested dictionary with sequence IDs as keys, and within each,
78
+ strand (+/-) as keys, containing a list of Intervals for CDS regions.
79
+ Each Interval object apart from the start and end positions of the CDS region
80
+ also stores the protein ID.
81
+ """
82
+ predictions = defaultdict(lambda: defaultdict(list))
83
+ with open(file) as file_in:
84
+ for line in file_in:
85
+ if line.startswith("# Model Data"):
86
+ continue
87
+ if line.startswith("# Sequence Data"):
88
+ matches = re.search(r'seqhdr="(\S+)"', line)
89
+ if matches:
90
+ seq_id = matches.group(1)
91
+ else:
92
+ fields = line[1:].strip().split("_")
93
+ # Fragment_id is an index of the fragment
94
+ # Pyrodigal uses these (rather than coordinates) to identify sequences in the fasta output
95
+ fragment_id, start, end, strand = fields
96
+ protein_id = f"{seq_id}_{fragment_id}"
97
+ predictions[seq_id][strand].append(
98
+ Interval(int(start), int(end), data={"protein_id": protein_id})
99
+ )
100
+ if not predictions:
101
+ raise ValueError("Zero gene predictions was read from the *.out file")
102
+ return predictions
103
+
104
+
105
+ def parse_fgsrs_output(file):
106
+ """
107
+ Parse FragGeneScanRS *.out file to extract gene predictions as Interval objects.
108
+ Example of *.out file:
109
+ >Bifidobacterium-longum-subsp-infantis-MC2-contig1
110
+ 256 2133 - 1 1.263995 I: D:
111
+
112
+ Args:
113
+ file (str): Path to the FragGeneScanRS *.out file.
114
+
115
+ Returns:
116
+ dict: A nested dictionary with sequence IDs as keys, and within each,
117
+ strand (+/-) as keys, containing a list of Intervals for CDS regions.
118
+ Each Interval object apart from the start and end positions of the CDS region
119
+ also stores the protein ID.
120
+ """
121
+ predictions = defaultdict(lambda: defaultdict(list))
122
+ with open(file) as file_in:
123
+ for line in file_in:
124
+ if line.startswith(">"):
125
+ seq_id = line.split()[0][1:]
126
+ else:
127
+ fields = line.strip().split("\t")
128
+ start, end, strand, *_ = fields
129
+ protein_id = f"{seq_id}_{start}_{end}_{strand}"
130
+ predictions[seq_id][strand].append(
131
+ Interval(int(start), int(end), data={"protein_id": protein_id})
132
+ )
133
+ if not predictions:
134
+ raise ValueError("Zero gene predictions was read from the *.out file")
135
+ return predictions
136
+
137
+
138
+ def parse_cmsearch_output(mask_file):
139
+ """
140
+ Parse masking regions from a cmsearch output file and store them as Intervals.
141
+
142
+ Args:
143
+ mask_file (str): Path to the masking file (possibly BED or GFF-like format).
144
+
145
+ Returns:
146
+ dict: A dictionary with sequence IDs as keys, and a list of Intervals representing masked regions.
147
+ """
148
+ regions = defaultdict(list)
149
+ with open(mask_file) as file_in:
150
+ for line in file_in:
151
+ if line.startswith("#"):
152
+ continue
153
+ fields = line.rstrip().split()
154
+ seq_id = fields[0]
155
+ start = int(fields[7])
156
+ end = int(fields[8])
157
+ if start > end:
158
+ start, end = end, start
159
+ regions[seq_id].append(Interval(start, end))
160
+ if not regions:
161
+ raise ValueError("Zero intervals was read from the input masking file")
162
+ return regions
163
+
164
+
165
+ def mask_regions(predictions, mask):
166
+ """
167
+ Apply masking to predictions by removing regions that overlap significantly
168
+ (more than MASK_OVERLAP_THRESHOLD)
169
+ with masked regions.
170
+
171
+ Args:
172
+ predictions (dict): A nested dictionary with sequence IDs as keys, and within each,
173
+ strand (+/-) as keys, containing a list of Intervals as values.
174
+ mask (dict): A dictionary with sequence IDs as keys, and a list of Intervals as values.
175
+
176
+ Returns:
177
+ dict: Updated predictions with masked regions removed.
178
+ """
179
+ masked = defaultdict(lambda: defaultdict(list))
180
+
181
+ for seq_id, strand_dict in predictions.items():
182
+ if seq_id in mask:
183
+ mask_tree = create_interval_tree(mask[seq_id])
184
+ for strand, regions in strand_dict.items():
185
+ tree = create_interval_tree(regions)
186
+ masked_intervals = []
187
+ for region in tree:
188
+ # Check for overlaps greater than 5 base pairs
189
+ overlapping_intervals = mask_tree.overlap(region.begin, region.end)
190
+ overlap = False
191
+ for mask_region in overlapping_intervals:
192
+ # If overlap is more than 5 base pairs, mark for masking
193
+ # Add 1 to make boundaries inclusive
194
+ overlap_len = 1 + abs(
195
+ min(region.end, mask_region.end)
196
+ - max(region.begin, mask_region.begin)
197
+ )
198
+ if overlap_len > MASK_OVERLAP_THRESHOLD:
199
+ overlap = True
200
+ break
201
+ if not overlap:
202
+ masked_intervals.append(region)
203
+ masked[seq_id][strand] = sorted(masked_intervals)
204
+ else:
205
+ # If no mask information exists, add the predictions directly
206
+ masked[seq_id] = strand_dict
207
+ return masked
208
+
209
+
210
+ def merge_predictions(predictions, priority):
211
+ """
212
+ Merge gene predictions from two sources, applying a priority order.
213
+
214
+ Args:
215
+ predictions (dict): Nested dictionary containing gene predictions from both sources.
216
+ priority (list): List specifying the order of priority for merging the predictions.
217
+
218
+ Returns:
219
+ dict: Nested dictionary with all predictions of the first priority source merged with non-overlapping predictions
220
+ the secondary source.
221
+ """
222
+ merged = defaultdict(lambda: defaultdict((lambda: defaultdict(list))))
223
+ primary, secondary = priority
224
+
225
+ # Primary merge
226
+ merged[primary] = predictions[primary]
227
+
228
+ # Secondary merge: add non-overlapping regions from the secondary gene caller
229
+ for seq_id in predictions[secondary]:
230
+ for strand in ["+", "-"]:
231
+ secondary_regions = predictions[secondary][seq_id][strand]
232
+ if seq_id in predictions[primary]:
233
+ primary_regions = merged[primary][seq_id][strand]
234
+ merged[secondary][seq_id][strand].extend(
235
+ check_against_gaps(primary_regions, secondary_regions)
236
+ )
237
+ else:
238
+ merged[secondary][seq_id][strand] = secondary_regions
239
+ return merged
240
+
241
+
242
+ def check_against_gaps(regions, candidates):
243
+ """
244
+ Check candidate regions against existing regions and select those
245
+ that do not overlap with any existing ones.
246
+
247
+ Args:
248
+ regions (list): Interval objects for existing regions.
249
+ candidates (list): Interval objects for candidate regions.
250
+
251
+ Returns:
252
+ list: Selected candidate Intervals that do not overlap with existing ones.
253
+ """
254
+ regions_tree = create_interval_tree(regions)
255
+ selected_candidates = []
256
+ for candidate in candidates:
257
+ # Check if the candidate overlaps with any existing region
258
+ if not regions_tree.overlap(candidate.begin, candidate.end):
259
+ selected_candidates.append(candidate)
260
+ return selected_candidates
261
+
262
+
263
+ def output_fasta_files(predictions, files_dict, output_faa, output_ffn):
264
+ """
265
+ Write FASTA output files containing protein and transcript sequences for
266
+ the predicted genes after merging.
267
+
268
+ Args:
269
+ predictions (dict): Nested dictionary with merged gene predictions as Interval objects.
270
+ Each Interval object stores a protein ID in the data attribute.
271
+ files_dict (dict): Dictionary containing input FASTA files for both Pyrodigal and FragGeneScanRS.
272
+ output_faa (str): Path to output protein FASTA file.
273
+ output_ffn (str): Path to output transcript FASTA file.
274
+ """
275
+ with (
276
+ open(output_faa, "w") as output_faa_fh,
277
+ open(output_ffn, "w") as output_ffn_fh,
278
+ ):
279
+ for caller, seq_data in predictions.items():
280
+ proteins = set()
281
+ for seq_id, strand_dict in seq_data.items():
282
+ for strand, regions in strand_dict.items():
283
+ for region in regions:
284
+ protein_id = region.data["protein_id"]
285
+ proteins.add(protein_id)
286
+
287
+ for input_file, output_file in [
288
+ (files_dict[caller]["proteins"], output_faa_fh),
289
+ (files_dict[caller]["transcripts"], output_ffn_fh),
290
+ ]:
291
+ sequences = []
292
+ for record in SeqIO.parse(input_file, "fasta"):
293
+ if record.id in proteins:
294
+ # Prodigal appends * to the end of a truncated sequence
295
+ # FGS uses * to mark an ambiguous amino acid
296
+ # Replace ending * and replace any other "*" with "X"
297
+ record.seq = record.seq.rstrip("*").replace("*", "X")
298
+ sequences.append(record)
299
+ SeqIO.write(sequences, output_file, "fasta")
300
+
301
+
302
+ def output_gff(predictions, output_gff):
303
+ """
304
+ Write merged gene predictions to a GFF output file.
305
+
306
+ Args:
307
+ predictions (dict): Nested dictionary with merged gene predictions as Interval objects.
308
+ Each Interval object stores a protein ID in the data attribute.
309
+ output_gff (str): Path to the output GFF file.
310
+ """
311
+ with open(output_gff, "w") as gff_out:
312
+ writer = csv.writer(gff_out, delimiter="\t")
313
+ gff_out.write("##gff-version 3\n")
314
+ for caller, seq_data in predictions.items():
315
+ for seq_id, strand_dict in seq_data.items():
316
+ for strand, regions in strand_dict.items():
317
+ for region in regions:
318
+ writer.writerow(
319
+ [
320
+ seq_id, # Sequence ID
321
+ caller, # Source
322
+ "CDS", # Feature type
323
+ region.begin, # Start position
324
+ region.end, # End position
325
+ ".", # Score (not used, hence '.')
326
+ strand, # Strand (+/-)
327
+ ".", # Phase (not used, hence '.')
328
+ f"ID={region.data['protein_id']}", # Attributes
329
+ ]
330
+ )
331
+
332
+
333
+ def output_summary(summary, output_file):
334
+ """
335
+ Write a summary of gene counts to a text file in JSON format.
336
+
337
+ Args:
338
+ summary (dict): Summary of gene counts.
339
+ output_file (str): Path to the summary output file.
340
+ """
341
+ with open(output_file, "w") as sf:
342
+ sf.write(json.dumps(summary, sort_keys=True, indent=4) + "\n")
343
+
344
+
345
+ def get_counts(predictions):
346
+ """
347
+ Count the number of gene predictions for each caller.
348
+
349
+ Args:
350
+ predictions (dict): Nested dictionary with gene predictions for each caller.
351
+
352
+ Returns:
353
+ dict: Total count of genes for each caller.
354
+ """
355
+ total = {}
356
+ for caller, seq_data in predictions.items():
357
+ count = sum(
358
+ len(seq_data[seq_id]["+"] + seq_data[seq_id]["-"]) for seq_id in seq_data
359
+ )
360
+ total[caller] = count
361
+ return total
362
+
363
+
364
+ def create_interval_tree(regions):
365
+ """
366
+ Create an IntervalTree from a list of regions.
367
+
368
+ Args:
369
+ regions (list): List of Interval objects.
370
+
371
+ Returns:
372
+ IntervalTree: An interval tree for efficient overlap checking.
373
+ """
374
+ tree = IntervalTree()
375
+ for region in regions:
376
+ tree.add(region)
377
+ return tree
378
+
379
+
380
+ def main():
381
+ parser = argparse.ArgumentParser(
382
+ """
383
+ MGnify gene caller combiner.
384
+ This script merges gene predictions made by Pyrodigal and FragGeneScanRS (FGS)
385
+ and outputs FASTA and GFF files.
386
+ For each gene caller, the script expects a set of files:
387
+ - GFF file with gene predictions OR *.out file
388
+ - FASTA file with protein sequences
389
+ - FASTA file with transcript sequences
390
+ """
391
+ )
392
+ parser.add_argument(
393
+ "--name", "-n", required=True, help="Base name for output files"
394
+ )
395
+ parser.add_argument(
396
+ "--priority",
397
+ "-P",
398
+ choices=["Pyrodigal_FragGeneScanRS", "FragGeneScanRS_Pyrodigal"],
399
+ default="Pyrodigal_FragGeneScanRS",
400
+ help="Merge priority",
401
+ )
402
+ parser.add_argument(
403
+ "--mask",
404
+ "-m",
405
+ help="Regions for masking (Infernal cmsearch output file)",
406
+ )
407
+ parser.add_argument("--pyrodigal-gff", "-pg", help="Pyrodigal *.gff file")
408
+ parser.add_argument("--pyrodigal-out", "-po", help="Pyrodigal *.out file")
409
+ parser.add_argument(
410
+ "--pyrodigal-ffn",
411
+ "-pt",
412
+ required=True,
413
+ help="Pyrodigal *.ffn file with transcripts",
414
+ )
415
+ parser.add_argument(
416
+ "--pyrodigal-faa",
417
+ "-pp",
418
+ required=True,
419
+ help="Pyrodigal *.faa file with proteins",
420
+ )
421
+ parser.add_argument("--fgsrs-gff", "-fg", help="FragGeneScanRS *.gff file")
422
+ parser.add_argument("--fgsrs-out", "-fo", help="FragGeneScanRS *.out file")
423
+ parser.add_argument(
424
+ "--fgsrs-ffn",
425
+ "-ft",
426
+ required=True,
427
+ help="FragGeneScanRS *.ffn file with transcripts",
428
+ )
429
+ parser.add_argument(
430
+ "--fgsrs-faa",
431
+ "-fp",
432
+ required=True,
433
+ help="FragGeneScanRS *.faa file with proteins",
434
+ )
435
+ parser.add_argument(
436
+ "--verbose", "-v", action="store_true", help="Increase verbosity level to debug"
437
+ )
438
+ args = parser.parse_args()
439
+
440
+ log_level = logging.DEBUG if args.verbose else logging.INFO
441
+ logging.basicConfig(
442
+ level=log_level,
443
+ format="%(levelname)s %(asctime)s - %(message)s",
444
+ datefmt="%Y/%m/%d %H:%M:%S",
445
+ )
446
+
447
+ if not args.pyrodigal_out and not args.pyrodigal_gff:
448
+ parser.error(
449
+ "For Pyrodigal, you must provide either --pyrodigal-out or --pyrodigal-gff"
450
+ )
451
+
452
+ if not args.fgsrs_out and not args.fgsrs_gff:
453
+ parser.error(
454
+ "For FragGeneScanRS, you must provide either --fgsrs-out or --fgsrs-gff"
455
+ )
456
+
457
+ summary = {}
458
+ all_predictions = {}
459
+
460
+ caller_priority = args.priority.split("_")
461
+ logging.info(f"Caller priority: 1. {caller_priority[0]}, 2. {caller_priority[1]}")
462
+
463
+ logging.info("Parsing Pyrodigal annotations...")
464
+ if args.pyrodigal_out:
465
+ all_predictions["Pyrodigal"] = parse_pyrodigal_output(args.pyrodigal_out)
466
+ elif args.pyrodigal_gff:
467
+ all_predictions["Pyrodigal"] = parse_gff(args.pyrodigal_gff)
468
+
469
+ logging.info("Parsing FragGeneScanRS annotations...")
470
+ if args.fgsrs_out:
471
+ all_predictions["FragGeneScanRS"] = parse_fgsrs_output(args.fgsrs_out)
472
+ elif args.fgsrs_gff:
473
+ all_predictions["FragGeneScanRS"] = parse_gff(args.fgsrs_gff)
474
+
475
+ summary["all"] = get_counts(all_predictions)
476
+
477
+ if args.mask:
478
+ logging.info("Masking of non-coding RNA regions was enabled")
479
+ logging.info(f"Parsing masking intervals from file {args.mask}")
480
+ mask_regions_file = parse_cmsearch_output(args.mask)
481
+ for caller in all_predictions:
482
+ logging.info(f"Masking {caller} outputs...")
483
+ all_predictions[caller] = mask_regions(
484
+ all_predictions[caller], mask_regions_file
485
+ )
486
+ summary["after_masking"] = get_counts(all_predictions)
487
+
488
+ logging.info("Merging combined gene caller results")
489
+ merged_predictions = merge_predictions(all_predictions, caller_priority)
490
+ summary["merged"] = get_counts(merged_predictions)
491
+
492
+ logging.info("Writing output files...")
493
+ output_summary(summary, f"{args.name}.summary.txt")
494
+ output_gff(merged_predictions, f"{args.name}.gff")
495
+ files = {
496
+ "Pyrodigal": {
497
+ "proteins": args.pyrodigal_faa,
498
+ "transcripts": args.pyrodigal_ffn,
499
+ },
500
+ "FragGeneScanRS": {"proteins": args.fgsrs_faa, "transcripts": args.fgsrs_ffn},
501
+ }
502
+ output_fasta_files(
503
+ merged_predictions,
504
+ files,
505
+ f"{args.name}.faa",
506
+ f"{args.name}.ffn",
507
+ )
508
+
509
+
510
+ if __name__ == "__main__":
511
+ main()
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the 'License');
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the 'License');
7
7
  # you may not use this file except in compliance with the License.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- # Copyright 2024 EMBL - European Bioinformatics Institute
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the 'License');
7
7
  # you may not use this file except in compliance with the License.