ORForise 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ORForise/Aggregate_Compare.py +2 -4
  2. ORForise/Annotation_Compare.py +16 -53
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  5. ORForise/Convert_To_GFF.py +139 -0
  6. ORForise/GFF_Adder.py +454 -179
  7. ORForise/List_Tools.py +63 -0
  8. ORForise/StORForise.py +8 -4
  9. ORForise/Tools/EasyGene/EasyGene.py +13 -1
  10. ORForise/Tools/{GLIMMER_3/GLIMMER_3.py → GLIMMER3/GLIMMER3.py} +2 -2
  11. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  12. ORForise/Tools/{GeneMark_HA/GeneMark_HA.py → GeneMarkHA/GeneMarkHA.py} +1 -1
  13. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  14. ORForise/Tools/Prodigal/Prodigal.py +13 -1
  15. ORForise/utils.py +4 -1
  16. orforise-1.6.1.dist-info/METADATA +1038 -0
  17. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/RECORD +29 -24
  18. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/entry_points.txt +6 -2
  19. ORForise/GFF_Intersector.py +0 -192
  20. orforise-1.5.1.dist-info/METADATA +0 -427
  21. /ORForise/{Tools → Aux}/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +0 -0
  22. /ORForise/{Tools/GLIMMER_3 → Aux/StORF_Undetected/Completely_Undetected}/__init__.py +0 -0
  23. /ORForise/{Tools → Aux}/StORF_Undetected/StORF_Undetected.py +0 -0
  24. /ORForise/{Tools/GeneMark_HA → Aux/StORF_Undetected}/__init__.py +0 -0
  25. /ORForise/{Tools/StORF_Undetected/Completely_Undetected → Aux/StORF_Undetected/unvitiated_Genes}/__init__.py +0 -0
  26. /ORForise/{Tools → Aux}/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +0 -0
  27. /ORForise/{Tools/StORF_Undetected → Aux/TabToGFF}/__init__.py +0 -0
  28. /ORForise/{Tools/StORF_Undetected/unvitiated_Genes → Aux}/__init__.py +0 -0
  29. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/WHEEL +0 -0
  30. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/licenses/LICENSE +0 -0
  31. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,726 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ from collections import OrderedDict
4
+ from datetime import date
5
+ import sys, gzip
6
+ import os
7
+ import logging
8
+
9
+ # Ensure logging prints to stdout by default so info/debug messages are visible when running the script
10
+ if not logging.getLogger().handlers:
11
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(levelname)s: %(message)s')
12
+
13
+ try:
14
+ from utils import *
15
+ except ImportError:
16
+ from .utils import *
17
+
18
+ ################################
19
+
20
+
21
+ def gff_writer(genome_ID, genome_DNA, reference_annotation, reference_tool, ref_gene_set, additional_annotation, additional_tool, genes_To_Keep_by_contig, output_file, gene_ident=None):
22
+ # genes_To_Keep_by_contig: {contig: {pos: data}}
23
+ # Expand user (~) and ensure output directory exists
24
+ output_file = os.path.expanduser(output_file)
25
+ out_dir = os.path.dirname(output_file)
26
+ if out_dir:
27
+ os.makedirs(out_dir, exist_ok=True)
28
+
29
+ # Always open the file and write the header first. Use a broad try so we can log any issue.
30
+ try:
31
+ with open(output_file, 'w', encoding='utf-8') as write_out:
32
+ write_out.write("##gff-version\t3\n#\tAnnotation-Intersector\n#\tRun Date:" + str(date.today()) + '\n')
33
+ if genome_DNA:
34
+ write_out.write("##Genome DNA File:" + genome_DNA + '\n')
35
+ write_out.write("##Original File: " + reference_annotation + "\n##Intersecting File: " + additional_annotation + '\n')
36
+
37
+ entries_written = 0
38
+
39
+ # If genes_To_Keep_by_contig is falsy or empty, attempt to write reference features directly as fallback
40
+ if not genes_To_Keep_by_contig or all(len(v) == 0 for v in genes_To_Keep_by_contig.values()):
41
+ write_out.write(f"# No kept genes to write (0 entries).\n")
42
+ write_out.write(f"# Falling back to writing reference features with coverage=0.\n")
43
+
44
+ # Parse reference annotation and write features matching gene_ident
45
+ try:
46
+ if reference_annotation.endswith('.gz'):
47
+ rf = gzip.open(reference_annotation, 'rt')
48
+ else:
49
+ rf = open(reference_annotation, 'r', encoding='unicode_escape')
50
+ with rf:
51
+ for line in rf:
52
+ line = line.rstrip('\n')
53
+ if not line or line.startswith('#'):
54
+ continue
55
+ parts = line.split('\t')
56
+ if len(parts) < 9:
57
+ continue
58
+ seqid = parts[0]
59
+ ftype = parts[2]
60
+ try:
61
+ gene_types = gene_ident.split(',') if gene_ident else ['CDS']
62
+ except Exception:
63
+ gene_types = ['CDS']
64
+ if ftype not in gene_types and not ('CDS' in gene_types and ftype == 'CDS'):
65
+ continue
66
+ try:
67
+ start = parts[3]
68
+ stop = parts[4]
69
+ strand = parts[6]
70
+ info = parts[8]
71
+ except Exception:
72
+ continue
73
+ # write entry with coverage 0 and empty additional annotation
74
+ entry = f"{seqid}\t{os.path.splitext(os.path.basename(reference_annotation))[0]}\t{ftype}\t{start}\t{stop}\t.\t{strand}\t.\tID=Original_Annotation={info};Additional_Annotation=;Coverage=0\n"
75
+ write_out.write(entry)
76
+ entries_written += 1
77
+ except Exception as e:
78
+ logging.warning('Fallback parse of reference annotation failed: %s', e)
79
+
80
+ write_out.flush()
81
+ logging.info('Wrote %d fallback reference entries to %s', entries_written, output_file)
82
+ return
83
+
84
+ for contig, genes in genes_To_Keep_by_contig.items():
85
+ # Use basename without extension for the source field
86
+ ref = os.path.splitext(os.path.basename(reference_annotation))[0].split('_')[0]
87
+ for pos, data in genes.items():
88
+ try:
89
+ pos_ = pos.split(',')
90
+ start = pos_[0]
91
+ stop = pos_[-1]
92
+ strand = data[0]
93
+ # Ensure indices exist and are strings
94
+ add_ann = str(data[4]) if len(data) > 4 else ''
95
+ orig_ann = str(data[5]) if len(data) > 5 else ''
96
+ entry = (
97
+ contig + '\t' + ref + '\t' + data[2] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation=' + orig_ann + ';Additional_Annotation=' + add_ann + ';Coverage=' + str(
98
+ data[1]) + '\n')
99
+ write_out.write(entry)
100
+ entries_written += 1
101
+ except Exception as e:
102
+ # Log the bad entry and continue
103
+ logging.warning('Skipping bad GFF entry for contig %s pos %s: %s', contig, pos, e)
104
+ continue
105
+
106
+ write_out.flush()
107
+ logging.info('Wrote %d GFF entries to %s', entries_written, output_file)
108
+ except OSError as e:
109
+ logging.error("Cannot write to output file %s: %s", output_file, e)
110
+ sys.exit(1)
111
+
112
+
113
+ def _get_opt(options, *names):
114
+
115
+ for n in names:
116
+ if hasattr(options, n):
117
+ return getattr(options, n)
118
+ return None
119
+
120
+
121
+ def _parse_pos(pos_str):
122
+ try:
123
+ s, e = pos_str.split(',')
124
+ return int(s), int(e)
125
+ except Exception:
126
+ return None, None
127
+
128
+
129
+ def _write_discordance_report(report_path, entries):
130
+ # Summarise discordance entries instead of writing each row (GFFs keep the full detail)
131
+ report_path = os.path.expanduser(report_path)
132
+ out_dir = os.path.dirname(report_path)
133
+ if out_dir:
134
+ os.makedirs(out_dir, exist_ok=True)
135
+ try:
136
+ from collections import Counter
137
+ total = len(entries) if entries is not None else 0
138
+ status_counts = Counter()
139
+ contig_counts = Counter()
140
+ cov_values = []
141
+ for e in (entries or []):
142
+ st = str(e.get('status', 'unknown'))
143
+ status_counts[st] += 1
144
+ contig = str(e.get('contig', ''))
145
+ if contig:
146
+ contig_counts[contig] += 1
147
+ # attempt to extract numeric coverage if present
148
+ try:
149
+ cov = float(e.get('coverage', 0) if e.get('coverage', '') != '' else 0)
150
+ cov_values.append(cov)
151
+ except Exception:
152
+ try:
153
+ # sometimes coverage might be a string like '12.34' or '0.00'
154
+ cov_values.append(float(str(e.get('coverage', '0')).strip()))
155
+ except Exception:
156
+ pass
157
+
158
+ avg_cov = (sum(cov_values) / len(cov_values)) if cov_values else 0.0
159
+ nonzero_covs = [v for v in cov_values if v != 0]
160
+ nonzero_avg_cov = (sum(nonzero_covs) / len(nonzero_covs)) if nonzero_covs else 0.0
161
+
162
+ with open(report_path, 'w', encoding='utf-8') as fh:
163
+ fh.write('metric\tvalue\n')
164
+ fh.write(f'total_entries\t{total}\n')
165
+ fh.write(f'unique_contigs_with_discordance\t{len(contig_counts)}\n')
166
+ fh.write(f'average_coverage_reported\t{avg_cov:.2f}\n')
167
+ fh.write(f'non-zero_average_coverage_reported\t{nonzero_avg_cov:.2f}\n')
168
+ # statuses
169
+ for status, cnt in status_counts.most_common():
170
+ fh.write(f'status::{status}\t{cnt}\n')
171
+ # per-contig counts (only top 50 to avoid huge files)
172
+ fh.write('# per-contig discordance counts (top 50)\n')
173
+ for contig, cnt in contig_counts.most_common(50):
174
+ fh.write(f'contig::{contig}\t{cnt}\n')
175
+ logging.info('Wrote discordance summary report: %s', report_path)
176
+ except OSError as e:
177
+ logging.error('Cannot write discordance report %s: %s', report_path, e)
178
+ sys.exit(1)
179
+
180
+
181
+ def _write_discordance_gff(report_path, entries, reference_annotation_basename):
182
+ """Write a list of discordance entries (dicts) to a GFF file."""
183
+ report_path = os.path.expanduser(report_path)
184
+ out_dir = os.path.dirname(report_path)
185
+ if out_dir:
186
+ os.makedirs(out_dir, exist_ok=True)
187
+ try:
188
+ with open(report_path, 'w', encoding='utf-8') as fh:
189
+ fh.write('##gff-version\t3\n')
190
+ fh.write('#\tAnnotation-Intersector discordance report\n')
191
+ fh.write('#\tRun Date:' + str(date.today()) + '\n')
192
+ fh.write('##Original File: ' + reference_annotation_basename + '\n')
193
+ entries_written = 0
194
+ for e in entries:
195
+ try:
196
+ contig = str(e.get('contig', '.'))
197
+ # prefer reference coords if present
198
+ ref_pos = e.get('ref_pos', '')
199
+ add_pos = e.get('add_pos', '')
200
+ if ref_pos:
201
+ start, stop = ref_pos.split(',')
202
+ ftype = e.get('ref_type', '') or 'CDS'
203
+ source = reference_annotation_basename.split('_')[0] or 'reference'
204
+ info_attr = e.get('ref_info', '')
205
+ else:
206
+ # No ref pos, use add_pos coords
207
+ start, stop = add_pos.split(',') if add_pos else ('0','0')
208
+ ftype = e.get('add_type', '') or 'CDS'
209
+ source = e.get('add_type', '') or 'additional'
210
+ info_attr = e.get('add_info', '')
211
+ # attributes
212
+ attrs = []
213
+ attrs.append('Status=' + str(e.get('status', '')))
214
+ attrs.append('Coverage=' + str(e.get('coverage', '')))
215
+ if e.get('ref_info', ''):
216
+ attrs.append('Ref_info=' + str(e.get('ref_info', '')).replace(';','%3B'))
217
+ if e.get('add_info', ''):
218
+ attrs.append('Add_info=' + str(e.get('add_info', '')).replace(';','%3B'))
219
+ attr_str = ';'.join(attrs)
220
+ # construct GFF line
221
+ line = f"{contig}\t{source}\t{ftype}\t{start}\t{stop}\t.\t.\t.\t{attr_str}\n"
222
+ fh.write(line)
223
+ entries_written += 1
224
+ except Exception:
225
+ # skip bad entry
226
+ continue
227
+ logging.info('Wrote %d discordance GFF entries to %s', entries_written, report_path)
228
+ except OSError as e:
229
+ logging.error('Cannot write discordance GFF %s: %s', report_path, e)
230
+ sys.exit(1)
231
+
232
+
233
+ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
234
+ """Compare reference and additional maps per contig and return three lists:
235
+ - only_in_ref: reference entries with no overlapping additional ORF
236
+ - only_in_additional: additional ORFs that don't overlap any reference entry
237
+ - mismatches: reference entries with overlapping additional ORFs that don't meet match criteria
238
+
239
+ This version is strand-aware and will classify mismatches that are due to strand
240
+ differences separately from type/coverage differences.
241
+
242
+ Expected layouts:
243
+ - ref entry: [strand, 'ref', type, info]
244
+ - add entry: [strand, ..., type (index 3), info (last element)]
245
+ """
246
+ only_in_ref = []
247
+ only_in_additional = []
248
+ mismatches = []
249
+
250
+ all_contigs = list(OrderedDict.fromkeys(list(ref_map_by_contig.keys()) + list(add_map_by_contig.keys())))
251
+ matched_adds = set()
252
+ cov_thresh = getattr(options, 'coverage', 100.0)
253
+
254
+ for contig in all_contigs:
255
+ ref_map = ref_map_by_contig.get(contig, {}) or {}
256
+ add_map = add_map_by_contig.get(contig, {}) or {}
257
+
258
+ # For each reference feature, find best overlapping additional ORF and classify
259
+ for rpos, rdata in ref_map.items():
260
+ rstart, rstop = _parse_pos(rpos)
261
+ if rstart is None:
262
+ continue
263
+ rlen = (rstop - rstart + 1) if (rstop and rstart) else 0
264
+ best_cov = 0.0
265
+ best_add = None
266
+ best_add_data = None
267
+ matched = False
268
+
269
+ # reference fields
270
+ r_strand = rdata[0] if len(rdata) > 0 else ''
271
+ r_type = rdata[3] if len(rdata) > 2 else ''
272
+ r_info = rdata[-1] if len(rdata) > 3 else ''
273
+
274
+ for apos, adata in add_map.items():
275
+ astart, astop = _parse_pos(apos)
276
+ if astart is None:
277
+ continue
278
+ ov = max(0, min(rstop, astop) - max(rstart, astart) + 1)
279
+ if ov <= 0:
280
+ continue
281
+ cov = 100.0 * float(ov) / float(rlen) if rlen > 0 else 0.0
282
+ if cov > best_cov:
283
+ best_cov = cov
284
+ best_add = apos
285
+ best_add_data = adata
286
+
287
+ # additional fields
288
+ a_strand = adata[0] if len(adata) > 0 else ''
289
+ a_type = adata[3] if len(adata) > 3 else ''
290
+ # frame check (distance of stops mod 3)
291
+ try:
292
+ frame_ok = ((abs(astop - rstop) % 3) == 0)
293
+ except Exception:
294
+ frame_ok = True
295
+
296
+ # check for a fully satisfactory match: type, coverage, strand and frame
297
+ if a_type == r_type and cov >= cov_thresh and (a_strand == r_strand) and frame_ok:
298
+ matched = True
299
+ matched_adds.add((contig, apos))
300
+ break
301
+
302
+ if matched:
303
+ # good match -> not discordant
304
+ continue
305
+
306
+ if best_add is None:
307
+ # no overlapping additional ORF found
308
+ only_in_ref.append({
309
+ 'contig': contig,
310
+ 'ref_pos': rpos,
311
+ 'add_pos': '',
312
+ 'ref_type': r_type,
313
+ 'add_type': '',
314
+ 'status': 'only_in_ref',
315
+ 'coverage': '0.00',
316
+ 'ref_info': r_info,
317
+ 'add_info': ''
318
+ })
319
+ else:
320
+ # overlapping additional ORF(s) exist but none satisfied the match criteria
321
+ a_type = best_add_data[3] if len(best_add_data) > 3 else ''
322
+ a_info = best_add_data[-1] if len(best_add_data) > 0 else ''
323
+ a_strand = best_add_data[0] if len(best_add_data) > 0 else ''
324
+
325
+ # compute reason flags
326
+ type_match = (a_type == r_type)
327
+ strand_match = (a_strand == r_strand)
328
+ cov_ok = (best_cov >= cov_thresh)
329
+ try:
330
+ # use frame between best add and ref
331
+ astart, astop = _parse_pos(best_add)
332
+ frame_ok = ((abs(astop - rstop) % 3) == 0) if (astop is not None) else True
333
+ except Exception:
334
+ frame_ok = True
335
+
336
+ # classify mismatch with strand-awareness
337
+ if not cov_ok:
338
+ status = 'found_in_additional_but_below_coverage'
339
+ elif not type_match and not strand_match:
340
+ status = 'found_in_additional_different_type_and_strand'
341
+ elif not type_match:
342
+ status = 'found_in_additional_different_type'
343
+ elif not strand_match:
344
+ status = 'found_in_additional_different_strand'
345
+ elif not frame_ok:
346
+ status = 'found_in_additional_different_frame'
347
+ else:
348
+ status = 'partial_overlap'
349
+
350
+ mismatches.append({
351
+ 'contig': contig,
352
+ 'ref_pos': rpos,
353
+ 'add_pos': best_add or '',
354
+ 'ref_type': r_type,
355
+ 'add_type': a_type,
356
+ 'status': status,
357
+ 'coverage': f"{best_cov:.2f}",
358
+ 'ref_info': r_info,
359
+ 'add_info': a_info,
360
+ })
361
+
362
+ if best_add:
363
+ matched_adds.add((contig, best_add))
364
+
365
+ # Additional-only ORFs: those not matched and not overlapping any reference
366
+ for apos, adata in add_map.items():
367
+ if (contig, apos) in matched_adds:
368
+ continue
369
+ astart, astop = _parse_pos(apos)
370
+ if astart is None:
371
+ continue
372
+ overlapped = False
373
+ for rpos in ref_map.keys():
374
+ rstart, rstop = _parse_pos(rpos)
375
+ if rstart is None:
376
+ continue
377
+ if max(rstart, astart) <= min(rstop, astop):
378
+ overlapped = True
379
+ break
380
+ if not overlapped:
381
+ only_in_additional.append({
382
+ 'contig': contig,
383
+ 'ref_pos': '',
384
+ 'add_pos': apos,
385
+ 'ref_type': '',
386
+ 'add_type': adata[3] if len(adata) > 3 else '',
387
+ 'status': 'only_in_additional',
388
+ 'coverage': '0.00',
389
+ 'ref_info': '',
390
+ 'add_info': adata[-1] if len(adata) > 0 else '',
391
+ })
392
+
393
+ return only_in_ref, only_in_additional, mismatches
394
+
395
+
396
+ def comparator(options):
397
+ # Multi-contig aware comparator
398
+ genome_seq = ''
399
+ genome_ID = None
400
+ dna_regions = {}
401
+
402
+ # Support both 'genome_DNA' and 'genome_dna' option names (compat with Annotation_Compare)
403
+ genome_path = _get_opt(options, 'genome_DNA', 'genome_dna')
404
+
405
+ # Load genome fasta if provided
406
+ if genome_path:
407
+ if not os.path.exists(genome_path):
408
+ logging.error('Genome DNA file does not exist: %s', genome_path)
409
+ sys.exit(1)
410
+ try:
411
+ fasta_in = gzip.open(genome_path, 'rt')
412
+ dna_regions = fasta_load(fasta_in)
413
+ except Exception:
414
+ fasta_in = open(genome_path, 'r', encoding='unicode_escape')
415
+ dna_regions = fasta_load(fasta_in)
416
+ # genome_ID fallback
417
+ try:
418
+ if isinstance(dna_regions, dict) and len(dna_regions) > 0:
419
+ genome_ID = next(iter(dna_regions.keys()))
420
+ genome_seq = dna_regions[genome_ID]
421
+ else:
422
+ genome_ID = os.path.splitext(os.path.basename(genome_path))[0]
423
+ except Exception:
424
+ genome_seq = ''
425
+ genome_ID = os.path.splitext(os.path.basename(genome_path))[0]
426
+ else:
427
+ # derive genome_ID from reference annotation filename
428
+ genome_seq = ''
429
+ genome_ID = os.path.splitext(os.path.basename(options.reference_annotation))[0]
430
+
431
+
432
+ # Load reference annotation. If a tool-specific parser is requested, use it to ensure contig keys exist in dna_regions
433
+ if getattr(options, 'reference_tool', None):
434
+ try:
435
+ reference_tool_mod = import_module('Tools.' + options.reference_tool + '.' + options.reference_tool,
436
+ package='my_current_pkg')
437
+ except ModuleNotFoundError:
438
+ try:
439
+ reference_tool_mod = import_module(
440
+ 'ORForise.Tools.' + options.reference_tool + '.' + options.reference_tool,
441
+ package='my_current_pkg')
442
+ except ModuleNotFoundError:
443
+ logging.error('Reference tool module not available: %s', options.reference_tool)
444
+ sys.exit(1)
445
+ reference_tool_fn = getattr(reference_tool_mod, options.reference_tool)
446
+ try:
447
+ # Call the tool parser; many tools return a contig->dict mapping. Ensure dna_regions contains those contig keys.
448
+ ref_out = reference_tool_fn(options.reference_annotation, dna_regions)
449
+ if isinstance(ref_out, dict):
450
+ for contig_key in ref_out.keys():
451
+ if contig_key not in dna_regions:
452
+ dna_regions[contig_key] = ['']
453
+ except Exception as e:
454
+ logging.error('Failed to load reference annotation with tool %s: %s', options.reference_tool, e)
455
+ sys.exit(1)
456
+ else:
457
+ try:
458
+ gff_in = gzip.open(options.reference_annotation, 'rt')
459
+ dna_regions = gff_load(options, gff_in, dna_regions)
460
+ except Exception:
461
+ gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
462
+ dna_regions = gff_load(options, gff_in, dna_regions)
463
+
464
+ # Build ref_genes_by_contig: mapping contig -> OrderedDict(pos -> [strand, 'ref', type, info])
465
+ ref_genes_by_contig = OrderedDict()
466
+
467
+ if not getattr(options, 'reference_tool', None):
468
+ # Parse GFF and group by seqid (first column)
469
+ with open(options.reference_annotation, 'r', encoding='unicode_escape') as genome_gff:
470
+ for line in genome_gff:
471
+ line = line.rstrip('\n')
472
+ if not line or line.startswith('#'):
473
+ continue
474
+ parts = line.split('\t')
475
+ if len(parts) < 9:
476
+ continue
477
+ seqid = parts[0]
478
+ ftype = parts[2]
479
+ try:
480
+ # Determine if this feature type is requested
481
+ gene_types = options.gene_ident.split(',') if options.gene_ident else ['CDS']
482
+ except Exception:
483
+ gene_types = ['CDS']
484
+ if ftype not in gene_types and not ('CDS' in gene_types and ftype == 'CDS'):
485
+ # If user specified CDS and this is CDS, include; else skip
486
+ if ftype not in gene_types:
487
+ continue
488
+ try:
489
+ start = int(parts[3])
490
+ stop = int(parts[4])
491
+ strand = parts[6]
492
+ pos = f"{start},{stop}"
493
+ info = parts[8]
494
+ except (IndexError, ValueError):
495
+ continue
496
+ if seqid not in ref_genes_by_contig:
497
+ ref_genes_by_contig[seqid] = OrderedDict()
498
+ ref_genes_by_contig[seqid].update({pos: [strand, 'ref', ftype, info]})
499
+ else:
500
+ # Use a tool parser to produce ref_genes; expect tool to return mapping contig->dict
501
+ try:
502
+ reference_tool_mod = import_module('Tools.' + options.reference_tool + '.' + options.reference_tool,
503
+ package='my_current_pkg')
504
+ except ModuleNotFoundError:
505
+ try:
506
+ reference_tool_mod = import_module('ORForise.Tools.' + options.reference_tool + '.' + options.reference_tool,
507
+ package='my_current_pkg')
508
+ except ModuleNotFoundError:
509
+ sys.exit("Tool not available")
510
+ reference_tool_fn = getattr(reference_tool_mod, options.reference_tool)
511
+ ref_out = reference_tool_fn(options.reference_annotation, dna_regions)
512
+ # If the tool returns a mapping contig->dict, use that; otherwise assume single-contig and wrap
513
+ if isinstance(ref_out, dict) and any(isinstance(v, dict) for v in ref_out.values()):
514
+ ref_genes_by_contig = ref_out
515
+ else:
516
+ # single-contig output: place under genome_ID or first contig in dna_regions
517
+ contig_key = genome_ID if genome_ID in dna_regions else (next(iter(dna_regions.keys())) if dna_regions else genome_ID)
518
+ ref_genes_by_contig[contig_key] = ref_out
519
+
520
+ # Get additional ORFs using tool parser; expect mapping contig->dict
521
+ try:
522
+ additional_tool_mod = import_module('Tools.' + options.additional_tool + '.' + options.additional_tool,
523
+ package='my_current_pkg')
524
+ except ModuleNotFoundError:
525
+ try:
526
+ additional_tool_mod = import_module('ORForise.Tools.' + options.additional_tool + '.' + options.additional_tool,
527
+ package='my_current_pkg')
528
+ except ModuleNotFoundError:
529
+ sys.exit("Tool not available")
530
+ additional_tool_fn = getattr(additional_tool_mod, options.additional_tool)
531
+ additional_orfs = additional_tool_fn(options.additional_annotation, dna_regions)
532
+
533
+ # Normalise additional_orfs: if single-contig dict, wrap under appropriate contig key
534
+ if isinstance(additional_orfs, dict) and any(isinstance(v, dict) for v in additional_orfs.values()):
535
+ additional_by_contig = additional_orfs
536
+ else:
537
+ contig_key = genome_ID if genome_ID in dna_regions else (next(iter(dna_regions.keys())) if dna_regions else genome_ID)
538
+ additional_by_contig = {contig_key: additional_orfs}
539
+
540
+ genes_To_Keep_by_contig = OrderedDict()
541
+
542
+ # Iterate per contig and perform intersection logic
543
+ for contig, orfs in additional_by_contig.items():
544
+ ref_genes = ref_genes_by_contig.get(contig, OrderedDict())
545
+ kept = OrderedDict()
546
+ if options.coverage == 100.00:
547
+ for orf, data in orfs.items():
548
+ try:
549
+ o_Start = int(orf.split(',')[0])
550
+ o_Stop = int(orf.split(',')[1])
551
+ except Exception:
552
+ continue
553
+ o_Strand = data[0]
554
+ additional_type = data[3]
555
+ additional_info = data[-1]
556
+
557
+ # Lookup exact-match reference entry safely
558
+ ref_entry = ref_genes.get(f"{o_Start},{o_Stop}")
559
+ if not ref_entry:
560
+ continue
561
+ # ref_entry layout: [strand, 'ref', type, info]
562
+ ref_type = ref_entry[3] if len(ref_entry) > 2 else ''
563
+ ref_info = ref_entry[-1] if len(ref_entry) > 3 else ''
564
+
565
+ if additional_type == ref_type and o_Strand == ref_entry[0]:
566
+ kept.update({f"{o_Start},{o_Stop}": [o_Strand, options.coverage, additional_type, ref_type, additional_info, ref_info]})
567
+ else:
568
+ cov_thresh = options.coverage
569
+ for orf, data in orfs.items():
570
+ try:
571
+ o_Start = int(orf.split(',')[0])
572
+ o_Stop = int(orf.split(',')[1])
573
+ except Exception:
574
+ continue
575
+ o_Strand = data[0]
576
+ additional_type = data[3]
577
+ additional_info = data[-1]
578
+
579
+ for gene, r_data in ref_genes.items():
580
+ try:
581
+ g_Start = int(gene.split(',')[0])
582
+ g_Stop = int(gene.split(',')[1])
583
+ except Exception:
584
+ continue
585
+
586
+ # skip genes that start after this ORF (ref genes assumed sorted by start)
587
+ if g_Start > o_Stop:
588
+ break
589
+ # skip genes that end before this ORF
590
+ if g_Stop < o_Start:
591
+ continue
592
+
593
+ # compute overlap length without creating large sets
594
+ overlap = max(0, min(o_Stop, g_Stop) - max(o_Start, g_Start) + 1)
595
+ gene_len = (g_Stop - g_Start + 1)
596
+ if gene_len <= 0:
597
+ continue
598
+ cov = 100.0 * overlap / gene_len
599
+
600
+ g_Strand = r_data[0]
601
+ # r_data layout: [strand, 'ref', type, info]
602
+ ref_type = r_data[3] if len(r_data) > 2 else ''
603
+ ref_info = r_data[-1] if len(r_data) > 3 else ''
604
+
605
+ if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and cov >= cov_thresh:
606
+ if additional_type == ref_type:
607
+ kept[f"{g_Start},{g_Stop}"] = [g_Strand, int(cov), additional_type, ref_type,
608
+ additional_info, ref_info]
609
+ genes_To_Keep_by_contig[contig] = sortORFs(kept)
610
+
611
+ # Log counts for debugging why GFF might be empty
612
+ try:
613
+ total_ref = sum(len(v) for v in ref_genes_by_contig.values()) if ref_genes_by_contig else 0
614
+ except Exception:
615
+ total_ref = 0
616
+ try:
617
+ total_add = sum(len(v) for v in additional_by_contig.values()) if additional_by_contig else 0
618
+ except Exception:
619
+ total_add = 0
620
+ try:
621
+ total_kept = sum(len(v) for v in genes_To_Keep_by_contig.values()) if genes_To_Keep_by_contig else 0
622
+ except Exception:
623
+ total_kept = 0
624
+ logging.info('Reference genes loaded: %d', total_ref)
625
+ logging.info('Additional ORFs loaded: %d', total_add)
626
+ logging.info('Kept genes after intersection: %d', total_kept)
627
+
628
+ # If requested, compute discordance lists and write three GFF outputs
629
+ if getattr(options, 'report_discordance', False):
630
+ # Compute discordance lists
631
+ only_in_ref, only_in_additional, mismatches = compute_discordance(ref_genes_by_contig, additional_by_contig, options)
632
+ base = os.path.splitext(os.path.basename(options.output_file))[0] if getattr(options, 'output_file', None) else 'discordance'
633
+ outdir = os.path.dirname(options.output_file) if getattr(options, 'output_file', None) else '.'
634
+ ref_base = os.path.splitext(os.path.basename(options.reference_annotation))[0]
635
+
636
+ # Keep the three detailed GFF outputs (backward compatible)
637
+ gff_ref = os.path.join(outdir, f"{base}.only_in_reference.gff")
638
+ gff_add = os.path.join(outdir, f"{base}.only_in_additional.gff")
639
+ gff_mis = os.path.join(outdir, f"{base}.mismatches.gff")
640
+ try:
641
+ _write_discordance_gff(gff_ref, only_in_ref, ref_base)
642
+ logging.info('Wrote discordance GFF: %s', gff_ref)
643
+ except Exception:
644
+ logging.exception('Failed to write discordance GFF: %s', gff_ref)
645
+ try:
646
+ _write_discordance_gff(gff_add, only_in_additional, ref_base)
647
+ logging.info('Wrote discordance GFF: %s', gff_add)
648
+ except Exception:
649
+ logging.exception('Failed to write discordance GFF: %s', gff_add)
650
+ try:
651
+ _write_discordance_gff(gff_mis, mismatches, ref_base)
652
+ logging.info('Wrote discordance GFF: %s', gff_mis)
653
+ except Exception:
654
+ logging.exception('Failed to write discordance GFF: %s', gff_mis)
655
+
656
+ # Write a single concise summary TSV aggregating all discordance entries
657
+ combined = []
658
+ combined.extend(only_in_ref or [])
659
+ combined.extend(mismatches or [])
660
+ combined.extend(only_in_additional or [])
661
+ combined_tsv = os.path.join(outdir, f"{base}.discordance_summary.tsv")
662
+ try:
663
+ _write_discordance_report(combined_tsv, combined)
664
+ logging.info('Wrote discordance summary TSV: %s', combined_tsv)
665
+ except Exception:
666
+ logging.exception('Failed to write discordance summary TSV: %s', combined_tsv)
667
+
668
+ # Ensure we always write a GFF (header + possibly diagnostic) so the core file is not empty
669
+ genome_DNA_path = genome_path if genome_path else None
670
+
671
+
672
+
673
+ # Write the kept genes GFF (this was missing and is why gff_writer wasn't called)
674
+ try:
675
+ logging.info('About to call gff_writer: total_kept=%d', total_kept)
676
+ try:
677
+ contig_summary = {c: len(v) for c, v in genes_To_Keep_by_contig.items()}
678
+ except Exception:
679
+ contig_summary = {}
680
+ logging.info('Kept genes by contig (sample): %s', dict(list(contig_summary.items())[:10]))
681
+ logging.info('Writing combined GFF to %s', options.output_file)
682
+ gff_writer(genome_ID, genome_DNA_path, options.reference_annotation, getattr(options, 'reference_tool', None), None, options.additional_annotation, options.additional_tool, genes_To_Keep_by_contig, options.output_file, getattr(options, 'gene_ident', None))
683
+ logging.info('gff_writer finished (check output file)')
684
+ except Exception as e:
685
+ logging.exception('Failed to write combined GFF: %s', e)
686
+
687
+ # End of comparator
688
+
689
+
690
+ def main():
691
+ print(WELCOME)
692
+
693
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Annotation-Intersector Run Parameters')
694
+
695
+ required = parser.add_argument_group('Required Arguments')
696
+ required.add_argument('-ref', dest='reference_annotation', required=True,
697
+ help='Reference annotation GFF file')
698
+ required.add_argument('-at', dest='additional_tool', required=True,
699
+ help='Tool name/format for additional annotation (module under Tools/)')
700
+ required.add_argument('-add', dest='additional_annotation', required=True,
701
+ help='Additional annotation file to compare')
702
+ required.add_argument('-o', dest='output_file', required=True,
703
+ help='Output GFF filename for kept genes')
704
+
705
+ optional = parser.add_argument_group('Optional Arguments')
706
+ optional.add_argument('-dna', dest='genome_DNA', required=False,
707
+ help='Genome DNA file (.fa) which both annotations are based on')
708
+ optional.add_argument('-rt', dest='reference_tool', required=False,
709
+ help='Reference tool parser name (if not provided, GFF is expected)')
710
+ optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False,
711
+ help='Comma-separated feature types to consider from reference (default: CDS)')
712
+ optional.add_argument('-cov', '--coverage', dest='coverage', default=100.0, type=float, required=False,
713
+ help='Percentage coverage threshold for intersection (default 100)')
714
+ optional.add_argument('--report-discordance', dest='report_discordance', action='store_true', required=False,
715
+ help='If set, produce discordance reports (three GFFs)')
716
+ optional.add_argument('--report-discordance-file', dest='report_discordance_file', required=False,
717
+ help='Optional base path for discordance reports')
718
+
719
+ options = parser.parse_args()
720
+ comparator(options)
721
+
722
+
723
+ if __name__ == '__main__':
724
+ main()
725
+ print('Complete')
726
+