ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/GFF_Adder.py ADDED
@@ -0,0 +1,543 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ from collections import OrderedDict, defaultdict, Counter
4
+ import gzip
5
+ from datetime import date
6
+ import sys
7
+
8
+ try:
9
+ from .utils import *
10
+ except (ImportError, ModuleNotFoundError):
11
+ from utils import *
12
+
13
+
14
+ ########################################
15
+ def gff_writer(options, combined_ORFs_by_contig, output_file, reference_annotation, additional_annotation):
16
+ write_out = open(output_file, 'w')
17
+ write_out.write("##gff-version\t3\n#\tGFF-Adder\n#\tRun Date:" + str(date.today()) + '\n')
18
+ write_out.write("##Genome DNA File:" + (options.genome_DNA if hasattr(options, 'genome_DNA') else '') + '\n')
19
+ write_out.write("##Original File: " + reference_annotation + "\n##Additional File: " + additional_annotation + '\n')
20
+ # meta counts
21
+ Ref_Only = 0
22
+ Ref_Combined = defaultdict(int)
23
+ Non_Ref_Combined = defaultdict(int)
24
+
25
+ # New counters: per-tool totals and per-contig per-tool breakdown
26
+ per_tool_total = Counter() # counts for additional-only entries
27
+ per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
28
+ ref_per_tool_total = Counter() # counts for reference entries matched by tools
29
+ ref_per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
30
+
31
+ # Iterate contigs in deterministic order
32
+ for contig in combined_ORFs_by_contig:
33
+ combined_ORFs = combined_ORFs_by_contig[contig]
34
+ # ref_gene_set for this contig: use keys from ref portion (we can detect by data[1]=='ref')
35
+ ref_gene_set = [k for k, v in combined_ORFs.items() if len(v) > 1 and v[1] == 'ref']
36
+
37
+ for pos, data in combined_ORFs.items():
38
+ pos_ = pos.split(',')
39
+ # pos may be like 'start,stop' or 'contig,start,stop' but here we expect 'start,stop'
40
+ if len(pos_) >= 2:
41
+ start = pos_[0]
42
+ stop = pos_[-1]
43
+ else:
44
+ # fallback: skip malformed
45
+ continue
46
+ strand = data[0]
47
+
48
+ # Build additional_annotation_info from the combined entry's additional list if present.
49
+ # Normalise entries and prefer the info portion after any 'ToolName:info' prefix.
50
+ additional_annotation_info = ''
51
+ additional_items = []
52
+ if len(data) > 4 and data[4]:
53
+ for add in data[4]:
54
+ s = str(add).strip()
55
+ if not s:
56
+ continue
57
+ if ':' in s:
58
+ # split tool:info -> take info part
59
+ _, info_part = s.split(':', 1)
60
+ info_part = info_part.strip()
61
+ else:
62
+ info_part = s
63
+ if info_part:
64
+ additional_items.append(info_part)
65
+ # deduplicate while preserving order
66
+ seen = set()
67
+ deduped = []
68
+ for it in additional_items:
69
+ if it not in seen:
70
+ seen.add(it)
71
+ deduped.append(it)
72
+ if deduped:
73
+ additional_annotation_info = ';'.join(deduped)
74
+ elif len(data) > 3 and data[3]:
75
+ additional_annotation_info = str(data[3]).strip()
76
+
77
+ # tools list from options (maybe empty)
78
+ tools = options.additional_tool.split(',') if getattr(options, 'additional_tool', None) else []
79
+
80
+ # Determine matched_tools_list reliably:
81
+ # prefer extracting tool names from data[4] entries, otherwise fallback to scanning values
82
+ matched_tools_list = []
83
+ try:
84
+ if len(data) > 4 and data[4]:
85
+ for add in data[4]:
86
+ # expected format: "ToolName:info" or "info"
87
+ if isinstance(add, str) and ':' in add:
88
+ t = add.split(':', 1)[0].strip()
89
+ if t:
90
+ matched_tools_list.append(t)
91
+ else:
92
+ # try to detect one of the known tool names in the string
93
+ for tool in tools:
94
+ if tool and tool in str(add):
95
+ matched_tools_list.append(tool)
96
+ else:
97
+ # fallback: scan the whole data structure for tool names (previous behaviour)
98
+ for tool in tools:
99
+ if tool and any(tool in str(x) for x in data):
100
+ matched_tools_list.append(tool)
101
+ except Exception:
102
+ # keep matched_tools_list empty if anything unexpected happens
103
+ matched_tools_list = []
104
+
105
+ # Normalise matched tools: unique, sorted
106
+ matched_tools_list = sorted(set(matched_tools_list))
107
+ matched_tools_str = ','.join(matched_tools_list)
108
+ matched_count = len(matched_tools_list)
109
+
110
+ # Build GFF entry and update meta counters
111
+ if pos not in ref_gene_set: # Additional-only entry (not in reference)
112
+ type_field = matched_tools_str if matched_tools_str else ''
113
+ Non_Ref_Combined[matched_count] += 1
114
+
115
+ # Update per-tool counters for additional-only entries
116
+ if matched_tools_list:
117
+ for t in matched_tools_list:
118
+ per_tool_total[t] += 1
119
+ per_contig_per_tool[contig][t] += 1
120
+ else:
121
+ # track unassigned additional entries (no tool name found)
122
+ per_tool_total['unassigned'] += 1
123
+ per_contig_per_tool[contig]['unassigned'] += 1
124
+
125
+ if not getattr(options, 'clean', False):
126
+ entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + (additional_annotation_info if additional_annotation_info else '') + '\n')
127
+ else:
128
+ entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + (additional_annotation_info if additional_annotation_info else '') + '\n')
129
+
130
+ else:
131
+ # Reference entry
132
+ if len(data) > 3 and data[3]:
133
+ info_field = data[3].replace('\n', '').strip()
134
+ else:
135
+ info_field = '.'
136
+
137
+ # Determine type and source fields
138
+ type_field = data[2] if len(data) > 2 and data[2] else (
139
+ reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '.')
140
+ source_field = data[5] if len(data) > 5 and data[5] else (
141
+ reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '')
142
+
143
+ # If additional_annotation_info duplicates info_field content, remove duplicate fragments.
144
+ filtered_additional = ''
145
+ if additional_annotation_info:
146
+ add_parts = [p.strip() for p in str(additional_annotation_info).split(';') if p.strip()]
147
+ info_parts = [p.strip() for p in str(info_field).split(';') if p.strip() and p.strip() != '.']
148
+ filtered = []
149
+ for ap in add_parts:
150
+ dup = False
151
+ for ip in info_parts:
152
+ # treat duplication if exact match or obvious substring relationship
153
+ if ip and (ap == ip or ip in ap or ap in ip):
154
+ dup = True
155
+ break
156
+ if not dup:
157
+ filtered.append(ap)
158
+ filtered_additional = ';'.join(filtered)
159
+
160
+ if not filtered_additional:
161
+ # Reference-only (no meaningful unique additional annotations)
162
+ Ref_Only += 1
163
+ if not getattr(options, 'clean', False):
164
+ entry = (
165
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + '\n')
166
+ else:
167
+ entry = (
168
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
169
+ else:
170
+ # Reference entry that had additional annotations (combined)
171
+ Ref_Combined[matched_count] += 1
172
+
173
+ # Update per-tool counters for reference-matched entries
174
+ if matched_tools_list:
175
+ for t in matched_tools_list:
176
+ ref_per_tool_total[t] += 1
177
+ ref_per_contig_per_tool[contig][t] += 1
178
+ else:
179
+ ref_per_tool_total['unassigned'] += 1
180
+ ref_per_contig_per_tool[contig]['unassigned'] += 1
181
+
182
+ if not getattr(options, 'clean', False):
183
+ entry = (
184
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + ';Matched_Annotations=' + filtered_additional + '\n')
185
+ else:
186
+ entry = (
187
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
188
+
189
+ write_out.write(entry)
190
+
191
+ # Produce metadata output if requested
192
+ if getattr(options, 'output_meta', False) == True:
193
+ # Summaries
194
+ total_ref_combined = sum(Ref_Combined.values())
195
+ total_nonref = sum(Non_Ref_Combined.values())
196
+ total_reference_genes = Ref_Only + total_ref_combined
197
+ Ref_Combined_counter = Counter(Ref_Combined)
198
+ Non_Ref_Combined_counter = Counter(Non_Ref_Combined)
199
+ with open(output_file.replace('.gff','_Meta.txt'),'w') as meta_out:
200
+ meta_out.write("GFF-Adder Metadata Report\n")
201
+ meta_out.write("=========================\n")
202
+ meta_out.write("Run Date: {}\n".format(date.today()))
203
+ meta_out.write("Genome DNA: {}\n".format(getattr(options, 'genome_DNA', 'N/A')))
204
+ meta_out.write("Reference annotation: {}\n".format(reference_annotation))
205
+ meta_out.write("Additional annotation(s): {}\n\n".format(additional_annotation))
206
+
207
+ meta_out.write("Summary counts\n")
208
+ meta_out.write("--------------\n")
209
+ meta_out.write(f"Reference-only genes (no matching additional annotation): {Ref_Only}\n")
210
+ meta_out.write(f"Reference genes with additional matches (combined): {total_ref_combined}\n")
211
+ meta_out.write(f"TOTAL reference genes observed: {total_reference_genes}\n")
212
+ meta_out.write(f"Additional-only genes (not present in reference): {total_nonref}\n\n")
213
+
214
+ meta_out.write("Distribution of matches for reference-combined entries (num matched tools -> count)\n")
215
+ if Ref_Combined_counter:
216
+ for k in sorted(Ref_Combined_counter):
217
+ meta_out.write(f" {k:>3} {Ref_Combined_counter[k]}\n")
218
+ else:
219
+ meta_out.write(" None\n")
220
+ meta_out.write("\n")
221
+
222
+ meta_out.write("Distribution of matches for non-reference entries (num matched tools -> count)\n")
223
+ if Non_Ref_Combined_counter:
224
+ for k in sorted(Non_Ref_Combined_counter):
225
+ meta_out.write(f" {k:>3} {Non_Ref_Combined_counter[k]}\n")
226
+ else:
227
+ meta_out.write(" None\n")
228
+ meta_out.write("\n")
229
+
230
+ # Per-tool totals for additional-only entries
231
+ meta_out.write("Per-tool additional-only annotation totals\n")
232
+ meta_out.write("-----------------------------------------\n")
233
+ if per_tool_total:
234
+ for t, c in per_tool_total.most_common():
235
+ meta_out.write(f" {t}: {c}\n")
236
+ else:
237
+ meta_out.write(" None\n")
238
+ meta_out.write("\n")
239
+
240
+ # Per-contig breakdown for additional-only entries: only contigs with additional genes
241
+ meta_out.write("Per-contig breakdown for additional-only annotations (only contigs with additions shown)\n")
242
+ meta_out.write("---------------------------------------------------------------------------------------\n")
243
+ if per_contig_per_tool:
244
+ for contig in sorted(per_contig_per_tool):
245
+ counter = per_contig_per_tool[contig]
246
+ if sum(counter.values()) == 0:
247
+ continue
248
+ meta_out.write(f" {contig}:\n")
249
+ for t, c in counter.most_common():
250
+ meta_out.write(f" {t}: {c}\n")
251
+ meta_out.write("\n")
252
+ else:
253
+ meta_out.write(" None\n\n")
254
+
255
+ # Per-tool totals and per-contig breakdown for reference genes matched by additional tools
256
+ meta_out.write("Per-tool totals for reference genes matched by additional tools\n")
257
+ meta_out.write("---------------------------------------------------------------\n")
258
+ if ref_per_tool_total:
259
+ for t, c in ref_per_tool_total.most_common():
260
+ meta_out.write(f" {t}: {c}\n")
261
+ else:
262
+ meta_out.write(" None\n")
263
+ meta_out.write("\n")
264
+
265
+ meta_out.write("Per-contig breakdown for reference genes matched by additional tools\n")
266
+ meta_out.write("--------------------------------------------------------------------\n")
267
+ if ref_per_contig_per_tool:
268
+ for contig in sorted(ref_per_contig_per_tool):
269
+ counter = ref_per_contig_per_tool[contig]
270
+ if sum(counter.values()) == 0:
271
+ continue
272
+ meta_out.write(f" {contig}:\n")
273
+ for t, c in counter.most_common():
274
+ meta_out.write(f" {t}: {c}\n")
275
+ meta_out.write("\n")
276
+ else:
277
+ meta_out.write(" None\n\n")
278
+
279
+ meta_out.write("Notes\n")
280
+ meta_out.write("-----\n")
281
+ meta_out.write(" - 'Reference-only' are reference entries that had no recorded additional annotation information.\n")
282
+ meta_out.write(" - 'Per-tool' counts are based on the tool names extracted from the additional-annotation provenance\n")
283
+ meta_out.write(" (expected format in combined entries: 'ToolName:info'). Entries with no tool detected are shown as 'unassigned'.\n")
284
+ meta_out.write("\nEnd of report\n")
285
+
286
+
287
+
288
+ def gff_adder(options):
289
+ # Load fasta into dna_regions (supports multi-contig)
290
+ try:
291
+ try:
292
+ fasta_in = gzip.open(options.genome_DNA, 'rt')
293
+ dna_regions = fasta_load(fasta_in)
294
+ except Exception:
295
+ fasta_in = open(options.genome_DNA, 'r', encoding='unicode_escape')
296
+ dna_regions = fasta_load(fasta_in)
297
+ except Exception:
298
+ # Fallback to legacy single-contig behaviour
299
+ genome_seq = ""
300
+ with open(options.genome_DNA, 'r') as genome_fasta:
301
+ for line in genome_fasta:
302
+ line = line.replace("\n", "")
303
+ if not line.startswith('>'):
304
+ genome_seq += str(line)
305
+ else:
306
+ genome_ID = line.split()[0].replace('>','')
307
+ # Create dna_regions with single entry
308
+ dna_regions = OrderedDict()
309
+ dna_regions[genome_ID] = (genome_seq, len(genome_seq), list(), None)
310
+
311
+ ###########################################
312
+ # Build reference gene dict per-contig
313
+ ref_genes_by_contig = defaultdict(OrderedDict)
314
+
315
+ if not options.reference_tool: # IF using Ensembl/file for comparison
316
+ # Parse reference gff to populate ref_genes_by_contig (retain original info fields)
317
+ # Detect gzip by magic bytes (first two bytes)
318
+ is_gz = False
319
+ with open(options.reference_annotation, 'rb') as _probe:
320
+ magic = _probe.read(2)
321
+ is_gz = (magic == b'\x1f\x8b')
322
+
323
+ if is_gz:
324
+ gff_in = gzip.open(options.reference_annotation, 'rt', errors='replace')
325
+ else:
326
+ # Open as plain text, replace undecodable bytes rather than fail
327
+ gff_in = open(options.reference_annotation, 'r', encoding='utf-8', errors='replace')
328
+
329
+ count = 0
330
+ try:
331
+ for line in gff_in:
332
+ if line.startswith('#') or line.strip() == '':
333
+ continue
334
+ parts = line.strip().split('\t')
335
+ if len(parts) < 9:
336
+ continue
337
+ contig = parts[0]
338
+ if contig not in dna_regions:
339
+ # skip records for contigs not in provided fasta
340
+ continue
341
+ try:
342
+ if 'CDS' in options.gene_ident and len(options.gene_ident) == 1:
343
+ if "CDS" in parts[2] and len(parts) == 9:
344
+ start = int(parts[3])
345
+ stop = int(parts[4])
346
+ strand = parts[6]
347
+ pos = f"{start},{stop}"
348
+ # store as [strand, source, type, info] to match downstream expectations
349
+ info = parts[8]
350
+ ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
351
+ count += 1
352
+ else:
353
+ gene_types = options.gene_ident.split(',')
354
+ if any(gene_type in parts[2] for gene_type in gene_types):
355
+ start = int(parts[3])
356
+ stop = int(parts[4])
357
+ strand = parts[6]
358
+ pos = f"{start},{stop}"
359
+ # store as [strand, source, type, info] to match downstream expectations
360
+ info = parts[8]
361
+ ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
362
+ count += 1
363
+ except IndexError:
364
+ continue
365
+ finally:
366
+ try:
367
+ gff_in.close()
368
+ except Exception:
369
+ pass
370
+
371
+ else:
372
+ # Reference tool provided: attempt to call it with dna_regions first (multi-contig aware), fallback to legacy signature
373
+ reference_tool = options.reference_tool if options.reference_tool != 'StORF-Reporter' else 'StORF-Reporter'
374
+ try:
375
+ reference_tool_mod = import_module('Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
376
+ except ModuleNotFoundError:
377
+ try:
378
+ reference_tool_mod = import_module('ORForise.Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
379
+ except ModuleNotFoundError:
380
+ sys.exit("Tool not available")
381
+ reference_tool_func = getattr(reference_tool_mod, reference_tool)
382
+ # Try multi-contig signature
383
+ try:
384
+ ref_result = reference_tool_func(options.reference_annotation, dna_regions)
385
+ except TypeError:
386
+ # Fallback to legacy signature, try passing genome seq string
387
+ genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
388
+ ref_result = reference_tool_func(reference_annotation=options.reference_annotation, genome_seq=genome_seq, gene_ident=options.gene_ident)
389
+ # Expect ref_result to be dict of contig -> {pos: data}
390
+ for contig, mapping in ref_result.items() if isinstance(ref_result, dict) else []:
391
+ ref_genes_by_contig[contig].update(mapping)
392
+
393
+ # Ensure each contig has an OrderedDict even if empty
394
+ for contig in dna_regions:
395
+ if contig not in ref_genes_by_contig:
396
+ ref_genes_by_contig[contig] = OrderedDict()
397
+
398
+ # Collect additional annotations per contig
399
+ additional_annotations_by_contig = defaultdict(OrderedDict)
400
+ tool_count = 0
401
+ for tool in options.additional_tool.split(','):
402
+ try:
403
+ additional_tool_mod = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
404
+ except ModuleNotFoundError:
405
+ try:
406
+ additional_tool_mod = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
407
+ except ModuleNotFoundError:
408
+ sys.exit("Tool not available")
409
+ additional_tool_func = getattr(additional_tool_mod, tool)
410
+
411
+ anno_file = options.additional_annotation.split(',')[tool_count]
412
+ tool_count += 1
413
+ # Try calling tool in multi-contig mode first
414
+ try:
415
+ tool_orfs = additional_tool_func(anno_file, dna_regions)
416
+ except TypeError:
417
+ # Fallback to legacy signature expecting genome_seq
418
+ genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
419
+ tool_orfs = additional_tool_func(anno_file, genome_seq, options.gene_ident)
420
+
421
+ # tool_orfs may be either {contig: {pos: data}} or a flat {pos: data}
422
+ if isinstance(tool_orfs, dict):
423
+ # If top-level keys look like contig names (present in dna_regions) then treat as multi-contig
424
+ top_keys = list(tool_orfs.keys())
425
+ if top_keys and top_keys[0] in dna_regions:
426
+ for contig, mapping in tool_orfs.items():
427
+ # Merge mapping into additional_annotations_by_contig and record tool provenance
428
+ for pos_k, pos_v in mapping.items():
429
+ # store tuple (value, tool)
430
+ additional_annotations_by_contig[contig][pos_k] = (pos_v, tool)
431
+ else:
432
+ # Treat as flat mapping — assume single contig if only one contig present
433
+ if len(dna_regions) == 1:
434
+ only_contig = next(iter(dna_regions))
435
+ for pos_k, pos_v in tool_orfs.items():
436
+ additional_annotations_by_contig[only_contig][pos_k] = (pos_v, tool)
437
+ else:
438
+ # If multiple contigs but mapping has contig-prefixed keys like 'contig,start,stop', split them
439
+ for k, v in tool_orfs.items():
440
+ parts = k.split(',')
441
+ if len(parts) == 3 and parts[0] in dna_regions:
442
+ contig = parts[0]
443
+ pos = parts[1] + ',' + parts[2]
444
+ additional_annotations_by_contig[contig][pos] = (v, tool)
445
+ else:
446
+ # Unknown format: assign nowhere (skip)
447
+ continue
448
+ tool_orfs = None
449
+
450
+ # Combine per-contig: keep reference entries and append additional annotations as supplemental
451
+ combined_ORFs_by_contig = OrderedDict()
452
+ for contig in dna_regions:
453
+ combined = OrderedDict()
454
+ # Add reference entries first; normalise to [strand, 'ref', type, ref_info, additional_list]
455
+ for pos, val in ref_genes_by_contig.get(contig, {}).items():
456
+ strand = val[0] if len(val) > 0 else '.'
457
+ src = 'ref'
458
+ source_field = val[1] if len(val) > 1 else 'ref'
459
+ ftype = val[2] if len(val) > 2 else '.'
460
+ ref_info = val[3] if len(val) > 3 else '.'
461
+ combined[pos] = [strand, src, ftype, ref_info, [], source_field]
462
+
463
+ # Now incorporate additional annotations without overwriting reference entries
464
+ for pos, wrapped in additional_annotations_by_contig.get(contig, {}).items():
465
+ # wrapped is (value, tool)
466
+ if isinstance(wrapped, tuple) and len(wrapped) == 2:
467
+ val, toolname = wrapped
468
+ else:
469
+ val = wrapped
470
+ toolname = ''
471
+
472
+ # Extract strand/type/info from value heuristically
473
+ strand_a = val[0] if isinstance(val, (list, tuple)) and len(val) > 0 else '.'
474
+ ftype_a = val[3] if isinstance(val, (list, tuple)) and len(val) > 2 else '.'
475
+ info_a = ''
476
+ if isinstance(val, (list, tuple)) and len(val) > 3:
477
+ info_a = val[4]
478
+ elif isinstance(val, str):
479
+ info_a = val
480
+
481
+ # If matching pos exists in reference, append additional info to its additional list
482
+ if pos in combined:
483
+ addstr = (toolname + ':' + info_a) if toolname else info_a
484
+ combined[pos][4].append(addstr)
485
+ else:
486
+ # Create a new entry for additional-only annotation: [strand, 'add', type, '.', [tool:info]]
487
+ addstr = (toolname + ':' + info_a) if toolname else info_a
488
+ #combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr]]
489
+ combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr], toolname]
490
+ # Sort ORFs for this contig
491
+ combined = sortORFs(combined)
492
+ combined_ORFs_by_contig[contig] = combined
493
+
494
+ # Call writer
495
+ gff_writer(options, combined_ORFs_by_contig, options.output_file, options.reference_annotation, options.additional_annotation)
496
+
497
+
498
+ def main():
499
+ print(WELCOME)
500
+
501
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': GFF-Adder Run Parameters.')
502
+ parser._action_groups.pop()
503
+
504
+ required = parser.add_argument_group('Required Arguments')
505
+ required.add_argument('-dna', dest='genome_DNA', required=True, help='Genome DNA file (.fa) which both annotations '
506
+ 'are based on')
507
+ required.add_argument('-ref', dest='reference_annotation', required=True,
508
+ help='Which reference annotation file to use as reference?')
509
+ required.add_argument('-at', dest='additional_tool', required=True,
510
+ help='Which format to use for additional annotation? - Can provide multiple annotations (Tool1,Tool2)')
511
+ required.add_argument('-add', dest='additional_annotation', required=True,
512
+ help='Which annotation file to add to reference annotation? - Can provide multiple annotations (1.GFF,2.GFF)')
513
+ required.add_argument('-o', dest='output_file', required=True,
514
+ help='Output filename')
515
+
516
+ optional = parser.add_argument_group('Optional Arguments')
517
+ optional.add_argument('-rt', dest='reference_tool', required=False,
518
+ help='Which tool format to use as reference? - If not provided, will default to the '
519
+ 'standard GFF format and will only look for "CDS" features')
520
+ optional.add_argument('--gene_ident', action='store', dest='gene_ident', default='CDS',
521
+ help='Identifier used for identifying genomic features in reference annotation "CDS,rRNA,tRNA"')
522
+ optional.add_argument('-mc', dest='mark_consensus', action='store_true', required=False,
523
+ help='Default - False: Mark reference annotations which where present in the additional tool annotation')
524
+ optional.add_argument('-c', dest='clean', action='store_true', required=False,
525
+ help='Default - False: Do not mark 9th column with "Original/Matched/Additional tag"')
526
+ optional.add_argument('--meta', dest='output_meta', action='store_true', required=False,
527
+ help='Default - False: Output metadata file')
528
+ optional.add_argument('--olap', dest='overlap', default=50, type=int, required=False,
529
+ help='Maximum overlap between reference and additional genic regions (CDS,rRNA etc) - Default: 50 nt')
530
+
531
+ misc = parser.add_argument_group('Misc')
532
+ misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
533
+ help='Default - False: Print out runtime status')
534
+
535
+ options = parser.parse_args()
536
+
537
+ gff_adder(options)
538
+
539
+
540
+
541
+ if __name__ == "__main__":
542
+ main()
543
+ print("Complete")
ORForise/List_Tools.py ADDED
@@ -0,0 +1,56 @@
1
+ from importlib import import_module
2
+ import os
3
+
4
+ try:
5
+ from .utils import *
6
+ except (ImportError, ModuleNotFoundError):
7
+ from utils import *
8
+
9
+
10
+
11
+
12
+
13
+ def main():
14
+ print(WELCOME)
15
+
16
+ print('ORForise ' + ORForise_Version + ': List Tools Run Parameters')
17
+
18
+ tools = set()
19
+ base_dirs = [
20
+ os.path.join(os.path.dirname(__file__), 'Tools'),
21
+ os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'Tools')),
22
+ ]
23
+
24
+ for base in base_dirs:
25
+ if not os.path.isdir(base):
26
+ continue
27
+ try:
28
+ for entry in os.listdir(base):
29
+ entry_path = os.path.join(base, entry)
30
+ if os.path.isdir(entry_path) and not entry.startswith('.') and entry != '__pycache__':
31
+ tools.add(entry)
32
+ except OSError:
33
+ continue
34
+
35
+ if not tools:
36
+ print('No tools found in the searched directories.')
37
+ return
38
+
39
+ print('Available tools:')
40
+ for tool_name in sorted(tools):
41
+ print(' -', tool_name)
42
+ try:
43
+ tool_ = import_module('Tools.' + tool_name + '.' + tool_name)
44
+ print(' Imported from Tools.' + tool_name)
45
+ except ModuleNotFoundError:
46
+ try:
47
+ tool_ = import_module('ORForise.Tools.' + tool_name + '.' + tool_name)
48
+ print(' Imported from ORForise.Tools.' + tool_name)
49
+ except ModuleNotFoundError:
50
+ print(' Tool not importable')
51
+
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()
56
+ print("Complete")
File without changes
@@ -0,0 +1,77 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ import collections
4
+ from datetime import date
5
+ import sys
6
+ try:
7
+ from ORForise.src.ORForise.utils import revCompIterative
8
+ except ImportError:
9
+
10
+ from ORForise.utils import revCompIterative
11
+
12
+
13
+
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument('-dna', '--genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
16
+ 'are based on')
17
+ parser.add_argument('-gff', '--genome_gff', required=True,
18
+ help='Which annotation file to add to reference annotation?')
19
+ args = parser.parse_args()
20
+
21
+
22
+
23
+
24
+ def cds_checker(genome_dna,genome_gff):
25
+ genome_seq = ""
26
+ with open(genome_dna, 'r') as genome_fasta:
27
+ for line in genome_fasta:
28
+ line = line.replace("\n", "")
29
+ if not line.startswith('>'):
30
+ genome_seq += str(line)
31
+ else:
32
+ genome_id = line.split()[0].replace('>','')
33
+
34
+ ###########################################
35
+ genome_size = len(genome_seq)
36
+ genome_rev = revCompIterative(genome_seq)
37
+ cds_dict = collections.OrderedDict() # Order is important
38
+ count = 0
39
+ with open(genome_gff, 'r') as genome_gff:
40
+ for line in genome_gff:
41
+ line = line.split('\t')
42
+ try:
43
+ if "biological_region" in line[2] and len(line) == 9:
44
+ start = int(line[3])
45
+ stop = int(line[4])
46
+ strand = line[6]
47
+
48
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
49
+ r_start = genome_size - stop
50
+ r_stop = genome_size - start
51
+ startCodon = genome_rev[r_start:r_start + 3]
52
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
53
+ length = abs(start - stop-1)
54
+ elif '+' in strand:
55
+ startCodon = genome_seq[start - 1:start + 2]
56
+ stopCodon = genome_seq[stop - 3:stop]
57
+ length = abs(start-1 - stop)
58
+ po = str(start) + ',' + str(stop)
59
+ orf = [strand, startCodon, stopCodon]
60
+ cds_dict.update({po: orf})
61
+
62
+ if length % 3 == 0:
63
+ print("In-Fame")
64
+ else:
65
+ sys.exit("W")
66
+
67
+
68
+ elif "bio" in line[2]:
69
+ sys.exit("SAS")
70
+ except IndexError:
71
+ continue
72
+
73
+
74
+ if __name__ == "__main__":
75
+ cds_checker(**vars(args))
76
+
77
+ print("Complete")