ORForise 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ORForise/Aggregate_Compare.py +2 -4
  2. ORForise/Annotation_Compare.py +16 -53
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  5. ORForise/Convert_To_GFF.py +139 -0
  6. ORForise/GFF_Adder.py +454 -179
  7. ORForise/List_Tools.py +63 -0
  8. ORForise/StORForise.py +8 -4
  9. ORForise/Tools/EasyGene/EasyGene.py +13 -1
  10. ORForise/Tools/{GLIMMER_3/GLIMMER_3.py → GLIMMER3/GLIMMER3.py} +2 -2
  11. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  12. ORForise/Tools/{GeneMark_HA/GeneMark_HA.py → GeneMarkHA/GeneMarkHA.py} +1 -1
  13. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  14. ORForise/Tools/Prodigal/Prodigal.py +13 -1
  15. ORForise/utils.py +4 -1
  16. orforise-1.6.1.dist-info/METADATA +1038 -0
  17. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/RECORD +29 -24
  18. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/entry_points.txt +6 -2
  19. ORForise/GFF_Intersector.py +0 -192
  20. orforise-1.5.1.dist-info/METADATA +0 -427
  21. /ORForise/{Tools → Aux}/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +0 -0
  22. /ORForise/{Tools/GLIMMER_3 → Aux/StORF_Undetected/Completely_Undetected}/__init__.py +0 -0
  23. /ORForise/{Tools → Aux}/StORF_Undetected/StORF_Undetected.py +0 -0
  24. /ORForise/{Tools/GeneMark_HA → Aux/StORF_Undetected}/__init__.py +0 -0
  25. /ORForise/{Tools/StORF_Undetected/Completely_Undetected → Aux/StORF_Undetected/unvitiated_Genes}/__init__.py +0 -0
  26. /ORForise/{Tools → Aux}/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +0 -0
  27. /ORForise/{Tools/StORF_Undetected → Aux/TabToGFF}/__init__.py +0 -0
  28. /ORForise/{Tools/StORF_Undetected/unvitiated_Genes → Aux}/__init__.py +0 -0
  29. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/WHEEL +0 -0
  30. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/licenses/LICENSE +0 -0
  31. {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/top_level.txt +0 -0
ORForise/GFF_Adder.py CHANGED
@@ -1,9 +1,10 @@
1
1
  from importlib import import_module
2
2
  import argparse
3
- import collections
4
-
3
+ from collections import OrderedDict, defaultdict, Counter
4
+ import gzip
5
5
  from datetime import date
6
6
  import sys
7
+
7
8
  try:
8
9
  from utils import *
9
10
  except ImportError:
@@ -11,217 +12,491 @@ except ImportError:
11
12
 
12
13
 
13
14
  ########################################
14
-
15
-
16
- def gff_writer(options,genome_ID, genome_DNA, reference_annotation, reference_tool, ref_gene_set, additional_annotation, additional_tool, combined_ORFs, output_file):
15
+ def gff_writer(options, combined_ORFs_by_contig, output_file, reference_annotation, additional_annotation):
17
16
  write_out = open(output_file, 'w')
18
-
19
- #write_out.write('##sequence-region ' + genome_ID + ' 1 ' + str(len(genome_DNA)) + '\n')
20
17
  write_out.write("##gff-version\t3\n#\tGFF-Adder\n#\tRun Date:" + str(date.today()) + '\n')
21
- write_out.write("##Genome DNA File:" + genome_DNA + '\n')
18
+ write_out.write("##Genome DNA File:" + (options.genome_DNA if hasattr(options, 'genome_DNA') else '') + '\n')
22
19
  write_out.write("##Original File: " + reference_annotation + "\n##Additional File: " + additional_annotation + '\n')
20
+ # meta counts
21
+ Ref_Only = 0
22
+ Ref_Combined = defaultdict(int)
23
+ Non_Ref_Combined = defaultdict(int)
23
24
 
25
+ # New counters: per-tool totals and per-contig per-tool breakdown
26
+ per_tool_total = Counter() # counts for additional-only entries
27
+ per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
28
+ ref_per_tool_total = Counter() # counts for reference entries matched by tools
29
+ ref_per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
24
30
 
25
- #meta counts
26
- Ref_Only = 0
27
- Ref_Combined = collections.defaultdict(int)
28
- Non_Ref_Combined = collections.defaultdict(int)
29
-
30
-
31
- for pos, data in combined_ORFs.items():
32
- pos_ = pos.split(',')
33
- start = pos_[0]
34
- stop = pos_[-1]
35
- strand = data[0]
36
- length = int(stop) - int(start)
37
- additional_annotation_info = ''
38
- tools = additional_tool.split(',')
39
- matched_tools = ''
40
- matching = []
41
- matched = False
42
- for tool in tools:
31
+ # Iterate contigs in deterministic order
32
+ for contig in combined_ORFs_by_contig:
33
+ combined_ORFs = combined_ORFs_by_contig[contig]
34
+ # ref_gene_set for this contig: use keys from ref portion (we can detect by data[1]=='ref')
35
+ ref_gene_set = [k for k, v in combined_ORFs.items() if len(v) > 1 and v[1] == 'ref']
36
+
37
+ for pos, data in combined_ORFs.items():
38
+ pos_ = pos.split(',')
39
+ # pos may be like 'start,stop' or 'contig,start,stop' but here we expect 'start,stop'
40
+ if len(pos_) >= 2:
41
+ start = pos_[0]
42
+ stop = pos_[-1]
43
+ else:
44
+ # fallback: skip malformed
45
+ continue
46
+ strand = data[0]
43
47
 
48
+ # Build additional_annotation_info from the combined entry's additional list if present.
49
+ # Normalise entries and prefer the info portion after any 'ToolName:info' prefix.
50
+ additional_annotation_info = ''
51
+ additional_items = []
52
+ if len(data) > 4 and data[4]:
53
+ for add in data[4]:
54
+ s = str(add).strip()
55
+ if not s:
56
+ continue
57
+ if ':' in s:
58
+ # split tool:info -> take info part
59
+ _, info_part = s.split(':', 1)
60
+ info_part = info_part.strip()
61
+ else:
62
+ info_part = s
63
+ if info_part:
64
+ additional_items.append(info_part)
65
+ # deduplicate while preserving order
66
+ seen = set()
67
+ deduped = []
68
+ for it in additional_items:
69
+ if it not in seen:
70
+ seen.add(it)
71
+ deduped.append(it)
72
+ if deduped:
73
+ additional_annotation_info = ';'.join(deduped)
74
+ elif len(data) > 3 and data[3]:
75
+ additional_annotation_info = str(data[3]).strip()
76
+
77
+ # tools list from options (maybe empty)
78
+ tools = options.additional_tool.split(',') if getattr(options, 'additional_tool', None) else []
79
+
80
+ # Determine matched_tools_list reliably:
81
+ # prefer extracting tool names from data[4] entries, otherwise fallback to scanning values
82
+ matched_tools_list = []
44
83
  try:
45
- if options.mark_consensus == True:
46
- match = [s for s in data if tool in s]
47
- matching.append(match[0].replace('\n', '').replace('ID=',''))
84
+ if len(data) > 4 and data[4]:
85
+ for add in data[4]:
86
+ # expected format: "ToolName:info" or "info"
87
+ if isinstance(add, str) and ':' in add:
88
+ t = add.split(':', 1)[0].strip()
89
+ if t:
90
+ matched_tools_list.append(t)
91
+ else:
92
+ # try to detect one of the known tool names in the string
93
+ for tool in tools:
94
+ if tool and tool in str(add):
95
+ matched_tools_list.append(tool)
48
96
  else:
49
- match = [s for s in data if tool in s]
50
- matching.append(match[0].replace('\n', '').replace('ID=',''))
51
- if matching:
52
- matched = True
53
-
54
- matched_tools += tool + ','
55
- except Exception as e:
56
- if options.verbose == True:
57
- print("Exception - (No matching annotation) : " + str(e))
58
- continue
59
- #temporary verbose fix
60
- additional_annotation_info = 'ID='
61
- if len(match) >1:
62
- for match in matching:
63
- additional_annotation_info += match+'|'
64
- additional_annotation_info = additional_annotation_info[:-1]
65
- elif len(match) == 1:
66
- additional_annotation_info += matching[0].replace('Prokka|','').replace('GeneMark_S_2|','')
67
-
68
- matching = None
69
-
70
- if pos not in ref_gene_set: # Check if ref or additional
71
- type = matched_tools[:-1]
72
- Non_Ref_Combined[len(matched_tools.split(','))] += 1
73
- if options.clean == False:
74
- entry = (genome_ID + '\t' + type + '\t' + data[3] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + additional_annotation_info + '\n')
75
- else:
76
- entry = (genome_ID + '\t' + type + '\t' + data[3] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + additional_annotation_info + '\n')
97
+ # fallback: scan the whole data structure for tool names (previous behaviour)
98
+ for tool in tools:
99
+ if tool and any(tool in str(x) for x in data):
100
+ matched_tools_list.append(tool)
101
+ except Exception:
102
+ # keep matched_tools_list empty if anything unexpected happens
103
+ matched_tools_list = []
77
104
 
78
- else:
79
- data[3] = data[3].replace('\n', '')#.replace('ID=', '')
80
- if not additional_annotation_info:
81
- Ref_Only += 1
82
- type = reference_annotation.split('/')[-1].split('.')[0]
83
- if options.clean == False:
84
- entry = (genome_ID + '\t' + type + '\t' + data[2] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + data[3] + '\n')
105
+ # Normalise matched tools: unique, sorted
106
+ matched_tools_list = sorted(set(matched_tools_list))
107
+ matched_tools_str = ','.join(matched_tools_list)
108
+ matched_count = len(matched_tools_list)
109
+
110
+ # Build GFF entry and update meta counters
111
+ if pos not in ref_gene_set: # Additional-only entry (not in reference)
112
+ type_field = matched_tools_str if matched_tools_str else ''
113
+ Non_Ref_Combined[matched_count] += 1
114
+
115
+ # Update per-tool counters for additional-only entries
116
+ if matched_tools_list:
117
+ for t in matched_tools_list:
118
+ per_tool_total[t] += 1
119
+ per_contig_per_tool[contig][t] += 1
120
+ else:
121
+ # track unassigned additional entries (no tool name found)
122
+ per_tool_total['unassigned'] += 1
123
+ per_contig_per_tool[contig]['unassigned'] += 1
124
+
125
+ if not getattr(options, 'clean', False):
126
+ entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + (additional_annotation_info if additional_annotation_info else '') + '\n')
85
127
  else:
86
- entry = (genome_ID + '\t' + type + '\t' + data[2] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + data[3] + '\n')
128
+ entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + (additional_annotation_info if additional_annotation_info else '') + '\n')
129
+
87
130
  else:
88
- Ref_Combined[len(matched_tools.split(','))] +=1
89
- type = reference_annotation.split('/')[-1].split('.')[0]
90
- if options.clean == False:
91
- entry = (genome_ID + '\t' + type + '\t' + data[2] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + data[3] +
92
- ';Matched_Annotations=' + additional_annotation_info + '\n')
131
+ # Reference entry
132
+ if len(data) > 3 and data[3]:
133
+ info_field = data[3].replace('\n', '').strip()
134
+ else:
135
+ info_field = '.'
136
+
137
+ # Determine type and source fields
138
+ type_field = data[2] if len(data) > 2 and data[2] else (
139
+ reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '.')
140
+ source_field = data[5] if len(data) > 5 and data[5] else (
141
+ reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '')
142
+
143
+ # If additional_annotation_info duplicates info_field content, remove duplicate fragments.
144
+ filtered_additional = ''
145
+ if additional_annotation_info:
146
+ add_parts = [p.strip() for p in str(additional_annotation_info).split(';') if p.strip()]
147
+ info_parts = [p.strip() for p in str(info_field).split(';') if p.strip() and p.strip() != '.']
148
+ filtered = []
149
+ for ap in add_parts:
150
+ dup = False
151
+ for ip in info_parts:
152
+ # treat duplication if exact match or obvious substring relationship
153
+ if ip and (ap == ip or ip in ap or ap in ip):
154
+ dup = True
155
+ break
156
+ if not dup:
157
+ filtered.append(ap)
158
+ filtered_additional = ';'.join(filtered)
159
+
160
+ if not filtered_additional:
161
+ # Reference-only (no meaningful unique additional annotations)
162
+ Ref_Only += 1
163
+ if not getattr(options, 'clean', False):
164
+ entry = (
165
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + '\n')
166
+ else:
167
+ entry = (
168
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
93
169
  else:
94
- entry = (genome_ID + '\t' + type + '\t' + data[2] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + data[3] + '\n')
95
- write_out.write(entry)
96
-
97
- if options.output_meta == True:
98
- meta_out = open(output_file.replace('.gff','_Meta.txt'),'w')
99
- meta_out.write('Reference Only Genes: ' + str(Ref_Only) + '\n')
100
- Ref_Combined_counter = collections.Counter(Ref_Combined)
101
- meta_out.write('Reference Combined Genes: ' + str(Ref_Combined_counter) + '\n')
102
- Non_Ref_Combined_counter = collections.Counter(Non_Ref_Combined)
103
- meta_out.write('Non_Reference Combined Genes: ' + str(Non_Ref_Combined_counter) + '\n')
104
-
105
- def gff_adder(options):#genome_DNA, reference_tool, reference_annotation, additional_tool, additional_annotation, gene_ident, overlap, output_file): # Only works for single contig genome
106
- genome_seq = ""
107
- with open(options.genome_DNA, 'r') as genome_fasta:
108
- for line in genome_fasta:
109
- line = line.replace("\n", "")
110
- if not line.startswith('>'):
111
- genome_seq += str(line)
170
+ # Reference entry that had additional annotations (combined)
171
+ Ref_Combined[matched_count] += 1
172
+
173
+ # Update per-tool counters for reference-matched entries
174
+ if matched_tools_list:
175
+ for t in matched_tools_list:
176
+ ref_per_tool_total[t] += 1
177
+ ref_per_contig_per_tool[contig][t] += 1
178
+ else:
179
+ ref_per_tool_total['unassigned'] += 1
180
+ ref_per_contig_per_tool[contig]['unassigned'] += 1
181
+
182
+ if not getattr(options, 'clean', False):
183
+ entry = (
184
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + ';Matched_Annotations=' + filtered_additional + '\n')
185
+ else:
186
+ entry = (
187
+ contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
188
+
189
+ write_out.write(entry)
190
+
191
+ # Produce metadata output if requested
192
+ if getattr(options, 'output_meta', False) == True:
193
+ # Summaries
194
+ total_ref_combined = sum(Ref_Combined.values())
195
+ total_nonref = sum(Non_Ref_Combined.values())
196
+ total_reference_genes = Ref_Only + total_ref_combined
197
+ Ref_Combined_counter = Counter(Ref_Combined)
198
+ Non_Ref_Combined_counter = Counter(Non_Ref_Combined)
199
+ with open(output_file.replace('.gff','_Meta.txt'),'w') as meta_out:
200
+ meta_out.write("GFF-Adder Metadata Report\n")
201
+ meta_out.write("=========================\n")
202
+ meta_out.write("Run Date: {}\n".format(date.today()))
203
+ meta_out.write("Genome DNA: {}\n".format(getattr(options, 'genome_DNA', 'N/A')))
204
+ meta_out.write("Reference annotation: {}\n".format(reference_annotation))
205
+ meta_out.write("Additional annotation(s): {}\n\n".format(additional_annotation))
206
+
207
+ meta_out.write("Summary counts\n")
208
+ meta_out.write("--------------\n")
209
+ meta_out.write(f"Reference-only genes (no matching additional annotation): {Ref_Only}\n")
210
+ meta_out.write(f"Reference genes with additional matches (combined): {total_ref_combined}\n")
211
+ meta_out.write(f"TOTAL reference genes observed: {total_reference_genes}\n")
212
+ meta_out.write(f"Additional-only genes (not present in reference): {total_nonref}\n\n")
213
+
214
+ meta_out.write("Distribution of matches for reference-combined entries (num matched tools -> count)\n")
215
+ if Ref_Combined_counter:
216
+ for k in sorted(Ref_Combined_counter):
217
+ meta_out.write(f" {k:>3} {Ref_Combined_counter[k]}\n")
112
218
  else:
113
- genome_ID = line.split()[0].replace('>','')
219
+ meta_out.write(" None\n")
220
+ meta_out.write("\n")
221
+
222
+ meta_out.write("Distribution of matches for non-reference entries (num matched tools -> count)\n")
223
+ if Non_Ref_Combined_counter:
224
+ for k in sorted(Non_Ref_Combined_counter):
225
+ meta_out.write(f" {k:>3} {Non_Ref_Combined_counter[k]}\n")
226
+ else:
227
+ meta_out.write(" None\n")
228
+ meta_out.write("\n")
229
+
230
+ # Per-tool totals for additional-only entries
231
+ meta_out.write("Per-tool additional-only annotation totals\n")
232
+ meta_out.write("-----------------------------------------\n")
233
+ if per_tool_total:
234
+ for t, c in per_tool_total.most_common():
235
+ meta_out.write(f" {t}: {c}\n")
236
+ else:
237
+ meta_out.write(" None\n")
238
+ meta_out.write("\n")
239
+
240
+ # Per-contig breakdown for additional-only entries: only contigs with additional genes
241
+ meta_out.write("Per-contig breakdown for additional-only annotations (only contigs with additions shown)\n")
242
+ meta_out.write("---------------------------------------------------------------------------------------\n")
243
+ if per_contig_per_tool:
244
+ for contig in sorted(per_contig_per_tool):
245
+ counter = per_contig_per_tool[contig]
246
+ if sum(counter.values()) == 0:
247
+ continue
248
+ meta_out.write(f" {contig}:\n")
249
+ for t, c in counter.most_common():
250
+ meta_out.write(f" {t}: {c}\n")
251
+ meta_out.write("\n")
252
+ else:
253
+ meta_out.write(" None\n\n")
254
+
255
+ # Per-tool totals and per-contig breakdown for reference genes matched by additional tools
256
+ meta_out.write("Per-tool totals for reference genes matched by additional tools\n")
257
+ meta_out.write("---------------------------------------------------------------\n")
258
+ if ref_per_tool_total:
259
+ for t, c in ref_per_tool_total.most_common():
260
+ meta_out.write(f" {t}: {c}\n")
261
+ else:
262
+ meta_out.write(" None\n")
263
+ meta_out.write("\n")
264
+
265
+ meta_out.write("Per-contig breakdown for reference genes matched by additional tools\n")
266
+ meta_out.write("--------------------------------------------------------------------\n")
267
+ if ref_per_contig_per_tool:
268
+ for contig in sorted(ref_per_contig_per_tool):
269
+ counter = ref_per_contig_per_tool[contig]
270
+ if sum(counter.values()) == 0:
271
+ continue
272
+ meta_out.write(f" {contig}:\n")
273
+ for t, c in counter.most_common():
274
+ meta_out.write(f" {t}: {c}\n")
275
+ meta_out.write("\n")
276
+ else:
277
+ meta_out.write(" None\n\n")
278
+
279
+ meta_out.write("Notes\n")
280
+ meta_out.write("-----\n")
281
+ meta_out.write(" - 'Reference-only' are reference entries that had no recorded additional annotation information.\n")
282
+ meta_out.write(" - 'Per-tool' counts are based on the tool names extracted from the additional-annotation provenance\n")
283
+ meta_out.write(" (expected format in combined entries: 'ToolName:info'). Entries with no tool detected are shown as 'unassigned'.\n")
284
+ meta_out.write("\nEnd of report\n")
285
+
286
+
287
+
288
+ def gff_adder(options):
289
+ # Load fasta into dna_regions (supports multi-contig)
290
+ try:
291
+ try:
292
+ fasta_in = gzip.open(options.genome_DNA, 'rt')
293
+ dna_regions = fasta_load(fasta_in)
294
+ except Exception:
295
+ fasta_in = open(options.genome_DNA, 'r', encoding='unicode_escape')
296
+ dna_regions = fasta_load(fasta_in)
297
+ except Exception:
298
+ # Fallback to legacy single-contig behaviour
299
+ genome_seq = ""
300
+ with open(options.genome_DNA, 'r') as genome_fasta:
301
+ for line in genome_fasta:
302
+ line = line.replace("\n", "")
303
+ if not line.startswith('>'):
304
+ genome_seq += str(line)
305
+ else:
306
+ genome_ID = line.split()[0].replace('>','')
307
+ # Create dna_regions with single entry
308
+ dna_regions = OrderedDict()
309
+ dna_regions[genome_ID] = (genome_seq, len(genome_seq), list(), None)
310
+
114
311
  ###########################################
115
- if not options.reference_tool: # IF using Ensembl for comparison
116
- ref_genes = collections.OrderedDict() # Order is important
312
+ # Build reference gene dict per-contig
313
+ ref_genes_by_contig = defaultdict(OrderedDict)
314
+
315
+ if not options.reference_tool: # IF using Ensembl/file for comparison
316
+ # Parse reference gff to populate ref_genes_by_contig (retain original info fields)
317
+ # Detect gzip by magic bytes (first two bytes)
318
+ is_gz = False
319
+ with open(options.reference_annotation, 'rb') as _probe:
320
+ magic = _probe.read(2)
321
+ is_gz = (magic == b'\x1f\x8b')
322
+
323
+ if is_gz:
324
+ gff_in = gzip.open(options.reference_annotation, 'rt', errors='replace')
325
+ else:
326
+ # Open as plain text, replace undecodable bytes rather than fail
327
+ gff_in = open(options.reference_annotation, 'r', encoding='utf-8', errors='replace')
328
+
117
329
  count = 0
118
- with open(options.reference_annotation, 'r') as genome_gff:
119
- for line in genome_gff:
120
- line = line.split('\t')
330
+ try:
331
+ for line in gff_in:
332
+ if line.startswith('#') or line.strip() == '':
333
+ continue
334
+ parts = line.strip().split('\t')
335
+ if len(parts) < 9:
336
+ continue
337
+ contig = parts[0]
338
+ if contig not in dna_regions:
339
+ # skip records for contigs not in provided fasta
340
+ continue
121
341
  try:
122
342
  if 'CDS' in options.gene_ident and len(options.gene_ident) == 1:
123
- if "CDS" in line[2] and len(line) == 9:
124
- start = int(line[3])
125
- stop = int(line[4])
126
- strand = line[6]
127
- pos = str(start)+','+str(stop)
128
- info = line[8]
129
- ref_genes.update({pos:[strand,'ref','CDS',info]})
343
+ if "CDS" in parts[2] and len(parts) == 9:
344
+ start = int(parts[3])
345
+ stop = int(parts[4])
346
+ strand = parts[6]
347
+ pos = f"{start},{stop}"
348
+ # store as [strand, source, type, info] to match downstream expectations
349
+ info = parts[8]
350
+ ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
130
351
  count += 1
131
352
  else:
132
353
  gene_types = options.gene_ident.split(',')
133
- if any(gene_type in line[2] for gene_type in gene_types): # line[2] for normalrun
134
- start = int(line[3])
135
- stop = int(line[4])
136
- strand = line[6]
137
- pos = str(start) + ',' + str(stop)
138
- info = line[8]
139
- ref_genes.update({pos: [strand, 'ref',line[2],info]}) #Report what type of gene/rRNA etc we have here
354
+ if any(gene_type in parts[2] for gene_type in gene_types):
355
+ start = int(parts[3])
356
+ stop = int(parts[4])
357
+ strand = parts[6]
358
+ pos = f"{start},{stop}"
359
+ # store as [strand, source, type, info] to match downstream expectations
360
+ info = parts[8]
361
+ ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
140
362
  count += 1
141
363
  except IndexError:
142
364
  continue
143
- elif options.reference_tool: # IF using a tool as reference
144
- if 'StORF_Reporter' == options.reference_tool:
145
- reference_tool = 'StORF_Reporter'
365
+ finally:
366
+ try:
367
+ gff_in.close()
368
+ except Exception:
369
+ pass
370
+
371
+ else:
372
+ # Reference tool provided: attempt to call it with dna_regions first (multi-contig aware), fallback to legacy signature
373
+ reference_tool = options.reference_tool if options.reference_tool != 'StORF_Reporter' else 'StORF_Reporter'
146
374
  try:
147
- reference_tool_ = import_module('Tools.' + reference_tool + '.' + reference_tool,
148
- package='my_current_pkg')
375
+ reference_tool_mod = import_module('Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
149
376
  except ModuleNotFoundError:
150
377
  try:
151
- reference_tool_ = import_module('ORForise.Tools.' + reference_tool + '.' + reference_tool,
152
- package='my_current_pkg')
378
+ reference_tool_mod = import_module('ORForise.Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
153
379
  except ModuleNotFoundError:
154
380
  sys.exit("Tool not available")
155
- reference_tool_ = getattr(reference_tool_, reference_tool)
156
- ############ Reformatting tool output for ref_genes
157
- ref_genes = reference_tool_(reference_annotation=options.reference_annotation, genome_seq=genome_seq,gene_ident=options.options.gene_ident)
158
- ref_genes = sortORFs(ref_genes)
159
- ref_gene_set = list(ref_genes.keys())
160
- ################ Get Additional Tool'
161
- # if 'StORF_Reporter' == options.additional_tool:
162
- # additional_tool = 'StORF_Reporter'
163
- additional_annotations = collections.OrderedDict()
381
+ reference_tool_func = getattr(reference_tool_mod, reference_tool)
382
+ # Try multi-contig signature
383
+ try:
384
+ ref_result = reference_tool_func(options.reference_annotation, dna_regions)
385
+ except TypeError:
386
+ # Fallback to legacy signature, try passing genome seq string
387
+ genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
388
+ ref_result = reference_tool_func(reference_annotation=options.reference_annotation, genome_seq=genome_seq, gene_ident=options.gene_ident)
389
+ # Expect ref_result to be dict of contig -> {pos: data}
390
+ for contig, mapping in ref_result.items() if isinstance(ref_result, dict) else []:
391
+ ref_genes_by_contig[contig].update(mapping)
392
+
393
+ # Ensure each contig has an OrderedDict even if empty
394
+ for contig in dna_regions:
395
+ if contig not in ref_genes_by_contig:
396
+ ref_genes_by_contig[contig] = OrderedDict()
397
+
398
+ # Collect additional annotations per contig
399
+ additional_annotations_by_contig = defaultdict(OrderedDict)
164
400
  tool_count = 0
165
401
  for tool in options.additional_tool.split(','):
166
402
  try:
167
- additional_tool_ = import_module('Tools.' + tool + '.' + tool,
168
- package='my_current_pkg')
403
+ additional_tool_mod = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
169
404
  except ModuleNotFoundError:
170
405
  try:
171
- additional_tool_ = import_module('ORForise.Tools.' + tool + '.' + tool,
172
- package='my_current_pkg')
406
+ additional_tool_mod = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
173
407
  except ModuleNotFoundError:
174
408
  sys.exit("Tool not available")
175
- additional_tool_ = getattr(additional_tool_, tool)
176
- current_additional_orfs = additional_tool_(options.additional_annotation.split(',')[tool_count], genome_seq,options.gene_ident)#,gene_ident=options.gene_ident)
409
+ additional_tool_func = getattr(additional_tool_mod, tool)
410
+
411
+ anno_file = options.additional_annotation.split(',')[tool_count]
177
412
  tool_count += 1
178
- orfs_to_remove = []
179
- for orf in current_additional_orfs.keys():
180
- o_start = int(orf.split(',')[0])
181
- o_stop = int(orf.split(',')[1])
182
- orf_set = set(range(int(o_start), int(o_stop) + 1))
183
- for pos, details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs - Slow
184
- g_start = int(pos.split(',')[0])
185
- g_stop = int(pos.split(',')[1])
186
-
187
- gene_set = set(range(int(g_start), int(g_stop) + 1))
188
- cov = len(orf_set.intersection(gene_set))
189
- if g_start > o_stop:
190
- break
191
- if cov >= options.overlap:
192
- orfs_to_remove.append(str(o_start) + ',' + str(o_stop))
193
- ref_genes[pos].append(current_additional_orfs[orf][4]) # record overlap
194
- break
195
- try:
196
- for pos, details in additional_annotations.items():
197
- a_start = int(pos.split(',')[0])
198
- a_stop = int(pos.split(',')[1])
199
- add_set = set(range(int(a_start), int(a_stop) + 1))
200
- cov = len(orf_set.intersection(add_set))
201
- if a_start > a_stop:
202
- break
203
- if cov >= options.overlap:
204
- #orfs_to_remove.append(str(a_start) + ',' + str(a_stop))
205
- additional_annotations[pos].append(current_additional_orfs[orf][4]) # record overlap
206
- break
207
- except:
208
- break
209
-
210
- for orf_key in orfs_to_remove: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
211
- if orf_key in current_additional_orfs:
212
- del current_additional_orfs[orf_key]
213
- additional_annotations.update(current_additional_orfs)
214
- #########################################################
215
- combined_ORFs = {**ref_genes, **additional_annotations}
216
- combined_ORFs = sortORFs(combined_ORFs)
217
-
218
- if not options.reference_tool:
219
- options.reference_tool = 'Reference_Annotation'
220
- gff_writer(options, genome_ID, options.genome_DNA, options.reference_annotation, options.reference_tool, ref_gene_set, options.additional_annotation, options.additional_tool, combined_ORFs, options.output_file)
413
+ # Try calling tool in multi-contig mode first
414
+ try:
415
+ tool_orfs = additional_tool_func(anno_file, dna_regions)
416
+ except TypeError:
417
+ # Fallback to legacy signature expecting genome_seq
418
+ genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
419
+ tool_orfs = additional_tool_func(anno_file, genome_seq, options.gene_ident)
420
+
421
+ # tool_orfs may be either {contig: {pos: data}} or a flat {pos: data}
422
+ if isinstance(tool_orfs, dict):
423
+ # If top-level keys look like contig names (present in dna_regions) then treat as multi-contig
424
+ top_keys = list(tool_orfs.keys())
425
+ if top_keys and top_keys[0] in dna_regions:
426
+ for contig, mapping in tool_orfs.items():
427
+ # Merge mapping into additional_annotations_by_contig and record tool provenance
428
+ for pos_k, pos_v in mapping.items():
429
+ # store tuple (value, tool)
430
+ additional_annotations_by_contig[contig][pos_k] = (pos_v, tool)
431
+ else:
432
+ # Treat as flat mapping — assume single contig if only one contig present
433
+ if len(dna_regions) == 1:
434
+ only_contig = next(iter(dna_regions))
435
+ for pos_k, pos_v in tool_orfs.items():
436
+ additional_annotations_by_contig[only_contig][pos_k] = (pos_v, tool)
437
+ else:
438
+ # If multiple contigs but mapping has contig-prefixed keys like 'contig,start,stop', split them
439
+ for k, v in tool_orfs.items():
440
+ parts = k.split(',')
441
+ if len(parts) == 3 and parts[0] in dna_regions:
442
+ contig = parts[0]
443
+ pos = parts[1] + ',' + parts[2]
444
+ additional_annotations_by_contig[contig][pos] = (v, tool)
445
+ else:
446
+ # Unknown format: assign nowhere (skip)
447
+ continue
448
+ tool_orfs = None
449
+
450
+ # Combine per-contig: keep reference entries and append additional annotations as supplemental
451
+ combined_ORFs_by_contig = OrderedDict()
452
+ for contig in dna_regions:
453
+ combined = OrderedDict()
454
+ # Add reference entries first; normalise to [strand, 'ref', type, ref_info, additional_list]
455
+ for pos, val in ref_genes_by_contig.get(contig, {}).items():
456
+ strand = val[0] if len(val) > 0 else '.'
457
+ src = 'ref'
458
+ source_field = val[1] if len(val) > 1 else 'ref'
459
+ ftype = val[2] if len(val) > 2 else '.'
460
+ ref_info = val[3] if len(val) > 3 else '.'
461
+ combined[pos] = [strand, src, ftype, ref_info, [], source_field]
462
+
463
+ # Now incorporate additional annotations without overwriting reference entries
464
+ for pos, wrapped in additional_annotations_by_contig.get(contig, {}).items():
465
+ # wrapped is (value, tool)
466
+ if isinstance(wrapped, tuple) and len(wrapped) == 2:
467
+ val, toolname = wrapped
468
+ else:
469
+ val = wrapped
470
+ toolname = ''
471
+
472
+ # Extract strand/type/info from value heuristically
473
+ strand_a = val[0] if isinstance(val, (list, tuple)) and len(val) > 0 else '.'
474
+ ftype_a = val[3] if isinstance(val, (list, tuple)) and len(val) > 2 else '.'
475
+ info_a = ''
476
+ if isinstance(val, (list, tuple)) and len(val) > 3:
477
+ info_a = val[4]
478
+ elif isinstance(val, str):
479
+ info_a = val
480
+
481
+ # If matching pos exists in reference, append additional info to its additional list
482
+ if pos in combined:
483
+ addstr = (toolname + ':' + info_a) if toolname else info_a
484
+ combined[pos][4].append(addstr)
485
+ else:
486
+ # Create a new entry for additional-only annotation: [strand, 'add', type, '.', [tool:info]]
487
+ addstr = (toolname + ':' + info_a) if toolname else info_a
488
+ #combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr]]
489
+ combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr], toolname]
490
+ # Sort ORFs for this contig
491
+ combined = sortORFs(combined)
492
+ combined_ORFs_by_contig[contig] = combined
493
+
494
+ # Call writer
495
+ gff_writer(options, combined_ORFs_by_contig, options.output_file, options.reference_annotation, options.additional_annotation)
221
496
 
222
497
 
223
498
  def main():
224
- print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
499
+ print(WELCOME)
225
500
 
226
501
  parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': GFF-Adder Run Parameters.')
227
502
  parser._action_groups.pop()
@@ -242,15 +517,15 @@ def main():
242
517
  optional.add_argument('-rt', dest='reference_tool', required=False,
243
518
  help='Which tool format to use as reference? - If not provided, will default to the '
244
519
  'standard GFF format and will only look for "CDS" features')
245
- optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
520
+ optional.add_argument('--gene_ident', action='store', dest='gene_ident', default='CDS',
246
521
  help='Identifier used for identifying genomic features in reference annotation "CDS,rRNA,tRNA"')
247
- optional.add_argument('-mc', dest='mark_consensus', default=False, type=bool, required=False,
522
+ optional.add_argument('-mc', dest='mark_consensus', action='store_true', required=False,
248
523
  help='Default - False: Mark reference annotations which where present in the additional tool annotation')
249
- optional.add_argument('-c', dest='clean', default=False, type=bool, required=False,
524
+ optional.add_argument('-c', dest='clean', action='store_true', required=False,
250
525
  help='Default - False: Do not mark 9th column with "Original/Matched/Additional tag"')
251
- optional.add_argument('-meta', dest='output_meta', default=False, type=bool, required=False,
526
+ optional.add_argument('--meta', dest='output_meta', action='store_true', required=False,
252
527
  help='Default - False: Output metadata file')
253
- optional.add_argument('-olap', dest='overlap', default=50, type=int, required=False,
528
+ optional.add_argument('--olap', dest='overlap', default=50, type=int, required=False,
254
529
  help='Maximum overlap between reference and additional genic regions (CDS,rRNA etc) - Default: 50 nt')
255
530
 
256
531
  misc = parser.add_argument_group('Misc')