ORForise 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +2 -4
- ORForise/Annotation_Compare.py +16 -53
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Convert_To_GFF.py +139 -0
- ORForise/GFF_Adder.py +454 -179
- ORForise/List_Tools.py +63 -0
- ORForise/StORForise.py +8 -4
- ORForise/Tools/EasyGene/EasyGene.py +13 -1
- ORForise/Tools/{GLIMMER_3/GLIMMER_3.py → GLIMMER3/GLIMMER3.py} +2 -2
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/{GeneMark_HA/GeneMark_HA.py → GeneMarkHA/GeneMarkHA.py} +1 -1
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +13 -1
- ORForise/utils.py +4 -1
- orforise-1.6.1.dist-info/METADATA +1038 -0
- {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/RECORD +29 -24
- {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/entry_points.txt +6 -2
- ORForise/GFF_Intersector.py +0 -192
- orforise-1.5.1.dist-info/METADATA +0 -427
- /ORForise/{Tools → Aux}/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +0 -0
- /ORForise/{Tools/GLIMMER_3 → Aux/StORF_Undetected/Completely_Undetected}/__init__.py +0 -0
- /ORForise/{Tools → Aux}/StORF_Undetected/StORF_Undetected.py +0 -0
- /ORForise/{Tools/GeneMark_HA → Aux/StORF_Undetected}/__init__.py +0 -0
- /ORForise/{Tools/StORF_Undetected/Completely_Undetected → Aux/StORF_Undetected/unvitiated_Genes}/__init__.py +0 -0
- /ORForise/{Tools → Aux}/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +0 -0
- /ORForise/{Tools/StORF_Undetected → Aux/TabToGFF}/__init__.py +0 -0
- /ORForise/{Tools/StORF_Undetected/unvitiated_Genes → Aux}/__init__.py +0 -0
- {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/WHEEL +0 -0
- {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.5.1.dist-info → orforise-1.6.1.dist-info}/top_level.txt +0 -0
ORForise/GFF_Adder.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from importlib import import_module
|
|
2
2
|
import argparse
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
from collections import OrderedDict, defaultdict, Counter
|
|
4
|
+
import gzip
|
|
5
5
|
from datetime import date
|
|
6
6
|
import sys
|
|
7
|
+
|
|
7
8
|
try:
|
|
8
9
|
from utils import *
|
|
9
10
|
except ImportError:
|
|
@@ -11,217 +12,491 @@ except ImportError:
|
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
########################################
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def gff_writer(options,genome_ID, genome_DNA, reference_annotation, reference_tool, ref_gene_set, additional_annotation, additional_tool, combined_ORFs, output_file):
|
|
15
|
+
def gff_writer(options, combined_ORFs_by_contig, output_file, reference_annotation, additional_annotation):
|
|
17
16
|
write_out = open(output_file, 'w')
|
|
18
|
-
|
|
19
|
-
#write_out.write('##sequence-region ' + genome_ID + ' 1 ' + str(len(genome_DNA)) + '\n')
|
|
20
17
|
write_out.write("##gff-version\t3\n#\tGFF-Adder\n#\tRun Date:" + str(date.today()) + '\n')
|
|
21
|
-
write_out.write("##Genome DNA File:" + genome_DNA + '\n')
|
|
18
|
+
write_out.write("##Genome DNA File:" + (options.genome_DNA if hasattr(options, 'genome_DNA') else '') + '\n')
|
|
22
19
|
write_out.write("##Original File: " + reference_annotation + "\n##Additional File: " + additional_annotation + '\n')
|
|
20
|
+
# meta counts
|
|
21
|
+
Ref_Only = 0
|
|
22
|
+
Ref_Combined = defaultdict(int)
|
|
23
|
+
Non_Ref_Combined = defaultdict(int)
|
|
23
24
|
|
|
25
|
+
# New counters: per-tool totals and per-contig per-tool breakdown
|
|
26
|
+
per_tool_total = Counter() # counts for additional-only entries
|
|
27
|
+
per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
|
|
28
|
+
ref_per_tool_total = Counter() # counts for reference entries matched by tools
|
|
29
|
+
ref_per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
|
|
24
30
|
|
|
25
|
-
#
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
matched = False
|
|
42
|
-
for tool in tools:
|
|
31
|
+
# Iterate contigs in deterministic order
|
|
32
|
+
for contig in combined_ORFs_by_contig:
|
|
33
|
+
combined_ORFs = combined_ORFs_by_contig[contig]
|
|
34
|
+
# ref_gene_set for this contig: use keys from ref portion (we can detect by data[1]=='ref')
|
|
35
|
+
ref_gene_set = [k for k, v in combined_ORFs.items() if len(v) > 1 and v[1] == 'ref']
|
|
36
|
+
|
|
37
|
+
for pos, data in combined_ORFs.items():
|
|
38
|
+
pos_ = pos.split(',')
|
|
39
|
+
# pos may be like 'start,stop' or 'contig,start,stop' but here we expect 'start,stop'
|
|
40
|
+
if len(pos_) >= 2:
|
|
41
|
+
start = pos_[0]
|
|
42
|
+
stop = pos_[-1]
|
|
43
|
+
else:
|
|
44
|
+
# fallback: skip malformed
|
|
45
|
+
continue
|
|
46
|
+
strand = data[0]
|
|
43
47
|
|
|
48
|
+
# Build additional_annotation_info from the combined entry's additional list if present.
|
|
49
|
+
# Normalise entries and prefer the info portion after any 'ToolName:info' prefix.
|
|
50
|
+
additional_annotation_info = ''
|
|
51
|
+
additional_items = []
|
|
52
|
+
if len(data) > 4 and data[4]:
|
|
53
|
+
for add in data[4]:
|
|
54
|
+
s = str(add).strip()
|
|
55
|
+
if not s:
|
|
56
|
+
continue
|
|
57
|
+
if ':' in s:
|
|
58
|
+
# split tool:info -> take info part
|
|
59
|
+
_, info_part = s.split(':', 1)
|
|
60
|
+
info_part = info_part.strip()
|
|
61
|
+
else:
|
|
62
|
+
info_part = s
|
|
63
|
+
if info_part:
|
|
64
|
+
additional_items.append(info_part)
|
|
65
|
+
# deduplicate while preserving order
|
|
66
|
+
seen = set()
|
|
67
|
+
deduped = []
|
|
68
|
+
for it in additional_items:
|
|
69
|
+
if it not in seen:
|
|
70
|
+
seen.add(it)
|
|
71
|
+
deduped.append(it)
|
|
72
|
+
if deduped:
|
|
73
|
+
additional_annotation_info = ';'.join(deduped)
|
|
74
|
+
elif len(data) > 3 and data[3]:
|
|
75
|
+
additional_annotation_info = str(data[3]).strip()
|
|
76
|
+
|
|
77
|
+
# tools list from options (maybe empty)
|
|
78
|
+
tools = options.additional_tool.split(',') if getattr(options, 'additional_tool', None) else []
|
|
79
|
+
|
|
80
|
+
# Determine matched_tools_list reliably:
|
|
81
|
+
# prefer extracting tool names from data[4] entries, otherwise fallback to scanning values
|
|
82
|
+
matched_tools_list = []
|
|
44
83
|
try:
|
|
45
|
-
if
|
|
46
|
-
|
|
47
|
-
|
|
84
|
+
if len(data) > 4 and data[4]:
|
|
85
|
+
for add in data[4]:
|
|
86
|
+
# expected format: "ToolName:info" or "info"
|
|
87
|
+
if isinstance(add, str) and ':' in add:
|
|
88
|
+
t = add.split(':', 1)[0].strip()
|
|
89
|
+
if t:
|
|
90
|
+
matched_tools_list.append(t)
|
|
91
|
+
else:
|
|
92
|
+
# try to detect one of the known tool names in the string
|
|
93
|
+
for tool in tools:
|
|
94
|
+
if tool and tool in str(add):
|
|
95
|
+
matched_tools_list.append(tool)
|
|
48
96
|
else:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if options.verbose == True:
|
|
57
|
-
print("Exception - (No matching annotation) : " + str(e))
|
|
58
|
-
continue
|
|
59
|
-
#temporary verbose fix
|
|
60
|
-
additional_annotation_info = 'ID='
|
|
61
|
-
if len(match) >1:
|
|
62
|
-
for match in matching:
|
|
63
|
-
additional_annotation_info += match+'|'
|
|
64
|
-
additional_annotation_info = additional_annotation_info[:-1]
|
|
65
|
-
elif len(match) == 1:
|
|
66
|
-
additional_annotation_info += matching[0].replace('Prokka|','').replace('GeneMark_S_2|','')
|
|
67
|
-
|
|
68
|
-
matching = None
|
|
69
|
-
|
|
70
|
-
if pos not in ref_gene_set: # Check if ref or additional
|
|
71
|
-
type = matched_tools[:-1]
|
|
72
|
-
Non_Ref_Combined[len(matched_tools.split(','))] += 1
|
|
73
|
-
if options.clean == False:
|
|
74
|
-
entry = (genome_ID + '\t' + type + '\t' + data[3] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + additional_annotation_info + '\n')
|
|
75
|
-
else:
|
|
76
|
-
entry = (genome_ID + '\t' + type + '\t' + data[3] + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + additional_annotation_info + '\n')
|
|
97
|
+
# fallback: scan the whole data structure for tool names (previous behaviour)
|
|
98
|
+
for tool in tools:
|
|
99
|
+
if tool and any(tool in str(x) for x in data):
|
|
100
|
+
matched_tools_list.append(tool)
|
|
101
|
+
except Exception:
|
|
102
|
+
# keep matched_tools_list empty if anything unexpected happens
|
|
103
|
+
matched_tools_list = []
|
|
77
104
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
105
|
+
# Normalise matched tools: unique, sorted
|
|
106
|
+
matched_tools_list = sorted(set(matched_tools_list))
|
|
107
|
+
matched_tools_str = ','.join(matched_tools_list)
|
|
108
|
+
matched_count = len(matched_tools_list)
|
|
109
|
+
|
|
110
|
+
# Build GFF entry and update meta counters
|
|
111
|
+
if pos not in ref_gene_set: # Additional-only entry (not in reference)
|
|
112
|
+
type_field = matched_tools_str if matched_tools_str else ''
|
|
113
|
+
Non_Ref_Combined[matched_count] += 1
|
|
114
|
+
|
|
115
|
+
# Update per-tool counters for additional-only entries
|
|
116
|
+
if matched_tools_list:
|
|
117
|
+
for t in matched_tools_list:
|
|
118
|
+
per_tool_total[t] += 1
|
|
119
|
+
per_contig_per_tool[contig][t] += 1
|
|
120
|
+
else:
|
|
121
|
+
# track unassigned additional entries (no tool name found)
|
|
122
|
+
per_tool_total['unassigned'] += 1
|
|
123
|
+
per_contig_per_tool[contig]['unassigned'] += 1
|
|
124
|
+
|
|
125
|
+
if not getattr(options, 'clean', False):
|
|
126
|
+
entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + (additional_annotation_info if additional_annotation_info else '') + '\n')
|
|
85
127
|
else:
|
|
86
|
-
entry = (
|
|
128
|
+
entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + (additional_annotation_info if additional_annotation_info else '') + '\n')
|
|
129
|
+
|
|
87
130
|
else:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
131
|
+
# Reference entry
|
|
132
|
+
if len(data) > 3 and data[3]:
|
|
133
|
+
info_field = data[3].replace('\n', '').strip()
|
|
134
|
+
else:
|
|
135
|
+
info_field = '.'
|
|
136
|
+
|
|
137
|
+
# Determine type and source fields
|
|
138
|
+
type_field = data[2] if len(data) > 2 and data[2] else (
|
|
139
|
+
reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '.')
|
|
140
|
+
source_field = data[5] if len(data) > 5 and data[5] else (
|
|
141
|
+
reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '')
|
|
142
|
+
|
|
143
|
+
# If additional_annotation_info duplicates info_field content, remove duplicate fragments.
|
|
144
|
+
filtered_additional = ''
|
|
145
|
+
if additional_annotation_info:
|
|
146
|
+
add_parts = [p.strip() for p in str(additional_annotation_info).split(';') if p.strip()]
|
|
147
|
+
info_parts = [p.strip() for p in str(info_field).split(';') if p.strip() and p.strip() != '.']
|
|
148
|
+
filtered = []
|
|
149
|
+
for ap in add_parts:
|
|
150
|
+
dup = False
|
|
151
|
+
for ip in info_parts:
|
|
152
|
+
# treat duplication if exact match or obvious substring relationship
|
|
153
|
+
if ip and (ap == ip or ip in ap or ap in ip):
|
|
154
|
+
dup = True
|
|
155
|
+
break
|
|
156
|
+
if not dup:
|
|
157
|
+
filtered.append(ap)
|
|
158
|
+
filtered_additional = ';'.join(filtered)
|
|
159
|
+
|
|
160
|
+
if not filtered_additional:
|
|
161
|
+
# Reference-only (no meaningful unique additional annotations)
|
|
162
|
+
Ref_Only += 1
|
|
163
|
+
if not getattr(options, 'clean', False):
|
|
164
|
+
entry = (
|
|
165
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + '\n')
|
|
166
|
+
else:
|
|
167
|
+
entry = (
|
|
168
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
|
|
93
169
|
else:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
170
|
+
# Reference entry that had additional annotations (combined)
|
|
171
|
+
Ref_Combined[matched_count] += 1
|
|
172
|
+
|
|
173
|
+
# Update per-tool counters for reference-matched entries
|
|
174
|
+
if matched_tools_list:
|
|
175
|
+
for t in matched_tools_list:
|
|
176
|
+
ref_per_tool_total[t] += 1
|
|
177
|
+
ref_per_contig_per_tool[contig][t] += 1
|
|
178
|
+
else:
|
|
179
|
+
ref_per_tool_total['unassigned'] += 1
|
|
180
|
+
ref_per_contig_per_tool[contig]['unassigned'] += 1
|
|
181
|
+
|
|
182
|
+
if not getattr(options, 'clean', False):
|
|
183
|
+
entry = (
|
|
184
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + ';Matched_Annotations=' + filtered_additional + '\n')
|
|
185
|
+
else:
|
|
186
|
+
entry = (
|
|
187
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
|
|
188
|
+
|
|
189
|
+
write_out.write(entry)
|
|
190
|
+
|
|
191
|
+
# Produce metadata output if requested
|
|
192
|
+
if getattr(options, 'output_meta', False) == True:
|
|
193
|
+
# Summaries
|
|
194
|
+
total_ref_combined = sum(Ref_Combined.values())
|
|
195
|
+
total_nonref = sum(Non_Ref_Combined.values())
|
|
196
|
+
total_reference_genes = Ref_Only + total_ref_combined
|
|
197
|
+
Ref_Combined_counter = Counter(Ref_Combined)
|
|
198
|
+
Non_Ref_Combined_counter = Counter(Non_Ref_Combined)
|
|
199
|
+
with open(output_file.replace('.gff','_Meta.txt'),'w') as meta_out:
|
|
200
|
+
meta_out.write("GFF-Adder Metadata Report\n")
|
|
201
|
+
meta_out.write("=========================\n")
|
|
202
|
+
meta_out.write("Run Date: {}\n".format(date.today()))
|
|
203
|
+
meta_out.write("Genome DNA: {}\n".format(getattr(options, 'genome_DNA', 'N/A')))
|
|
204
|
+
meta_out.write("Reference annotation: {}\n".format(reference_annotation))
|
|
205
|
+
meta_out.write("Additional annotation(s): {}\n\n".format(additional_annotation))
|
|
206
|
+
|
|
207
|
+
meta_out.write("Summary counts\n")
|
|
208
|
+
meta_out.write("--------------\n")
|
|
209
|
+
meta_out.write(f"Reference-only genes (no matching additional annotation): {Ref_Only}\n")
|
|
210
|
+
meta_out.write(f"Reference genes with additional matches (combined): {total_ref_combined}\n")
|
|
211
|
+
meta_out.write(f"TOTAL reference genes observed: {total_reference_genes}\n")
|
|
212
|
+
meta_out.write(f"Additional-only genes (not present in reference): {total_nonref}\n\n")
|
|
213
|
+
|
|
214
|
+
meta_out.write("Distribution of matches for reference-combined entries (num matched tools -> count)\n")
|
|
215
|
+
if Ref_Combined_counter:
|
|
216
|
+
for k in sorted(Ref_Combined_counter):
|
|
217
|
+
meta_out.write(f" {k:>3} {Ref_Combined_counter[k]}\n")
|
|
112
218
|
else:
|
|
113
|
-
|
|
219
|
+
meta_out.write(" None\n")
|
|
220
|
+
meta_out.write("\n")
|
|
221
|
+
|
|
222
|
+
meta_out.write("Distribution of matches for non-reference entries (num matched tools -> count)\n")
|
|
223
|
+
if Non_Ref_Combined_counter:
|
|
224
|
+
for k in sorted(Non_Ref_Combined_counter):
|
|
225
|
+
meta_out.write(f" {k:>3} {Non_Ref_Combined_counter[k]}\n")
|
|
226
|
+
else:
|
|
227
|
+
meta_out.write(" None\n")
|
|
228
|
+
meta_out.write("\n")
|
|
229
|
+
|
|
230
|
+
# Per-tool totals for additional-only entries
|
|
231
|
+
meta_out.write("Per-tool additional-only annotation totals\n")
|
|
232
|
+
meta_out.write("-----------------------------------------\n")
|
|
233
|
+
if per_tool_total:
|
|
234
|
+
for t, c in per_tool_total.most_common():
|
|
235
|
+
meta_out.write(f" {t}: {c}\n")
|
|
236
|
+
else:
|
|
237
|
+
meta_out.write(" None\n")
|
|
238
|
+
meta_out.write("\n")
|
|
239
|
+
|
|
240
|
+
# Per-contig breakdown for additional-only entries: only contigs with additional genes
|
|
241
|
+
meta_out.write("Per-contig breakdown for additional-only annotations (only contigs with additions shown)\n")
|
|
242
|
+
meta_out.write("---------------------------------------------------------------------------------------\n")
|
|
243
|
+
if per_contig_per_tool:
|
|
244
|
+
for contig in sorted(per_contig_per_tool):
|
|
245
|
+
counter = per_contig_per_tool[contig]
|
|
246
|
+
if sum(counter.values()) == 0:
|
|
247
|
+
continue
|
|
248
|
+
meta_out.write(f" {contig}:\n")
|
|
249
|
+
for t, c in counter.most_common():
|
|
250
|
+
meta_out.write(f" {t}: {c}\n")
|
|
251
|
+
meta_out.write("\n")
|
|
252
|
+
else:
|
|
253
|
+
meta_out.write(" None\n\n")
|
|
254
|
+
|
|
255
|
+
# Per-tool totals and per-contig breakdown for reference genes matched by additional tools
|
|
256
|
+
meta_out.write("Per-tool totals for reference genes matched by additional tools\n")
|
|
257
|
+
meta_out.write("---------------------------------------------------------------\n")
|
|
258
|
+
if ref_per_tool_total:
|
|
259
|
+
for t, c in ref_per_tool_total.most_common():
|
|
260
|
+
meta_out.write(f" {t}: {c}\n")
|
|
261
|
+
else:
|
|
262
|
+
meta_out.write(" None\n")
|
|
263
|
+
meta_out.write("\n")
|
|
264
|
+
|
|
265
|
+
meta_out.write("Per-contig breakdown for reference genes matched by additional tools\n")
|
|
266
|
+
meta_out.write("--------------------------------------------------------------------\n")
|
|
267
|
+
if ref_per_contig_per_tool:
|
|
268
|
+
for contig in sorted(ref_per_contig_per_tool):
|
|
269
|
+
counter = ref_per_contig_per_tool[contig]
|
|
270
|
+
if sum(counter.values()) == 0:
|
|
271
|
+
continue
|
|
272
|
+
meta_out.write(f" {contig}:\n")
|
|
273
|
+
for t, c in counter.most_common():
|
|
274
|
+
meta_out.write(f" {t}: {c}\n")
|
|
275
|
+
meta_out.write("\n")
|
|
276
|
+
else:
|
|
277
|
+
meta_out.write(" None\n\n")
|
|
278
|
+
|
|
279
|
+
meta_out.write("Notes\n")
|
|
280
|
+
meta_out.write("-----\n")
|
|
281
|
+
meta_out.write(" - 'Reference-only' are reference entries that had no recorded additional annotation information.\n")
|
|
282
|
+
meta_out.write(" - 'Per-tool' counts are based on the tool names extracted from the additional-annotation provenance\n")
|
|
283
|
+
meta_out.write(" (expected format in combined entries: 'ToolName:info'). Entries with no tool detected are shown as 'unassigned'.\n")
|
|
284
|
+
meta_out.write("\nEnd of report\n")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def gff_adder(options):
|
|
289
|
+
# Load fasta into dna_regions (supports multi-contig)
|
|
290
|
+
try:
|
|
291
|
+
try:
|
|
292
|
+
fasta_in = gzip.open(options.genome_DNA, 'rt')
|
|
293
|
+
dna_regions = fasta_load(fasta_in)
|
|
294
|
+
except Exception:
|
|
295
|
+
fasta_in = open(options.genome_DNA, 'r', encoding='unicode_escape')
|
|
296
|
+
dna_regions = fasta_load(fasta_in)
|
|
297
|
+
except Exception:
|
|
298
|
+
# Fallback to legacy single-contig behaviour
|
|
299
|
+
genome_seq = ""
|
|
300
|
+
with open(options.genome_DNA, 'r') as genome_fasta:
|
|
301
|
+
for line in genome_fasta:
|
|
302
|
+
line = line.replace("\n", "")
|
|
303
|
+
if not line.startswith('>'):
|
|
304
|
+
genome_seq += str(line)
|
|
305
|
+
else:
|
|
306
|
+
genome_ID = line.split()[0].replace('>','')
|
|
307
|
+
# Create dna_regions with single entry
|
|
308
|
+
dna_regions = OrderedDict()
|
|
309
|
+
dna_regions[genome_ID] = (genome_seq, len(genome_seq), list(), None)
|
|
310
|
+
|
|
114
311
|
###########################################
|
|
115
|
-
|
|
116
|
-
|
|
312
|
+
# Build reference gene dict per-contig
|
|
313
|
+
ref_genes_by_contig = defaultdict(OrderedDict)
|
|
314
|
+
|
|
315
|
+
if not options.reference_tool: # IF using Ensembl/file for comparison
|
|
316
|
+
# Parse reference gff to populate ref_genes_by_contig (retain original info fields)
|
|
317
|
+
# Detect gzip by magic bytes (first two bytes)
|
|
318
|
+
is_gz = False
|
|
319
|
+
with open(options.reference_annotation, 'rb') as _probe:
|
|
320
|
+
magic = _probe.read(2)
|
|
321
|
+
is_gz = (magic == b'\x1f\x8b')
|
|
322
|
+
|
|
323
|
+
if is_gz:
|
|
324
|
+
gff_in = gzip.open(options.reference_annotation, 'rt', errors='replace')
|
|
325
|
+
else:
|
|
326
|
+
# Open as plain text, replace undecodable bytes rather than fail
|
|
327
|
+
gff_in = open(options.reference_annotation, 'r', encoding='utf-8', errors='replace')
|
|
328
|
+
|
|
117
329
|
count = 0
|
|
118
|
-
|
|
119
|
-
for line in
|
|
120
|
-
line
|
|
330
|
+
try:
|
|
331
|
+
for line in gff_in:
|
|
332
|
+
if line.startswith('#') or line.strip() == '':
|
|
333
|
+
continue
|
|
334
|
+
parts = line.strip().split('\t')
|
|
335
|
+
if len(parts) < 9:
|
|
336
|
+
continue
|
|
337
|
+
contig = parts[0]
|
|
338
|
+
if contig not in dna_regions:
|
|
339
|
+
# skip records for contigs not in provided fasta
|
|
340
|
+
continue
|
|
121
341
|
try:
|
|
122
342
|
if 'CDS' in options.gene_ident and len(options.gene_ident) == 1:
|
|
123
|
-
if "CDS" in
|
|
124
|
-
start = int(
|
|
125
|
-
stop = int(
|
|
126
|
-
strand =
|
|
127
|
-
pos =
|
|
128
|
-
|
|
129
|
-
|
|
343
|
+
if "CDS" in parts[2] and len(parts) == 9:
|
|
344
|
+
start = int(parts[3])
|
|
345
|
+
stop = int(parts[4])
|
|
346
|
+
strand = parts[6]
|
|
347
|
+
pos = f"{start},{stop}"
|
|
348
|
+
# store as [strand, source, type, info] to match downstream expectations
|
|
349
|
+
info = parts[8]
|
|
350
|
+
ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
|
|
130
351
|
count += 1
|
|
131
352
|
else:
|
|
132
353
|
gene_types = options.gene_ident.split(',')
|
|
133
|
-
if any(gene_type in
|
|
134
|
-
start = int(
|
|
135
|
-
stop = int(
|
|
136
|
-
strand =
|
|
137
|
-
pos =
|
|
138
|
-
|
|
139
|
-
|
|
354
|
+
if any(gene_type in parts[2] for gene_type in gene_types):
|
|
355
|
+
start = int(parts[3])
|
|
356
|
+
stop = int(parts[4])
|
|
357
|
+
strand = parts[6]
|
|
358
|
+
pos = f"{start},{stop}"
|
|
359
|
+
# store as [strand, source, type, info] to match downstream expectations
|
|
360
|
+
info = parts[8]
|
|
361
|
+
ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
|
|
140
362
|
count += 1
|
|
141
363
|
except IndexError:
|
|
142
364
|
continue
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
365
|
+
finally:
|
|
366
|
+
try:
|
|
367
|
+
gff_in.close()
|
|
368
|
+
except Exception:
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
# Reference tool provided: attempt to call it with dna_regions first (multi-contig aware), fallback to legacy signature
|
|
373
|
+
reference_tool = options.reference_tool if options.reference_tool != 'StORF_Reporter' else 'StORF_Reporter'
|
|
146
374
|
try:
|
|
147
|
-
|
|
148
|
-
package='my_current_pkg')
|
|
375
|
+
reference_tool_mod = import_module('Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
|
|
149
376
|
except ModuleNotFoundError:
|
|
150
377
|
try:
|
|
151
|
-
|
|
152
|
-
package='my_current_pkg')
|
|
378
|
+
reference_tool_mod = import_module('ORForise.Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
|
|
153
379
|
except ModuleNotFoundError:
|
|
154
380
|
sys.exit("Tool not available")
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
381
|
+
reference_tool_func = getattr(reference_tool_mod, reference_tool)
|
|
382
|
+
# Try multi-contig signature
|
|
383
|
+
try:
|
|
384
|
+
ref_result = reference_tool_func(options.reference_annotation, dna_regions)
|
|
385
|
+
except TypeError:
|
|
386
|
+
# Fallback to legacy signature, try passing genome seq string
|
|
387
|
+
genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
|
|
388
|
+
ref_result = reference_tool_func(reference_annotation=options.reference_annotation, genome_seq=genome_seq, gene_ident=options.gene_ident)
|
|
389
|
+
# Expect ref_result to be dict of contig -> {pos: data}
|
|
390
|
+
for contig, mapping in ref_result.items() if isinstance(ref_result, dict) else []:
|
|
391
|
+
ref_genes_by_contig[contig].update(mapping)
|
|
392
|
+
|
|
393
|
+
# Ensure each contig has an OrderedDict even if empty
|
|
394
|
+
for contig in dna_regions:
|
|
395
|
+
if contig not in ref_genes_by_contig:
|
|
396
|
+
ref_genes_by_contig[contig] = OrderedDict()
|
|
397
|
+
|
|
398
|
+
# Collect additional annotations per contig
|
|
399
|
+
additional_annotations_by_contig = defaultdict(OrderedDict)
|
|
164
400
|
tool_count = 0
|
|
165
401
|
for tool in options.additional_tool.split(','):
|
|
166
402
|
try:
|
|
167
|
-
|
|
168
|
-
package='my_current_pkg')
|
|
403
|
+
additional_tool_mod = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
|
|
169
404
|
except ModuleNotFoundError:
|
|
170
405
|
try:
|
|
171
|
-
|
|
172
|
-
package='my_current_pkg')
|
|
406
|
+
additional_tool_mod = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
|
|
173
407
|
except ModuleNotFoundError:
|
|
174
408
|
sys.exit("Tool not available")
|
|
175
|
-
|
|
176
|
-
|
|
409
|
+
additional_tool_func = getattr(additional_tool_mod, tool)
|
|
410
|
+
|
|
411
|
+
anno_file = options.additional_annotation.split(',')[tool_count]
|
|
177
412
|
tool_count += 1
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
413
|
+
# Try calling tool in multi-contig mode first
|
|
414
|
+
try:
|
|
415
|
+
tool_orfs = additional_tool_func(anno_file, dna_regions)
|
|
416
|
+
except TypeError:
|
|
417
|
+
# Fallback to legacy signature expecting genome_seq
|
|
418
|
+
genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
|
|
419
|
+
tool_orfs = additional_tool_func(anno_file, genome_seq, options.gene_ident)
|
|
420
|
+
|
|
421
|
+
# tool_orfs may be either {contig: {pos: data}} or a flat {pos: data}
|
|
422
|
+
if isinstance(tool_orfs, dict):
|
|
423
|
+
# If top-level keys look like contig names (present in dna_regions) then treat as multi-contig
|
|
424
|
+
top_keys = list(tool_orfs.keys())
|
|
425
|
+
if top_keys and top_keys[0] in dna_regions:
|
|
426
|
+
for contig, mapping in tool_orfs.items():
|
|
427
|
+
# Merge mapping into additional_annotations_by_contig and record tool provenance
|
|
428
|
+
for pos_k, pos_v in mapping.items():
|
|
429
|
+
# store tuple (value, tool)
|
|
430
|
+
additional_annotations_by_contig[contig][pos_k] = (pos_v, tool)
|
|
431
|
+
else:
|
|
432
|
+
# Treat as flat mapping — assume single contig if only one contig present
|
|
433
|
+
if len(dna_regions) == 1:
|
|
434
|
+
only_contig = next(iter(dna_regions))
|
|
435
|
+
for pos_k, pos_v in tool_orfs.items():
|
|
436
|
+
additional_annotations_by_contig[only_contig][pos_k] = (pos_v, tool)
|
|
437
|
+
else:
|
|
438
|
+
# If multiple contigs but mapping has contig-prefixed keys like 'contig,start,stop', split them
|
|
439
|
+
for k, v in tool_orfs.items():
|
|
440
|
+
parts = k.split(',')
|
|
441
|
+
if len(parts) == 3 and parts[0] in dna_regions:
|
|
442
|
+
contig = parts[0]
|
|
443
|
+
pos = parts[1] + ',' + parts[2]
|
|
444
|
+
additional_annotations_by_contig[contig][pos] = (v, tool)
|
|
445
|
+
else:
|
|
446
|
+
# Unknown format: assign nowhere (skip)
|
|
447
|
+
continue
|
|
448
|
+
tool_orfs = None
|
|
449
|
+
|
|
450
|
+
# Combine per-contig: keep reference entries and append additional annotations as supplemental
|
|
451
|
+
combined_ORFs_by_contig = OrderedDict()
|
|
452
|
+
for contig in dna_regions:
|
|
453
|
+
combined = OrderedDict()
|
|
454
|
+
# Add reference entries first; normalise to [strand, 'ref', type, ref_info, additional_list]
|
|
455
|
+
for pos, val in ref_genes_by_contig.get(contig, {}).items():
|
|
456
|
+
strand = val[0] if len(val) > 0 else '.'
|
|
457
|
+
src = 'ref'
|
|
458
|
+
source_field = val[1] if len(val) > 1 else 'ref'
|
|
459
|
+
ftype = val[2] if len(val) > 2 else '.'
|
|
460
|
+
ref_info = val[3] if len(val) > 3 else '.'
|
|
461
|
+
combined[pos] = [strand, src, ftype, ref_info, [], source_field]
|
|
462
|
+
|
|
463
|
+
# Now incorporate additional annotations without overwriting reference entries
|
|
464
|
+
for pos, wrapped in additional_annotations_by_contig.get(contig, {}).items():
|
|
465
|
+
# wrapped is (value, tool)
|
|
466
|
+
if isinstance(wrapped, tuple) and len(wrapped) == 2:
|
|
467
|
+
val, toolname = wrapped
|
|
468
|
+
else:
|
|
469
|
+
val = wrapped
|
|
470
|
+
toolname = ''
|
|
471
|
+
|
|
472
|
+
# Extract strand/type/info from value heuristically
|
|
473
|
+
strand_a = val[0] if isinstance(val, (list, tuple)) and len(val) > 0 else '.'
|
|
474
|
+
ftype_a = val[3] if isinstance(val, (list, tuple)) and len(val) > 2 else '.'
|
|
475
|
+
info_a = ''
|
|
476
|
+
if isinstance(val, (list, tuple)) and len(val) > 3:
|
|
477
|
+
info_a = val[4]
|
|
478
|
+
elif isinstance(val, str):
|
|
479
|
+
info_a = val
|
|
480
|
+
|
|
481
|
+
# If matching pos exists in reference, append additional info to its additional list
|
|
482
|
+
if pos in combined:
|
|
483
|
+
addstr = (toolname + ':' + info_a) if toolname else info_a
|
|
484
|
+
combined[pos][4].append(addstr)
|
|
485
|
+
else:
|
|
486
|
+
# Create a new entry for additional-only annotation: [strand, 'add', type, '.', [tool:info]]
|
|
487
|
+
addstr = (toolname + ':' + info_a) if toolname else info_a
|
|
488
|
+
#combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr]]
|
|
489
|
+
combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr], toolname]
|
|
490
|
+
# Sort ORFs for this contig
|
|
491
|
+
combined = sortORFs(combined)
|
|
492
|
+
combined_ORFs_by_contig[contig] = combined
|
|
493
|
+
|
|
494
|
+
# Call writer
|
|
495
|
+
gff_writer(options, combined_ORFs_by_contig, options.output_file, options.reference_annotation, options.additional_annotation)
|
|
221
496
|
|
|
222
497
|
|
|
223
498
|
def main():
|
|
224
|
-
print(
|
|
499
|
+
print(WELCOME)
|
|
225
500
|
|
|
226
501
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': GFF-Adder Run Parameters.')
|
|
227
502
|
parser._action_groups.pop()
|
|
@@ -242,15 +517,15 @@ def main():
|
|
|
242
517
|
optional.add_argument('-rt', dest='reference_tool', required=False,
|
|
243
518
|
help='Which tool format to use as reference? - If not provided, will default to the '
|
|
244
519
|
'standard GFF format and will only look for "CDS" features')
|
|
245
|
-
optional.add_argument('
|
|
520
|
+
optional.add_argument('--gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
246
521
|
help='Identifier used for identifying genomic features in reference annotation "CDS,rRNA,tRNA"')
|
|
247
|
-
optional.add_argument('-mc', dest='mark_consensus',
|
|
522
|
+
optional.add_argument('-mc', dest='mark_consensus', action='store_true', required=False,
|
|
248
523
|
help='Default - False: Mark reference annotations which where present in the additional tool annotation')
|
|
249
|
-
optional.add_argument('-c', dest='clean',
|
|
524
|
+
optional.add_argument('-c', dest='clean', action='store_true', required=False,
|
|
250
525
|
help='Default - False: Do not mark 9th column with "Original/Matched/Additional tag"')
|
|
251
|
-
optional.add_argument('
|
|
526
|
+
optional.add_argument('--meta', dest='output_meta', action='store_true', required=False,
|
|
252
527
|
help='Default - False: Output metadata file')
|
|
253
|
-
optional.add_argument('
|
|
528
|
+
optional.add_argument('--olap', dest='overlap', default=50, type=int, required=False,
|
|
254
529
|
help='Maximum overlap between reference and additional genic regions (CDS,rRNA etc) - Default: 50 nt')
|
|
255
530
|
|
|
256
531
|
misc = parser.add_argument_group('Misc')
|