ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/GFF_Adder.py
ADDED
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
from importlib import import_module
|
|
2
|
+
import argparse
|
|
3
|
+
from collections import OrderedDict, defaultdict, Counter
|
|
4
|
+
import gzip
|
|
5
|
+
from datetime import date
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from .utils import *
|
|
10
|
+
except (ImportError, ModuleNotFoundError):
|
|
11
|
+
from utils import *
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
########################################
|
|
15
|
+
def gff_writer(options, combined_ORFs_by_contig, output_file, reference_annotation, additional_annotation):
|
|
16
|
+
write_out = open(output_file, 'w')
|
|
17
|
+
write_out.write("##gff-version\t3\n#\tGFF-Adder\n#\tRun Date:" + str(date.today()) + '\n')
|
|
18
|
+
write_out.write("##Genome DNA File:" + (options.genome_DNA if hasattr(options, 'genome_DNA') else '') + '\n')
|
|
19
|
+
write_out.write("##Original File: " + reference_annotation + "\n##Additional File: " + additional_annotation + '\n')
|
|
20
|
+
# meta counts
|
|
21
|
+
Ref_Only = 0
|
|
22
|
+
Ref_Combined = defaultdict(int)
|
|
23
|
+
Non_Ref_Combined = defaultdict(int)
|
|
24
|
+
|
|
25
|
+
# New counters: per-tool totals and per-contig per-tool breakdown
|
|
26
|
+
per_tool_total = Counter() # counts for additional-only entries
|
|
27
|
+
per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
|
|
28
|
+
ref_per_tool_total = Counter() # counts for reference entries matched by tools
|
|
29
|
+
ref_per_contig_per_tool = defaultdict(Counter) # contig -> Counter(tool -> count)
|
|
30
|
+
|
|
31
|
+
# Iterate contigs in deterministic order
|
|
32
|
+
for contig in combined_ORFs_by_contig:
|
|
33
|
+
combined_ORFs = combined_ORFs_by_contig[contig]
|
|
34
|
+
# ref_gene_set for this contig: use keys from ref portion (we can detect by data[1]=='ref')
|
|
35
|
+
ref_gene_set = [k for k, v in combined_ORFs.items() if len(v) > 1 and v[1] == 'ref']
|
|
36
|
+
|
|
37
|
+
for pos, data in combined_ORFs.items():
|
|
38
|
+
pos_ = pos.split(',')
|
|
39
|
+
# pos may be like 'start,stop' or 'contig,start,stop' but here we expect 'start,stop'
|
|
40
|
+
if len(pos_) >= 2:
|
|
41
|
+
start = pos_[0]
|
|
42
|
+
stop = pos_[-1]
|
|
43
|
+
else:
|
|
44
|
+
# fallback: skip malformed
|
|
45
|
+
continue
|
|
46
|
+
strand = data[0]
|
|
47
|
+
|
|
48
|
+
# Build additional_annotation_info from the combined entry's additional list if present.
|
|
49
|
+
# Normalise entries and prefer the info portion after any 'ToolName:info' prefix.
|
|
50
|
+
additional_annotation_info = ''
|
|
51
|
+
additional_items = []
|
|
52
|
+
if len(data) > 4 and data[4]:
|
|
53
|
+
for add in data[4]:
|
|
54
|
+
s = str(add).strip()
|
|
55
|
+
if not s:
|
|
56
|
+
continue
|
|
57
|
+
if ':' in s:
|
|
58
|
+
# split tool:info -> take info part
|
|
59
|
+
_, info_part = s.split(':', 1)
|
|
60
|
+
info_part = info_part.strip()
|
|
61
|
+
else:
|
|
62
|
+
info_part = s
|
|
63
|
+
if info_part:
|
|
64
|
+
additional_items.append(info_part)
|
|
65
|
+
# deduplicate while preserving order
|
|
66
|
+
seen = set()
|
|
67
|
+
deduped = []
|
|
68
|
+
for it in additional_items:
|
|
69
|
+
if it not in seen:
|
|
70
|
+
seen.add(it)
|
|
71
|
+
deduped.append(it)
|
|
72
|
+
if deduped:
|
|
73
|
+
additional_annotation_info = ';'.join(deduped)
|
|
74
|
+
elif len(data) > 3 and data[3]:
|
|
75
|
+
additional_annotation_info = str(data[3]).strip()
|
|
76
|
+
|
|
77
|
+
# tools list from options (maybe empty)
|
|
78
|
+
tools = options.additional_tool.split(',') if getattr(options, 'additional_tool', None) else []
|
|
79
|
+
|
|
80
|
+
# Determine matched_tools_list reliably:
|
|
81
|
+
# prefer extracting tool names from data[4] entries, otherwise fallback to scanning values
|
|
82
|
+
matched_tools_list = []
|
|
83
|
+
try:
|
|
84
|
+
if len(data) > 4 and data[4]:
|
|
85
|
+
for add in data[4]:
|
|
86
|
+
# expected format: "ToolName:info" or "info"
|
|
87
|
+
if isinstance(add, str) and ':' in add:
|
|
88
|
+
t = add.split(':', 1)[0].strip()
|
|
89
|
+
if t:
|
|
90
|
+
matched_tools_list.append(t)
|
|
91
|
+
else:
|
|
92
|
+
# try to detect one of the known tool names in the string
|
|
93
|
+
for tool in tools:
|
|
94
|
+
if tool and tool in str(add):
|
|
95
|
+
matched_tools_list.append(tool)
|
|
96
|
+
else:
|
|
97
|
+
# fallback: scan the whole data structure for tool names (previous behaviour)
|
|
98
|
+
for tool in tools:
|
|
99
|
+
if tool and any(tool in str(x) for x in data):
|
|
100
|
+
matched_tools_list.append(tool)
|
|
101
|
+
except Exception:
|
|
102
|
+
# keep matched_tools_list empty if anything unexpected happens
|
|
103
|
+
matched_tools_list = []
|
|
104
|
+
|
|
105
|
+
# Normalise matched tools: unique, sorted
|
|
106
|
+
matched_tools_list = sorted(set(matched_tools_list))
|
|
107
|
+
matched_tools_str = ','.join(matched_tools_list)
|
|
108
|
+
matched_count = len(matched_tools_list)
|
|
109
|
+
|
|
110
|
+
# Build GFF entry and update meta counters
|
|
111
|
+
if pos not in ref_gene_set: # Additional-only entry (not in reference)
|
|
112
|
+
type_field = matched_tools_str if matched_tools_str else ''
|
|
113
|
+
Non_Ref_Combined[matched_count] += 1
|
|
114
|
+
|
|
115
|
+
# Update per-tool counters for additional-only entries
|
|
116
|
+
if matched_tools_list:
|
|
117
|
+
for t in matched_tools_list:
|
|
118
|
+
per_tool_total[t] += 1
|
|
119
|
+
per_contig_per_tool[contig][t] += 1
|
|
120
|
+
else:
|
|
121
|
+
# track unassigned additional entries (no tool name found)
|
|
122
|
+
per_tool_total['unassigned'] += 1
|
|
123
|
+
per_contig_per_tool[contig]['unassigned'] += 1
|
|
124
|
+
|
|
125
|
+
if not getattr(options, 'clean', False):
|
|
126
|
+
entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Additional_Annotations;' + (additional_annotation_info if additional_annotation_info else '') + '\n')
|
|
127
|
+
else:
|
|
128
|
+
entry = (contig + '\t' + type_field + '\t' + (data[2] if len(data) > 3 else '.') + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + (additional_annotation_info if additional_annotation_info else '') + '\n')
|
|
129
|
+
|
|
130
|
+
else:
|
|
131
|
+
# Reference entry
|
|
132
|
+
if len(data) > 3 and data[3]:
|
|
133
|
+
info_field = data[3].replace('\n', '').strip()
|
|
134
|
+
else:
|
|
135
|
+
info_field = '.'
|
|
136
|
+
|
|
137
|
+
# Determine type and source fields
|
|
138
|
+
type_field = data[2] if len(data) > 2 and data[2] else (
|
|
139
|
+
reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '.')
|
|
140
|
+
source_field = data[5] if len(data) > 5 and data[5] else (
|
|
141
|
+
reference_annotation.split('/')[-1].split('.')[0] if reference_annotation else '')
|
|
142
|
+
|
|
143
|
+
# If additional_annotation_info duplicates info_field content, remove duplicate fragments.
|
|
144
|
+
filtered_additional = ''
|
|
145
|
+
if additional_annotation_info:
|
|
146
|
+
add_parts = [p.strip() for p in str(additional_annotation_info).split(';') if p.strip()]
|
|
147
|
+
info_parts = [p.strip() for p in str(info_field).split(';') if p.strip() and p.strip() != '.']
|
|
148
|
+
filtered = []
|
|
149
|
+
for ap in add_parts:
|
|
150
|
+
dup = False
|
|
151
|
+
for ip in info_parts:
|
|
152
|
+
# treat duplication if exact match or obvious substring relationship
|
|
153
|
+
if ip and (ap == ip or ip in ap or ap in ip):
|
|
154
|
+
dup = True
|
|
155
|
+
break
|
|
156
|
+
if not dup:
|
|
157
|
+
filtered.append(ap)
|
|
158
|
+
filtered_additional = ';'.join(filtered)
|
|
159
|
+
|
|
160
|
+
if not filtered_additional:
|
|
161
|
+
# Reference-only (no meaningful unique additional annotations)
|
|
162
|
+
Ref_Only += 1
|
|
163
|
+
if not getattr(options, 'clean', False):
|
|
164
|
+
entry = (
|
|
165
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + '\n')
|
|
166
|
+
else:
|
|
167
|
+
entry = (
|
|
168
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
|
|
169
|
+
else:
|
|
170
|
+
# Reference entry that had additional annotations (combined)
|
|
171
|
+
Ref_Combined[matched_count] += 1
|
|
172
|
+
|
|
173
|
+
# Update per-tool counters for reference-matched entries
|
|
174
|
+
if matched_tools_list:
|
|
175
|
+
for t in matched_tools_list:
|
|
176
|
+
ref_per_tool_total[t] += 1
|
|
177
|
+
ref_per_contig_per_tool[contig][t] += 1
|
|
178
|
+
else:
|
|
179
|
+
ref_per_tool_total['unassigned'] += 1
|
|
180
|
+
ref_per_contig_per_tool[contig]['unassigned'] += 1
|
|
181
|
+
|
|
182
|
+
if not getattr(options, 'clean', False):
|
|
183
|
+
entry = (
|
|
184
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation;' + info_field + ';Matched_Annotations=' + filtered_additional + '\n')
|
|
185
|
+
else:
|
|
186
|
+
entry = (
|
|
187
|
+
contig + '\t' + source_field + '\t' + type_field + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\t' + info_field + '\n')
|
|
188
|
+
|
|
189
|
+
write_out.write(entry)
|
|
190
|
+
|
|
191
|
+
# Produce metadata output if requested
|
|
192
|
+
if getattr(options, 'output_meta', False) == True:
|
|
193
|
+
# Summaries
|
|
194
|
+
total_ref_combined = sum(Ref_Combined.values())
|
|
195
|
+
total_nonref = sum(Non_Ref_Combined.values())
|
|
196
|
+
total_reference_genes = Ref_Only + total_ref_combined
|
|
197
|
+
Ref_Combined_counter = Counter(Ref_Combined)
|
|
198
|
+
Non_Ref_Combined_counter = Counter(Non_Ref_Combined)
|
|
199
|
+
with open(output_file.replace('.gff','_Meta.txt'),'w') as meta_out:
|
|
200
|
+
meta_out.write("GFF-Adder Metadata Report\n")
|
|
201
|
+
meta_out.write("=========================\n")
|
|
202
|
+
meta_out.write("Run Date: {}\n".format(date.today()))
|
|
203
|
+
meta_out.write("Genome DNA: {}\n".format(getattr(options, 'genome_DNA', 'N/A')))
|
|
204
|
+
meta_out.write("Reference annotation: {}\n".format(reference_annotation))
|
|
205
|
+
meta_out.write("Additional annotation(s): {}\n\n".format(additional_annotation))
|
|
206
|
+
|
|
207
|
+
meta_out.write("Summary counts\n")
|
|
208
|
+
meta_out.write("--------------\n")
|
|
209
|
+
meta_out.write(f"Reference-only genes (no matching additional annotation): {Ref_Only}\n")
|
|
210
|
+
meta_out.write(f"Reference genes with additional matches (combined): {total_ref_combined}\n")
|
|
211
|
+
meta_out.write(f"TOTAL reference genes observed: {total_reference_genes}\n")
|
|
212
|
+
meta_out.write(f"Additional-only genes (not present in reference): {total_nonref}\n\n")
|
|
213
|
+
|
|
214
|
+
meta_out.write("Distribution of matches for reference-combined entries (num matched tools -> count)\n")
|
|
215
|
+
if Ref_Combined_counter:
|
|
216
|
+
for k in sorted(Ref_Combined_counter):
|
|
217
|
+
meta_out.write(f" {k:>3} {Ref_Combined_counter[k]}\n")
|
|
218
|
+
else:
|
|
219
|
+
meta_out.write(" None\n")
|
|
220
|
+
meta_out.write("\n")
|
|
221
|
+
|
|
222
|
+
meta_out.write("Distribution of matches for non-reference entries (num matched tools -> count)\n")
|
|
223
|
+
if Non_Ref_Combined_counter:
|
|
224
|
+
for k in sorted(Non_Ref_Combined_counter):
|
|
225
|
+
meta_out.write(f" {k:>3} {Non_Ref_Combined_counter[k]}\n")
|
|
226
|
+
else:
|
|
227
|
+
meta_out.write(" None\n")
|
|
228
|
+
meta_out.write("\n")
|
|
229
|
+
|
|
230
|
+
# Per-tool totals for additional-only entries
|
|
231
|
+
meta_out.write("Per-tool additional-only annotation totals\n")
|
|
232
|
+
meta_out.write("-----------------------------------------\n")
|
|
233
|
+
if per_tool_total:
|
|
234
|
+
for t, c in per_tool_total.most_common():
|
|
235
|
+
meta_out.write(f" {t}: {c}\n")
|
|
236
|
+
else:
|
|
237
|
+
meta_out.write(" None\n")
|
|
238
|
+
meta_out.write("\n")
|
|
239
|
+
|
|
240
|
+
# Per-contig breakdown for additional-only entries: only contigs with additional genes
|
|
241
|
+
meta_out.write("Per-contig breakdown for additional-only annotations (only contigs with additions shown)\n")
|
|
242
|
+
meta_out.write("---------------------------------------------------------------------------------------\n")
|
|
243
|
+
if per_contig_per_tool:
|
|
244
|
+
for contig in sorted(per_contig_per_tool):
|
|
245
|
+
counter = per_contig_per_tool[contig]
|
|
246
|
+
if sum(counter.values()) == 0:
|
|
247
|
+
continue
|
|
248
|
+
meta_out.write(f" {contig}:\n")
|
|
249
|
+
for t, c in counter.most_common():
|
|
250
|
+
meta_out.write(f" {t}: {c}\n")
|
|
251
|
+
meta_out.write("\n")
|
|
252
|
+
else:
|
|
253
|
+
meta_out.write(" None\n\n")
|
|
254
|
+
|
|
255
|
+
# Per-tool totals and per-contig breakdown for reference genes matched by additional tools
|
|
256
|
+
meta_out.write("Per-tool totals for reference genes matched by additional tools\n")
|
|
257
|
+
meta_out.write("---------------------------------------------------------------\n")
|
|
258
|
+
if ref_per_tool_total:
|
|
259
|
+
for t, c in ref_per_tool_total.most_common():
|
|
260
|
+
meta_out.write(f" {t}: {c}\n")
|
|
261
|
+
else:
|
|
262
|
+
meta_out.write(" None\n")
|
|
263
|
+
meta_out.write("\n")
|
|
264
|
+
|
|
265
|
+
meta_out.write("Per-contig breakdown for reference genes matched by additional tools\n")
|
|
266
|
+
meta_out.write("--------------------------------------------------------------------\n")
|
|
267
|
+
if ref_per_contig_per_tool:
|
|
268
|
+
for contig in sorted(ref_per_contig_per_tool):
|
|
269
|
+
counter = ref_per_contig_per_tool[contig]
|
|
270
|
+
if sum(counter.values()) == 0:
|
|
271
|
+
continue
|
|
272
|
+
meta_out.write(f" {contig}:\n")
|
|
273
|
+
for t, c in counter.most_common():
|
|
274
|
+
meta_out.write(f" {t}: {c}\n")
|
|
275
|
+
meta_out.write("\n")
|
|
276
|
+
else:
|
|
277
|
+
meta_out.write(" None\n\n")
|
|
278
|
+
|
|
279
|
+
meta_out.write("Notes\n")
|
|
280
|
+
meta_out.write("-----\n")
|
|
281
|
+
meta_out.write(" - 'Reference-only' are reference entries that had no recorded additional annotation information.\n")
|
|
282
|
+
meta_out.write(" - 'Per-tool' counts are based on the tool names extracted from the additional-annotation provenance\n")
|
|
283
|
+
meta_out.write(" (expected format in combined entries: 'ToolName:info'). Entries with no tool detected are shown as 'unassigned'.\n")
|
|
284
|
+
meta_out.write("\nEnd of report\n")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def gff_adder(options):
|
|
289
|
+
# Load fasta into dna_regions (supports multi-contig)
|
|
290
|
+
try:
|
|
291
|
+
try:
|
|
292
|
+
fasta_in = gzip.open(options.genome_DNA, 'rt')
|
|
293
|
+
dna_regions = fasta_load(fasta_in)
|
|
294
|
+
except Exception:
|
|
295
|
+
fasta_in = open(options.genome_DNA, 'r', encoding='unicode_escape')
|
|
296
|
+
dna_regions = fasta_load(fasta_in)
|
|
297
|
+
except Exception:
|
|
298
|
+
# Fallback to legacy single-contig behaviour
|
|
299
|
+
genome_seq = ""
|
|
300
|
+
with open(options.genome_DNA, 'r') as genome_fasta:
|
|
301
|
+
for line in genome_fasta:
|
|
302
|
+
line = line.replace("\n", "")
|
|
303
|
+
if not line.startswith('>'):
|
|
304
|
+
genome_seq += str(line)
|
|
305
|
+
else:
|
|
306
|
+
genome_ID = line.split()[0].replace('>','')
|
|
307
|
+
# Create dna_regions with single entry
|
|
308
|
+
dna_regions = OrderedDict()
|
|
309
|
+
dna_regions[genome_ID] = (genome_seq, len(genome_seq), list(), None)
|
|
310
|
+
|
|
311
|
+
###########################################
|
|
312
|
+
# Build reference gene dict per-contig
|
|
313
|
+
ref_genes_by_contig = defaultdict(OrderedDict)
|
|
314
|
+
|
|
315
|
+
if not options.reference_tool: # IF using Ensembl/file for comparison
|
|
316
|
+
# Parse reference gff to populate ref_genes_by_contig (retain original info fields)
|
|
317
|
+
# Detect gzip by magic bytes (first two bytes)
|
|
318
|
+
is_gz = False
|
|
319
|
+
with open(options.reference_annotation, 'rb') as _probe:
|
|
320
|
+
magic = _probe.read(2)
|
|
321
|
+
is_gz = (magic == b'\x1f\x8b')
|
|
322
|
+
|
|
323
|
+
if is_gz:
|
|
324
|
+
gff_in = gzip.open(options.reference_annotation, 'rt', errors='replace')
|
|
325
|
+
else:
|
|
326
|
+
# Open as plain text, replace undecodable bytes rather than fail
|
|
327
|
+
gff_in = open(options.reference_annotation, 'r', encoding='utf-8', errors='replace')
|
|
328
|
+
|
|
329
|
+
count = 0
|
|
330
|
+
try:
|
|
331
|
+
for line in gff_in:
|
|
332
|
+
if line.startswith('#') or line.strip() == '':
|
|
333
|
+
continue
|
|
334
|
+
parts = line.strip().split('\t')
|
|
335
|
+
if len(parts) < 9:
|
|
336
|
+
continue
|
|
337
|
+
contig = parts[0]
|
|
338
|
+
if contig not in dna_regions:
|
|
339
|
+
# skip records for contigs not in provided fasta
|
|
340
|
+
continue
|
|
341
|
+
try:
|
|
342
|
+
if 'CDS' in options.gene_ident and len(options.gene_ident) == 1:
|
|
343
|
+
if "CDS" in parts[2] and len(parts) == 9:
|
|
344
|
+
start = int(parts[3])
|
|
345
|
+
stop = int(parts[4])
|
|
346
|
+
strand = parts[6]
|
|
347
|
+
pos = f"{start},{stop}"
|
|
348
|
+
# store as [strand, source, type, info] to match downstream expectations
|
|
349
|
+
info = parts[8]
|
|
350
|
+
ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
|
|
351
|
+
count += 1
|
|
352
|
+
else:
|
|
353
|
+
gene_types = options.gene_ident.split(',')
|
|
354
|
+
if any(gene_type in parts[2] for gene_type in gene_types):
|
|
355
|
+
start = int(parts[3])
|
|
356
|
+
stop = int(parts[4])
|
|
357
|
+
strand = parts[6]
|
|
358
|
+
pos = f"{start},{stop}"
|
|
359
|
+
# store as [strand, source, type, info] to match downstream expectations
|
|
360
|
+
info = parts[8]
|
|
361
|
+
ref_genes_by_contig[contig][pos] = [strand, parts[1], parts[2], info]
|
|
362
|
+
count += 1
|
|
363
|
+
except IndexError:
|
|
364
|
+
continue
|
|
365
|
+
finally:
|
|
366
|
+
try:
|
|
367
|
+
gff_in.close()
|
|
368
|
+
except Exception:
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
# Reference tool provided: attempt to call it with dna_regions first (multi-contig aware), fallback to legacy signature
|
|
373
|
+
reference_tool = options.reference_tool if options.reference_tool != 'StORF-Reporter' else 'StORF-Reporter'
|
|
374
|
+
try:
|
|
375
|
+
reference_tool_mod = import_module('Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
|
|
376
|
+
except ModuleNotFoundError:
|
|
377
|
+
try:
|
|
378
|
+
reference_tool_mod = import_module('ORForise.Tools.' + reference_tool + '.' + reference_tool, package='my_current_pkg')
|
|
379
|
+
except ModuleNotFoundError:
|
|
380
|
+
sys.exit("Tool not available")
|
|
381
|
+
reference_tool_func = getattr(reference_tool_mod, reference_tool)
|
|
382
|
+
# Try multi-contig signature
|
|
383
|
+
try:
|
|
384
|
+
ref_result = reference_tool_func(options.reference_annotation, dna_regions)
|
|
385
|
+
except TypeError:
|
|
386
|
+
# Fallback to legacy signature, try passing genome seq string
|
|
387
|
+
genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
|
|
388
|
+
ref_result = reference_tool_func(reference_annotation=options.reference_annotation, genome_seq=genome_seq, gene_ident=options.gene_ident)
|
|
389
|
+
# Expect ref_result to be dict of contig -> {pos: data}
|
|
390
|
+
for contig, mapping in ref_result.items() if isinstance(ref_result, dict) else []:
|
|
391
|
+
ref_genes_by_contig[contig].update(mapping)
|
|
392
|
+
|
|
393
|
+
# Ensure each contig has an OrderedDict even if empty
|
|
394
|
+
for contig in dna_regions:
|
|
395
|
+
if contig not in ref_genes_by_contig:
|
|
396
|
+
ref_genes_by_contig[contig] = OrderedDict()
|
|
397
|
+
|
|
398
|
+
# Collect additional annotations per contig
|
|
399
|
+
additional_annotations_by_contig = defaultdict(OrderedDict)
|
|
400
|
+
tool_count = 0
|
|
401
|
+
for tool in options.additional_tool.split(','):
|
|
402
|
+
try:
|
|
403
|
+
additional_tool_mod = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
|
|
404
|
+
except ModuleNotFoundError:
|
|
405
|
+
try:
|
|
406
|
+
additional_tool_mod = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
|
|
407
|
+
except ModuleNotFoundError:
|
|
408
|
+
sys.exit("Tool not available")
|
|
409
|
+
additional_tool_func = getattr(additional_tool_mod, tool)
|
|
410
|
+
|
|
411
|
+
anno_file = options.additional_annotation.split(',')[tool_count]
|
|
412
|
+
tool_count += 1
|
|
413
|
+
# Try calling tool in multi-contig mode first
|
|
414
|
+
try:
|
|
415
|
+
tool_orfs = additional_tool_func(anno_file, dna_regions)
|
|
416
|
+
except TypeError:
|
|
417
|
+
# Fallback to legacy signature expecting genome_seq
|
|
418
|
+
genome_seq = ''.join([dna_regions[c][0] for c in dna_regions])
|
|
419
|
+
tool_orfs = additional_tool_func(anno_file, genome_seq, options.gene_ident)
|
|
420
|
+
|
|
421
|
+
# tool_orfs may be either {contig: {pos: data}} or a flat {pos: data}
|
|
422
|
+
if isinstance(tool_orfs, dict):
|
|
423
|
+
# If top-level keys look like contig names (present in dna_regions) then treat as multi-contig
|
|
424
|
+
top_keys = list(tool_orfs.keys())
|
|
425
|
+
if top_keys and top_keys[0] in dna_regions:
|
|
426
|
+
for contig, mapping in tool_orfs.items():
|
|
427
|
+
# Merge mapping into additional_annotations_by_contig and record tool provenance
|
|
428
|
+
for pos_k, pos_v in mapping.items():
|
|
429
|
+
# store tuple (value, tool)
|
|
430
|
+
additional_annotations_by_contig[contig][pos_k] = (pos_v, tool)
|
|
431
|
+
else:
|
|
432
|
+
# Treat as flat mapping — assume single contig if only one contig present
|
|
433
|
+
if len(dna_regions) == 1:
|
|
434
|
+
only_contig = next(iter(dna_regions))
|
|
435
|
+
for pos_k, pos_v in tool_orfs.items():
|
|
436
|
+
additional_annotations_by_contig[only_contig][pos_k] = (pos_v, tool)
|
|
437
|
+
else:
|
|
438
|
+
# If multiple contigs but mapping has contig-prefixed keys like 'contig,start,stop', split them
|
|
439
|
+
for k, v in tool_orfs.items():
|
|
440
|
+
parts = k.split(',')
|
|
441
|
+
if len(parts) == 3 and parts[0] in dna_regions:
|
|
442
|
+
contig = parts[0]
|
|
443
|
+
pos = parts[1] + ',' + parts[2]
|
|
444
|
+
additional_annotations_by_contig[contig][pos] = (v, tool)
|
|
445
|
+
else:
|
|
446
|
+
# Unknown format: assign nowhere (skip)
|
|
447
|
+
continue
|
|
448
|
+
tool_orfs = None
|
|
449
|
+
|
|
450
|
+
# Combine per-contig: keep reference entries and append additional annotations as supplemental
|
|
451
|
+
combined_ORFs_by_contig = OrderedDict()
|
|
452
|
+
for contig in dna_regions:
|
|
453
|
+
combined = OrderedDict()
|
|
454
|
+
# Add reference entries first; normalise to [strand, 'ref', type, ref_info, additional_list]
|
|
455
|
+
for pos, val in ref_genes_by_contig.get(contig, {}).items():
|
|
456
|
+
strand = val[0] if len(val) > 0 else '.'
|
|
457
|
+
src = 'ref'
|
|
458
|
+
source_field = val[1] if len(val) > 1 else 'ref'
|
|
459
|
+
ftype = val[2] if len(val) > 2 else '.'
|
|
460
|
+
ref_info = val[3] if len(val) > 3 else '.'
|
|
461
|
+
combined[pos] = [strand, src, ftype, ref_info, [], source_field]
|
|
462
|
+
|
|
463
|
+
# Now incorporate additional annotations without overwriting reference entries
|
|
464
|
+
for pos, wrapped in additional_annotations_by_contig.get(contig, {}).items():
|
|
465
|
+
# wrapped is (value, tool)
|
|
466
|
+
if isinstance(wrapped, tuple) and len(wrapped) == 2:
|
|
467
|
+
val, toolname = wrapped
|
|
468
|
+
else:
|
|
469
|
+
val = wrapped
|
|
470
|
+
toolname = ''
|
|
471
|
+
|
|
472
|
+
# Extract strand/type/info from value heuristically
|
|
473
|
+
strand_a = val[0] if isinstance(val, (list, tuple)) and len(val) > 0 else '.'
|
|
474
|
+
ftype_a = val[3] if isinstance(val, (list, tuple)) and len(val) > 2 else '.'
|
|
475
|
+
info_a = ''
|
|
476
|
+
if isinstance(val, (list, tuple)) and len(val) > 3:
|
|
477
|
+
info_a = val[4]
|
|
478
|
+
elif isinstance(val, str):
|
|
479
|
+
info_a = val
|
|
480
|
+
|
|
481
|
+
# If matching pos exists in reference, append additional info to its additional list
|
|
482
|
+
if pos in combined:
|
|
483
|
+
addstr = (toolname + ':' + info_a) if toolname else info_a
|
|
484
|
+
combined[pos][4].append(addstr)
|
|
485
|
+
else:
|
|
486
|
+
# Create a new entry for additional-only annotation: [strand, 'add', type, '.', [tool:info]]
|
|
487
|
+
addstr = (toolname + ':' + info_a) if toolname else info_a
|
|
488
|
+
#combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr]]
|
|
489
|
+
combined[pos] = [strand_a, 'add', ftype_a if ftype_a else '.', '.', [addstr], toolname]
|
|
490
|
+
# Sort ORFs for this contig
|
|
491
|
+
combined = sortORFs(combined)
|
|
492
|
+
combined_ORFs_by_contig[contig] = combined
|
|
493
|
+
|
|
494
|
+
# Call writer
|
|
495
|
+
gff_writer(options, combined_ORFs_by_contig, options.output_file, options.reference_annotation, options.additional_annotation)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def main():
|
|
499
|
+
print(WELCOME)
|
|
500
|
+
|
|
501
|
+
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': GFF-Adder Run Parameters.')
|
|
502
|
+
parser._action_groups.pop()
|
|
503
|
+
|
|
504
|
+
required = parser.add_argument_group('Required Arguments')
|
|
505
|
+
required.add_argument('-dna', dest='genome_DNA', required=True, help='Genome DNA file (.fa) which both annotations '
|
|
506
|
+
'are based on')
|
|
507
|
+
required.add_argument('-ref', dest='reference_annotation', required=True,
|
|
508
|
+
help='Which reference annotation file to use as reference?')
|
|
509
|
+
required.add_argument('-at', dest='additional_tool', required=True,
|
|
510
|
+
help='Which format to use for additional annotation? - Can provide multiple annotations (Tool1,Tool2)')
|
|
511
|
+
required.add_argument('-add', dest='additional_annotation', required=True,
|
|
512
|
+
help='Which annotation file to add to reference annotation? - Can provide multiple annotations (1.GFF,2.GFF)')
|
|
513
|
+
required.add_argument('-o', dest='output_file', required=True,
|
|
514
|
+
help='Output filename')
|
|
515
|
+
|
|
516
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
517
|
+
optional.add_argument('-rt', dest='reference_tool', required=False,
|
|
518
|
+
help='Which tool format to use as reference? - If not provided, will default to the '
|
|
519
|
+
'standard GFF format and will only look for "CDS" features')
|
|
520
|
+
optional.add_argument('--gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
521
|
+
help='Identifier used for identifying genomic features in reference annotation "CDS,rRNA,tRNA"')
|
|
522
|
+
optional.add_argument('-mc', dest='mark_consensus', action='store_true', required=False,
|
|
523
|
+
help='Default - False: Mark reference annotations which where present in the additional tool annotation')
|
|
524
|
+
optional.add_argument('-c', dest='clean', action='store_true', required=False,
|
|
525
|
+
help='Default - False: Do not mark 9th column with "Original/Matched/Additional tag"')
|
|
526
|
+
optional.add_argument('--meta', dest='output_meta', action='store_true', required=False,
|
|
527
|
+
help='Default - False: Output metadata file')
|
|
528
|
+
optional.add_argument('--olap', dest='overlap', default=50, type=int, required=False,
|
|
529
|
+
help='Maximum overlap between reference and additional genic regions (CDS,rRNA etc) - Default: 50 nt')
|
|
530
|
+
|
|
531
|
+
misc = parser.add_argument_group('Misc')
|
|
532
|
+
misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
|
|
533
|
+
help='Default - False: Print out runtime status')
|
|
534
|
+
|
|
535
|
+
options = parser.parse_args()
|
|
536
|
+
|
|
537
|
+
gff_adder(options)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
if __name__ == "__main__":
|
|
542
|
+
main()
|
|
543
|
+
print("Complete")
|
ORForise/List_Tools.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from importlib import import_module
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from .utils import *
|
|
6
|
+
except (ImportError, ModuleNotFoundError):
|
|
7
|
+
from utils import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main():
|
|
14
|
+
print(WELCOME)
|
|
15
|
+
|
|
16
|
+
print('ORForise ' + ORForise_Version + ': List Tools Run Parameters')
|
|
17
|
+
|
|
18
|
+
tools = set()
|
|
19
|
+
base_dirs = [
|
|
20
|
+
os.path.join(os.path.dirname(__file__), 'Tools'),
|
|
21
|
+
os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'Tools')),
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
for base in base_dirs:
|
|
25
|
+
if not os.path.isdir(base):
|
|
26
|
+
continue
|
|
27
|
+
try:
|
|
28
|
+
for entry in os.listdir(base):
|
|
29
|
+
entry_path = os.path.join(base, entry)
|
|
30
|
+
if os.path.isdir(entry_path) and not entry.startswith('.') and entry != '__pycache__':
|
|
31
|
+
tools.add(entry)
|
|
32
|
+
except OSError:
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
if not tools:
|
|
36
|
+
print('No tools found in the searched directories.')
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
print('Available tools:')
|
|
40
|
+
for tool_name in sorted(tools):
|
|
41
|
+
print(' -', tool_name)
|
|
42
|
+
try:
|
|
43
|
+
tool_ = import_module('Tools.' + tool_name + '.' + tool_name)
|
|
44
|
+
print(' Imported from Tools.' + tool_name)
|
|
45
|
+
except ModuleNotFoundError:
|
|
46
|
+
try:
|
|
47
|
+
tool_ = import_module('ORForise.Tools.' + tool_name + '.' + tool_name)
|
|
48
|
+
print(' Imported from ORForise.Tools.' + tool_name)
|
|
49
|
+
except ModuleNotFoundError:
|
|
50
|
+
print(' Tool not importable')
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
main()
|
|
56
|
+
print("Complete")
|
|
File without changes
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from importlib import import_module
|
|
2
|
+
import argparse
|
|
3
|
+
import collections
|
|
4
|
+
from datetime import date
|
|
5
|
+
import sys
|
|
6
|
+
try:
|
|
7
|
+
from ORForise.src.ORForise.utils import revCompIterative
|
|
8
|
+
except ImportError:
|
|
9
|
+
|
|
10
|
+
from ORForise.utils import revCompIterative
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
parser = argparse.ArgumentParser()
|
|
15
|
+
parser.add_argument('-dna', '--genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
|
|
16
|
+
'are based on')
|
|
17
|
+
parser.add_argument('-gff', '--genome_gff', required=True,
|
|
18
|
+
help='Which annotation file to add to reference annotation?')
|
|
19
|
+
args = parser.parse_args()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def cds_checker(genome_dna,genome_gff):
|
|
25
|
+
genome_seq = ""
|
|
26
|
+
with open(genome_dna, 'r') as genome_fasta:
|
|
27
|
+
for line in genome_fasta:
|
|
28
|
+
line = line.replace("\n", "")
|
|
29
|
+
if not line.startswith('>'):
|
|
30
|
+
genome_seq += str(line)
|
|
31
|
+
else:
|
|
32
|
+
genome_id = line.split()[0].replace('>','')
|
|
33
|
+
|
|
34
|
+
###########################################
|
|
35
|
+
genome_size = len(genome_seq)
|
|
36
|
+
genome_rev = revCompIterative(genome_seq)
|
|
37
|
+
cds_dict = collections.OrderedDict() # Order is important
|
|
38
|
+
count = 0
|
|
39
|
+
with open(genome_gff, 'r') as genome_gff:
|
|
40
|
+
for line in genome_gff:
|
|
41
|
+
line = line.split('\t')
|
|
42
|
+
try:
|
|
43
|
+
if "biological_region" in line[2] and len(line) == 9:
|
|
44
|
+
start = int(line[3])
|
|
45
|
+
stop = int(line[4])
|
|
46
|
+
strand = line[6]
|
|
47
|
+
|
|
48
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
49
|
+
r_start = genome_size - stop
|
|
50
|
+
r_stop = genome_size - start
|
|
51
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
52
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
53
|
+
length = abs(start - stop-1)
|
|
54
|
+
elif '+' in strand:
|
|
55
|
+
startCodon = genome_seq[start - 1:start + 2]
|
|
56
|
+
stopCodon = genome_seq[stop - 3:stop]
|
|
57
|
+
length = abs(start-1 - stop)
|
|
58
|
+
po = str(start) + ',' + str(stop)
|
|
59
|
+
orf = [strand, startCodon, stopCodon]
|
|
60
|
+
cds_dict.update({po: orf})
|
|
61
|
+
|
|
62
|
+
if length % 3 == 0:
|
|
63
|
+
print("In-Fame")
|
|
64
|
+
else:
|
|
65
|
+
sys.exit("W")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
elif "bio" in line[2]:
|
|
69
|
+
sys.exit("SAS")
|
|
70
|
+
except IndexError:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if __name__ == "__main__":
|
|
75
|
+
cds_checker(**vars(args))
|
|
76
|
+
|
|
77
|
+
print("Complete")
|