ORForise 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Annotation_Compare.py +105 -88
- ORForise/Comparator.py +60 -28
- ORForise/Convert_To_GFF.py +138 -0
- ORForise/Tools/TabToGFF/TabToGFF.py +140 -0
- ORForise/Tools/TabToGFF/__init__.py +0 -0
- ORForise/utils.py +1 -1
- orforise-1.6.0.dist-info/METADATA +1051 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/RECORD +12 -9
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/entry_points.txt +2 -0
- orforise-1.5.0.dist-info/METADATA +0 -451
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/WHEEL +0 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/top_level.txt +0 -0
ORForise/Annotation_Compare.py
CHANGED
|
@@ -1,22 +1,35 @@
|
|
|
1
1
|
from importlib import import_module
|
|
2
2
|
import argparse
|
|
3
|
-
import sys,os
|
|
4
|
-
import gzip,csv
|
|
3
|
+
import sys, os
|
|
4
|
+
import gzip, csv
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime
|
|
5
7
|
|
|
6
|
-
try:
|
|
7
|
-
from Comparator import tool_comparison
|
|
8
|
-
except ImportError:
|
|
9
|
-
from .Comparator import tool_comparison
|
|
10
8
|
|
|
11
9
|
try:
|
|
12
10
|
from utils import *
|
|
11
|
+
from Comparator import tool_comparison
|
|
13
12
|
except ImportError:
|
|
13
|
+
from .Comparator import tool_comparison
|
|
14
14
|
from ORForise.utils import *
|
|
15
15
|
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
##########################
|
|
20
|
+
|
|
21
|
+
# Consolidate printing and logging into a single block
|
|
22
|
+
def _pct(n, total):
|
|
23
|
+
try:
|
|
24
|
+
return format(100 * n / total, '.2f') + '%'
|
|
25
|
+
except Exception:
|
|
26
|
+
return 'N/A'
|
|
27
|
+
|
|
16
28
|
##########################
|
|
17
29
|
|
|
18
30
|
def comparator(options):
|
|
19
31
|
|
|
32
|
+
|
|
20
33
|
try:
|
|
21
34
|
try: # Detect whether fasta/gff files are .gz or text and read accordingly
|
|
22
35
|
fasta_in = gzip.open(options.genome_dna, 'rt')
|
|
@@ -77,36 +90,46 @@ def comparator(options):
|
|
|
77
90
|
'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
|
|
78
91
|
|
|
79
92
|
for dna_region, result in results.items():
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
contig_summaries.append([
|
|
90
|
-
|
|
91
|
-
])
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
93
|
+
if result:
|
|
94
|
+
num_current_genes = len(dna_regions[dna_region][2])
|
|
95
|
+
num_orfs = result['pred_metrics']['Number_of_ORFs']
|
|
96
|
+
num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
|
|
97
|
+
num_partial = len(result['pred_metrics']['partial_Hits'])
|
|
98
|
+
num_missed = len(result['rep_metrics']['genes_Undetected'])
|
|
99
|
+
num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
|
|
100
|
+
num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
|
|
101
|
+
# Collect summary for this contig
|
|
102
|
+
contig_summaries.append([dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi])
|
|
103
|
+
num_current_genes = len(dna_regions[dna_region][2])
|
|
104
|
+
genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
|
|
105
|
+
rep_metric_description, rep_metrics = get_rep_metrics(result)
|
|
106
|
+
all_metric_description, all_metrics = get_all_metrics(result)
|
|
107
|
+
|
|
108
|
+
# Safely extract metric values
|
|
109
|
+
num_orfs = result.get('pred_metrics', {}).get('Number_of_ORFs') if isinstance(result, dict) else 'N/A'
|
|
110
|
+
perfect = result.get('pred_metrics', {}).get('Number_of_Perfect_Matches') if isinstance(result, dict) else 0
|
|
111
|
+
partial = len(result.get('pred_metrics', {}).get('partial_Hits', [])) if isinstance(result, dict) else 'N/A'
|
|
112
|
+
missed = len(result.get('rep_metrics', {}).get('genes_Undetected', [])) if isinstance(result, dict) else 'N/A'
|
|
113
|
+
unmatched = len(result.get('pred_metrics', {}).get('unmatched_ORFs', [])) if isinstance(result, dict) else 'N/A'
|
|
114
|
+
multi = len(result.get('pred_metrics', {}).get('multi_Matched_ORFs', [])) if isinstance(result, dict) else 'N/A'
|
|
115
|
+
|
|
116
|
+
lines = [
|
|
117
|
+
f"These are the results for: {dna_region}",
|
|
118
|
+
f"Current Contig: {dna_region}",
|
|
119
|
+
f"Number of Genes: {num_current_genes}",
|
|
120
|
+
f"Number of ORFs: {num_orfs}",
|
|
121
|
+
f"Perfect Matches: {perfect} [{num_current_genes}] - {_pct(perfect, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
|
|
122
|
+
f"Partial Matches: {partial} [{num_current_genes}] - {_pct(partial, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
|
|
123
|
+
f"Missed Genes: {missed} [{num_current_genes}] - {_pct(missed, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
|
|
124
|
+
f"Unmatched ORFs: {unmatched} [{num_current_genes}] - {_pct(unmatched, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
|
|
125
|
+
f"Multi-matched ORFs: {multi} [{num_current_genes}] - {_pct(multi, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}"
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
full_msg = '\n'.join(lines) + '\n'
|
|
129
|
+
if options.verbose:
|
|
130
|
+
print(full_msg)
|
|
131
|
+
options.output_logger.info(full_msg)
|
|
132
|
+
|
|
110
133
|
# Prepare output directory and file names for each contig
|
|
111
134
|
contig_save = dna_region.replace('/', '_').replace('\\', '_')
|
|
112
135
|
contig_dir = os.path.join(options.outdir, contig_save)
|
|
@@ -156,24 +179,6 @@ def comparator(options):
|
|
|
156
179
|
tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
|
|
157
180
|
tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
|
|
158
181
|
tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
|
|
159
|
-
# tool_out.writerow(['Start_Position_Difference:'])
|
|
160
|
-
# tool_out.writerow(result.get('start_Difference', []))
|
|
161
|
-
# tool_out.writerow(['Stop_Position_Difference:'])
|
|
162
|
-
# tool_out.writerow(result.get('stop_Difference', []))
|
|
163
|
-
# tool_out.writerow(['Alternative_Starts_Predicted:'])
|
|
164
|
-
# tool_out.writerow(result.get('other_Starts', []))
|
|
165
|
-
# tool_out.writerow(['Alternative_Stops_Predicted:'])
|
|
166
|
-
# tool_out.writerow(result.get('other_Stops', []))
|
|
167
|
-
# tool_out.writerow(['Undetected_Gene_Metrics:'])
|
|
168
|
-
# tool_out.writerow([
|
|
169
|
-
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
170
|
-
# ])
|
|
171
|
-
# tool_out.writerow(result.get('undetected_Gene_Metrics', []))
|
|
172
|
-
# tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
|
|
173
|
-
# tool_out.writerow([
|
|
174
|
-
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
175
|
-
# ])
|
|
176
|
-
# tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
|
|
177
182
|
|
|
178
183
|
# Write perfect matches to FASTA
|
|
179
184
|
with open(perfect_fasta, 'w', encoding='utf-8') as f:
|
|
@@ -210,6 +215,11 @@ def comparator(options):
|
|
|
210
215
|
key_parts = key.split(',')
|
|
211
216
|
multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
|
|
212
217
|
f.write(f"{multi}\n")
|
|
218
|
+
else:
|
|
219
|
+
if options.verbose:
|
|
220
|
+
print(f"No results to process for dna region - " + str(dna_region))
|
|
221
|
+
options.output_logger.info(f"No results to process for dna region - " + str(dna_region))
|
|
222
|
+
|
|
213
223
|
|
|
214
224
|
# After all contigs, append the summary table to the main summary file
|
|
215
225
|
if options.outdir and contig_summaries:
|
|
@@ -227,34 +237,27 @@ def comparator(options):
|
|
|
227
237
|
out_file.write('\nOverall Summary:\n')
|
|
228
238
|
out_file.write(f'Number of Genes: {total_genes}\n')
|
|
229
239
|
out_file.write(f'Number of ORFs: {total_orfs}\n')
|
|
230
|
-
out_file.write(
|
|
231
|
-
|
|
232
|
-
out_file.write(
|
|
233
|
-
|
|
234
|
-
out_file.write(
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
f
|
|
238
|
-
|
|
239
|
-
f
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
print(
|
|
252
|
-
f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
|
|
253
|
-
print(
|
|
254
|
-
f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
240
|
+
out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
|
|
241
|
+
out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
|
|
242
|
+
out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
|
|
243
|
+
out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
|
|
244
|
+
out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
|
|
245
|
+
|
|
246
|
+
lines = [
|
|
247
|
+
f"Combined metrics for all contigs:",
|
|
248
|
+
f"Number of Genes: {total_genes}",
|
|
249
|
+
f"Number of ORFs: {total_orfs}",
|
|
250
|
+
f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
|
|
251
|
+
f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
|
|
252
|
+
f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
|
|
253
|
+
f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
|
|
254
|
+
f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
full_msg = '\n'.join(lines) + '\n'
|
|
258
|
+
if options.verbose:
|
|
259
|
+
print(full_msg)
|
|
260
|
+
options.output_logger.info(full_msg)
|
|
258
261
|
|
|
259
262
|
|
|
260
263
|
def main():
|
|
@@ -282,21 +285,35 @@ def main():
|
|
|
282
285
|
'- Provide tool name to compare output from two tools')
|
|
283
286
|
|
|
284
287
|
output = parser.add_argument_group('Output')
|
|
285
|
-
output.add_argument('-o', dest='outdir', required=
|
|
286
|
-
help='Define directory where detailed output should be places
|
|
288
|
+
output.add_argument('-o', dest='outdir', required=True,
|
|
289
|
+
help='Define directory where detailed output should be places')
|
|
287
290
|
output.add_argument('-n', dest='outname', required=False,
|
|
288
|
-
help='Define output
|
|
291
|
+
help='Define output filename(s) prefix - If not provided, filename of reference '
|
|
292
|
+
'annotation file will be used- <outname>_<contig_id>_ORF_Comparison.csv')
|
|
289
293
|
|
|
290
294
|
misc = parser.add_argument_group('Misc')
|
|
291
295
|
misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
|
|
292
296
|
help='Default - False: Print out runtime status')
|
|
293
297
|
options = parser.parse_args()
|
|
294
298
|
|
|
295
|
-
if options.
|
|
296
|
-
|
|
299
|
+
options.outname = options.outname if options.outname else options.reference_annotation.split('/')[-1].split('.')[0]
|
|
300
|
+
|
|
301
|
+
# Initialise loggers once and store on options
|
|
302
|
+
if not getattr(options, 'logger_initialized', False):
|
|
303
|
+
os.makedirs(options.outdir, exist_ok=True)
|
|
304
|
+
output_log = os.path.join(options.outdir, f"ORForise_{options.outname}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
|
|
305
|
+
logger = logging.getLogger('ORForise.output')
|
|
306
|
+
logger.setLevel(logging.INFO)
|
|
307
|
+
fh_out = logging.FileHandler(output_log, encoding='utf-8')
|
|
308
|
+
fh_out.setFormatter(logging.Formatter('%(message)s'))
|
|
309
|
+
logger.addHandler(fh_out)
|
|
310
|
+
|
|
311
|
+
options.output_logger = logger
|
|
312
|
+
options.logger_initialized = True
|
|
313
|
+
|
|
297
314
|
|
|
298
315
|
comparator(options)
|
|
299
316
|
|
|
300
317
|
if __name__ == "__main__":
|
|
301
318
|
main()
|
|
302
|
-
print("Complete")
|
|
319
|
+
print("Complete")
|
ORForise/Comparator.py
CHANGED
|
@@ -206,33 +206,53 @@ def start_Codon_Count(orfs):
|
|
|
206
206
|
else:
|
|
207
207
|
other += 1
|
|
208
208
|
other_Starts.append(codon)
|
|
209
|
-
atg_P = format(100 * atg / len(orfs), '.2f')
|
|
210
|
-
gtg_P = format(100 * gtg / len(orfs), '.2f')
|
|
211
|
-
ttg_P = format(100 * ttg / len(orfs), '.2f')
|
|
212
|
-
att_P = format(100 * att / len(orfs), '.2f')
|
|
213
|
-
ctg_P = format(100 * ctg / len(orfs), '.2f')
|
|
214
|
-
other_Start_P = format(100 * other / len(orfs), '.2f')
|
|
215
|
-
return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
|
|
216
209
|
|
|
210
|
+
total = len(orfs) if orfs is not None else 0
|
|
211
|
+
|
|
212
|
+
if total:
|
|
213
|
+
atg_P = format(100 * atg / len(orfs), '.2f')
|
|
214
|
+
gtg_P = format(100 * gtg / len(orfs), '.2f')
|
|
215
|
+
ttg_P = format(100 * ttg / len(orfs), '.2f')
|
|
216
|
+
att_P = format(100 * att / len(orfs), '.2f')
|
|
217
|
+
ctg_P = format(100 * ctg / len(orfs), '.2f')
|
|
218
|
+
other_Start_P = format(100 * other / len(orfs), '.2f')
|
|
219
|
+
else:
|
|
220
|
+
atg_P = ttg_P = gtg_P = ctg_P = att_P = other_Start_P = format(0, '.2f')
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
'ATG': (atg, atg_P),
|
|
224
|
+
'TTG': (ttg, ttg_P),
|
|
225
|
+
'GTG': (gtg, gtg_P),
|
|
226
|
+
'CTG': (ctg, ctg_P),
|
|
227
|
+
'ATT': (att, att_P),
|
|
228
|
+
'Other': (other, other_Start_P),
|
|
229
|
+
'total': total
|
|
230
|
+
}
|
|
217
231
|
|
|
218
232
|
def stop_Codon_Count(orfs):
|
|
219
233
|
tag, taa, tga, other = 0, 0, 0, 0
|
|
220
234
|
other_Stops = []
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
235
|
+
|
|
236
|
+
total = len(orfs) if orfs else 0
|
|
237
|
+
if total:
|
|
238
|
+
for orf in orfs.values():
|
|
239
|
+
codon = orf[2]
|
|
240
|
+
if codon == 'TAG':
|
|
241
|
+
tag += 1
|
|
242
|
+
elif codon == 'TAA':
|
|
243
|
+
taa += 1
|
|
244
|
+
elif codon == 'TGA':
|
|
245
|
+
tga += 1
|
|
246
|
+
else:
|
|
247
|
+
other += 1
|
|
248
|
+
other_Stops.append(codon)
|
|
249
|
+
tag_p = format(100 * tag / len(orfs), '.2f')
|
|
250
|
+
taa_p = format(100 * taa / len(orfs), '.2f')
|
|
251
|
+
tga_p = format(100 * tga / len(orfs), '.2f')
|
|
252
|
+
other_Stop_P = format(100 * other / len(orfs), '.2f')
|
|
253
|
+
else:
|
|
254
|
+
tag_p = taa_p = tga_p = other_Stop_P = format(0, '.2f')
|
|
255
|
+
|
|
236
256
|
return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
|
|
237
257
|
|
|
238
258
|
|
|
@@ -260,8 +280,8 @@ def candidate_ORF_Selection(gene_Set,
|
|
|
260
280
|
if len(current_ORF_Difference) > len(candidate_ORF_Difference):
|
|
261
281
|
pos = c_Pos
|
|
262
282
|
orf_Details = c_ORF_Details
|
|
263
|
-
else:
|
|
264
|
-
|
|
283
|
+
#else:
|
|
284
|
+
#("Match filtered out")
|
|
265
285
|
return pos, orf_Details
|
|
266
286
|
|
|
267
287
|
|
|
@@ -300,6 +320,11 @@ def tool_comparison(all_orfs, dna_regions, verbose):
|
|
|
300
320
|
|
|
301
321
|
ref_genes_list = dna_regions[dna_region][2]
|
|
302
322
|
ref_genes = collections.OrderedDict()
|
|
323
|
+
|
|
324
|
+
if not ref_genes_list:
|
|
325
|
+
results[dna_region] = {}
|
|
326
|
+
continue
|
|
327
|
+
|
|
303
328
|
for d in ref_genes_list:
|
|
304
329
|
ref_genes.update(d)
|
|
305
330
|
comp.genome_Seq = dna_regions[dna_region][0]
|
|
@@ -311,6 +336,10 @@ def tool_comparison(all_orfs, dna_regions, verbose):
|
|
|
311
336
|
|
|
312
337
|
better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
|
|
313
338
|
|
|
339
|
+
if not current_orfs or not better_pos_orfs_items:
|
|
340
|
+
results[dna_region] = {}
|
|
341
|
+
continue
|
|
342
|
+
|
|
314
343
|
for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
|
|
315
344
|
g_Start = int(gene_details[0])
|
|
316
345
|
g_Stop = int(gene_details[1])
|
|
@@ -477,10 +506,13 @@ def tool_comparison(all_orfs, dna_regions, verbose):
|
|
|
477
506
|
comp.gene_Pos_Olap.append(0)
|
|
478
507
|
elif '-' in g_Strand:
|
|
479
508
|
comp.gene_Neg_Olap.append(0)
|
|
480
|
-
####
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
509
|
+
#### avoid ValueError
|
|
510
|
+
if comp.gene_Lengths:
|
|
511
|
+
min_Gene_Length = min(comp.gene_Lengths)
|
|
512
|
+
max_Gene_Length = max(comp.gene_Lengths)
|
|
513
|
+
median_Gene_Length = np.median(comp.gene_Lengths)
|
|
514
|
+
else:
|
|
515
|
+
min_Gene_Length = max_Gene_Length = min_Length_Difference = 0
|
|
484
516
|
prev_ORF_Stop = 0
|
|
485
517
|
prev_ORF_Overlapped = False
|
|
486
518
|
for o_Positions, orf_Details in current_orfs.items():
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from utils import *
|
|
9
|
+
from Tools.TabToGFF.TabToGFF import TabToGFF
|
|
10
|
+
except ImportError:
|
|
11
|
+
from ORForise.utils import *
|
|
12
|
+
from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def setup_logging(outdir, verbose=False):
|
|
16
|
+
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
17
|
+
logfile = None
|
|
18
|
+
logger = logging.getLogger()
|
|
19
|
+
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
20
|
+
# clear existing handlers to avoid duplicates when running repeatedly
|
|
21
|
+
logger.handlers = []
|
|
22
|
+
fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
23
|
+
# Only create a file handler (and thus the logfile) when verbose is enabled
|
|
24
|
+
if verbose:
|
|
25
|
+
logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
|
|
26
|
+
fh = logging.FileHandler(logfile)
|
|
27
|
+
fh.setLevel(logging.DEBUG)
|
|
28
|
+
fh.setFormatter(fmt)
|
|
29
|
+
logger.addHandler(fh)
|
|
30
|
+
# Always add a stdout handler
|
|
31
|
+
sh = logging.StreamHandler(sys.stdout)
|
|
32
|
+
sh.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
33
|
+
sh.setFormatter(fmt)
|
|
34
|
+
logger.addHandler(sh)
|
|
35
|
+
return logfile
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
|
|
39
|
+
with open(outpath, 'w') as out:
|
|
40
|
+
out.write('##gff-version\t3\n')
|
|
41
|
+
out.write('#\tConvert_To_GFF\n')
|
|
42
|
+
out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
|
|
43
|
+
# Only include genome DNA line if a path was provided
|
|
44
|
+
if genome_DNA:
|
|
45
|
+
out.write('##Genome DNA File:' + genome_DNA + '\n')
|
|
46
|
+
out.write('##Original File: ' + input_annotation + '\n')
|
|
47
|
+
for pos, data in features.items():
|
|
48
|
+
pos_ = pos.split(',')
|
|
49
|
+
start = pos_[0]
|
|
50
|
+
stop = pos_[-1]
|
|
51
|
+
strand = data['strand']
|
|
52
|
+
if fmt == 'abricate': # Currently only supports abricate format
|
|
53
|
+
info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
|
|
54
|
+
entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
|
|
55
|
+
out.write(entry)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_genome(genome_fasta):
|
|
59
|
+
genome_seq = ''
|
|
60
|
+
genome_ID = 'unknown'
|
|
61
|
+
with open(genome_fasta, 'r') as fh:
|
|
62
|
+
for line in fh:
|
|
63
|
+
line = line.rstrip('\n')
|
|
64
|
+
if not line:
|
|
65
|
+
continue
|
|
66
|
+
if line.startswith('>'):
|
|
67
|
+
genome_ID = line.split()[0].lstrip('>')
|
|
68
|
+
else:
|
|
69
|
+
genome_seq += line
|
|
70
|
+
return genome_ID, genome_seq
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
|
|
75
|
+
|
|
76
|
+
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
|
|
77
|
+
parser._action_groups.pop()
|
|
78
|
+
|
|
79
|
+
required = parser.add_argument_group('Required Arguments')
|
|
80
|
+
# Make genome DNA optional: if not provided we operate without genome sequence
|
|
81
|
+
required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
|
|
82
|
+
required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
|
|
83
|
+
required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
|
|
84
|
+
required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
|
|
85
|
+
|
|
86
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
87
|
+
optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
|
|
88
|
+
optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
|
|
89
|
+
|
|
90
|
+
options = parser.parse_args()
|
|
91
|
+
|
|
92
|
+
if not os.path.exists(options.output_dir):
|
|
93
|
+
os.makedirs(options.output_dir)
|
|
94
|
+
logfile = setup_logging(options.output_dir, verbose=options.verbose)
|
|
95
|
+
logging.info('Starting Convert_To_GFF')
|
|
96
|
+
# Log genome DNA only if provided
|
|
97
|
+
if options.genome_DNA:
|
|
98
|
+
logging.info('Genome DNA: %s', options.genome_DNA)
|
|
99
|
+
else:
|
|
100
|
+
logging.info('Genome DNA: (not provided)')
|
|
101
|
+
logging.info('Input annotation: %s', options.input_annotation)
|
|
102
|
+
logging.info('Format: %s', options.format)
|
|
103
|
+
|
|
104
|
+
# If a genome fasta was provided, load it; otherwise proceed without genome sequence
|
|
105
|
+
if options.genome_DNA:
|
|
106
|
+
if not os.path.exists(options.genome_DNA):
|
|
107
|
+
logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
genome_ID, genome_seq = load_genome(options.genome_DNA)
|
|
110
|
+
else:
|
|
111
|
+
# Derive a sensible genome_ID from the annotation filename and leave sequence empty
|
|
112
|
+
genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
|
|
113
|
+
genome_seq = ''
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
# Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
|
|
117
|
+
genome_map = {genome_ID: (genome_seq,)}
|
|
118
|
+
features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logging.exception('Error parsing input annotation')
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
|
|
123
|
+
#features = sortORFs(features) - Not sorting for now to preserve original order
|
|
124
|
+
basename = os.path.basename(options.input_annotation)
|
|
125
|
+
dot = basename.rfind('.')
|
|
126
|
+
if dot != -1:
|
|
127
|
+
outname = basename[:dot] + '.gff'
|
|
128
|
+
else:
|
|
129
|
+
outname = basename + '.gff'
|
|
130
|
+
outgff = os.path.join(options.output_dir, outname)
|
|
131
|
+
# Pass the original genome path if provided, else pass None so headers adapt
|
|
132
|
+
genome_DNA_path = options.genome_DNA if options.genome_DNA else None
|
|
133
|
+
write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
|
|
134
|
+
logging.info('Wrote GFF to %s', outgff)
|
|
135
|
+
logging.info('Logfile: %s', logfile)
|
|
136
|
+
|
|
137
|
+
if __name__ == '__main__':
|
|
138
|
+
main()
|