ORForise 1.6.2__py3-none-any.whl → 1.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +7 -4
- ORForise/Annotation_Compare.py +7 -4
- ORForise/Annotation_Intersector.py +89 -76
- ORForise/Convert_To_GFF.py +7 -3
- ORForise/GFF_Adder.py +7 -4
- ORForise/List_Tools.py +7 -4
- ORForise/StORForise.py +7 -2
- ORForise/Tools/GFF/GFF.py +2 -2
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +2 -2
- ORForise/utils.py +2 -2
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/METADATA +7 -7
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/RECORD +16 -16
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/WHEEL +1 -1
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/entry_points.txt +0 -0
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.6.2.dist-info → orforise-1.6.4.dist-info}/top_level.txt +0 -0
ORForise/Aggregate_Compare.py
CHANGED
|
@@ -337,7 +337,6 @@ def comparator(options):
|
|
|
337
337
|
|
|
338
338
|
|
|
339
339
|
def main():
|
|
340
|
-
print(WELCOME)
|
|
341
340
|
|
|
342
341
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Aggregate-Compare Run Parameters.')
|
|
343
342
|
parser._action_groups.pop()
|
|
@@ -373,6 +372,10 @@ def main():
|
|
|
373
372
|
comparator(options)
|
|
374
373
|
|
|
375
374
|
if __name__ == "__main__":
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
375
|
+
try:
|
|
376
|
+
try:
|
|
377
|
+
main()
|
|
378
|
+
except Exception:
|
|
379
|
+
print('Unhandled exception in main')
|
|
380
|
+
finally:
|
|
381
|
+
print(CLOSING)
|
ORForise/Annotation_Compare.py
CHANGED
|
@@ -259,8 +259,6 @@ def comparator(options):
|
|
|
259
259
|
|
|
260
260
|
|
|
261
261
|
def main():
|
|
262
|
-
print(WELCOME)
|
|
263
|
-
|
|
264
262
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Annotatione-Compare Run Parameters.')
|
|
265
263
|
parser._action_groups.pop()
|
|
266
264
|
|
|
@@ -313,5 +311,10 @@ def main():
|
|
|
313
311
|
comparator(options)
|
|
314
312
|
|
|
315
313
|
if __name__ == "__main__":
|
|
316
|
-
|
|
317
|
-
|
|
314
|
+
try:
|
|
315
|
+
try:
|
|
316
|
+
main()
|
|
317
|
+
except Exception:
|
|
318
|
+
logging.exception('Unhandled exception in main')
|
|
319
|
+
finally:
|
|
320
|
+
print(CLOSING)
|
|
@@ -26,13 +26,12 @@ def gff_writer(genome_ID, genome_DNA, reference_annotation, reference_tool, ref_
|
|
|
26
26
|
if out_dir:
|
|
27
27
|
os.makedirs(out_dir, exist_ok=True)
|
|
28
28
|
|
|
29
|
-
# Always open the file and write the header first. Use a broad try so we can log any issue.
|
|
30
29
|
try:
|
|
31
30
|
with open(output_file, 'w', encoding='utf-8') as write_out:
|
|
32
31
|
write_out.write("##gff-version\t3\n#\tAnnotation-Intersector\n#\tRun Date:" + str(date.today()) + '\n')
|
|
33
32
|
if genome_DNA:
|
|
34
33
|
write_out.write("##Genome DNA File:" + genome_DNA + '\n')
|
|
35
|
-
write_out.write("##Original File: " + reference_annotation + "\n##Intersecting File: " + additional_annotation + '\n')
|
|
34
|
+
write_out.write("##Original File: " + (reference_annotation or '') + "\n##Intersecting File: " + (additional_annotation or '') + '\n')
|
|
36
35
|
|
|
37
36
|
entries_written = 0
|
|
38
37
|
|
|
@@ -43,7 +42,7 @@ def gff_writer(genome_ID, genome_DNA, reference_annotation, reference_tool, ref_
|
|
|
43
42
|
|
|
44
43
|
# Parse reference annotation and write features matching gene_ident
|
|
45
44
|
try:
|
|
46
|
-
if reference_annotation.endswith('.gz'):
|
|
45
|
+
if reference_annotation and reference_annotation.endswith('.gz'):
|
|
47
46
|
rf = gzip.open(reference_annotation, 'rt')
|
|
48
47
|
else:
|
|
49
48
|
rf = open(reference_annotation, 'r', encoding='unicode_escape')
|
|
@@ -68,10 +67,11 @@ def gff_writer(genome_ID, genome_DNA, reference_annotation, reference_tool, ref_
|
|
|
68
67
|
stop = parts[4]
|
|
69
68
|
strand = parts[6]
|
|
70
69
|
info = parts[8]
|
|
70
|
+
source = parts[1] if len(parts) > 1 else ''
|
|
71
71
|
except Exception:
|
|
72
72
|
continue
|
|
73
|
-
# write entry with coverage 0 and empty additional annotation
|
|
74
|
-
entry = f"{seqid}\t{
|
|
73
|
+
# write entry with coverage 0 and empty additional annotation; use source from input GFF
|
|
74
|
+
entry = f"{seqid}\t{source}\t{ftype}\t{start}\t{stop}\t.\t{strand}\t.\tID=Original_Annotation={info}\n"
|
|
75
75
|
write_out.write(entry)
|
|
76
76
|
entries_written += 1
|
|
77
77
|
except Exception as e:
|
|
@@ -81,25 +81,26 @@ def gff_writer(genome_ID, genome_DNA, reference_annotation, reference_tool, ref_
|
|
|
81
81
|
logging.info('Wrote %d fallback reference entries to %s', entries_written, output_file)
|
|
82
82
|
return
|
|
83
83
|
|
|
84
|
+
# Iterate contigs and write kept entries. Kept entry layout expected:
|
|
85
|
+
# [strand, coverage, additional_type, ref_type, additional_info, ref_info, ref_source]
|
|
84
86
|
for contig, genes in genes_To_Keep_by_contig.items():
|
|
85
|
-
|
|
86
|
-
ref = os.path.splitext(os.path.basename(reference_annotation))[0].split('_')[0]
|
|
87
|
+
fallback_source = os.path.splitext(os.path.basename(reference_annotation))[0].split('_')[0] if reference_annotation else 'reference'
|
|
87
88
|
for pos, data in genes.items():
|
|
88
89
|
try:
|
|
89
90
|
pos_ = pos.split(',')
|
|
90
91
|
start = pos_[0]
|
|
91
92
|
stop = pos_[-1]
|
|
92
|
-
strand = data[0]
|
|
93
|
-
# Ensure indices exist and are strings
|
|
93
|
+
strand = data[0] if len(data) > 0 else '.'
|
|
94
94
|
add_ann = str(data[4]) if len(data) > 4 else ''
|
|
95
95
|
orig_ann = str(data[5]) if len(data) > 5 else ''
|
|
96
|
+
source_field = data[6] if len(data) > 6 and data[6] else fallback_source
|
|
97
|
+
feat_type = data[3] if len(data) > 3 and data[3] else (data[2] if len(data) > 2 else 'CDS')
|
|
96
98
|
entry = (
|
|
97
|
-
contig + '\t' +
|
|
99
|
+
contig + '\t' + source_field + '\t' + feat_type + '\t' + start + '\t' + stop + '\t.\t' + strand + '\t.\tID=Original_Annotation=' + orig_ann + ';Additional_Annotation=' + add_ann + ';Coverage=' + str(
|
|
98
100
|
data[1]) + '\n')
|
|
99
101
|
write_out.write(entry)
|
|
100
102
|
entries_written += 1
|
|
101
103
|
except Exception as e:
|
|
102
|
-
# Log the bad entry and continue
|
|
103
104
|
logging.warning('Skipping bad GFF entry for contig %s pos %s: %s', contig, pos, e)
|
|
104
105
|
continue
|
|
105
106
|
|
|
@@ -179,7 +180,9 @@ def _write_discordance_report(report_path, entries):
|
|
|
179
180
|
|
|
180
181
|
|
|
181
182
|
def _write_discordance_gff(report_path, entries, reference_annotation_basename):
|
|
182
|
-
"""Write a list of discordance entries (dicts) to a GFF file.
|
|
183
|
+
"""Write a list of discordance entries (dicts) to a GFF file.
|
|
184
|
+
The GFF source column is taken from carried 'ref_source' or 'add_source' when available.
|
|
185
|
+
"""
|
|
183
186
|
report_path = os.path.expanduser(report_path)
|
|
184
187
|
out_dir = os.path.dirname(report_path)
|
|
185
188
|
if out_dir:
|
|
@@ -189,40 +192,35 @@ def _write_discordance_gff(report_path, entries, reference_annotation_basename):
|
|
|
189
192
|
fh.write('##gff-version\t3\n')
|
|
190
193
|
fh.write('#\tAnnotation-Intersector discordance report\n')
|
|
191
194
|
fh.write('#\tRun Date:' + str(date.today()) + '\n')
|
|
192
|
-
fh.write('##Original File: ' + reference_annotation_basename + '\n')
|
|
195
|
+
fh.write('##Original File: ' + (reference_annotation_basename or '') + '\n')
|
|
193
196
|
entries_written = 0
|
|
194
|
-
for e in entries:
|
|
197
|
+
for e in (entries or []):
|
|
195
198
|
try:
|
|
196
199
|
contig = str(e.get('contig', '.'))
|
|
197
|
-
# prefer reference coords if present
|
|
198
200
|
ref_pos = e.get('ref_pos', '')
|
|
199
201
|
add_pos = e.get('add_pos', '')
|
|
200
202
|
if ref_pos:
|
|
201
203
|
start, stop = ref_pos.split(',')
|
|
202
204
|
ftype = e.get('ref_type', '') or 'CDS'
|
|
203
|
-
source = reference_annotation_basename.split('_')[0]
|
|
205
|
+
source = e.get('ref_source') or (reference_annotation_basename.split('_')[0] if reference_annotation_basename else 'reference')
|
|
204
206
|
info_attr = e.get('ref_info', '')
|
|
205
207
|
else:
|
|
206
|
-
|
|
207
|
-
start, stop = add_pos.split(',') if add_pos else ('0','0')
|
|
208
|
+
start, stop = add_pos.split(',') if add_pos else ('0', '0')
|
|
208
209
|
ftype = e.get('add_type', '') or 'CDS'
|
|
209
|
-
source = e.get('
|
|
210
|
+
source = e.get('add_source') or 'additional'
|
|
210
211
|
info_attr = e.get('add_info', '')
|
|
211
|
-
# attributes
|
|
212
212
|
attrs = []
|
|
213
213
|
attrs.append('Status=' + str(e.get('status', '')))
|
|
214
214
|
attrs.append('Coverage=' + str(e.get('coverage', '')))
|
|
215
215
|
if e.get('ref_info', ''):
|
|
216
|
-
attrs.append('Ref_info=' + str(e.get('ref_info', '')).replace(';','%3B'))
|
|
216
|
+
attrs.append('Ref_info=' + str(e.get('ref_info', '')).replace(';', '%3B'))
|
|
217
217
|
if e.get('add_info', ''):
|
|
218
|
-
attrs.append('Add_info=' + str(e.get('add_info', '')).replace(';','%3B'))
|
|
218
|
+
attrs.append('Add_info=' + str(e.get('add_info', '')).replace(';', '%3B'))
|
|
219
219
|
attr_str = ';'.join(attrs)
|
|
220
|
-
# construct GFF line
|
|
221
220
|
line = f"{contig}\t{source}\t{ftype}\t{start}\t{stop}\t.\t.\t.\t{attr_str}\n"
|
|
222
221
|
fh.write(line)
|
|
223
222
|
entries_written += 1
|
|
224
223
|
except Exception:
|
|
225
|
-
# skip bad entry
|
|
226
224
|
continue
|
|
227
225
|
logging.info('Wrote %d discordance GFF entries to %s', entries_written, report_path)
|
|
228
226
|
except OSError as e:
|
|
@@ -236,12 +234,9 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
236
234
|
- only_in_additional: additional ORFs that don't overlap any reference entry
|
|
237
235
|
- mismatches: reference entries with overlapping additional ORFs that don't meet match criteria
|
|
238
236
|
|
|
239
|
-
This version is strand-aware and will classify mismatches that are due to strand
|
|
240
|
-
differences separately from type/coverage differences.
|
|
241
|
-
|
|
242
237
|
Expected layouts:
|
|
243
|
-
- ref entry: [strand, 'ref', type, info]
|
|
244
|
-
- add entry: [strand, ..., type (index 3), info (last element)]
|
|
238
|
+
- ref entry: [strand, 'ref', type, info, source]
|
|
239
|
+
- add entry: [strand, ..., type (index 3), info (last element), (optional) source]
|
|
245
240
|
"""
|
|
246
241
|
only_in_ref = []
|
|
247
242
|
only_in_additional = []
|
|
@@ -255,7 +250,6 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
255
250
|
ref_map = ref_map_by_contig.get(contig, {}) or {}
|
|
256
251
|
add_map = add_map_by_contig.get(contig, {}) or {}
|
|
257
252
|
|
|
258
|
-
# For each reference feature, find best overlapping additional ORF and classify
|
|
259
253
|
for rpos, rdata in ref_map.items():
|
|
260
254
|
rstart, rstop = _parse_pos(rpos)
|
|
261
255
|
if rstart is None:
|
|
@@ -266,10 +260,10 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
266
260
|
best_add_data = None
|
|
267
261
|
matched = False
|
|
268
262
|
|
|
269
|
-
# reference fields
|
|
270
263
|
r_strand = rdata[0] if len(rdata) > 0 else ''
|
|
271
|
-
r_type = rdata[
|
|
272
|
-
r_info = rdata[
|
|
264
|
+
r_type = rdata[2] if len(rdata) > 2 else ''
|
|
265
|
+
r_info = rdata[3] if len(rdata) > 3 else ''
|
|
266
|
+
r_source = rdata[4] if len(rdata) > 4 else ''
|
|
273
267
|
|
|
274
268
|
for apos, adata in add_map.items():
|
|
275
269
|
astart, astop = _parse_pos(apos)
|
|
@@ -284,27 +278,25 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
284
278
|
best_add = apos
|
|
285
279
|
best_add_data = adata
|
|
286
280
|
|
|
287
|
-
# additional fields
|
|
288
281
|
a_strand = adata[0] if len(adata) > 0 else ''
|
|
289
282
|
a_type = adata[3] if len(adata) > 3 else ''
|
|
290
|
-
|
|
283
|
+
a_info = adata[-1] if len(adata) > 0 else ''
|
|
284
|
+
a_source = adata[4] if len(adata) > 4 else ''
|
|
285
|
+
|
|
291
286
|
try:
|
|
292
287
|
frame_ok = ((abs(astop - rstop) % 3) == 0)
|
|
293
288
|
except Exception:
|
|
294
289
|
frame_ok = True
|
|
295
290
|
|
|
296
|
-
# check for a fully satisfactory match: type, coverage, strand and frame
|
|
297
291
|
if a_type == r_type and cov >= cov_thresh and (a_strand == r_strand) and frame_ok:
|
|
298
292
|
matched = True
|
|
299
293
|
matched_adds.add((contig, apos))
|
|
300
294
|
break
|
|
301
295
|
|
|
302
296
|
if matched:
|
|
303
|
-
# good match -> not discordant
|
|
304
297
|
continue
|
|
305
298
|
|
|
306
299
|
if best_add is None:
|
|
307
|
-
# no overlapping additional ORF found
|
|
308
300
|
only_in_ref.append({
|
|
309
301
|
'contig': contig,
|
|
310
302
|
'ref_pos': rpos,
|
|
@@ -314,26 +306,24 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
314
306
|
'status': 'only_in_ref',
|
|
315
307
|
'coverage': '0.00',
|
|
316
308
|
'ref_info': r_info,
|
|
309
|
+
'ref_source': r_source,
|
|
317
310
|
'add_info': ''
|
|
318
311
|
})
|
|
319
312
|
else:
|
|
320
|
-
# overlapping additional ORF(s) exist but none satisfied the match criteria
|
|
321
313
|
a_type = best_add_data[3] if len(best_add_data) > 3 else ''
|
|
322
314
|
a_info = best_add_data[-1] if len(best_add_data) > 0 else ''
|
|
323
315
|
a_strand = best_add_data[0] if len(best_add_data) > 0 else ''
|
|
316
|
+
a_source = best_add_data[4] if len(best_add_data) > 4 else ''
|
|
324
317
|
|
|
325
|
-
# compute reason flags
|
|
326
318
|
type_match = (a_type == r_type)
|
|
327
319
|
strand_match = (a_strand == r_strand)
|
|
328
320
|
cov_ok = (best_cov >= cov_thresh)
|
|
329
321
|
try:
|
|
330
|
-
# use frame between best add and ref
|
|
331
322
|
astart, astop = _parse_pos(best_add)
|
|
332
323
|
frame_ok = ((abs(astop - rstop) % 3) == 0) if (astop is not None) else True
|
|
333
324
|
except Exception:
|
|
334
325
|
frame_ok = True
|
|
335
326
|
|
|
336
|
-
# classify mismatch with strand-awareness
|
|
337
327
|
if not cov_ok:
|
|
338
328
|
status = 'found_in_additional_but_below_coverage'
|
|
339
329
|
elif not type_match and not strand_match:
|
|
@@ -356,13 +346,14 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
356
346
|
'status': status,
|
|
357
347
|
'coverage': f"{best_cov:.2f}",
|
|
358
348
|
'ref_info': r_info,
|
|
349
|
+
'ref_source': r_source,
|
|
359
350
|
'add_info': a_info,
|
|
351
|
+
'add_source': a_source,
|
|
360
352
|
})
|
|
361
353
|
|
|
362
354
|
if best_add:
|
|
363
355
|
matched_adds.add((contig, best_add))
|
|
364
356
|
|
|
365
|
-
# Additional-only ORFs: those not matched and not overlapping any reference
|
|
366
357
|
for apos, adata in add_map.items():
|
|
367
358
|
if (contig, apos) in matched_adds:
|
|
368
359
|
continue
|
|
@@ -378,6 +369,7 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
378
369
|
overlapped = True
|
|
379
370
|
break
|
|
380
371
|
if not overlapped:
|
|
372
|
+
add_source = adata[4] if len(adata) > 4 else ''
|
|
381
373
|
only_in_additional.append({
|
|
382
374
|
'contig': contig,
|
|
383
375
|
'ref_pos': '',
|
|
@@ -388,9 +380,11 @@ def compute_discordance(ref_map_by_contig, add_map_by_contig, options):
|
|
|
388
380
|
'coverage': '0.00',
|
|
389
381
|
'ref_info': '',
|
|
390
382
|
'add_info': adata[-1] if len(adata) > 0 else '',
|
|
383
|
+
'add_source': add_source,
|
|
391
384
|
})
|
|
392
385
|
|
|
393
|
-
|
|
386
|
+
# Return discordance lists and the set of matched additional ORFs (for overlap counts)
|
|
387
|
+
return only_in_ref, only_in_additional, mismatches, matched_adds
|
|
394
388
|
|
|
395
389
|
|
|
396
390
|
def comparator(options):
|
|
@@ -491,11 +485,14 @@ def comparator(options):
|
|
|
491
485
|
strand = parts[6]
|
|
492
486
|
pos = f"{start},{stop}"
|
|
493
487
|
info = parts[8]
|
|
488
|
+
source = parts[1] if len(parts) > 1 else ''
|
|
494
489
|
except (IndexError, ValueError):
|
|
495
490
|
continue
|
|
496
491
|
if seqid not in ref_genes_by_contig:
|
|
497
492
|
ref_genes_by_contig[seqid] = OrderedDict()
|
|
498
|
-
|
|
493
|
+
# Store source from column 1 as well. Layout becomes:
|
|
494
|
+
# [strand, 'ref', type, info, source]
|
|
495
|
+
ref_genes_by_contig[seqid].update({pos: [strand, 'ref', ftype, info, source]})
|
|
499
496
|
else:
|
|
500
497
|
# Use a tool parser to produce ref_genes; expect tool to return mapping contig->dict
|
|
501
498
|
try:
|
|
@@ -558,12 +555,14 @@ def comparator(options):
|
|
|
558
555
|
ref_entry = ref_genes.get(f"{o_Start},{o_Stop}")
|
|
559
556
|
if not ref_entry:
|
|
560
557
|
continue
|
|
561
|
-
# ref_entry layout: [strand, 'ref', type, info]
|
|
562
|
-
ref_type = ref_entry[
|
|
563
|
-
ref_info = ref_entry[
|
|
558
|
+
# ref_entry layout now: [strand, 'ref', type, info, source]
|
|
559
|
+
ref_type = ref_entry[2] if len(ref_entry) > 2 else ''
|
|
560
|
+
ref_info = ref_entry[3] if len(ref_entry) > 3 else ''
|
|
561
|
+
ref_source = ref_entry[4] if len(ref_entry) > 4 else ''
|
|
564
562
|
|
|
565
563
|
if additional_type == ref_type and o_Strand == ref_entry[0]:
|
|
566
|
-
kept
|
|
564
|
+
# kept layout: [strand, coverage, additional_type, ref_type, additional_info, ref_info, ref_source]
|
|
565
|
+
kept.update({f"{o_Start},{o_Stop}": [o_Strand, options.coverage, additional_type, ref_type, additional_info, ref_info, ref_source]})
|
|
567
566
|
else:
|
|
568
567
|
cov_thresh = options.coverage
|
|
569
568
|
for orf, data in orfs.items():
|
|
@@ -598,42 +597,43 @@ def comparator(options):
|
|
|
598
597
|
cov = 100.0 * overlap / gene_len
|
|
599
598
|
|
|
600
599
|
g_Strand = r_data[0]
|
|
601
|
-
# r_data layout: [strand, 'ref', type, info]
|
|
602
|
-
ref_type = r_data[
|
|
603
|
-
ref_info = r_data[
|
|
600
|
+
# r_data layout now: [strand, 'ref', type, info, source]
|
|
601
|
+
ref_type = r_data[2] if len(r_data) > 2 else ''
|
|
602
|
+
ref_info = r_data[3] if len(r_data) > 3 else ''
|
|
603
|
+
ref_source = r_data[4] if len(r_data) > 4 else ''
|
|
604
604
|
|
|
605
605
|
if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and cov >= cov_thresh:
|
|
606
606
|
if additional_type == ref_type:
|
|
607
|
-
|
|
608
|
-
|
|
607
|
+
# keep ref_source with the kept entry
|
|
608
|
+
kept[f"{g_Start},{g_Stop}"] = [g_Strand, int(cov), additional_type, ref_type, additional_info, ref_info, ref_source]
|
|
609
609
|
genes_To_Keep_by_contig[contig] = sortORFs(kept)
|
|
610
610
|
|
|
611
611
|
# Log counts for debugging why GFF might be empty
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
try:
|
|
621
|
-
total_kept = sum(len(v) for v in genes_To_Keep_by_contig.values()) if genes_To_Keep_by_contig else 0
|
|
622
|
-
except Exception:
|
|
623
|
-
total_kept = 0
|
|
624
|
-
logging.info('Reference genes loaded: %d', total_ref)
|
|
625
|
-
logging.info('Additional ORFs loaded: %d', total_add)
|
|
626
|
-
logging.info('Kept genes after intersection: %d', total_kept)
|
|
612
|
+
# Compute summary metrics (safe/simple)
|
|
613
|
+
total_ref = sum(len(v) for v in ref_genes_by_contig.values()) if ref_genes_by_contig else 0
|
|
614
|
+
total_add = sum(len(v) for v in additional_by_contig.values()) if additional_by_contig else 0
|
|
615
|
+
total_kept = sum(len(v) for v in genes_To_Keep_by_contig.values()) if genes_To_Keep_by_contig else 0
|
|
616
|
+
# Print totals in requested order: reference, additional, then overlap/kept and a percentage
|
|
617
|
+
logging.info('Totals -- reference_genes=%d, additional_genes=%d, overlapping/kept=%d', total_ref, total_add, total_kept)
|
|
618
|
+
if total_ref:
|
|
619
|
+
logging.info('Overlap relative to reference: %.2f%%', (100.0 * total_kept / total_ref))
|
|
627
620
|
|
|
628
621
|
# If requested, compute discordance lists and write three GFF outputs
|
|
629
622
|
if getattr(options, 'report_discordance', False):
|
|
630
|
-
# Compute discordance lists
|
|
631
|
-
only_in_ref, only_in_additional, mismatches = compute_discordance(ref_genes_by_contig, additional_by_contig, options)
|
|
623
|
+
# Compute discordance lists and matched additional ORFs
|
|
624
|
+
only_in_ref, only_in_additional, mismatches, matched_adds = compute_discordance(ref_genes_by_contig, additional_by_contig, options)
|
|
632
625
|
base = os.path.splitext(os.path.basename(options.output_file))[0] if getattr(options, 'output_file', None) else 'discordance'
|
|
633
626
|
outdir = os.path.dirname(options.output_file) if getattr(options, 'output_file', None) else '.'
|
|
634
627
|
ref_base = os.path.splitext(os.path.basename(options.reference_annotation))[0]
|
|
635
628
|
|
|
636
|
-
#
|
|
629
|
+
# Compute and log clear summary metrics
|
|
630
|
+
total_ref = sum(len(v) for v in ref_genes_by_contig.values()) if ref_genes_by_contig else 0
|
|
631
|
+
total_add = sum(len(v) for v in additional_by_contig.values()) if additional_by_contig else 0
|
|
632
|
+
overlapping_additional = len(matched_adds) if matched_adds is not None else 0
|
|
633
|
+
overlapping_reference = max(0, total_ref - (len(only_in_ref) if only_in_ref is not None else 0))
|
|
634
|
+
logging.info('Summary: reference_genes=%d, additional_geness=%d, additional_genes_overlapping_any_reference=%d, reference_genes_overlapped=%d', total_ref, total_add, overlapping_additional, overlapping_reference)
|
|
635
|
+
|
|
636
|
+
# Keep the three detailed GFF outputs (backward compatible)
|
|
637
637
|
gff_ref = os.path.join(outdir, f"{base}.only_in_reference.gff")
|
|
638
638
|
gff_add = os.path.join(outdir, f"{base}.only_in_additional.gff")
|
|
639
639
|
gff_mis = os.path.join(outdir, f"{base}.mismatches.gff")
|
|
@@ -679,6 +679,7 @@ def comparator(options):
|
|
|
679
679
|
contig_summary = {}
|
|
680
680
|
logging.info('Kept genes by contig (sample): %s', dict(list(contig_summary.items())[:10]))
|
|
681
681
|
logging.info('Writing combined GFF to %s', options.output_file)
|
|
682
|
+
# single correct invocation of gff_writer
|
|
682
683
|
gff_writer(genome_ID, genome_DNA_path, options.reference_annotation, getattr(options, 'reference_tool', None), None, options.additional_annotation, options.additional_tool, genes_To_Keep_by_contig, options.output_file, getattr(options, 'gene_ident', None))
|
|
683
684
|
logging.info('gff_writer finished (check output file)')
|
|
684
685
|
except Exception as e:
|
|
@@ -688,8 +689,6 @@ def comparator(options):
|
|
|
688
689
|
|
|
689
690
|
|
|
690
691
|
def main():
|
|
691
|
-
print(WELCOME)
|
|
692
|
-
|
|
693
692
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Annotation-Intersector Run Parameters')
|
|
694
693
|
|
|
695
694
|
required = parser.add_argument_group('Required Arguments')
|
|
@@ -721,6 +720,20 @@ def main():
|
|
|
721
720
|
|
|
722
721
|
|
|
723
722
|
if __name__ == '__main__':
|
|
724
|
-
|
|
725
|
-
|
|
723
|
+
try:
|
|
724
|
+
try:
|
|
725
|
+
main()
|
|
726
|
+
except Exception:
|
|
727
|
+
logging.exception('Unhandled exception in main')
|
|
728
|
+
finally:
|
|
729
|
+
print(CLOSING)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
|
|
726
739
|
|
ORForise/Convert_To_GFF.py
CHANGED
|
@@ -73,8 +73,6 @@ def load_genome(genome_fasta):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def main():
|
|
76
|
-
print(WELCOME)
|
|
77
|
-
|
|
78
76
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
|
|
79
77
|
parser._action_groups.pop()
|
|
80
78
|
|
|
@@ -138,4 +136,10 @@ def main():
|
|
|
138
136
|
logging.info('Logfile: %s', logfile)
|
|
139
137
|
|
|
140
138
|
if __name__ == '__main__':
|
|
141
|
-
|
|
139
|
+
try:
|
|
140
|
+
try:
|
|
141
|
+
main()
|
|
142
|
+
except Exception:
|
|
143
|
+
logging.exception('Unhandled exception in main')
|
|
144
|
+
finally:
|
|
145
|
+
print(CLOSING)
|
ORForise/GFF_Adder.py
CHANGED
|
@@ -496,8 +496,6 @@ def gff_adder(options):
|
|
|
496
496
|
|
|
497
497
|
|
|
498
498
|
def main():
|
|
499
|
-
print(WELCOME)
|
|
500
|
-
|
|
501
499
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': GFF-Adder Run Parameters.')
|
|
502
500
|
parser._action_groups.pop()
|
|
503
501
|
|
|
@@ -539,5 +537,10 @@ def main():
|
|
|
539
537
|
|
|
540
538
|
|
|
541
539
|
if __name__ == "__main__":
|
|
542
|
-
|
|
543
|
-
|
|
540
|
+
try:
|
|
541
|
+
try:
|
|
542
|
+
main()
|
|
543
|
+
except Exception:
|
|
544
|
+
logging.exception('Unhandled exception in main')
|
|
545
|
+
finally:
|
|
546
|
+
print(CLOSING)
|
ORForise/List_Tools.py
CHANGED
|
@@ -11,8 +11,6 @@ except (ImportError, ModuleNotFoundError):
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def main():
|
|
14
|
-
print(WELCOME)
|
|
15
|
-
|
|
16
14
|
print('ORForise ' + ORForise_Version + ': List Tools Run Parameters')
|
|
17
15
|
|
|
18
16
|
tools = set()
|
|
@@ -52,5 +50,10 @@ def main():
|
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
if __name__ == "__main__":
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
try:
|
|
54
|
+
try:
|
|
55
|
+
main()
|
|
56
|
+
except Exception:
|
|
57
|
+
print('Unhandled exception in main')
|
|
58
|
+
finally:
|
|
59
|
+
print(CLOSING)
|
ORForise/StORForise.py
CHANGED
|
@@ -101,7 +101,6 @@ def comparator(tool, input_to_analyse, storfs_to_find_missing, genome_to_compare
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
def main():
|
|
104
|
-
print(WELCOME)
|
|
105
104
|
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': StORForise Run Parameters.')
|
|
106
105
|
parser.add_argument('-t', '--tool', default='GFF', help='Which tool/format would you analyse with StORF-R?')
|
|
107
106
|
parser.add_argument('-i', '--input_to_analyse', default='', help='Location of file containing missed genes')
|
|
@@ -112,4 +111,10 @@ def main():
|
|
|
112
111
|
comparator(**vars(args))
|
|
113
112
|
|
|
114
113
|
if __name__ == "__main__":
|
|
115
|
-
|
|
114
|
+
try:
|
|
115
|
+
try:
|
|
116
|
+
main()
|
|
117
|
+
except Exception:
|
|
118
|
+
print('Unhandled exception in main')
|
|
119
|
+
finally:
|
|
120
|
+
print(CLOSING)
|
ORForise/Tools/GFF/GFF.py
CHANGED
|
@@ -31,7 +31,7 @@ def GFF(*args):
|
|
|
31
31
|
genome_rev = revCompIterative(genome)
|
|
32
32
|
with open(tool_pred, 'r') as gff_input:
|
|
33
33
|
for line in gff_input:
|
|
34
|
-
if '#'
|
|
34
|
+
if not line.startswith('#'):
|
|
35
35
|
line = line.split('\t')
|
|
36
36
|
#gene_types = types.split(',') - Temporary fix
|
|
37
37
|
#if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
|
|
@@ -68,7 +68,7 @@ def GFF(*args):
|
|
|
68
68
|
stopCodon = genome[stop - 3:stop]
|
|
69
69
|
po = str(start) + ',' + str(stop)
|
|
70
70
|
orf = [strand, startCodon, stopCodon, line[2], 'GFF-Standard'] # This needs to detect the type
|
|
71
|
-
GFF_ORFs.update({po: orf})
|
|
71
|
+
GFF_ORFs[dna_region].update({po: orf})
|
|
72
72
|
# elif "CDS" in line[2]:
|
|
73
73
|
# sys.exit("SAS")
|
|
74
74
|
|
|
@@ -13,8 +13,8 @@ def StORF_Reporter(*args):
|
|
|
13
13
|
dna_regions = args[1]
|
|
14
14
|
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
15
|
dna_regions = collections.OrderedDict()
|
|
16
|
-
with open(tool_pred, 'r') as
|
|
17
|
-
for line in
|
|
16
|
+
with open(tool_pred, 'r') as StORF_Reporter_input:
|
|
17
|
+
for line in StORF_Reporter_input:
|
|
18
18
|
line = line.split()
|
|
19
19
|
if 'StORF-Reporter' in line[1] or 'StoRF_Reporter' in line[1] or 'StORF' in line[1] or 'StORF-Reporter' in line[1] and line[0] not in dna_regions:
|
|
20
20
|
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
ORForise/utils.py
CHANGED
|
@@ -4,8 +4,8 @@ import collections
|
|
|
4
4
|
# Constants
|
|
5
5
|
SHORT_ORF_LENGTH = 300
|
|
6
6
|
MIN_COVERAGE = 75
|
|
7
|
-
ORForise_Version = 'v1.6.
|
|
8
|
-
|
|
7
|
+
ORForise_Version = 'v1.6.4'
|
|
8
|
+
CLOSING=("\n####\nThank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n"
|
|
9
9
|
"Please Cite: https://doi.org/10.1093/bioinformatics/btab827\n"
|
|
10
10
|
"#####")
|
|
11
11
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ORForise
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.4
|
|
4
4
|
Summary: ORForise - A platform for analysing and comparing genome annotations.
|
|
5
5
|
Author-email: Nicholas Dimonaco <nicholas@dimonaco.co.uk>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -662,7 +662,7 @@ Example output files from ```Annotation-Compare```, ```Aggregate-Compare```, ```
|
|
|
662
662
|
For Help: ```Annotation-Compare -h ```
|
|
663
663
|
|
|
664
664
|
```python
|
|
665
|
-
ORForise v1.6.
|
|
665
|
+
ORForise v1.6.4: Annotatione-Compare Run Parameters.
|
|
666
666
|
|
|
667
667
|
Required Arguments:
|
|
668
668
|
-dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
|
|
@@ -730,7 +730,7 @@ ORForise can be used as the example below.
|
|
|
730
730
|
For Help: ```Aggregate-Compare -h ```
|
|
731
731
|
|
|
732
732
|
```python
|
|
733
|
-
ORForise v1.6.
|
|
733
|
+
ORForise v1.6.4: Aggregate-Compare Run Parameters.
|
|
734
734
|
|
|
735
735
|
Required Arguments:
|
|
736
736
|
-dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
|
|
@@ -811,7 +811,7 @@ GFF-Adder combines two existing annotations (GFF or other tool formats).
|
|
|
811
811
|
For Help: ```GFF-Adder -h ```
|
|
812
812
|
|
|
813
813
|
```python
|
|
814
|
-
ORForise v1.6.
|
|
814
|
+
ORForise v1.6.4: GFF-Adder Run Parameters.
|
|
815
815
|
|
|
816
816
|
Required Arguments:
|
|
817
817
|
-dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
|
|
@@ -868,7 +868,7 @@ usage: Annotation_Intersector.py [-h] -ref REFERENCE_ANNOTATION -at
|
|
|
868
868
|
[-cov COVERAGE] [--report-discordance]
|
|
869
869
|
[--report-discordance-file REPORT_DISCORDANCE_FILE]
|
|
870
870
|
|
|
871
|
-
ORForise v1.6.
|
|
871
|
+
ORForise v1.6.4: Annotation-Intersector Run Parameters
|
|
872
872
|
|
|
873
873
|
options:
|
|
874
874
|
-h, --help show this help message and exit
|
|
@@ -947,7 +947,7 @@ Please report any issues to: https://github.com/NickJD/ORForise/issues
|
|
|
947
947
|
#####
|
|
948
948
|
usage: Convert_To_GFF.py [-h] [-dna GENOME_DNA] -i INPUT_ANNOTATION -fmt FORMAT -o OUTPUT_DIR [-gi GENE_IDENT] [--verbose]
|
|
949
949
|
|
|
950
|
-
ORForise v1.6.
|
|
950
|
+
ORForise v1.6.4: Convert-To-GFF Run Parameters
|
|
951
951
|
|
|
952
952
|
Required Arguments:
|
|
953
953
|
-dna GENOME_DNA Genome DNA file (.fa)
|
|
@@ -1026,7 +1026,7 @@ Defaults options were used.
|
|
|
1026
1026
|
**MetaGeneMark - Version '2020'** - http://exon.gatech.edu/meta_gmhmmp.cgi
|
|
1027
1027
|
GFF was chosen as output type.
|
|
1028
1028
|
|
|
1029
|
-
**Prodigal - Version 2.6.3** - https://github.com/hyattpd/Prodigal
|
|
1029
|
+
**Prodigal (Includes Pyrodigal) - Version 2.6.3** - https://github.com/hyattpd/Prodigal
|
|
1030
1030
|
GFF was chosen as output type.
|
|
1031
1031
|
|
|
1032
1032
|
**TransDecoder - Version 5.5.0** - https://github.com/TransDecoder/TransDecoder/wiki
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
ORForise/Aggregate_Compare.py,sha256=
|
|
2
|
-
ORForise/Annotation_Compare.py,sha256=
|
|
3
|
-
ORForise/Annotation_Intersector.py,sha256=
|
|
1
|
+
ORForise/Aggregate_Compare.py,sha256=AzGOfuQLt4haw4rdCwIEag5Y7hnXHLLApkTa6_j99-A,22887
|
|
2
|
+
ORForise/Annotation_Compare.py,sha256=7_LwWKDKZHBrhUWODxTJgd-tppaA4k5IvNuX4bU8_2Q,18571
|
|
3
|
+
ORForise/Annotation_Intersector.py,sha256=7VH7iHk4m1c08AeKf9vGEYuAecsywfC4AQHUlIbgQKQ,35856
|
|
4
4
|
ORForise/Comparator.py,sha256=59VfUS8d19Xa83o1AsCuowDhhe-iNr5wO4FutDpoQRs,48078
|
|
5
|
-
ORForise/Convert_To_GFF.py,sha256=
|
|
6
|
-
ORForise/GFF_Adder.py,sha256=
|
|
7
|
-
ORForise/List_Tools.py,sha256=
|
|
8
|
-
ORForise/StORForise.py,sha256=
|
|
5
|
+
ORForise/Convert_To_GFF.py,sha256=zkpO3vpLxA7EpKe1X1i-_IPbcU3lbwLCsh30mmeuZkI,6030
|
|
6
|
+
ORForise/GFF_Adder.py,sha256=PuOZl4TUN9SbMjGhkuF92UDePAnx0NdVAuWFRxR61XA,28670
|
|
7
|
+
ORForise/List_Tools.py,sha256=OZadIWAP0HJ_JYlTDqWw_EA8Mkew-26_cKOkRE4i7ro,1618
|
|
8
|
+
ORForise/StORForise.py,sha256=yRZtKXKcmevxZ_2asesYdkl-qen3MmOn9_r0vb0927I,5772
|
|
9
9
|
ORForise/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
ORForise/utils.py,sha256=
|
|
10
|
+
ORForise/utils.py,sha256=QdXT0XkEIjMbu4ef2HDwAKa_19m8oeu4QV8oLll5gpk,15759
|
|
11
11
|
ORForise/Aux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
ORForise/Aux/StORF_Undetected/StORF_Undetected.py,sha256=B7f9AxXD6j2ip4QtuOi7pwtfBCxkexE0XiDCJrKSX5U,1318
|
|
13
13
|
ORForise/Aux/StORF_Undetected/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -37,7 +37,7 @@ ORForise/Tools/FGENESB/FGENESB.py,sha256=E6vghsstDeYgmT1lT4DL2M7wreYeXx2s-N-scCu
|
|
|
37
37
|
ORForise/Tools/FGENESB/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
38
|
ORForise/Tools/FragGeneScan/FragGeneScan.py,sha256=koPby-VZZ7X6RA1OBc3-yr9axGlm82LVbFZJ-kyx1Kw,2365
|
|
39
39
|
ORForise/Tools/FragGeneScan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
ORForise/Tools/GFF/GFF.py,sha256=
|
|
40
|
+
ORForise/Tools/GFF/GFF.py,sha256=Q2BFzfJAqL7QYRhTsjPXAK0vyGtNAtIdPATVW6jMwW8,3784
|
|
41
41
|
ORForise/Tools/GFF/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
ORForise/Tools/GLIMMER3/GLIMMER3.py,sha256=EddNu6M1NrWDbWjvByM9gvStuvWoD5lq5jz0M27oro8,2686
|
|
43
43
|
ORForise/Tools/GLIMMER3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -61,13 +61,13 @@ ORForise/Tools/Prodigal/Prodigal.py,sha256=1KVkTL3gHp8iSNFt_CvPnLZUr66x1AfE0ZKxo
|
|
|
61
61
|
ORForise/Tools/Prodigal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
ORForise/Tools/Prokka/Prokka.py,sha256=Kcl1ocVj6hPOfEEwf8bBAWhzWX_XAe55kwNUeM8EUKg,2468
|
|
63
63
|
ORForise/Tools/Prokka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
-
ORForise/Tools/StORF-Reporter/StORF-Reporter.py,sha256=
|
|
64
|
+
ORForise/Tools/StORF-Reporter/StORF-Reporter.py,sha256=BQpFfpXtcNC4C_P4Bk5IZZ9__Xy2VNcbh7zzSDnrNOE,2647
|
|
65
65
|
ORForise/Tools/StORF-Reporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
ORForise/Tools/TransDecoder/TransDecoder.py,sha256=l9y4OFxhSdPRBhUprs0yt2fxtSwyNCOv7oKO-aTvpDk,2381
|
|
67
67
|
ORForise/Tools/TransDecoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
orforise-1.6.
|
|
69
|
-
orforise-1.6.
|
|
70
|
-
orforise-1.6.
|
|
71
|
-
orforise-1.6.
|
|
72
|
-
orforise-1.6.
|
|
73
|
-
orforise-1.6.
|
|
68
|
+
orforise-1.6.4.dist-info/licenses/LICENSE,sha256=eAL1bBUjSMCdvudcn9E3sbujCBCa839cqXxauONDbSU,32476
|
|
69
|
+
orforise-1.6.4.dist-info/METADATA,sha256=cJbN2ekkUs5mP8izYLMqxv8r4awotKf6DtVQNDvuPFo,59575
|
|
70
|
+
orforise-1.6.4.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
71
|
+
orforise-1.6.4.dist-info/entry_points.txt,sha256=_HaBzKQFXCkxHIIgBH_XIOng92-GWJ5FC29LmNaSpR0,670
|
|
72
|
+
orforise-1.6.4.dist-info/top_level.txt,sha256=7kmFicUFY65FJmioc0cpZtXVz93V7KSKvZVWpGz5Hyk,9
|
|
73
|
+
orforise-1.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|