ORForise 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,22 +1,35 @@
1
1
  from importlib import import_module
2
2
  import argparse
3
- import sys,os
4
- import gzip,csv
3
+ import sys, os
4
+ import gzip, csv
5
+ import logging
6
+ from datetime import datetime
5
7
 
6
- try:
7
- from Comparator import tool_comparison
8
- except ImportError:
9
- from .Comparator import tool_comparison
10
8
 
11
9
  try:
12
10
  from utils import *
11
+ from Comparator import tool_comparison
13
12
  except ImportError:
13
+ from .Comparator import tool_comparison
14
14
  from ORForise.utils import *
15
15
 
16
+
17
+
18
+
19
+ ##########################
20
+
21
+ # Consolidate printing and logging into a single block
22
+ def _pct(n, total):
23
+ try:
24
+ return format(100 * n / total, '.2f') + '%'
25
+ except Exception:
26
+ return 'N/A'
27
+
16
28
  ##########################
17
29
 
18
30
  def comparator(options):
19
31
 
32
+
20
33
  try:
21
34
  try: # Detect whether fasta/gff files are .gz or text and read accordingly
22
35
  fasta_in = gzip.open(options.genome_dna, 'rt')
@@ -77,36 +90,46 @@ def comparator(options):
77
90
  'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
78
91
 
79
92
  for dna_region, result in results.items():
80
- num_current_genes = len(dna_regions[dna_region][2])
81
- num_orfs = result['pred_metrics']['Number_of_ORFs']
82
- num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
83
- num_partial = len(result['pred_metrics']['partial_Hits'])
84
- num_missed = len(result['rep_metrics']['genes_Undetected'])
85
- num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
86
- num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
87
- # Collect summary for this contig
88
- if options.outdir:
89
- contig_summaries.append([
90
- dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
91
- ])
92
- ###
93
- num_current_genes = len(dna_regions[dna_region][2])
94
- print("These are the results for: " + dna_region + '\n')
95
- ############################################# To get default output filename from input file details
96
- genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
97
- rep_metric_description, rep_metrics = get_rep_metrics(result)
98
- all_metric_description, all_metrics = get_all_metrics(result)
99
-
100
- print('Current Contig: ' + str(dna_region))
101
- print('Number of Genes: ' + str(num_current_genes))
102
- print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
103
- print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
104
- print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
105
- print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
106
- print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
107
- print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
108
-
109
- if options.outdir:
93
+ if result:
94
+ num_current_genes = len(dna_regions[dna_region][2])
95
+ num_orfs = result['pred_metrics']['Number_of_ORFs']
96
+ num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
97
+ num_partial = len(result['pred_metrics']['partial_Hits'])
98
+ num_missed = len(result['rep_metrics']['genes_Undetected'])
99
+ num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
100
+ num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
101
+ # Collect summary for this contig
102
+ contig_summaries.append([dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi])
103
+ num_current_genes = len(dna_regions[dna_region][2])
104
+ genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
105
+ rep_metric_description, rep_metrics = get_rep_metrics(result)
106
+ all_metric_description, all_metrics = get_all_metrics(result)
107
+
108
+ # Safely extract metric values
109
+ num_orfs = result.get('pred_metrics', {}).get('Number_of_ORFs') if isinstance(result, dict) else 'N/A'
110
+ perfect = result.get('pred_metrics', {}).get('Number_of_Perfect_Matches') if isinstance(result, dict) else 0
111
+ partial = len(result.get('pred_metrics', {}).get('partial_Hits', [])) if isinstance(result, dict) else 'N/A'
112
+ missed = len(result.get('rep_metrics', {}).get('genes_Undetected', [])) if isinstance(result, dict) else 'N/A'
113
+ unmatched = len(result.get('pred_metrics', {}).get('unmatched_ORFs', [])) if isinstance(result, dict) else 'N/A'
114
+ multi = len(result.get('pred_metrics', {}).get('multi_Matched_ORFs', [])) if isinstance(result, dict) else 'N/A'
115
+
116
+ lines = [
117
+ f"These are the results for: {dna_region}",
118
+ f"Current Contig: {dna_region}",
119
+ f"Number of Genes: {num_current_genes}",
120
+ f"Number of ORFs: {num_orfs}",
121
+ f"Perfect Matches: {perfect} [{num_current_genes}] - {_pct(perfect, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
122
+ f"Partial Matches: {partial} [{num_current_genes}] - {_pct(partial, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
123
+ f"Missed Genes: {missed} [{num_current_genes}] - {_pct(missed, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
124
+ f"Unmatched ORFs: {unmatched} [{num_current_genes}] - {_pct(unmatched, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
125
+ f"Multi-matched ORFs: {multi} [{num_current_genes}] - {_pct(multi, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}"
126
+ ]
127
+
128
+ full_msg = '\n'.join(lines) + '\n'
129
+ if options.verbose:
130
+ print(full_msg)
131
+ options.output_logger.info(full_msg)
132
+
110
133
  # Prepare output directory and file names for each contig
111
134
  contig_save = dna_region.replace('/', '_').replace('\\', '_')
112
135
  contig_dir = os.path.join(options.outdir, contig_save)
@@ -156,24 +179,6 @@ def comparator(options):
156
179
  tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
157
180
  tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
158
181
  tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
159
- # tool_out.writerow(['Start_Position_Difference:'])
160
- # tool_out.writerow(result.get('start_Difference', []))
161
- # tool_out.writerow(['Stop_Position_Difference:'])
162
- # tool_out.writerow(result.get('stop_Difference', []))
163
- # tool_out.writerow(['Alternative_Starts_Predicted:'])
164
- # tool_out.writerow(result.get('other_Starts', []))
165
- # tool_out.writerow(['Alternative_Stops_Predicted:'])
166
- # tool_out.writerow(result.get('other_Stops', []))
167
- # tool_out.writerow(['Undetected_Gene_Metrics:'])
168
- # tool_out.writerow([
169
- # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
170
- # ])
171
- # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
172
- # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
173
- # tool_out.writerow([
174
- # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
175
- # ])
176
- # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
177
182
 
178
183
  # Write perfect matches to FASTA
179
184
  with open(perfect_fasta, 'w', encoding='utf-8') as f:
@@ -210,6 +215,11 @@ def comparator(options):
210
215
  key_parts = key.split(',')
211
216
  multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
212
217
  f.write(f"{multi}\n")
218
+ else:
219
+ if options.verbose:
220
+ print(f"No results to process for dna region - " + str(dna_region))
221
+ options.output_logger.info(f"No results to process for dna region - " + str(dna_region))
222
+
213
223
 
214
224
  # After all contigs, append the summary table to the main summary file
215
225
  if options.outdir and contig_summaries:
@@ -227,34 +237,27 @@ def comparator(options):
227
237
  out_file.write('\nOverall Summary:\n')
228
238
  out_file.write(f'Number of Genes: {total_genes}\n')
229
239
  out_file.write(f'Number of ORFs: {total_orfs}\n')
230
- out_file.write(
231
- f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
232
- out_file.write(
233
- f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
234
- out_file.write(
235
- f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
236
- out_file.write(
237
- f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
238
- out_file.write(
239
- f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
240
-
241
- # Print combined metrics to stdout
242
- print("\nCombined metrics for all contigs:")
243
-
244
- print(f'Number of Genes: {total_genes}')
245
- print(f'Number of ORFs: {total_orfs}')
246
- print(
247
- f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
248
- print(
249
- f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
250
- print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
251
- print(
252
- f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
253
- print(
254
- f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
255
-
256
-
257
-
240
+ out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
241
+ out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
242
+ out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
243
+ out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
244
+ out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
245
+
246
+ lines = [
247
+ f"Combined metrics for all contigs:",
248
+ f"Number of Genes: {total_genes}",
249
+ f"Number of ORFs: {total_orfs}",
250
+ f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
251
+ f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
252
+ f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
253
+ f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
254
+ f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
255
+ ]
256
+
257
+ full_msg = '\n'.join(lines) + '\n'
258
+ if options.verbose:
259
+ print(full_msg)
260
+ options.output_logger.info(full_msg)
258
261
 
259
262
 
260
263
  def main():
@@ -282,21 +285,35 @@ def main():
282
285
  '- Provide tool name to compare output from two tools')
283
286
 
284
287
  output = parser.add_argument_group('Output')
285
- output.add_argument('-o', dest='outdir', required=False,
286
- help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
288
+ output.add_argument('-o', dest='outdir', required=True,
289
+ help='Define directory where detailed output should be places')
287
290
  output.add_argument('-n', dest='outname', required=False,
288
- help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
291
+ help='Define output filename(s) prefix - If not provided, filename of reference '
292
+ 'annotation file will be used- <outname>_<contig_id>_ORF_Comparison.csv')
289
293
 
290
294
  misc = parser.add_argument_group('Misc')
291
295
  misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
292
296
  help='Default - False: Print out runtime status')
293
297
  options = parser.parse_args()
294
298
 
295
- if options.outdir and not options.outname:
296
- sys.exit("Error: If -o (outdir) is provided, you must also provide -n (outname).")
299
+ options.outname = options.outname if options.outname else options.reference_annotation.split('/')[-1].split('.')[0]
300
+
301
+ # Initialise loggers once and store on options
302
+ if not getattr(options, 'logger_initialized', False):
303
+ os.makedirs(options.outdir, exist_ok=True)
304
+ output_log = os.path.join(options.outdir, f"ORForise_{options.outname}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
305
+ logger = logging.getLogger('ORForise.output')
306
+ logger.setLevel(logging.INFO)
307
+ fh_out = logging.FileHandler(output_log, encoding='utf-8')
308
+ fh_out.setFormatter(logging.Formatter('%(message)s'))
309
+ logger.addHandler(fh_out)
310
+
311
+ options.output_logger = logger
312
+ options.logger_initialized = True
313
+
297
314
 
298
315
  comparator(options)
299
316
 
300
317
  if __name__ == "__main__":
301
318
  main()
302
- print("Complete")
319
+ print("Complete")
ORForise/Comparator.py CHANGED
@@ -206,33 +206,53 @@ def start_Codon_Count(orfs):
206
206
  else:
207
207
  other += 1
208
208
  other_Starts.append(codon)
209
- atg_P = format(100 * atg / len(orfs), '.2f')
210
- gtg_P = format(100 * gtg / len(orfs), '.2f')
211
- ttg_P = format(100 * ttg / len(orfs), '.2f')
212
- att_P = format(100 * att / len(orfs), '.2f')
213
- ctg_P = format(100 * ctg / len(orfs), '.2f')
214
- other_Start_P = format(100 * other / len(orfs), '.2f')
215
- return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
216
209
 
210
+ total = len(orfs) if orfs is not None else 0
211
+
212
+ if total:
213
+ atg_P = format(100 * atg / len(orfs), '.2f')
214
+ gtg_P = format(100 * gtg / len(orfs), '.2f')
215
+ ttg_P = format(100 * ttg / len(orfs), '.2f')
216
+ att_P = format(100 * att / len(orfs), '.2f')
217
+ ctg_P = format(100 * ctg / len(orfs), '.2f')
218
+ other_Start_P = format(100 * other / len(orfs), '.2f')
219
+ else:
220
+ atg_P = ttg_P = gtg_P = ctg_P = att_P = other_Start_P = format(0, '.2f')
221
+
222
+ return {
223
+ 'ATG': (atg, atg_P),
224
+ 'TTG': (ttg, ttg_P),
225
+ 'GTG': (gtg, gtg_P),
226
+ 'CTG': (ctg, ctg_P),
227
+ 'ATT': (att, att_P),
228
+ 'Other': (other, other_Start_P),
229
+ 'total': total
230
+ }
217
231
 
218
232
  def stop_Codon_Count(orfs):
219
233
  tag, taa, tga, other = 0, 0, 0, 0
220
234
  other_Stops = []
221
- for orf in orfs.values():
222
- codon = orf[2]
223
- if codon == 'TAG':
224
- tag += 1
225
- elif codon == 'TAA':
226
- taa += 1
227
- elif codon == 'TGA':
228
- tga += 1
229
- else:
230
- other += 1
231
- other_Stops.append(codon)
232
- tag_p = format(100 * tag / len(orfs), '.2f')
233
- taa_p = format(100 * taa / len(orfs), '.2f')
234
- tga_p = format(100 * tga / len(orfs), '.2f')
235
- other_Stop_P = format(100 * other / len(orfs), '.2f')
235
+
236
+ total = len(orfs) if orfs else 0
237
+ if total:
238
+ for orf in orfs.values():
239
+ codon = orf[2]
240
+ if codon == 'TAG':
241
+ tag += 1
242
+ elif codon == 'TAA':
243
+ taa += 1
244
+ elif codon == 'TGA':
245
+ tga += 1
246
+ else:
247
+ other += 1
248
+ other_Stops.append(codon)
249
+ tag_p = format(100 * tag / len(orfs), '.2f')
250
+ taa_p = format(100 * taa / len(orfs), '.2f')
251
+ tga_p = format(100 * tga / len(orfs), '.2f')
252
+ other_Stop_P = format(100 * other / len(orfs), '.2f')
253
+ else:
254
+ tag_p = taa_p = tga_p = other_Stop_P = format(0, '.2f')
255
+
236
256
  return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
237
257
 
238
258
 
@@ -260,8 +280,8 @@ def candidate_ORF_Selection(gene_Set,
260
280
  if len(current_ORF_Difference) > len(candidate_ORF_Difference):
261
281
  pos = c_Pos
262
282
  orf_Details = c_ORF_Details
263
- else:
264
- print("Match filtered out")
283
+ #else:
284
+ #("Match filtered out")
265
285
  return pos, orf_Details
266
286
 
267
287
 
@@ -300,6 +320,11 @@ def tool_comparison(all_orfs, dna_regions, verbose):
300
320
 
301
321
  ref_genes_list = dna_regions[dna_region][2]
302
322
  ref_genes = collections.OrderedDict()
323
+
324
+ if not ref_genes_list:
325
+ results[dna_region] = {}
326
+ continue
327
+
303
328
  for d in ref_genes_list:
304
329
  ref_genes.update(d)
305
330
  comp.genome_Seq = dna_regions[dna_region][0]
@@ -311,6 +336,10 @@ def tool_comparison(all_orfs, dna_regions, verbose):
311
336
 
312
337
  better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
313
338
 
339
+ if not current_orfs or not better_pos_orfs_items:
340
+ results[dna_region] = {}
341
+ continue
342
+
314
343
  for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
315
344
  g_Start = int(gene_details[0])
316
345
  g_Stop = int(gene_details[1])
@@ -477,10 +506,13 @@ def tool_comparison(all_orfs, dna_regions, verbose):
477
506
  comp.gene_Pos_Olap.append(0)
478
507
  elif '-' in g_Strand:
479
508
  comp.gene_Neg_Olap.append(0)
480
- ####
481
- min_Gene_Length = min(comp.gene_Lengths)
482
- max_Gene_Length = max(comp.gene_Lengths)
483
- median_Gene_Length = np.median(comp.gene_Lengths)
509
+ #### avoid ValueError
510
+ if comp.gene_Lengths:
511
+ min_Gene_Length = min(comp.gene_Lengths)
512
+ max_Gene_Length = max(comp.gene_Lengths)
513
+ median_Gene_Length = np.median(comp.gene_Lengths)
514
+ else:
515
+ min_Gene_Length = max_Gene_Length = min_Length_Difference = 0
484
516
  prev_ORF_Stop = 0
485
517
  prev_ORF_Overlapped = False
486
518
  for o_Positions, orf_Details in current_orfs.items():
@@ -0,0 +1,138 @@
1
+ import argparse
2
+ import logging
3
+ from datetime import datetime
4
+ import os
5
+ import sys
6
+
7
+ try:
8
+ from utils import *
9
+ from Tools.TabToGFF.TabToGFF import TabToGFF
10
+ except ImportError:
11
+ from ORForise.utils import *
12
+ from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
13
+
14
+
15
+ def setup_logging(outdir, verbose=False):
16
+ ts = datetime.now().strftime('%Y%m%d_%H%M%S')
17
+ logfile = None
18
+ logger = logging.getLogger()
19
+ logger.setLevel(logging.DEBUG if verbose else logging.INFO)
20
+ # clear existing handlers to avoid duplicates when running repeatedly
21
+ logger.handlers = []
22
+ fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
23
+ # Only create a file handler (and thus the logfile) when verbose is enabled
24
+ if verbose:
25
+ logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
26
+ fh = logging.FileHandler(logfile)
27
+ fh.setLevel(logging.DEBUG)
28
+ fh.setFormatter(fmt)
29
+ logger.addHandler(fh)
30
+ # Always add a stdout handler
31
+ sh = logging.StreamHandler(sys.stdout)
32
+ sh.setLevel(logging.DEBUG if verbose else logging.INFO)
33
+ sh.setFormatter(fmt)
34
+ logger.addHandler(sh)
35
+ return logfile
36
+
37
+
38
+ def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
39
+ with open(outpath, 'w') as out:
40
+ out.write('##gff-version\t3\n')
41
+ out.write('#\tConvert_To_GFF\n')
42
+ out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
43
+ # Only include genome DNA line if a path was provided
44
+ if genome_DNA:
45
+ out.write('##Genome DNA File:' + genome_DNA + '\n')
46
+ out.write('##Original File: ' + input_annotation + '\n')
47
+ for pos, data in features.items():
48
+ pos_ = pos.split(',')
49
+ start = pos_[0]
50
+ stop = pos_[-1]
51
+ strand = data['strand']
52
+ if fmt == 'abricate': # Currently only supports abricate format
53
+ info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
54
+ entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
55
+ out.write(entry)
56
+
57
+
58
+ def load_genome(genome_fasta):
59
+ genome_seq = ''
60
+ genome_ID = 'unknown'
61
+ with open(genome_fasta, 'r') as fh:
62
+ for line in fh:
63
+ line = line.rstrip('\n')
64
+ if not line:
65
+ continue
66
+ if line.startswith('>'):
67
+ genome_ID = line.split()[0].lstrip('>')
68
+ else:
69
+ genome_seq += line
70
+ return genome_ID, genome_seq
71
+
72
+
73
+ def main():
74
+ print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
75
+
76
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
77
+ parser._action_groups.pop()
78
+
79
+ required = parser.add_argument_group('Required Arguments')
80
+ # Make genome DNA optional: if not provided we operate without genome sequence
81
+ required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
82
+ required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
83
+ required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
84
+ required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
85
+
86
+ optional = parser.add_argument_group('Optional Arguments')
87
+ optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
88
+ optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
89
+
90
+ options = parser.parse_args()
91
+
92
+ if not os.path.exists(options.output_dir):
93
+ os.makedirs(options.output_dir)
94
+ logfile = setup_logging(options.output_dir, verbose=options.verbose)
95
+ logging.info('Starting Convert_To_GFF')
96
+ # Log genome DNA only if provided
97
+ if options.genome_DNA:
98
+ logging.info('Genome DNA: %s', options.genome_DNA)
99
+ else:
100
+ logging.info('Genome DNA: (not provided)')
101
+ logging.info('Input annotation: %s', options.input_annotation)
102
+ logging.info('Format: %s', options.format)
103
+
104
+ # If a genome fasta was provided, load it; otherwise proceed without genome sequence
105
+ if options.genome_DNA:
106
+ if not os.path.exists(options.genome_DNA):
107
+ logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
108
+ sys.exit(1)
109
+ genome_ID, genome_seq = load_genome(options.genome_DNA)
110
+ else:
111
+ # Derive a sensible genome_ID from the annotation filename and leave sequence empty
112
+ genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
113
+ genome_seq = ''
114
+
115
+ try:
116
+ # Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
117
+ genome_map = {genome_ID: (genome_seq,)}
118
+ features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
119
+ except Exception as e:
120
+ logging.exception('Error parsing input annotation')
121
+ sys.exit(1)
122
+
123
+ #features = sortORFs(features) - Not sorting for now to preserve original order
124
+ basename = os.path.basename(options.input_annotation)
125
+ dot = basename.rfind('.')
126
+ if dot != -1:
127
+ outname = basename[:dot] + '.gff'
128
+ else:
129
+ outname = basename + '.gff'
130
+ outgff = os.path.join(options.output_dir, outname)
131
+ # Pass the original genome path if provided, else pass None so headers adapt
132
+ genome_DNA_path = options.genome_DNA if options.genome_DNA else None
133
+ write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
134
+ logging.info('Wrote GFF to %s', outgff)
135
+ logging.info('Logfile: %s', logfile)
136
+
137
+ if __name__ == '__main__':
138
+ main()