ORForise 1.4.3__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ORForise/Aggregate_Compare.py +318 -133
  2. ORForise/Annotation_Compare.py +294 -125
  3. ORForise/Comparator.py +656 -576
  4. ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
  5. ORForise/Tools/Augustus/Augustus.py +30 -23
  6. ORForise/Tools/Balrog/Balrog.py +31 -23
  7. ORForise/Tools/EasyGene/EasyGene.py +30 -22
  8. ORForise/Tools/FGENESB/FGENESB.py +32 -25
  9. ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
  10. ORForise/Tools/GFF/GFF.py +51 -47
  11. ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
  12. ORForise/Tools/GeneMark/GeneMark.py +46 -40
  13. ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
  14. ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
  15. ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
  16. ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
  17. ORForise/Tools/MetaGene/MetaGene.py +29 -22
  18. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
  19. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
  20. ORForise/Tools/Prodigal/Prodigal.py +30 -26
  21. ORForise/Tools/Prokka/Prokka.py +30 -25
  22. ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
  23. ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
  24. ORForise/utils.py +204 -2
  25. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/METADATA +7 -31
  26. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/RECORD +30 -30
  27. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/entry_points.txt +5 -0
  28. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/WHEEL +0 -0
  29. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/licenses/LICENSE +0 -0
  30. {orforise-1.4.3.dist-info → orforise-1.5.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
1
  from importlib import import_module
2
2
  import argparse
3
- import collections
4
- import csv
5
- import sys
3
+ import csv, os, gzip, sys
4
+
6
5
 
7
6
  try:
8
7
  from Comparator import tool_comparison
@@ -14,50 +13,29 @@ except ImportError:
14
13
  ############################################
15
14
 
16
15
  def comparator(options):
17
- genome_seq = ""
18
- with open(options.genome_DNA, 'r') as genome:
19
- for line in genome:
20
- line = line.replace("\n", "")
21
- if not line.startswith('>'):
22
- genome_seq += str(line)
23
- else:
24
- genome_ID = line.split()[0].replace('>','')
25
- ##############################################
26
- if not options.reference_tool: # IF using Ensembl for comparison
27
- ref_genes = collections.OrderedDict() # Order is important
28
- count = 0
29
- with open(options.reference_annotation, 'r') as genome_gff:
30
- for line in genome_gff:
31
- line = line.split('\t')
32
- try:
33
- if "CDS" in line[2] and len(line) == 9:
34
- start = int(line[3])
35
- stop = int(line[4])
36
- strand = line[6]
37
- gene_details = [start, stop, strand]
38
- ref_genes.update({count: gene_details})
39
- count += 1
40
- except IndexError:
41
- continue
42
- else: # IF using a tool as reference
16
+ try:
17
+ try: # Detect whether fasta/gff files are .gz or text and read accordingly
18
+ fasta_in = gzip.open(options.genome_dna, 'rt')
19
+ dna_regions = fasta_load(fasta_in)
20
+ except:
21
+ fasta_in = open(options.genome_dna, 'r', encoding='unicode_escape')
22
+ dna_regions = fasta_load(fasta_in)
43
23
  try:
44
- reference_tool_ = import_module('Tools.' + options.reference_tool + '.' + options.reference_tool,
45
- package='my_current_pkg')
46
- except ModuleNotFoundError:
47
- try:
48
- reference_tool_ = import_module('ORForise.Tools.' + options.reference_tool + '.' + options.reference_tool,
49
- package='my_current_pkg')
50
- except ModuleNotFoundError:
51
- sys.exit("Tool not available")
52
- reference_tool_ = getattr(reference_tool_, options.reference_tool)
53
- ############ Reformatting tool output for ref_genes
54
- ref_genes_tmp = reference_tool_(options.reference_annotation, genome_seq)
55
- ref_genes = collections.OrderedDict()
56
- for i, (pos, details) in enumerate(ref_genes_tmp.items()):
57
- pos = pos.split(',')
58
- ref_genes.update({i: [pos[0], pos[1], details[0]]})
24
+ gff_in = gzip.open(options.reference_annotation, 'rt')
25
+ dna_regions = gff_load(options, gff_in, dna_regions)
26
+ except:
27
+ gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
28
+ dna_regions = gff_load(options, gff_in, dna_regions)
29
+ except AttributeError:
30
+ sys.exit("Attribute Error:\nStORF'ed GFF probably already exists - Must be deleted before running (-overwrite)")
31
+ except FileNotFoundError:
32
+ split_path = options.gff.split(os.sep)
33
+ sys.exit("Directory '" + split_path[-2] + "' missing fna/gff files")
34
+ ###############################################
35
+ total_ref_genes = sum(
36
+ len(v[2]) if isinstance(v[2], (list, tuple, set, dict, str)) else 1 for v in dna_regions.values())
59
37
  #############################################
60
- # Currently only one model type can be used. (--parameters)
38
+ # Collect predictions from tools
61
39
  aggregate_Predictions = collections.OrderedDict()
62
40
  aggregate_Tools = options.tools.split(',')
63
41
  for i, (tool) in enumerate(aggregate_Tools):
@@ -71,104 +49,306 @@ def comparator(options):
71
49
  except ModuleNotFoundError:
72
50
  sys.exit("Tool not available")
73
51
  tool_ = getattr(tool_, tool)
74
- orfs = tool_(tool_prediction, genome_seq)
75
- aggregate_Predictions.update(orfs)
52
+ ##
53
+ orfs = tool_(tool_prediction, dna_regions)
54
+ for current_contig in orfs:
55
+ if current_contig not in aggregate_Predictions:
56
+ aggregate_Predictions[current_contig] = {}
57
+ current_orfs = orfs[current_contig]
58
+ for key, value in current_orfs.items():
59
+ if key in aggregate_Predictions[current_contig]:
60
+ aggregate_Predictions[current_contig][key][-1] += '|' + tool
61
+ else:
62
+ aggregate_Predictions[current_contig][key] = value
76
63
 
77
- aggregate_Predictions = sortORFs(aggregate_Predictions)
78
- all_Metrics, all_rep_Metrics, start_precision, stop_precision, other_starts, other_stops, perfect_Matches, missed_genes, unmatched_orfs, undetected_gene_metrics, unmatched_orf_metrics, orf_Coverage_Genome, matched_ORF_Coverage_Genome, gene_coverage_genome, multi_Matched_ORFs, partial_Hits = tool_comparison(
79
- ref_genes, aggregate_Predictions, genome_seq, options.verbose)
64
+ aggregate_ORFs = {k: sortORFs(v) for k, v in aggregate_Predictions.items()}
65
+ results = tool_comparison(aggregate_ORFs, dna_regions, options.verbose)
66
+ ############## Printing to std-out and optional csv file
67
+ # Ensure the output directory exists
68
+ os.makedirs(options.outdir, exist_ok=True)
69
+ # Use outname as a directory, basename for files is output-outname
70
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
71
+
72
+ # Prepare to collect summary stats for all contigs
73
+ contig_summaries = []
80
74
  ############################################# To get default output filename from input file details
81
- genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
82
- metric_description = list(all_Metrics.keys())
83
- metrics = list(all_Metrics.values())
84
- rep_metric_description = list(all_rep_Metrics.keys())
85
- rep_metrics = list(all_rep_Metrics.values())
86
- #################################
87
- print('Genome Used: ' + str(options.reference_annotation.split('/')[-1]))
88
- if options.reference_tool:
89
- print('Reference Tool Used: ' + str(options.reference_tool))
90
- else:
91
- print('Reference Used: ' + str(options.reference_annotation))
92
- print('Tools Compared: ' + str(options.tools))
93
- print('Perfect Matches:' + str(len(perfect_Matches)) + '[' + str(len(ref_genes))+ '] -'+ format(100 * len(perfect_Matches)/len(ref_genes),'.2f')+'%')
94
- print('Partial Matches:' + str(len(partial_Hits)) + '[' + str(len(ref_genes))+ '] - '+ format(100 * len(partial_Hits)/len(ref_genes),'.2f')+'%')
95
- print('Missed Genes:' + str(len(missed_genes)) + '[' + str(len(ref_genes))+ '] - '+ format(100 * len(missed_genes)/len(ref_genes),'.2f')+'%')
96
- if options.outname:
97
- with open(options.outname, 'w', newline='\n',
98
- encoding='utf-8') as out_file: # Clear write out of report
99
- tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
100
- tool_out.writerow(['Representative_Metrics:'])
101
- tool_out.writerow(rep_metric_description)
102
- tool_out.writerow(rep_metrics)
103
- tool_out.writerow(['All_Metrics:'])
104
- tool_out.writerow(metric_description)
105
- tool_out.writerow(metrics)
106
- tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
107
- tool_out.writerow([gene_coverage_genome])
108
- tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
109
- tool_out.writerow([orf_Coverage_Genome])
110
- tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
111
- tool_out.writerow([matched_ORF_Coverage_Genome])
112
- tool_out.writerow(['Start_Position_Difference:'])
113
- tool_out.writerow(start_precision)
114
- tool_out.writerow(['Stop_Position_Difference:'])
115
- tool_out.writerow(stop_precision)
116
- tool_out.writerow(['Alternative_Starts_Predicted:'])
117
- tool_out.writerow(other_starts)
118
- tool_out.writerow(['Alternative_Stops_Predicted:'])
119
- tool_out.writerow(other_stops)
120
- tool_out.writerow(['Undetected_Gene_Metrics:'])
121
- tool_out.writerow([
122
- 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
123
- tool_out.writerow(undetected_gene_metrics)
124
- tool_out.writerow(['Perfect_Match_Genes:'])
125
- for key, value in perfect_Matches.items():
126
- key = key.split(',')
127
- id = ('>' + genome_name + '_' + key[0] + '_' + key[1] + '_' + key[2])
128
- tool_out.writerow([id + '\n' + value + '\n'])
129
- ####
130
- tool_out.writerow(['Partial_Match_Genes:'])
131
- for key, seqs in partial_Hits.items():
132
- key = key.split(';')
133
- gene_Seq = seqs[0]
134
- orf_Seq = seqs[1]
135
- partial = (key[0] + '\n' + gene_Seq + '\n' + key[1] + '\n' + orf_Seq + '\n')
136
- tool_out.writerow([partial])
137
- ####
138
- tool_out.writerow(['\nMissed_Genes:'])
139
- for key, value in missed_genes.items():
140
- key = key.split(',')
141
- id = ('>' + genome_name + '_' + key[0] + '_' + key[1] + '_' + key[2])
142
- tool_out.writerow([id + '\n' + value + '\n'])
143
- tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
144
- tool_out.writerow([
145
- 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
146
- tool_out.writerow(unmatched_orf_metrics)
147
- tool_out.writerow(['Predicted_CDS_Without_Corresponding_Gene_in_Reference:'])
148
- for key, value in unmatched_orfs.items():
149
- key = key.split(',')
150
- id = ('>' + tool + '_' + key[0] + '_' + key[1] + '_' + key[2])
151
- tool_out.writerow([id + '\n' + value])
152
- tool_out.writerow(['\nPredicted_CDSs_Which_Detected_more_than_one_Gene:'])
75
+ if options.outdir:
76
+ # Ensure the output directory exists
77
+ os.makedirs(options.outdir, exist_ok=True)
78
+ # Use outname as a directory, basename for files is output-outname
79
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
80
+ with open(f"{base_out}_summary.txt", 'w', encoding='utf-8') as out_file:
81
+ out_file.write('Genome Used: ' + str(options.genome_dna.split('/')[-1]) + '\n')
82
+ if options.reference_tool:
83
+ out_file.write('Reference Tool Used: ' + str(options.reference_tool) + '\n')
84
+ else:
85
+ out_file.write('Reference Used: ' + str(options.reference_annotation.split('/')[-1]) + '\n')
86
+ out_file.write('Tool Compared: ' + str(options.tools) + '\n')
87
+ out_file.write('Total Number of Reference Genes: ' + str(total_ref_genes) + '\n')
88
+ out_file.write('Number of Contigs: ' + str(len(dna_regions)) + '\n')
89
+ out_file.write(
90
+ 'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
153
91
 
154
- try:
155
- for key, value in multi_Matched_ORFs.items():
156
- key = key.split(',')
157
- multi = ('Predicted_CDS:' + key[0] + '-' + key[1] + '_Genes:' + '|'.join(value))
158
- tool_out.writerow([multi])
159
- except IndexError:
160
- pass
92
+ for dna_region, result in results.items():
93
+ num_current_genes = len(dna_regions[dna_region][2])
94
+ num_orfs = result['pred_metrics']['Number_of_ORFs']
95
+ num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
96
+ num_partial = len(result['pred_metrics']['partial_Hits'])
97
+ num_missed = len(result['rep_metrics']['genes_Undetected'])
98
+ num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
99
+ num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
100
+
101
+ ####
102
+ # Tool-specific stats
103
+ tool_stats = {}
104
+ for tool in options.tools.split(','):
105
+ tool_stats[tool] = {
106
+ 'perfect': 0,
107
+ 'partial': 0,
108
+ 'unmatched': 0,
109
+ 'multi': 0
110
+ }
111
+ # Count perfect matches per tool
112
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
113
+ for tool in options.tools.split(','):
114
+ if tool in key:
115
+ tool_stats[tool]['perfect'] += 1
116
+ # Count partial matches per tool
117
+ for key in result['pred_metrics'].get('partial_Hits', {}):
118
+ for tool in options.tools.split(','):
119
+ if tool in key:
120
+ tool_stats[tool]['partial'] += 1
121
+ # Count unmatched ORFs per tool
122
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
123
+ for tool in options.tools.split(','):
124
+ if tool in key:
125
+ tool_stats[tool]['unmatched'] += 1
126
+ # Count multi-matched ORFs per tool
127
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
128
+ for tool in options.tools.split(','):
129
+ if tool in key:
130
+ tool_stats[tool]['multi'] += 1
131
+ ####
132
+
133
+ # Collect summary for this contig
134
+ if options.outdir:
135
+ contig_summaries.append([
136
+ dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
137
+ ])
138
+ ###
139
+ num_current_genes = len(dna_regions[dna_region][2])
140
+ print("These are the results for: " + dna_region + '\n')
141
+ ############################################# To get default output filename from input file details
142
+ genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
143
+ rep_metric_description, rep_metrics = get_rep_metrics(result)
144
+ all_metric_description, all_metrics = get_all_metrics(result)
145
+
146
+ print('Current Contig: ' + str(dna_region))
147
+ print('Number of Genes: ' + str(num_current_genes))
148
+ print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
149
+ print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
150
+ print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
151
+ print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
152
+ print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
153
+ print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
154
+ print('Tool breakdown:')
155
+ for tool, stats in tool_stats.items():
156
+ print(
157
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}")
158
+
159
+ if options.outdir:
160
+ # Prepare output directory and file names for each contig
161
+ contig_save = dna_region.replace('/', '_').replace('\\', '_')
162
+ contig_dir = os.path.join(options.outdir, contig_save)
163
+ os.makedirs(contig_dir, exist_ok=True)
164
+ summary_file = os.path.join(contig_dir, "summary.txt")
165
+ csv_file = os.path.join(contig_dir, "metrics.csv")
166
+ perfect_fasta = os.path.join(contig_dir, "perfect_matches.fasta")
167
+ partial_fasta = os.path.join(contig_dir, "partial_matches.fasta")
168
+ missed_fasta = os.path.join(contig_dir, "missed_genes.fasta")
169
+ unmatched_fasta = os.path.join(contig_dir, "unmatched_orfs.fasta")
170
+ multi_fasta = os.path.join(contig_dir, "multi_matched_orfs.fasta")
171
+
172
+ # Write summary to text file
173
+ with open(summary_file, 'w', encoding='utf-8') as sf:
174
+ sf.write('Current Contig: ' + str(dna_region) + '\n')
175
+ sf.write('Number of Genes: ' + str(num_current_genes) + '\n')
176
+ sf.write('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']) + '\n')
177
+ sf.write('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(
178
+ num_current_genes) + '] - ' + format(
179
+ 100 * result['pred_metrics']['Number_of_Perfect_Matches'] / num_current_genes, '.2f') + '%\n')
180
+ sf.write('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(
181
+ num_current_genes) + '] - ' + format(
182
+ 100 * len(result['pred_metrics']['partial_Hits']) / num_current_genes, '.2f') + '%\n')
183
+ sf.write('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(
184
+ num_current_genes) + '] - ' + format(
185
+ 100 * len(result['rep_metrics']['genes_Undetected']) / num_current_genes, '.2f') + '%\n')
186
+ sf.write('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(
187
+ num_current_genes) + '] - ' + format(
188
+ 100 * len(result['pred_metrics']['unmatched_ORFs']) / num_current_genes, '.2f') + '%\n')
189
+ sf.write('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(
190
+ num_current_genes) + '] - ' + format(
191
+ 100 * len(result['pred_metrics']['multi_Matched_ORFs']) / num_current_genes, '.2f') + '%\n')
192
+ sf.write('Tool breakdown:\n')
193
+ for tool, stats in tool_stats.items():
194
+ sf.write(
195
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n")
196
+
197
+ # Write metrics to CSV
198
+ with open(csv_file, 'w', newline='\n', encoding='utf-8') as out_file:
199
+ tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
200
+ tool_out.writerow(['Representative_Metrics:'])
201
+ tool_out.writerow(rep_metric_description.split(','))
202
+ tool_out.writerow([*rep_metrics])
203
+ tool_out.writerow(['Prediction_Metrics:'])
204
+ tool_out.writerow(all_metric_description.split(','))
205
+ tool_out.writerow([*all_metrics])
206
+ tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
207
+ tool_out.writerow([''.join(map(str, result['rep_metrics']['gene_Coverage_Genome']))])
208
+ tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
209
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
210
+ tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
211
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
212
+ # tool_out.writerow(['Start_Position_Difference:'])
213
+ # tool_out.writerow(result.get('start_Difference', []))
214
+ # tool_out.writerow(['Stop_Position_Difference:'])
215
+ # tool_out.writerow(result.get('stop_Difference', []))
216
+ # tool_out.writerow(['Alternative_Starts_Predicted:'])
217
+ # tool_out.writerow(result.get('other_Starts', []))
218
+ # tool_out.writerow(['Alternative_Stops_Predicted:'])
219
+ # tool_out.writerow(result.get('other_Stops', []))
220
+ # tool_out.writerow(['Undetected_Gene_Metrics:'])
221
+ # tool_out.writerow([
222
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
223
+ # ])
224
+ # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
225
+ # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
226
+ # tool_out.writerow([
227
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
228
+ # ])
229
+ # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
230
+
231
+ # Write perfect matches to FASTA
232
+ with open(perfect_fasta, 'w', encoding='utf-8') as f:
233
+ for key, value in result['pred_metrics'].get('perfect_Matches', {}).items():
234
+ key_parts = key.split(',')
235
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}_{key_parts[5]}"
236
+ f.write(f"{id}\n{value}\n")
237
+
238
+ # Write partial matches to FASTA
239
+ with open(partial_fasta, 'w', encoding='utf- 8') as f:
240
+ for key, value in result['pred_metrics'].get('partial_Hits', {}).items():
241
+ key_parts = key.split(';')
242
+ gene_Seq = value[0]
243
+ orf_Seq = value[1]
244
+ f.write(f">{key_parts[0]}_gene\n{gene_Seq}\n>{key_parts[1]}_orf\n{orf_Seq}\n")
245
+
246
+ # Write missed genes to FASTA
247
+ with open(missed_fasta, 'w', encoding='utf-8') as f:
248
+ for key, value in result['rep_metrics'].get('genes_Undetected', {}).items():
249
+ key_parts = key.split(',')
250
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
251
+ f.write(f"{id}\n{value}\n")
252
+
253
+ # Write unmatched ORFs to FASTA
254
+ with open(unmatched_fasta, 'w', encoding='utf-8') as f:
255
+ for key, value in result['pred_metrics'].get('unmatched_ORFs', {}).items():
256
+ key_parts = key.split(',')
257
+ id = f">{options.tools}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
258
+ f.write(f"{id}\n{value}\n")
259
+
260
+ # Write multi-matched ORFs to FASTA
261
+ with open(multi_fasta, 'w', encoding='utf-8') as f:
262
+ for key, value in result['pred_metrics'].get('multi_Matched_ORFs', {}).items():
263
+ key_parts = key.split(',')
264
+ multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
265
+ f.write(f"{multi}\n")
266
+
267
+ # After all contigs, append the summary table to the main summary file
268
+ if options.outdir and contig_summaries:
269
+ with open(f"{base_out}_summary.txt", 'a', encoding='utf-8') as out_file:
270
+ for row in contig_summaries:
271
+ out_file.write('\t'.join(map(str, row)) + '\n')
272
+ # Optionally, add overall totals
273
+ total_genes = sum(row[1] for row in contig_summaries)
274
+ total_orfs = sum(row[2] for row in contig_summaries)
275
+ total_perfect = sum(row[3] for row in contig_summaries)
276
+ total_partial = sum(row[4] for row in contig_summaries)
277
+ total_missed = sum(row[5] for row in contig_summaries)
278
+ total_unmatched = sum(row[6] for row in contig_summaries)
279
+ total_multi = sum(row[7] for row in contig_summaries)
280
+ out_file.write('\nOverall Summary:\n')
281
+ out_file.write(f'Number of Genes: {total_genes}\n')
282
+ out_file.write(f'Number of ORFs: {total_orfs}\n')
283
+ out_file.write(
284
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
285
+ out_file.write(
286
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
287
+ out_file.write(
288
+ f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
289
+ out_file.write(
290
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
291
+ out_file.write(
292
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
293
+
294
+ # Calculate combined tool stats - could be optimised further
295
+ combined_tool_stats = {tool: {'perfect': 0, 'partial': 0, 'unmatched': 0, 'multi': 0} for tool in
296
+ options.tools.split(',')}
297
+ for dna_region, result in results.items():
298
+ for tool in options.tools.split(','):
299
+ # perfect
300
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
301
+ if tool in key:
302
+ combined_tool_stats[tool]['perfect'] += 1
303
+ # partial
304
+ for key in result['pred_metrics'].get('partial_Hits', {}):
305
+ if tool in key:
306
+ combined_tool_stats[tool]['partial'] += 1
307
+ # unmatched
308
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
309
+ if tool in key:
310
+ combined_tool_stats[tool]['unmatched'] += 1
311
+ # multi
312
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
313
+ if tool in key:
314
+ combined_tool_stats[tool]['multi'] += 1
315
+ for tool, stats in combined_tool_stats.items():
316
+ out_file.write('\n'+
317
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n"
318
+ )
319
+
320
+ # Print combined metrics to stdout
321
+ print("\nCombined metrics for all contigs:")
322
+ print(f'Number of Genes: {total_genes}')
323
+ print(f'Number of ORFs: {total_orfs}')
324
+ print(
325
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
326
+ print(
327
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
328
+ print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
329
+ print(
330
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
331
+ print(
332
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
333
+
334
+ print('Tool breakdown (combined):')
335
+ for tool, stats in combined_tool_stats.items():
336
+ print('\n'+
337
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}"
338
+ )
161
339
 
162
340
 
163
341
  def main():
164
- print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
342
+ print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n"
343
+ "Please Cite: https://doi.org/10.1093/bioinformatics/btab827\n"
344
+ "#####")
165
345
 
166
346
  parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Aggregate-Compare Run Parameters.')
167
347
  parser._action_groups.pop()
168
348
 
169
349
  required = parser.add_argument_group('Required Arguments')
170
350
 
171
- required.add_argument('-dna', dest='genome_DNA', required=True, help='Genome DNA file (.fa) which both annotations '
351
+ required.add_argument('-dna', dest='genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
172
352
  'are based on')
173
353
  required.add_argument('-t', dest='tools', required=True, help='Which tools to analyse? (Prodigal,GeneMarkS)')
174
354
  required.add_argument('-tp', dest='tool_predictions', required=True, help='Tool genome prediction file (.gff) - Provide'
@@ -177,13 +357,18 @@ def main():
177
357
  help='Which reference annotation file to use as reference?')
178
358
 
179
359
  optional = parser.add_argument_group('Optional Arguments')
360
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
361
+ help='What features to consider as genes? - Default: CDS - '
362
+ 'Provide comma separated list of features to consider as genes (e.g. CDS,exon)')
180
363
  optional.add_argument('-rt', dest='reference_tool', required=False,
181
364
  help='What type of Annotation to compare to? -- Leave blank for Ensembl reference'
182
365
  '- Provide tool name to compare output from two tools')
183
366
 
184
367
  output = parser.add_argument_group('Output')
185
- output.add_argument('-o', dest='outname', required=False,
186
- help='Define full output filename (format is CSV) - If not provided, summary will be printed to std-out')
368
+ output.add_argument('-o', dest='outdir', required=False,
369
+ help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
370
+ output.add_argument('-n', dest='outname', required=False,
371
+ help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
187
372
 
188
373
  misc = parser.add_argument_group('Misc')
189
374
  misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],