ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,378 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ import csv, os, gzip, sys
4
+
5
+ try:
6
+ from .Comparator import tool_comparison
7
+ from .utils import *
8
+ except (ImportError, ModuleNotFoundError):
9
+ from Comparator import tool_comparison
10
+ from utils import *
11
+ ############################################
12
+
13
+ def comparator(options):
14
+ try:
15
+ try: # Detect whether fasta/gff files are .gz or text and read accordingly
16
+ fasta_in = gzip.open(options.genome_dna, 'rt')
17
+ dna_regions = fasta_load(fasta_in)
18
+ except:
19
+ fasta_in = open(options.genome_dna, 'r', encoding='unicode_escape')
20
+ dna_regions = fasta_load(fasta_in)
21
+ try:
22
+ gff_in = gzip.open(options.reference_annotation, 'rt')
23
+ dna_regions = gff_load(options, gff_in, dna_regions)
24
+ except:
25
+ gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
26
+ dna_regions = gff_load(options, gff_in, dna_regions)
27
+ except AttributeError:
28
+ sys.exit("Attribute Error:\nStORF'ed GFF probably already exists - Must be deleted before running (-overwrite)")
29
+ except FileNotFoundError:
30
+ split_path = options.gff.split(os.sep)
31
+ sys.exit("Directory '" + split_path[-2] + "' missing fna/gff files")
32
+ ###############################################
33
+ total_ref_genes = sum(
34
+ len(v[2]) if isinstance(v[2], (list, tuple, set, dict, str)) else 1 for v in dna_regions.values())
35
+ #############################################
36
+ # Collect predictions from tools
37
+ aggregate_Predictions = collections.OrderedDict()
38
+ aggregate_Tools = options.tools.split(',')
39
+ for i, (tool) in enumerate(aggregate_Tools):
40
+ tool_prediction = options.tool_predictions.split(',')[i]
41
+ print(tool)
42
+ try:
43
+ tool_ = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
44
+ except ModuleNotFoundError:
45
+ try:
46
+ tool_ = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
47
+ except ModuleNotFoundError:
48
+ sys.exit("Tool not available")
49
+ tool_ = getattr(tool_, tool)
50
+ ##
51
+ orfs = tool_(tool_prediction, dna_regions)
52
+ for current_contig in orfs:
53
+ if current_contig not in aggregate_Predictions:
54
+ aggregate_Predictions[current_contig] = {}
55
+ current_orfs = orfs[current_contig]
56
+ for key, value in current_orfs.items():
57
+ if key in aggregate_Predictions[current_contig]:
58
+ aggregate_Predictions[current_contig][key][-1] += '|' + tool
59
+ else:
60
+ aggregate_Predictions[current_contig][key] = value
61
+
62
+ aggregate_ORFs = {k: sortORFs(v) for k, v in aggregate_Predictions.items()}
63
+ results = tool_comparison(aggregate_ORFs, dna_regions, options.verbose)
64
+ ############## Printing to std-out and optional csv file
65
+ # Ensure the output directory exists
66
+ os.makedirs(options.outdir, exist_ok=True)
67
+ # Use outname as a directory, basename for files is output-outname
68
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
69
+
70
+ # Prepare to collect summary stats for all contigs
71
+ contig_summaries = []
72
+ ############################################# To get default output filename from input file details
73
+ if options.outdir:
74
+ # Ensure the output directory exists
75
+ os.makedirs(options.outdir, exist_ok=True)
76
+ # Use outname as a directory, basename for files is output-outname
77
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
78
+ with open(f"{base_out}_summary.txt", 'w', encoding='utf-8') as out_file:
79
+ out_file.write('Genome Used: ' + str(options.genome_dna.split('/')[-1]) + '\n')
80
+ if options.reference_tool:
81
+ out_file.write('Reference Tool Used: ' + str(options.reference_tool) + '\n')
82
+ else:
83
+ out_file.write('Reference Used: ' + str(options.reference_annotation.split('/')[-1]) + '\n')
84
+ out_file.write('Tool Compared: ' + str(options.tools) + '\n')
85
+ out_file.write('Total Number of Reference Genes: ' + str(total_ref_genes) + '\n')
86
+ out_file.write('Number of Contigs: ' + str(len(dna_regions)) + '\n')
87
+ out_file.write(
88
+ 'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
89
+
90
+ for dna_region, result in results.items():
91
+ num_current_genes = len(dna_regions[dna_region][2])
92
+ num_orfs = result['pred_metrics']['Number_of_ORFs']
93
+ num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
94
+ num_partial = len(result['pred_metrics']['partial_Hits'])
95
+ num_missed = len(result['rep_metrics']['genes_Undetected'])
96
+ num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
97
+ num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
98
+
99
+ ####
100
+ # Tool-specific stats
101
+ tool_stats = {}
102
+ for tool in options.tools.split(','):
103
+ tool_stats[tool] = {
104
+ 'perfect': 0,
105
+ 'partial': 0,
106
+ 'unmatched': 0,
107
+ 'multi': 0
108
+ }
109
+ # Count perfect matches per tool
110
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
111
+ for tool in options.tools.split(','):
112
+ if tool in key:
113
+ tool_stats[tool]['perfect'] += 1
114
+ # Count partial matches per tool
115
+ for key in result['pred_metrics'].get('partial_Hits', {}):
116
+ for tool in options.tools.split(','):
117
+ if tool in key:
118
+ tool_stats[tool]['partial'] += 1
119
+ # Count unmatched ORFs per tool
120
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
121
+ for tool in options.tools.split(','):
122
+ if tool in key:
123
+ tool_stats[tool]['unmatched'] += 1
124
+ # Count multi-matched ORFs per tool
125
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
126
+ for tool in options.tools.split(','):
127
+ if tool in key:
128
+ tool_stats[tool]['multi'] += 1
129
+ ####
130
+
131
+ # Collect summary for this contig
132
+ if options.outdir:
133
+ contig_summaries.append([
134
+ dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
135
+ ])
136
+ ###
137
+ num_current_genes = len(dna_regions[dna_region][2])
138
+ print("These are the results for: " + dna_region + '\n')
139
+ ############################################# To get default output filename from input file details
140
+ genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
141
+ rep_metric_description, rep_metrics = get_rep_metrics(result)
142
+ all_metric_description, all_metrics = get_all_metrics(result)
143
+
144
+ print('Current Contig: ' + str(dna_region))
145
+ print('Number of Genes: ' + str(num_current_genes))
146
+ print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
147
+ print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
148
+ print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
149
+ print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
150
+ print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
151
+ print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
152
+ print('Tool breakdown:')
153
+ for tool, stats in tool_stats.items():
154
+ print(
155
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}")
156
+
157
+ if options.outdir:
158
+ # Prepare output directory and file names for each contig
159
+ contig_save = dna_region.replace('/', '_').replace('\\', '_')
160
+ contig_dir = os.path.join(options.outdir, contig_save)
161
+ os.makedirs(contig_dir, exist_ok=True)
162
+ summary_file = os.path.join(contig_dir, "summary.txt")
163
+ csv_file = os.path.join(contig_dir, "metrics.csv")
164
+ perfect_fasta = os.path.join(contig_dir, "perfect_matches.fasta")
165
+ partial_fasta = os.path.join(contig_dir, "partial_matches.fasta")
166
+ missed_fasta = os.path.join(contig_dir, "missed_genes.fasta")
167
+ unmatched_fasta = os.path.join(contig_dir, "unmatched_orfs.fasta")
168
+ multi_fasta = os.path.join(contig_dir, "multi_matched_orfs.fasta")
169
+
170
+ # Write summary to text file
171
+ with open(summary_file, 'w', encoding='utf-8') as sf:
172
+ sf.write('Current Contig: ' + str(dna_region) + '\n')
173
+ sf.write('Number of Genes: ' + str(num_current_genes) + '\n')
174
+ sf.write('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']) + '\n')
175
+ sf.write('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(
176
+ num_current_genes) + '] - ' + format(
177
+ 100 * result['pred_metrics']['Number_of_Perfect_Matches'] / num_current_genes, '.2f') + '%\n')
178
+ sf.write('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(
179
+ num_current_genes) + '] - ' + format(
180
+ 100 * len(result['pred_metrics']['partial_Hits']) / num_current_genes, '.2f') + '%\n')
181
+ sf.write('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(
182
+ num_current_genes) + '] - ' + format(
183
+ 100 * len(result['rep_metrics']['genes_Undetected']) / num_current_genes, '.2f') + '%\n')
184
+ sf.write('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(
185
+ num_current_genes) + '] - ' + format(
186
+ 100 * len(result['pred_metrics']['unmatched_ORFs']) / num_current_genes, '.2f') + '%\n')
187
+ sf.write('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(
188
+ num_current_genes) + '] - ' + format(
189
+ 100 * len(result['pred_metrics']['multi_Matched_ORFs']) / num_current_genes, '.2f') + '%\n')
190
+ sf.write('Tool breakdown:\n')
191
+ for tool, stats in tool_stats.items():
192
+ sf.write(
193
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n")
194
+
195
+ # Write metrics to CSV
196
+ with open(csv_file, 'w', newline='\n', encoding='utf-8') as out_file:
197
+ tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
198
+ tool_out.writerow(['Representative_Metrics:'])
199
+ tool_out.writerow(rep_metric_description.split(','))
200
+ tool_out.writerow([*rep_metrics])
201
+ tool_out.writerow(['Prediction_Metrics:'])
202
+ tool_out.writerow(all_metric_description.split(','))
203
+ tool_out.writerow([*all_metrics])
204
+ tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
205
+ tool_out.writerow([''.join(map(str, result['rep_metrics']['gene_Coverage_Genome']))])
206
+ tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
207
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
208
+ tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
209
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
210
+ # tool_out.writerow(['Start_Position_Difference:'])
211
+ # tool_out.writerow(result.get('start_Difference', []))
212
+ # tool_out.writerow(['Stop_Position_Difference:'])
213
+ # tool_out.writerow(result.get('stop_Difference', []))
214
+ # tool_out.writerow(['Alternative_Starts_Predicted:'])
215
+ # tool_out.writerow(result.get('other_Starts', []))
216
+ # tool_out.writerow(['Alternative_Stops_Predicted:'])
217
+ # tool_out.writerow(result.get('other_Stops', []))
218
+ # tool_out.writerow(['Undetected_Gene_Metrics:'])
219
+ # tool_out.writerow([
220
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
221
+ # ])
222
+ # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
223
+ # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
224
+ # tool_out.writerow([
225
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
226
+ # ])
227
+ # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
228
+
229
+ # Write perfect matches to FASTA
230
+ with open(perfect_fasta, 'w', encoding='utf-8') as f:
231
+ for key, value in result['pred_metrics'].get('perfect_Matches', {}).items():
232
+ key_parts = key.split(',')
233
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}_{key_parts[5]}"
234
+ f.write(f"{id}\n{value}\n")
235
+
236
+ # Write partial matches to FASTA
237
+ with open(partial_fasta, 'w', encoding='utf- 8') as f:
238
+ for key, value in result['pred_metrics'].get('partial_Hits', {}).items():
239
+ key_parts = key.split(';')
240
+ gene_Seq = value[0]
241
+ orf_Seq = value[1]
242
+ f.write(f">{key_parts[0]}_gene\n{gene_Seq}\n>{key_parts[1]}_orf\n{orf_Seq}\n")
243
+
244
+ # Write missed genes to FASTA
245
+ with open(missed_fasta, 'w', encoding='utf-8') as f:
246
+ for key, value in result['rep_metrics'].get('genes_Undetected', {}).items():
247
+ key_parts = key.split(',')
248
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
249
+ f.write(f"{id}\n{value}\n")
250
+
251
+ # Write unmatched ORFs to FASTA
252
+ with open(unmatched_fasta, 'w', encoding='utf-8') as f:
253
+ for key, value in result['pred_metrics'].get('unmatched_ORFs', {}).items():
254
+ key_parts = key.split(',')
255
+ id = f">{options.tools}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
256
+ f.write(f"{id}\n{value}\n")
257
+
258
+ # Write multi-matched ORFs to FASTA
259
+ with open(multi_fasta, 'w', encoding='utf-8') as f:
260
+ for key, value in result['pred_metrics'].get('multi_Matched_ORFs', {}).items():
261
+ key_parts = key.split(',')
262
+ multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
263
+ f.write(f"{multi}\n")
264
+
265
+ # After all contigs, append the summary table to the main summary file
266
+ if options.outdir and contig_summaries:
267
+ with open(f"{base_out}_summary.txt", 'a', encoding='utf-8') as out_file:
268
+ for row in contig_summaries:
269
+ out_file.write('\t'.join(map(str, row)) + '\n')
270
+ # Optionally, add overall totals
271
+ total_genes = sum(row[1] for row in contig_summaries)
272
+ total_orfs = sum(row[2] for row in contig_summaries)
273
+ total_perfect = sum(row[3] for row in contig_summaries)
274
+ total_partial = sum(row[4] for row in contig_summaries)
275
+ total_missed = sum(row[5] for row in contig_summaries)
276
+ total_unmatched = sum(row[6] for row in contig_summaries)
277
+ total_multi = sum(row[7] for row in contig_summaries)
278
+ out_file.write('\nOverall Summary:\n')
279
+ out_file.write(f'Number of Genes: {total_genes}\n')
280
+ out_file.write(f'Number of ORFs: {total_orfs}\n')
281
+ out_file.write(
282
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
283
+ out_file.write(
284
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
285
+ out_file.write(
286
+ f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
287
+ out_file.write(
288
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
289
+ out_file.write(
290
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
291
+
292
+ # Calculate combined tool stats - could be optimised further
293
+ combined_tool_stats = {tool: {'perfect': 0, 'partial': 0, 'unmatched': 0, 'multi': 0} for tool in
294
+ options.tools.split(',')}
295
+ for dna_region, result in results.items():
296
+ for tool in options.tools.split(','):
297
+ # perfect
298
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
299
+ if tool in key:
300
+ combined_tool_stats[tool]['perfect'] += 1
301
+ # partial
302
+ for key in result['pred_metrics'].get('partial_Hits', {}):
303
+ if tool in key:
304
+ combined_tool_stats[tool]['partial'] += 1
305
+ # unmatched
306
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
307
+ if tool in key:
308
+ combined_tool_stats[tool]['unmatched'] += 1
309
+ # multi
310
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
311
+ if tool in key:
312
+ combined_tool_stats[tool]['multi'] += 1
313
+ for tool, stats in combined_tool_stats.items():
314
+ out_file.write('\n'+
315
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n"
316
+ )
317
+
318
+ # Print combined metrics to stdout
319
+ print("\nCombined metrics for all contigs:")
320
+ print(f'Number of Genes: {total_genes}')
321
+ print(f'Number of ORFs: {total_orfs}')
322
+ print(
323
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
324
+ print(
325
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
326
+ print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
327
+ print(
328
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
329
+ print(
330
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
331
+
332
+ print('Tool breakdown (combined):')
333
+ for tool, stats in combined_tool_stats.items():
334
+ print('\n'+
335
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}"
336
+ )
337
+
338
+
339
+ def main():
340
+ print(WELCOME)
341
+
342
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Aggregate-Compare Run Parameters.')
343
+ parser._action_groups.pop()
344
+
345
+ required = parser.add_argument_group('Required Arguments')
346
+
347
+ required.add_argument('-dna', dest='genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
348
+ 'are based on')
349
+ required.add_argument('-t', dest='tools', required=True, help='Which tools to analyse?')
350
+ required.add_argument('-tp', dest='tool_predictions', required=True, help='Tool genome prediction file (.gff) - Provide'
351
+ 'file locations for each tool comma separated')
352
+ required.add_argument('-ref', dest='reference_annotation', required=True,
353
+ help='Which reference annotation file to use as reference?')
354
+
355
+ optional = parser.add_argument_group('Optional Arguments')
356
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
357
+ help='What features to consider as genes? - Default: CDS - '
358
+ 'Provide comma separated list of features to consider as genes (e.g. CDS,exon)')
359
+ optional.add_argument('-rt', dest='reference_tool', required=False,
360
+ help='What type of Annotation to compare to? -- Leave blank for Ensembl reference'
361
+ '- Provide tool name to compare output from two tools')
362
+
363
+ output = parser.add_argument_group('Output')
364
+ output.add_argument('-o', dest='outdir', required=False,
365
+ help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
366
+ output.add_argument('-n', dest='outname', required=False,
367
+ help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
368
+
369
+ misc = parser.add_argument_group('Misc')
370
+ misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
371
+ help='Default - False: Print out runtime status')
372
+ options = parser.parse_args()
373
+ comparator(options)
374
+
375
+ if __name__ == "__main__":
376
+ main()
377
+ print("Complete")
378
+