ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ORForise/Aggregate_Compare.py +318 -133
  2. ORForise/Annotation_Compare.py +243 -125
  3. ORForise/Comparator.py +600 -552
  4. ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
  5. ORForise/Tools/Augustus/Augustus.py +30 -23
  6. ORForise/Tools/Balrog/Balrog.py +31 -23
  7. ORForise/Tools/EasyGene/EasyGene.py +30 -22
  8. ORForise/Tools/FGENESB/FGENESB.py +32 -25
  9. ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
  10. ORForise/Tools/GFF/GFF.py +51 -47
  11. ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
  12. ORForise/Tools/GeneMark/GeneMark.py +46 -40
  13. ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
  14. ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
  15. ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
  16. ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
  17. ORForise/Tools/MetaGene/MetaGene.py +29 -22
  18. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
  19. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
  20. ORForise/Tools/Prodigal/Prodigal.py +30 -26
  21. ORForise/Tools/Prokka/Prokka.py +30 -25
  22. ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
  23. ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
  24. ORForise/utils.py +204 -2
  25. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
  26. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
  27. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
  28. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
  29. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
  30. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  from importlib import import_module
2
2
  import argparse
3
- import collections
4
- import csv,sys
5
- #####
3
+ import sys,os
4
+ import gzip,csv
5
+
6
6
  try:
7
7
  from Comparator import tool_comparison
8
8
  except ImportError:
@@ -16,43 +16,29 @@ except ImportError:
16
16
  ##########################
17
17
 
18
18
  def comparator(options):
19
- with open(options.genome_DNA, mode='r') as genome:
20
- genome_Seq = "".join(line.rstrip() for line in genome if not line.startswith('>'))
21
- ##############################################
22
- if not options.reference_tool: # IF using Ensembl for comparison
23
- ref_genes = collections.OrderedDict() # Order is important
24
- count = 0
25
- with open(options.reference_annotation, 'r') as genome_gff:
26
- for line in genome_gff:
27
- line = line.split('\t')
28
- try:
29
- if "CDS" in line[2] and len(line) == 9:
30
- start = int(line[3])
31
- stop = int(line[4])
32
- strand = line[6]
33
- gene_details = [start,stop,strand]
34
- ref_genes.update({count:gene_details})
35
- count += 1
36
- except IndexError:
37
- continue
38
- ref_genes = sortGenes(ref_genes) # sorted GFF refernce
39
- else: # IF using a tool as reference
19
+
20
+ try:
21
+ try: # Detect whether fasta/gff files are .gz or text and read accordingly
22
+ fasta_in = gzip.open(options.genome_dna, 'rt')
23
+ dna_regions = fasta_load(fasta_in)
24
+ except:
25
+ fasta_in = open(options.genome_dna, 'r', encoding='unicode_escape')
26
+ dna_regions = fasta_load(fasta_in)
40
27
  try:
41
- reference_tool_ = import_module('Tools.' + options.reference_tool + '.' + options.reference_tool,
42
- package='my_current_pkg')
43
- except ModuleNotFoundError:
44
- try:
45
- reference_tool_ = import_module('ORForise.Tools.' + options.reference_tool + '.' + options.reference_tool,
46
- package='my_current_pkg')
47
- except ModuleNotFoundError:
48
- sys.exit("Tool not available")
49
- reference_tool_ = getattr(reference_tool_, options.reference_tool)
50
- ############ Reformatting tool output for ref_genes
51
- ref_genes_tmp = reference_tool_(options.reference_annotation, genome_Seq)
52
- ref_genes = collections.OrderedDict()
53
- for i, (pos, details) in enumerate(ref_genes_tmp.items()):
54
- pos = pos.split(',')
55
- ref_genes.update({i:[pos[0],pos[1],details[0]]})
28
+ gff_in = gzip.open(options.reference_annotation, 'rt')
29
+ dna_regions = gff_load(options, gff_in, dna_regions)
30
+ except:
31
+ gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
32
+ dna_regions = gff_load(options, gff_in, dna_regions)
33
+ except AttributeError:
34
+ sys.exit("Attribute Error:\nStORF'ed GFF probably already exists - Must be deleted before running (-overwrite)")
35
+ except FileNotFoundError:
36
+ split_path = options.gff.split(os.sep)
37
+ sys.exit("Directory '" + split_path[-2] + "' missing fna/gff files")
38
+ ###############################################
39
+ total_ref_genes = sum(
40
+ len(v[2]) if isinstance(v[2], (list, tuple, set, dict, str)) else 1 for v in dna_regions.values())
41
+
56
42
  #############################################
57
43
  try:
58
44
  tool_ = import_module('Tools.' + options.tool + '.' + options.tool, package='my_current_pkg')
@@ -62,90 +48,214 @@ def comparator(options):
62
48
  except ModuleNotFoundError:
63
49
  sys.exit("Tool not available - Did you get the name right?")
64
50
  tool_ = getattr(tool_, options.tool)
65
- orfs = tool_(options.tool_prediction, genome_Seq)
66
- all_Metrics, all_rep_Metrics, start_precision, stop_precision, other_starts, other_stops, perfect_Matches, missed_genes, unmatched_orfs, undetected_gene_metrics, unmatched_orf_metrics, orf_Coverage_Genome, matched_ORF_Coverage_Genome, gene_coverage_genome, multi_Matched_ORFs, partial_Hits = tool_comparison(
67
- ref_genes, orfs, genome_Seq, options.verbose)
68
- ############################################# To get default output filename from input file details
69
- genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
70
- metric_description = list(all_Metrics.keys())
71
- metrics = list(all_Metrics.values())
72
- rep_metric_description = list(all_rep_Metrics.keys())
73
- rep_metrics = list(all_rep_Metrics.values())
51
+ all_orfs = tool_(options.tool_prediction, dna_regions)
52
+ results = tool_comparison(all_orfs, dna_regions, options.verbose)
74
53
  ############## Printing to std-out and optional csv file
75
- print('Genome Used: ' + str(options.genome_DNA.split('/')[-1]))
76
- if options.reference_tool:
77
- print('Reference Tool Used: '+str(options.reference_tool))
78
- else:
79
- print('Reference Used: ' + str(options.reference_annotation.split('/')[-1]))
80
- print('Tool Compared: '+str(options.tool))
81
- print('Perfect Matches: ' + str(len(perfect_Matches)) + ' [' + str(len(ref_genes))+ '] - '+ format(100 * len(perfect_Matches)/len(ref_genes),'.2f')+'%')
82
- print('Partial Matches: ' + str(len(partial_Hits)) + ' [' + str(len(ref_genes))+ '] - '+ format(100 * len(partial_Hits)/len(ref_genes),'.2f')+'%')
83
- print('Missed Genes: ' + str(len(missed_genes)) + ' [' + str(len(ref_genes))+ '] - '+ format(100 * len(missed_genes)/len(ref_genes),'.2f')+'%')
84
- if options.outname:
85
- with open(options.outname, 'w', newline='\n', encoding='utf-8') as out_file: # Clear write out of report
86
- tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
87
- tool_out.writerow(['Representative_Metrics:'])
88
- tool_out.writerow(rep_metric_description)
89
- tool_out.writerow(rep_metrics)
90
- tool_out.writerow(['All_Metrics:'])
91
- tool_out.writerow(metric_description)
92
- tool_out.writerow(metrics)
93
- tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
94
- tool_out.writerow([gene_coverage_genome])
95
- tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
96
- tool_out.writerow([orf_Coverage_Genome])
97
- tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
98
- tool_out.writerow([matched_ORF_Coverage_Genome])
99
- tool_out.writerow(['Start_Position_Difference:'])
100
- tool_out.writerow(start_precision)
101
- tool_out.writerow(['Stop_Position_Difference:'])
102
- tool_out.writerow(stop_precision)
103
- tool_out.writerow(['Alternative_Starts_Predicted:'])
104
- tool_out.writerow(other_starts)
105
- tool_out.writerow(['Alternative_Stops_Predicted:'])
106
- tool_out.writerow(other_stops)
107
- tool_out.writerow(['Undetected_Gene_Metrics:'])
108
- tool_out.writerow([
109
- 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
110
- tool_out.writerow(undetected_gene_metrics)
111
- ####
112
- tool_out.writerow(['Perfect_Match_Genes:'])
113
- for key, value in perfect_Matches.items():
114
- key = key.split(',')
115
- id = ('>' + genome_name + '_' + key[0] + '_' + key[1] + '_' + key[2])
116
- tool_out.writerow([id + '\n' + value + '\n'])
117
- ####
118
- tool_out.writerow(['Partial_Match_Genes:'])
119
- for key, seqs in partial_Hits.items():
120
- key = key.split(';')
121
- gene_Seq = seqs[0]
122
- orf_Seq = seqs[1]
123
- partial = (key[0] + '\n' + gene_Seq + '\n' + key[1] + '\n' + orf_Seq + '\n')
124
- tool_out.writerow([partial])
125
- ####
126
- tool_out.writerow(['\nMissed_Genes:'])
127
- for key, value in missed_genes.items():
128
- key = key.split(',')
129
- id = ('>' + genome_name + '_' + key[0] + '_' + key[1] + '_' + key[2])
130
- tool_out.writerow([id + '\n' + value + '\n'])
131
- tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
132
- tool_out.writerow([
133
- 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
134
- tool_out.writerow(unmatched_orf_metrics)
135
- tool_out.writerow(['Predicted_CDS_Without_Corresponding_Gene_in_Reference:'])
136
- for key, value in unmatched_orfs.items():
137
- key = key.split(',')
138
- id = ('>' + options.tool + '_' + key[0] + '_' + key[1] + '_' + key[2])
139
- tool_out.writerow([id + '\n' + value])
140
- tool_out.writerow(['\nPredicted_CDSs_Which_Detected_more_than_one_Gene:'])
141
-
142
- try:
143
- for key, value in multi_Matched_ORFs.items():
144
- key = key.split(',')
145
- multi = ('Predicted_CDS:' + key[0] + '-' + key[1] + '_Genes:' + '|'.join(value))
146
- tool_out.writerow([multi])
147
- except IndexError:
148
- pass
54
+ # Ensure the output directory exists
55
+ os.makedirs(options.outdir, exist_ok=True)
56
+ # Use outname as a directory, basename for files is output-outname
57
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
58
+
59
+ # Prepare to collect summary stats for all contigs
60
+ contig_summaries = []
61
+
62
+ if options.outdir:
63
+ # Ensure the output directory exists
64
+ os.makedirs(options.outdir, exist_ok=True)
65
+ # Use outname as a directory, basename for files is output-outname
66
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
67
+ with open(f"{base_out}_summary.txt", 'w', encoding='utf-8') as out_file:
68
+ out_file.write('Genome Used: ' + str(options.genome_dna.split('/')[-1]) + '\n')
69
+ if options.reference_tool:
70
+ out_file.write('Reference Tool Used: ' + str(options.reference_tool) + '\n')
71
+ else:
72
+ out_file.write('Reference Used: ' + str(options.reference_annotation.split('/')[-1]) + '\n')
73
+ out_file.write('Tool Compared: ' + str(options.tool) + '\n')
74
+ out_file.write('Total Number of Reference Genes: ' + str(total_ref_genes) + '\n')
75
+ out_file.write('Number of Contigs: ' + str(len(dna_regions)) + '\n')
76
+ out_file.write(
77
+ 'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
78
+
79
+ for dna_region, result in results.items():
80
+ num_current_genes = len(dna_regions[dna_region][2])
81
+ num_orfs = result['pred_metrics']['Number_of_ORFs']
82
+ num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
83
+ num_partial = len(result['pred_metrics']['partial_Hits'])
84
+ num_missed = len(result['rep_metrics']['genes_Undetected'])
85
+ num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
86
+ num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
87
+ # Collect summary for this contig
88
+ if options.outdir:
89
+ contig_summaries.append([
90
+ dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
91
+ ])
92
+ ###
93
+ num_current_genes = len(dna_regions[dna_region][2])
94
+ print("These are the results for: " + dna_region + '\n')
95
+ ############################################# To get default output filename from input file details
96
+ genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
97
+ rep_metric_description, rep_metrics = get_rep_metrics(result)
98
+ all_metric_description, all_metrics = get_all_metrics(result)
99
+
100
+ print('Current Contig: ' + str(dna_region))
101
+ print('Number of Genes: ' + str(num_current_genes))
102
+ print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
103
+ print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
104
+ print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
105
+ print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
106
+ print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
107
+ print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
108
+
109
+ if options.outdir:
110
+ # Prepare output directory and file names for each contig
111
+ contig_save = dna_region.replace('/', '_').replace('\\', '_')
112
+ contig_dir = os.path.join(options.outdir, contig_save)
113
+ os.makedirs(contig_dir, exist_ok=True)
114
+ summary_file = os.path.join(contig_dir, "summary.txt")
115
+ csv_file = os.path.join(contig_dir, "metrics.csv")
116
+ perfect_fasta = os.path.join(contig_dir, "perfect_matches.fasta")
117
+ partial_fasta = os.path.join(contig_dir, "partial_matches.fasta")
118
+ missed_fasta = os.path.join(contig_dir, "missed_genes.fasta")
119
+ unmatched_fasta = os.path.join(contig_dir, "unmatched_orfs.fasta")
120
+ multi_fasta = os.path.join(contig_dir, "multi_matched_orfs.fasta")
121
+
122
+ # Write summary to text file
123
+ with open(summary_file, 'w', encoding='utf-8') as sf:
124
+ sf.write('Current Contig: ' + str(dna_region) + '\n')
125
+ sf.write('Number of Genes: ' + str(num_current_genes) + '\n')
126
+ sf.write('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']) + '\n')
127
+ sf.write('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(
128
+ num_current_genes) + '] - ' + format(
129
+ 100 * result['pred_metrics']['Number_of_Perfect_Matches'] / num_current_genes, '.2f') + '%\n')
130
+ sf.write('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(
131
+ num_current_genes) + '] - ' + format(
132
+ 100 * len(result['pred_metrics']['partial_Hits']) / num_current_genes, '.2f') + '%\n')
133
+ sf.write('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(
134
+ num_current_genes) + '] - ' + format(
135
+ 100 * len(result['rep_metrics']['genes_Undetected']) / num_current_genes, '.2f') + '%\n')
136
+ sf.write('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(
137
+ num_current_genes) + '] - ' + format(
138
+ 100 * len(result['pred_metrics']['unmatched_ORFs']) / num_current_genes, '.2f') + '%\n')
139
+ sf.write('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(
140
+ num_current_genes) + '] - ' + format(
141
+ 100 * len(result['pred_metrics']['multi_Matched_ORFs']) / num_current_genes, '.2f') + '%\n')
142
+
143
+
144
+ # Write metrics to CSV
145
+ with open(csv_file, 'w', newline='\n', encoding='utf-8') as out_file:
146
+ tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
147
+ tool_out.writerow(['Representative_Metrics:'])
148
+ tool_out.writerow(rep_metric_description.split(','))
149
+ tool_out.writerow([*rep_metrics])
150
+ tool_out.writerow(['Prediction_Metrics:'])
151
+ tool_out.writerow(all_metric_description.split(','))
152
+ tool_out.writerow([*all_metrics])
153
+ tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
154
+ tool_out.writerow([''.join(map(str, result['rep_metrics']['gene_Coverage_Genome']))])
155
+ tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
156
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
157
+ tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
158
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
159
+ # tool_out.writerow(['Start_Position_Difference:'])
160
+ # tool_out.writerow(result.get('start_Difference', []))
161
+ # tool_out.writerow(['Stop_Position_Difference:'])
162
+ # tool_out.writerow(result.get('stop_Difference', []))
163
+ # tool_out.writerow(['Alternative_Starts_Predicted:'])
164
+ # tool_out.writerow(result.get('other_Starts', []))
165
+ # tool_out.writerow(['Alternative_Stops_Predicted:'])
166
+ # tool_out.writerow(result.get('other_Stops', []))
167
+ # tool_out.writerow(['Undetected_Gene_Metrics:'])
168
+ # tool_out.writerow([
169
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
170
+ # ])
171
+ # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
172
+ # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
173
+ # tool_out.writerow([
174
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
175
+ # ])
176
+ # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
177
+
178
+ # Write perfect matches to FASTA
179
+ with open(perfect_fasta, 'w', encoding='utf-8') as f:
180
+ for key, value in result['pred_metrics'].get('perfect_Matches', {}).items():
181
+ key_parts = key.split(',')
182
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}_{key_parts[5]}"
183
+ f.write(f"{id}\n{value}\n")
184
+
185
+ # Write partial matches to FASTA
186
+ with open(partial_fasta, 'w', encoding='utf-8') as f:
187
+ for key, value in result['pred_metrics'].get('partial_Hits', {}).items():
188
+ key_parts = key.split(';')
189
+ gene_Seq = value[0]
190
+ orf_Seq = value[1]
191
+ f.write(f">{key_parts[0]}_gene\n{gene_Seq}\n>{key_parts[1]}_orf\n{orf_Seq}\n")
192
+
193
+ # Write missed genes to FASTA
194
+ with open(missed_fasta, 'w', encoding='utf-8') as f:
195
+ for key, value in result['rep_metrics'].get('genes_Undetected', {}).items():
196
+ key_parts = key.split(',')
197
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
198
+ f.write(f"{id}\n{value}\n")
199
+
200
+ # Write unmatched ORFs to FASTA
201
+ with open(unmatched_fasta, 'w', encoding='utf-8') as f:
202
+ for key, value in result['pred_metrics'].get('unmatched_ORFs', {}).items():
203
+ key_parts = key.split(',')
204
+ id = f">{options.tool}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
205
+ f.write(f"{id}\n{value}\n")
206
+
207
+ # Write multi-matched ORFs to FASTA
208
+ with open(multi_fasta, 'w', encoding='utf-8') as f:
209
+ for key, value in result['pred_metrics'].get('multi_Matched_ORFs', {}).items():
210
+ key_parts = key.split(',')
211
+ multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
212
+ f.write(f"{multi}\n")
213
+
214
+ # After all contigs, append the summary table to the main summary file
215
+ if options.outdir and contig_summaries:
216
+ with open(f"{base_out}_summary.txt", 'a', encoding='utf-8') as out_file:
217
+ for row in contig_summaries:
218
+ out_file.write('\t'.join(map(str, row)) + '\n')
219
+ # Optionally, add overall totals
220
+ total_genes = sum(row[1] for row in contig_summaries)
221
+ total_orfs = sum(row[2] for row in contig_summaries)
222
+ total_perfect = sum(row[3] for row in contig_summaries)
223
+ total_partial = sum(row[4] for row in contig_summaries)
224
+ total_missed = sum(row[5] for row in contig_summaries)
225
+ total_unmatched = sum(row[6] for row in contig_summaries)
226
+ total_multi = sum(row[7] for row in contig_summaries)
227
+ out_file.write('\nOverall Summary:\n')
228
+ out_file.write(f'Number of Genes: {total_genes}\n')
229
+ out_file.write(f'Number of ORFs: {total_orfs}\n')
230
+ out_file.write(
231
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
232
+ out_file.write(
233
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
234
+ out_file.write(
235
+ f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
236
+ out_file.write(
237
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
238
+ out_file.write(
239
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
240
+
241
+ # Print combined metrics to stdout
242
+ print("\nCombined metrics for all contigs:")
243
+
244
+ print(f'Number of Genes: {total_genes}')
245
+ print(f'Number of ORFs: {total_orfs}')
246
+ print(
247
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
248
+ print(
249
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
250
+ print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
251
+ print(
252
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
253
+ print(
254
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
255
+
256
+
257
+
258
+
149
259
 
150
260
  def main():
151
261
  print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
@@ -154,7 +264,7 @@ def main():
154
264
  parser._action_groups.pop()
155
265
 
156
266
  required = parser.add_argument_group('Required Arguments')
157
- required.add_argument('-dna', dest='genome_DNA', required=True, help='Genome DNA file (.fa) which both annotations '
267
+ required.add_argument('-dna', dest='genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
158
268
  'are based on')
159
269
  required.add_argument('-ref', dest='reference_annotation', required=True,
160
270
  help='Which reference annotation file to use as reference?')
@@ -164,19 +274,27 @@ def main():
164
274
  ' are compared individually via separate files')
165
275
 
166
276
  optional = parser.add_argument_group('Optional Arguments')
277
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
278
+ help='What features to consider as genes? - Default: CDS - '
279
+ 'Provide comma separated list of features to consider as genes (e.g. CDS,exon)')
167
280
  optional.add_argument('-rt', dest='reference_tool', required=False,
168
281
  help='What type of Annotation to compare to? -- Leave blank for Ensembl reference'
169
282
  '- Provide tool name to compare output from two tools')
170
283
 
171
284
  output = parser.add_argument_group('Output')
172
- output.add_argument('-o', dest='outname', required=False,
173
- help='Define full output filename (format is CSV) - If not provided, summary will be printed to std-out')
285
+ output.add_argument('-o', dest='outdir', required=False,
286
+ help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
287
+ output.add_argument('-n', dest='outname', required=False,
288
+ help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
174
289
 
175
290
  misc = parser.add_argument_group('Misc')
176
291
  misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
177
292
  help='Default - False: Print out runtime status')
178
293
  options = parser.parse_args()
179
294
 
295
+ if options.outdir and not options.outname:
296
+ sys.exit("Error: If -o (outdir) is provided, you must also provide -n (outname).")
297
+
180
298
  comparator(options)
181
299
 
182
300
  if __name__ == "__main__":