ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +318 -133
- ORForise/Annotation_Compare.py +243 -125
- ORForise/Comparator.py +600 -552
- ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
- ORForise/Tools/Augustus/Augustus.py +30 -23
- ORForise/Tools/Balrog/Balrog.py +31 -23
- ORForise/Tools/EasyGene/EasyGene.py +30 -22
- ORForise/Tools/FGENESB/FGENESB.py +32 -25
- ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
- ORForise/Tools/GFF/GFF.py +51 -47
- ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
- ORForise/Tools/GeneMark/GeneMark.py +46 -40
- ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
- ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
- ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
- ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
- ORForise/Tools/MetaGene/MetaGene.py +29 -22
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
- ORForise/Tools/Prodigal/Prodigal.py +30 -26
- ORForise/Tools/Prokka/Prokka.py +30 -25
- ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
- ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
- ORForise/utils.py +204 -2
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
ORForise/Annotation_Compare.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from importlib import import_module
|
|
2
2
|
import argparse
|
|
3
|
-
import
|
|
4
|
-
import csv
|
|
5
|
-
|
|
3
|
+
import sys,os
|
|
4
|
+
import gzip,csv
|
|
5
|
+
|
|
6
6
|
try:
|
|
7
7
|
from Comparator import tool_comparison
|
|
8
8
|
except ImportError:
|
|
@@ -16,43 +16,29 @@ except ImportError:
|
|
|
16
16
|
##########################
|
|
17
17
|
|
|
18
18
|
def comparator(options):
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
line = line.split('\t')
|
|
28
|
-
try:
|
|
29
|
-
if "CDS" in line[2] and len(line) == 9:
|
|
30
|
-
start = int(line[3])
|
|
31
|
-
stop = int(line[4])
|
|
32
|
-
strand = line[6]
|
|
33
|
-
gene_details = [start,stop,strand]
|
|
34
|
-
ref_genes.update({count:gene_details})
|
|
35
|
-
count += 1
|
|
36
|
-
except IndexError:
|
|
37
|
-
continue
|
|
38
|
-
ref_genes = sortGenes(ref_genes) # sorted GFF refernce
|
|
39
|
-
else: # IF using a tool as reference
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
try: # Detect whether fasta/gff files are .gz or text and read accordingly
|
|
22
|
+
fasta_in = gzip.open(options.genome_dna, 'rt')
|
|
23
|
+
dna_regions = fasta_load(fasta_in)
|
|
24
|
+
except:
|
|
25
|
+
fasta_in = open(options.genome_dna, 'r', encoding='unicode_escape')
|
|
26
|
+
dna_regions = fasta_load(fasta_in)
|
|
40
27
|
try:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
except
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
ref_genes.update({i:[pos[0],pos[1],details[0]]})
|
|
28
|
+
gff_in = gzip.open(options.reference_annotation, 'rt')
|
|
29
|
+
dna_regions = gff_load(options, gff_in, dna_regions)
|
|
30
|
+
except:
|
|
31
|
+
gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
|
|
32
|
+
dna_regions = gff_load(options, gff_in, dna_regions)
|
|
33
|
+
except AttributeError:
|
|
34
|
+
sys.exit("Attribute Error:\nStORF'ed GFF probably already exists - Must be deleted before running (-overwrite)")
|
|
35
|
+
except FileNotFoundError:
|
|
36
|
+
split_path = options.gff.split(os.sep)
|
|
37
|
+
sys.exit("Directory '" + split_path[-2] + "' missing fna/gff files")
|
|
38
|
+
###############################################
|
|
39
|
+
total_ref_genes = sum(
|
|
40
|
+
len(v[2]) if isinstance(v[2], (list, tuple, set, dict, str)) else 1 for v in dna_regions.values())
|
|
41
|
+
|
|
56
42
|
#############################################
|
|
57
43
|
try:
|
|
58
44
|
tool_ = import_module('Tools.' + options.tool + '.' + options.tool, package='my_current_pkg')
|
|
@@ -62,90 +48,214 @@ def comparator(options):
|
|
|
62
48
|
except ModuleNotFoundError:
|
|
63
49
|
sys.exit("Tool not available - Did you get the name right?")
|
|
64
50
|
tool_ = getattr(tool_, options.tool)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
ref_genes, orfs, genome_Seq, options.verbose)
|
|
68
|
-
############################################# To get default output filename from input file details
|
|
69
|
-
genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
|
|
70
|
-
metric_description = list(all_Metrics.keys())
|
|
71
|
-
metrics = list(all_Metrics.values())
|
|
72
|
-
rep_metric_description = list(all_rep_Metrics.keys())
|
|
73
|
-
rep_metrics = list(all_rep_Metrics.values())
|
|
51
|
+
all_orfs = tool_(options.tool_prediction, dna_regions)
|
|
52
|
+
results = tool_comparison(all_orfs, dna_regions, options.verbose)
|
|
74
53
|
############## Printing to std-out and optional csv file
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
54
|
+
# Ensure the output directory exists
|
|
55
|
+
os.makedirs(options.outdir, exist_ok=True)
|
|
56
|
+
# Use outname as a directory, basename for files is output-outname
|
|
57
|
+
base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
|
|
58
|
+
|
|
59
|
+
# Prepare to collect summary stats for all contigs
|
|
60
|
+
contig_summaries = []
|
|
61
|
+
|
|
62
|
+
if options.outdir:
|
|
63
|
+
# Ensure the output directory exists
|
|
64
|
+
os.makedirs(options.outdir, exist_ok=True)
|
|
65
|
+
# Use outname as a directory, basename for files is output-outname
|
|
66
|
+
base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
|
|
67
|
+
with open(f"{base_out}_summary.txt", 'w', encoding='utf-8') as out_file:
|
|
68
|
+
out_file.write('Genome Used: ' + str(options.genome_dna.split('/')[-1]) + '\n')
|
|
69
|
+
if options.reference_tool:
|
|
70
|
+
out_file.write('Reference Tool Used: ' + str(options.reference_tool) + '\n')
|
|
71
|
+
else:
|
|
72
|
+
out_file.write('Reference Used: ' + str(options.reference_annotation.split('/')[-1]) + '\n')
|
|
73
|
+
out_file.write('Tool Compared: ' + str(options.tool) + '\n')
|
|
74
|
+
out_file.write('Total Number of Reference Genes: ' + str(total_ref_genes) + '\n')
|
|
75
|
+
out_file.write('Number of Contigs: ' + str(len(dna_regions)) + '\n')
|
|
76
|
+
out_file.write(
|
|
77
|
+
'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
|
|
78
|
+
|
|
79
|
+
for dna_region, result in results.items():
|
|
80
|
+
num_current_genes = len(dna_regions[dna_region][2])
|
|
81
|
+
num_orfs = result['pred_metrics']['Number_of_ORFs']
|
|
82
|
+
num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
|
|
83
|
+
num_partial = len(result['pred_metrics']['partial_Hits'])
|
|
84
|
+
num_missed = len(result['rep_metrics']['genes_Undetected'])
|
|
85
|
+
num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
|
|
86
|
+
num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
|
|
87
|
+
# Collect summary for this contig
|
|
88
|
+
if options.outdir:
|
|
89
|
+
contig_summaries.append([
|
|
90
|
+
dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
|
|
91
|
+
])
|
|
92
|
+
###
|
|
93
|
+
num_current_genes = len(dna_regions[dna_region][2])
|
|
94
|
+
print("These are the results for: " + dna_region + '\n')
|
|
95
|
+
############################################# To get default output filename from input file details
|
|
96
|
+
genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
|
|
97
|
+
rep_metric_description, rep_metrics = get_rep_metrics(result)
|
|
98
|
+
all_metric_description, all_metrics = get_all_metrics(result)
|
|
99
|
+
|
|
100
|
+
print('Current Contig: ' + str(dna_region))
|
|
101
|
+
print('Number of Genes: ' + str(num_current_genes))
|
|
102
|
+
print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
|
|
103
|
+
print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
|
|
104
|
+
print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
|
|
105
|
+
print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
|
|
106
|
+
print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
|
|
107
|
+
print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
|
|
108
|
+
|
|
109
|
+
if options.outdir:
|
|
110
|
+
# Prepare output directory and file names for each contig
|
|
111
|
+
contig_save = dna_region.replace('/', '_').replace('\\', '_')
|
|
112
|
+
contig_dir = os.path.join(options.outdir, contig_save)
|
|
113
|
+
os.makedirs(contig_dir, exist_ok=True)
|
|
114
|
+
summary_file = os.path.join(contig_dir, "summary.txt")
|
|
115
|
+
csv_file = os.path.join(contig_dir, "metrics.csv")
|
|
116
|
+
perfect_fasta = os.path.join(contig_dir, "perfect_matches.fasta")
|
|
117
|
+
partial_fasta = os.path.join(contig_dir, "partial_matches.fasta")
|
|
118
|
+
missed_fasta = os.path.join(contig_dir, "missed_genes.fasta")
|
|
119
|
+
unmatched_fasta = os.path.join(contig_dir, "unmatched_orfs.fasta")
|
|
120
|
+
multi_fasta = os.path.join(contig_dir, "multi_matched_orfs.fasta")
|
|
121
|
+
|
|
122
|
+
# Write summary to text file
|
|
123
|
+
with open(summary_file, 'w', encoding='utf-8') as sf:
|
|
124
|
+
sf.write('Current Contig: ' + str(dna_region) + '\n')
|
|
125
|
+
sf.write('Number of Genes: ' + str(num_current_genes) + '\n')
|
|
126
|
+
sf.write('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']) + '\n')
|
|
127
|
+
sf.write('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(
|
|
128
|
+
num_current_genes) + '] - ' + format(
|
|
129
|
+
100 * result['pred_metrics']['Number_of_Perfect_Matches'] / num_current_genes, '.2f') + '%\n')
|
|
130
|
+
sf.write('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(
|
|
131
|
+
num_current_genes) + '] - ' + format(
|
|
132
|
+
100 * len(result['pred_metrics']['partial_Hits']) / num_current_genes, '.2f') + '%\n')
|
|
133
|
+
sf.write('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(
|
|
134
|
+
num_current_genes) + '] - ' + format(
|
|
135
|
+
100 * len(result['rep_metrics']['genes_Undetected']) / num_current_genes, '.2f') + '%\n')
|
|
136
|
+
sf.write('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(
|
|
137
|
+
num_current_genes) + '] - ' + format(
|
|
138
|
+
100 * len(result['pred_metrics']['unmatched_ORFs']) / num_current_genes, '.2f') + '%\n')
|
|
139
|
+
sf.write('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(
|
|
140
|
+
num_current_genes) + '] - ' + format(
|
|
141
|
+
100 * len(result['pred_metrics']['multi_Matched_ORFs']) / num_current_genes, '.2f') + '%\n')
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Write metrics to CSV
|
|
145
|
+
with open(csv_file, 'w', newline='\n', encoding='utf-8') as out_file:
|
|
146
|
+
tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
|
|
147
|
+
tool_out.writerow(['Representative_Metrics:'])
|
|
148
|
+
tool_out.writerow(rep_metric_description.split(','))
|
|
149
|
+
tool_out.writerow([*rep_metrics])
|
|
150
|
+
tool_out.writerow(['Prediction_Metrics:'])
|
|
151
|
+
tool_out.writerow(all_metric_description.split(','))
|
|
152
|
+
tool_out.writerow([*all_metrics])
|
|
153
|
+
tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
|
|
154
|
+
tool_out.writerow([''.join(map(str, result['rep_metrics']['gene_Coverage_Genome']))])
|
|
155
|
+
tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
|
|
156
|
+
tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
|
|
157
|
+
tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
|
|
158
|
+
tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
|
|
159
|
+
# tool_out.writerow(['Start_Position_Difference:'])
|
|
160
|
+
# tool_out.writerow(result.get('start_Difference', []))
|
|
161
|
+
# tool_out.writerow(['Stop_Position_Difference:'])
|
|
162
|
+
# tool_out.writerow(result.get('stop_Difference', []))
|
|
163
|
+
# tool_out.writerow(['Alternative_Starts_Predicted:'])
|
|
164
|
+
# tool_out.writerow(result.get('other_Starts', []))
|
|
165
|
+
# tool_out.writerow(['Alternative_Stops_Predicted:'])
|
|
166
|
+
# tool_out.writerow(result.get('other_Stops', []))
|
|
167
|
+
# tool_out.writerow(['Undetected_Gene_Metrics:'])
|
|
168
|
+
# tool_out.writerow([
|
|
169
|
+
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
170
|
+
# ])
|
|
171
|
+
# tool_out.writerow(result.get('undetected_Gene_Metrics', []))
|
|
172
|
+
# tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
|
|
173
|
+
# tool_out.writerow([
|
|
174
|
+
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
175
|
+
# ])
|
|
176
|
+
# tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
|
|
177
|
+
|
|
178
|
+
# Write perfect matches to FASTA
|
|
179
|
+
with open(perfect_fasta, 'w', encoding='utf-8') as f:
|
|
180
|
+
for key, value in result['pred_metrics'].get('perfect_Matches', {}).items():
|
|
181
|
+
key_parts = key.split(',')
|
|
182
|
+
id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}_{key_parts[5]}"
|
|
183
|
+
f.write(f"{id}\n{value}\n")
|
|
184
|
+
|
|
185
|
+
# Write partial matches to FASTA
|
|
186
|
+
with open(partial_fasta, 'w', encoding='utf-8') as f:
|
|
187
|
+
for key, value in result['pred_metrics'].get('partial_Hits', {}).items():
|
|
188
|
+
key_parts = key.split(';')
|
|
189
|
+
gene_Seq = value[0]
|
|
190
|
+
orf_Seq = value[1]
|
|
191
|
+
f.write(f">{key_parts[0]}_gene\n{gene_Seq}\n>{key_parts[1]}_orf\n{orf_Seq}\n")
|
|
192
|
+
|
|
193
|
+
# Write missed genes to FASTA
|
|
194
|
+
with open(missed_fasta, 'w', encoding='utf-8') as f:
|
|
195
|
+
for key, value in result['rep_metrics'].get('genes_Undetected', {}).items():
|
|
196
|
+
key_parts = key.split(',')
|
|
197
|
+
id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
|
|
198
|
+
f.write(f"{id}\n{value}\n")
|
|
199
|
+
|
|
200
|
+
# Write unmatched ORFs to FASTA
|
|
201
|
+
with open(unmatched_fasta, 'w', encoding='utf-8') as f:
|
|
202
|
+
for key, value in result['pred_metrics'].get('unmatched_ORFs', {}).items():
|
|
203
|
+
key_parts = key.split(',')
|
|
204
|
+
id = f">{options.tool}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
|
|
205
|
+
f.write(f"{id}\n{value}\n")
|
|
206
|
+
|
|
207
|
+
# Write multi-matched ORFs to FASTA
|
|
208
|
+
with open(multi_fasta, 'w', encoding='utf-8') as f:
|
|
209
|
+
for key, value in result['pred_metrics'].get('multi_Matched_ORFs', {}).items():
|
|
210
|
+
key_parts = key.split(',')
|
|
211
|
+
multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
|
|
212
|
+
f.write(f"{multi}\n")
|
|
213
|
+
|
|
214
|
+
# After all contigs, append the summary table to the main summary file
|
|
215
|
+
if options.outdir and contig_summaries:
|
|
216
|
+
with open(f"{base_out}_summary.txt", 'a', encoding='utf-8') as out_file:
|
|
217
|
+
for row in contig_summaries:
|
|
218
|
+
out_file.write('\t'.join(map(str, row)) + '\n')
|
|
219
|
+
# Optionally, add overall totals
|
|
220
|
+
total_genes = sum(row[1] for row in contig_summaries)
|
|
221
|
+
total_orfs = sum(row[2] for row in contig_summaries)
|
|
222
|
+
total_perfect = sum(row[3] for row in contig_summaries)
|
|
223
|
+
total_partial = sum(row[4] for row in contig_summaries)
|
|
224
|
+
total_missed = sum(row[5] for row in contig_summaries)
|
|
225
|
+
total_unmatched = sum(row[6] for row in contig_summaries)
|
|
226
|
+
total_multi = sum(row[7] for row in contig_summaries)
|
|
227
|
+
out_file.write('\nOverall Summary:\n')
|
|
228
|
+
out_file.write(f'Number of Genes: {total_genes}\n')
|
|
229
|
+
out_file.write(f'Number of ORFs: {total_orfs}\n')
|
|
230
|
+
out_file.write(
|
|
231
|
+
f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
|
|
232
|
+
out_file.write(
|
|
233
|
+
f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
|
|
234
|
+
out_file.write(
|
|
235
|
+
f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
|
|
236
|
+
out_file.write(
|
|
237
|
+
f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
|
|
238
|
+
out_file.write(
|
|
239
|
+
f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
|
|
240
|
+
|
|
241
|
+
# Print combined metrics to stdout
|
|
242
|
+
print("\nCombined metrics for all contigs:")
|
|
243
|
+
|
|
244
|
+
print(f'Number of Genes: {total_genes}')
|
|
245
|
+
print(f'Number of ORFs: {total_orfs}')
|
|
246
|
+
print(
|
|
247
|
+
f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
|
|
248
|
+
print(
|
|
249
|
+
f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
|
|
250
|
+
print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
|
|
251
|
+
print(
|
|
252
|
+
f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
|
|
253
|
+
print(
|
|
254
|
+
f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
|
|
149
259
|
|
|
150
260
|
def main():
|
|
151
261
|
print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
|
|
@@ -154,7 +264,7 @@ def main():
|
|
|
154
264
|
parser._action_groups.pop()
|
|
155
265
|
|
|
156
266
|
required = parser.add_argument_group('Required Arguments')
|
|
157
|
-
required.add_argument('-dna', dest='
|
|
267
|
+
required.add_argument('-dna', dest='genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
|
|
158
268
|
'are based on')
|
|
159
269
|
required.add_argument('-ref', dest='reference_annotation', required=True,
|
|
160
270
|
help='Which reference annotation file to use as reference?')
|
|
@@ -164,19 +274,27 @@ def main():
|
|
|
164
274
|
' are compared individually via separate files')
|
|
165
275
|
|
|
166
276
|
optional = parser.add_argument_group('Optional Arguments')
|
|
277
|
+
optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
278
|
+
help='What features to consider as genes? - Default: CDS - '
|
|
279
|
+
'Provide comma separated list of features to consider as genes (e.g. CDS,exon)')
|
|
167
280
|
optional.add_argument('-rt', dest='reference_tool', required=False,
|
|
168
281
|
help='What type of Annotation to compare to? -- Leave blank for Ensembl reference'
|
|
169
282
|
'- Provide tool name to compare output from two tools')
|
|
170
283
|
|
|
171
284
|
output = parser.add_argument_group('Output')
|
|
172
|
-
output.add_argument('-o', dest='
|
|
173
|
-
help='Define
|
|
285
|
+
output.add_argument('-o', dest='outdir', required=False,
|
|
286
|
+
help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
|
|
287
|
+
output.add_argument('-n', dest='outname', required=False,
|
|
288
|
+
help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
|
|
174
289
|
|
|
175
290
|
misc = parser.add_argument_group('Misc')
|
|
176
291
|
misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
|
|
177
292
|
help='Default - False: Print out runtime status')
|
|
178
293
|
options = parser.parse_args()
|
|
179
294
|
|
|
295
|
+
if options.outdir and not options.outname:
|
|
296
|
+
sys.exit("Error: If -o (outdir) is provided, you must also provide -n (outname).")
|
|
297
|
+
|
|
180
298
|
comparator(options)
|
|
181
299
|
|
|
182
300
|
if __name__ == "__main__":
|