ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/Comparator.py
ADDED
|
@@ -0,0 +1,882 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from .utils import *
|
|
5
|
+
except (ImportError, ModuleNotFoundError):
|
|
6
|
+
from utils import *
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class comparator: # Class to hold global-type variables
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.reset()
|
|
13
|
+
|
|
14
|
+
def reset(self):
|
|
15
|
+
self.perfect_Starts = 0
|
|
16
|
+
self.perfect_Stops = 0
|
|
17
|
+
self.genome_Seq = ''
|
|
18
|
+
self.genome_Seq_Rev = ''
|
|
19
|
+
self.genome_Size = 0
|
|
20
|
+
self.correct_Frame_Number = 0
|
|
21
|
+
self.extended_Start = 0
|
|
22
|
+
self.extended_Stop = 0
|
|
23
|
+
self.extended_CDS = 0
|
|
24
|
+
|
|
25
|
+
self.perfect_Matches = collections.OrderedDict()
|
|
26
|
+
self.matched_ORFs = collections.OrderedDict()
|
|
27
|
+
self.multi_Matched_ORFs = collections.defaultdict(list)
|
|
28
|
+
self.unmatched_ORFs = collections.OrderedDict()
|
|
29
|
+
self.genes_Detected = collections.OrderedDict()
|
|
30
|
+
self.genes_Undetected = collections.OrderedDict()
|
|
31
|
+
self.out_Of_Frame_ORFs = collections.OrderedDict()
|
|
32
|
+
self.partial_Hits = collections.OrderedDict()
|
|
33
|
+
|
|
34
|
+
self.start_Difference = []
|
|
35
|
+
self.stop_Difference = []
|
|
36
|
+
self.orf_Lengths = []
|
|
37
|
+
self.gene_Lengths = []
|
|
38
|
+
|
|
39
|
+
self.gene_Pos_Olap = []
|
|
40
|
+
self.gene_Neg_Olap = []
|
|
41
|
+
self.orf_Pos_Olap = []
|
|
42
|
+
self.orf_Neg_Olap = []
|
|
43
|
+
self.m_ORF_Pos_Olap = []
|
|
44
|
+
self.m_ORF_Neg_Olap = []
|
|
45
|
+
|
|
46
|
+
self.gene_GC = []
|
|
47
|
+
self.orf_GC = []
|
|
48
|
+
self.m_ORF_GC = []
|
|
49
|
+
|
|
50
|
+
self.gene_Short = []
|
|
51
|
+
self.orf_Short = []
|
|
52
|
+
self.m_ORF_Short = []
|
|
53
|
+
|
|
54
|
+
self.pos_Strand = 0
|
|
55
|
+
self.neg_Strand = 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
comp = comparator()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_double_range(range1, range2):
|
|
63
|
+
return len(range1) >= 2 * len(range2)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
|
|
67
|
+
if stop >= comp.genome_Size:
|
|
68
|
+
if verbose == True:
|
|
69
|
+
print("There is a wrap around gene and I am dealing with it the best I can - Start: " + str(start) + " Stop: " + str(stop))
|
|
70
|
+
extra_stop = stop - comp.genome_Size
|
|
71
|
+
stop = comp.genome_Size
|
|
72
|
+
if strand == '-':
|
|
73
|
+
r_Start = comp.genome_Size - stop
|
|
74
|
+
r_Stop = comp.genome_Size - start
|
|
75
|
+
seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
76
|
+
extra_seq = (comp.genome_Seq_Rev[-extra_stop-1:])
|
|
77
|
+
seq = extra_seq+seq
|
|
78
|
+
elif strand == '+':
|
|
79
|
+
seq = comp.genome_Seq[start - 1:stop]
|
|
80
|
+
extra_seq = comp.genome_Seq[:extra_stop +1]
|
|
81
|
+
seq = seq+extra_seq
|
|
82
|
+
#seq = (comp.genome_Seq[start - 1:stop])
|
|
83
|
+
else:
|
|
84
|
+
if strand == '-':
|
|
85
|
+
r_Start = comp.genome_Size - stop
|
|
86
|
+
r_Stop = comp.genome_Size - start
|
|
87
|
+
seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
88
|
+
elif strand == '+':
|
|
89
|
+
seq = (comp.genome_Seq[start - 1:stop])
|
|
90
|
+
c = 0
|
|
91
|
+
a = 0
|
|
92
|
+
g = 0
|
|
93
|
+
t = 0
|
|
94
|
+
n = 0
|
|
95
|
+
for i in seq:
|
|
96
|
+
if "C" in i:
|
|
97
|
+
c += 1
|
|
98
|
+
elif "G" in i:
|
|
99
|
+
g += 1
|
|
100
|
+
elif "A" in i:
|
|
101
|
+
a += 1
|
|
102
|
+
elif "T" in i:
|
|
103
|
+
t += 1
|
|
104
|
+
elif "N" in i:
|
|
105
|
+
n += 1
|
|
106
|
+
gc_content = (g + c) * 100 / (a + t + g + c + n)
|
|
107
|
+
return gc_content
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def orf_Unmatched(o_Start, o_Stop, o_Strand, tools):
|
|
111
|
+
if o_Strand == '-':
|
|
112
|
+
r_Start = comp.genome_Size - o_Stop
|
|
113
|
+
r_Stop = comp.genome_Size - o_Start
|
|
114
|
+
Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq_Rev[
|
|
115
|
+
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
116
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
117
|
+
seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
118
|
+
comp.unmatched_ORFs.update({Unmatched_ORF: seq})
|
|
119
|
+
elif o_Strand == '+':
|
|
120
|
+
Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq[
|
|
121
|
+
o_Start - 1:o_Start + 2] + ',' + comp.genome_Seq[
|
|
122
|
+
o_Stop - 3:o_Stop] + ',' + tools
|
|
123
|
+
seq = (comp.genome_Seq[o_Start - 1:o_Stop])
|
|
124
|
+
comp.unmatched_ORFs.update({Unmatched_ORF: seq})
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def genes_Unmatched(g_Start, g_Stop, g_Strand, tools):
|
|
128
|
+
if g_Strand == '-':
|
|
129
|
+
r_Start = comp.genome_Size - g_Stop
|
|
130
|
+
r_Stop = comp.genome_Size - g_Start
|
|
131
|
+
missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
|
|
132
|
+
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
133
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
134
|
+
genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
135
|
+
comp.genes_Undetected.update({missed_Gene: genSeq})
|
|
136
|
+
elif g_Strand == '+':
|
|
137
|
+
missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
|
|
138
|
+
g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
|
|
139
|
+
g_Stop - 3:g_Stop] + ',' + tools
|
|
140
|
+
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
141
|
+
comp.genes_Undetected.update({missed_Gene: genSeq})
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def perfect_Matched_Genes(g_Start, g_Stop, g_Strand,tools):
|
|
145
|
+
if g_Strand == '-':
|
|
146
|
+
r_Start = comp.genome_Size - g_Stop
|
|
147
|
+
r_Stop = comp.genome_Size - g_Start
|
|
148
|
+
perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
|
|
149
|
+
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
150
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
151
|
+
genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
152
|
+
comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
|
|
153
|
+
elif g_Strand == '+':
|
|
154
|
+
perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
|
|
155
|
+
g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
|
|
156
|
+
g_Stop - 3:g_Stop] + ',' + tools
|
|
157
|
+
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
158
|
+
comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand):
|
|
162
|
+
comp.correct_Frame_Number += 1
|
|
163
|
+
############ Calculate prediction precision
|
|
164
|
+
if '+' in g_Strand:
|
|
165
|
+
comp.start_Difference.append(o_Start - g_Start)
|
|
166
|
+
comp.stop_Difference.append(o_Stop - g_Stop)
|
|
167
|
+
if g_Start == o_Start:
|
|
168
|
+
comp.perfect_Starts += 1
|
|
169
|
+
if g_Stop == o_Stop:
|
|
170
|
+
comp.perfect_Stops += 1
|
|
171
|
+
if o_Start < g_Start and o_Stop > g_Stop:
|
|
172
|
+
comp.extended_CDS += 1
|
|
173
|
+
if o_Start < g_Start:
|
|
174
|
+
comp.extended_Start += 1
|
|
175
|
+
if o_Stop > g_Stop:
|
|
176
|
+
comp.extended_Stop += 1
|
|
177
|
+
elif '-' in g_Strand: # Negative strand genes are reversed
|
|
178
|
+
comp.start_Difference.append(o_Stop - g_Stop)
|
|
179
|
+
comp.stop_Difference.append(o_Start - g_Start)
|
|
180
|
+
if g_Start == o_Start:
|
|
181
|
+
comp.perfect_Stops += 1
|
|
182
|
+
if g_Stop == o_Stop:
|
|
183
|
+
comp.perfect_Starts += 1
|
|
184
|
+
if o_Start < g_Start and o_Stop > g_Stop:
|
|
185
|
+
comp.extended_CDS += 1
|
|
186
|
+
if o_Start < g_Start:
|
|
187
|
+
comp.extended_Stop += 1
|
|
188
|
+
if o_Stop > g_Stop:
|
|
189
|
+
comp.extended_Start += 1
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def start_Codon_Count(orfs):
|
|
193
|
+
atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
|
|
194
|
+
other_Starts = []
|
|
195
|
+
for orf in orfs.values():
|
|
196
|
+
codon = orf[1]
|
|
197
|
+
if codon == 'ATG':
|
|
198
|
+
atg += 1
|
|
199
|
+
elif codon == 'GTG':
|
|
200
|
+
gtg += 1
|
|
201
|
+
elif codon == 'TTG':
|
|
202
|
+
ttg += 1
|
|
203
|
+
elif codon == 'ATT':
|
|
204
|
+
att += 1
|
|
205
|
+
elif codon == 'CTG':
|
|
206
|
+
ctg += 1
|
|
207
|
+
else:
|
|
208
|
+
other += 1
|
|
209
|
+
other_Starts.append(codon)
|
|
210
|
+
|
|
211
|
+
total = len(orfs) if orfs is not None else 0
|
|
212
|
+
|
|
213
|
+
if total:
|
|
214
|
+
atg_P = format(100 * atg / len(orfs), '.2f')
|
|
215
|
+
gtg_P = format(100 * gtg / len(orfs), '.2f')
|
|
216
|
+
ttg_P = format(100 * ttg / len(orfs), '.2f')
|
|
217
|
+
att_P = format(100 * att / len(orfs), '.2f')
|
|
218
|
+
ctg_P = format(100 * ctg / len(orfs), '.2f')
|
|
219
|
+
other_Start_P = format(100 * other / len(orfs), '.2f')
|
|
220
|
+
else:
|
|
221
|
+
atg_P = ttg_P = gtg_P = ctg_P = att_P = other_Start_P = format(0, '.2f')
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
'ATG': (atg, atg_P),
|
|
225
|
+
'TTG': (ttg, ttg_P),
|
|
226
|
+
'GTG': (gtg, gtg_P),
|
|
227
|
+
'CTG': (ctg, ctg_P),
|
|
228
|
+
'ATT': (att, att_P),
|
|
229
|
+
'Other': (other, other_Start_P),
|
|
230
|
+
'total': total
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
def stop_Codon_Count(orfs):
|
|
234
|
+
tag, taa, tga, other = 0, 0, 0, 0
|
|
235
|
+
other_Stops = []
|
|
236
|
+
|
|
237
|
+
total = len(orfs) if orfs else 0
|
|
238
|
+
if total:
|
|
239
|
+
for orf in orfs.values():
|
|
240
|
+
codon = orf[2]
|
|
241
|
+
if codon == 'TAG':
|
|
242
|
+
tag += 1
|
|
243
|
+
elif codon == 'TAA':
|
|
244
|
+
taa += 1
|
|
245
|
+
elif codon == 'TGA':
|
|
246
|
+
tga += 1
|
|
247
|
+
else:
|
|
248
|
+
other += 1
|
|
249
|
+
other_Stops.append(codon)
|
|
250
|
+
tag_p = format(100 * tag / len(orfs), '.2f')
|
|
251
|
+
taa_p = format(100 * taa / len(orfs), '.2f')
|
|
252
|
+
tga_p = format(100 * tga / len(orfs), '.2f')
|
|
253
|
+
other_Stop_P = format(100 * other / len(orfs), '.2f')
|
|
254
|
+
else:
|
|
255
|
+
tag_p = taa_p = tga_p = other_Stop_P = format(0, '.2f')
|
|
256
|
+
|
|
257
|
+
return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def candidate_ORF_Selection(gene_Set,
|
|
261
|
+
candidate_ORFs): # Select ORF from candidates which is most similar to partially detected gene
|
|
262
|
+
current_Coverage = 0
|
|
263
|
+
candidate_ORF_Difference = 0
|
|
264
|
+
pos = ''
|
|
265
|
+
orf_Details = []
|
|
266
|
+
for c_Pos, c_ORF_Details in candidate_ORFs.items():
|
|
267
|
+
o_Start = int(c_Pos.split(',')[0])
|
|
268
|
+
o_Stop = int(c_Pos.split(',')[1])
|
|
269
|
+
# Below is not a long term fix
|
|
270
|
+
coverage = c_ORF_Details[-1]
|
|
271
|
+
orf_Set = set(range(o_Start, o_Stop + 1))
|
|
272
|
+
if coverage > current_Coverage:
|
|
273
|
+
current_Coverage = coverage
|
|
274
|
+
# Return set of elements outside the two sets/DNA ranges
|
|
275
|
+
candidate_ORF_Difference = orf_Set.symmetric_difference(gene_Set)
|
|
276
|
+
pos = c_Pos
|
|
277
|
+
orf_Details = c_ORF_Details
|
|
278
|
+
elif coverage == current_Coverage:
|
|
279
|
+
current_ORF_Difference = orf_Set.symmetric_difference(
|
|
280
|
+
gene_Set) # Pick least different ORF set from the Gene Set
|
|
281
|
+
if len(current_ORF_Difference) > len(candidate_ORF_Difference):
|
|
282
|
+
pos = c_Pos
|
|
283
|
+
orf_Details = c_ORF_Details
|
|
284
|
+
#else:
|
|
285
|
+
#("Match filtered out")
|
|
286
|
+
return pos, orf_Details
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, tools):
|
|
290
|
+
if g_Strand == '-':
|
|
291
|
+
r_G_Start = comp.genome_Size - g_Stop
|
|
292
|
+
r_G_Stop = comp.genome_Size - g_Start
|
|
293
|
+
r_O_Start = comp.genome_Size - o_Stop
|
|
294
|
+
r_O_Stop = comp.genome_Size - o_Start
|
|
295
|
+
partial = "Gene:" + str(g_Start) + '_' + str(g_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
|
|
296
|
+
r_G_Start:r_G_Start + 3] + '_' + comp.genome_Seq_Rev[
|
|
297
|
+
r_G_Stop - 2:r_G_Stop + 1] + ';Predicted_CDS:' + str(
|
|
298
|
+
o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
|
|
299
|
+
r_O_Start:r_O_Start + 3] + '_' + comp.genome_Seq_Rev[
|
|
300
|
+
r_O_Stop - 2:r_O_Stop + 1] + ';' + tools
|
|
301
|
+
genSeq = (comp.genome_Seq_Rev[r_G_Start:r_G_Stop + 1])
|
|
302
|
+
orfSeq = (comp.genome_Seq_Rev[r_O_Start:r_O_Stop + 1])
|
|
303
|
+
comp.partial_Hits.update({partial: [genSeq, orfSeq]})
|
|
304
|
+
elif g_Strand == '+':
|
|
305
|
+
partial = "Gene:" + str(g_Start) + '_' + str(g_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
|
|
306
|
+
g_Start - 1:g_Start + 2] + '_' + comp.genome_Seq[
|
|
307
|
+
g_Stop - 3:g_Stop] + ';Predicted_CDS:' + str(
|
|
308
|
+
o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
|
|
309
|
+
o_Start - 1:o_Start + 2] + '_' + comp.genome_Seq[
|
|
310
|
+
o_Stop - 3:o_Stop] + ';' + tools
|
|
311
|
+
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
312
|
+
orfSeq = (comp.genome_Seq[o_Start - 1:o_Stop])
|
|
313
|
+
comp.partial_Hits.update({partial: [genSeq, orfSeq]})
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def tool_comparison(all_orfs, dna_regions, verbose):
|
|
317
|
+
results = collections.OrderedDict() # Store results for each DNA region
|
|
318
|
+
for dna_region in dna_regions: # Loop through each DNA region
|
|
319
|
+
# reset comparator class variables
|
|
320
|
+
comp.reset()
|
|
321
|
+
|
|
322
|
+
ref_genes_list = dna_regions[dna_region][2]
|
|
323
|
+
ref_genes = collections.OrderedDict()
|
|
324
|
+
|
|
325
|
+
if not ref_genes_list:
|
|
326
|
+
results[dna_region] = {}
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
for d in ref_genes_list:
|
|
330
|
+
ref_genes.update(d)
|
|
331
|
+
comp.genome_Seq = dna_regions[dna_region][0]
|
|
332
|
+
comp.genome_Seq_Rev = revCompIterative(dna_regions[dna_region][0])
|
|
333
|
+
comp.genome_Size = len(dna_regions[dna_region][0])
|
|
334
|
+
|
|
335
|
+
current_orfs = all_orfs[dna_region]
|
|
336
|
+
# sort the ORFs by start position
|
|
337
|
+
|
|
338
|
+
better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
|
|
339
|
+
|
|
340
|
+
if not current_orfs or not better_pos_orfs_items:
|
|
341
|
+
results[dna_region] = {}
|
|
342
|
+
continue
|
|
343
|
+
|
|
344
|
+
for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
|
|
345
|
+
g_Start = int(gene_details[0])
|
|
346
|
+
g_Stop = int(gene_details[1])
|
|
347
|
+
g_Strand = gene_details[2]
|
|
348
|
+
g_pos = str(g_Start) + ',' + str(g_Stop)
|
|
349
|
+
gene_Set = set(range(g_Start,
|
|
350
|
+
g_Stop + 1)) # Used to check Overlap of ORFs and pick best match - slow but confirms best match
|
|
351
|
+
overlapping_ORFs = collections.OrderedDict()
|
|
352
|
+
perfect_Match = False
|
|
353
|
+
out_Frame = False
|
|
354
|
+
for pos, orf_Details in better_pos_orfs_items: # Check if perfect match, if not check if match covers at least 75% of gene - Loop through ALL ORFs - SLOW
|
|
355
|
+
o_Start,o_Stop = pos
|
|
356
|
+
o_Strand = orf_Details[0]
|
|
357
|
+
#orf_Set = set(range(o_Start, o_Stop + 1)) Removed for optimisation
|
|
358
|
+
if o_Stop <= g_Start or o_Start >= g_Stop: # Not caught up yet
|
|
359
|
+
continue
|
|
360
|
+
elif o_Start == g_Start and o_Stop == g_Stop: # If perfect match, break and skip the rest of the ORFs
|
|
361
|
+
perfect_Match = True
|
|
362
|
+
break
|
|
363
|
+
elif is_double_range(range(o_Start, o_Stop), range(g_Start,g_Stop)): # If ORF is double or more than the length of the gene, we do not count as found.
|
|
364
|
+
continue
|
|
365
|
+
elif g_Start <= o_Start < g_Stop or g_Start < o_Stop < g_Stop: # If ORF Start or Stop is between gene Start or Stop
|
|
366
|
+
#overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
|
|
367
|
+
overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
|
|
368
|
+
coverage = 100 * float(overlap) / float(len(gene_Set))
|
|
369
|
+
orf_Details.append(coverage)
|
|
370
|
+
if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
|
|
371
|
+
overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
372
|
+
elif coverage >= MIN_COVERAGE: # Not in frame / on same strand
|
|
373
|
+
comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
374
|
+
out_Frame = True
|
|
375
|
+
elif o_Start <= g_Start and o_Stop >= g_Stop: # If ORF extends one or both ends of the gene
|
|
376
|
+
#overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
|
|
377
|
+
overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
|
|
378
|
+
coverage = 100 * float(overlap) / float(len(gene_Set))
|
|
379
|
+
orf_Details.append(coverage)
|
|
380
|
+
if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
|
|
381
|
+
overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
382
|
+
elif coverage >= MIN_COVERAGE:
|
|
383
|
+
comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
384
|
+
out_Frame = True
|
|
385
|
+
else:
|
|
386
|
+
if verbose == True:
|
|
387
|
+
print("Unexpected Error Finding Predicted CDSs") # Should not happen
|
|
388
|
+
# Now Check that we select the best ORF
|
|
389
|
+
### Multi_Match_ORFs Should contain All genes found by a specific ORF
|
|
390
|
+
if perfect_Match == True: # Check if the ORF is a perfect match to the Gene
|
|
391
|
+
m_ORF_Details = orf_Details[:]
|
|
392
|
+
m_ORF_Details.append(g_pos)
|
|
393
|
+
if g_pos in comp.matched_ORFs.keys():
|
|
394
|
+
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
395
|
+
comp.multi_Matched_ORFs[g_pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
396
|
+
'-'), orf_Details[4]] # ORF is same as gene so can use g_pos
|
|
397
|
+
comp.matched_ORFs.update({g_pos: m_ORF_Details})
|
|
398
|
+
comp.genes_Detected.update({str(gene_details): g_pos})
|
|
399
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
400
|
+
perfect_Matched_Genes(g_Start, g_Stop, g_Strand,orf_Details[4])
|
|
401
|
+
#if verbose == True:
|
|
402
|
+
# print('Perfect Match')
|
|
403
|
+
elif perfect_Match == False and len(
|
|
404
|
+
overlapping_ORFs) == 1: # If we do not have a perfect match but 1 ORF which has passed the filtering
|
|
405
|
+
orf_Pos = list(overlapping_ORFs.keys())[0]
|
|
406
|
+
o_Start = int(orf_Pos.split(',')[0])
|
|
407
|
+
o_Stop = int(orf_Pos.split(',')[1])
|
|
408
|
+
orf_Details = overlapping_ORFs[orf_Pos]
|
|
409
|
+
m_ORF_Details = orf_Details[:]
|
|
410
|
+
m_ORF_Details.append(g_pos)
|
|
411
|
+
if orf_Pos in comp.matched_ORFs.keys():
|
|
412
|
+
try:
|
|
413
|
+
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
414
|
+
except KeyError:
|
|
415
|
+
last_key = [*comp.matched_ORFs.keys()][-1]
|
|
416
|
+
previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
|
|
417
|
+
comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
418
|
+
'-'), orf_Details[4]] # ORF collects multiple gene pos'
|
|
419
|
+
comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
|
|
420
|
+
comp.genes_Detected.update({str(gene_details): orf_Pos})
|
|
421
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
422
|
+
#if verbose == True:
|
|
423
|
+
# print('Partial Match')
|
|
424
|
+
partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
|
|
425
|
+
elif perfect_Match == False and len(
|
|
426
|
+
overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
|
|
427
|
+
orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
|
|
428
|
+
o_Start = int(orf_Pos.split(',')[0])
|
|
429
|
+
o_Stop = int(orf_Pos.split(',')[1])
|
|
430
|
+
m_ORF_Details = orf_Details[:]
|
|
431
|
+
m_ORF_Details.append(g_pos)
|
|
432
|
+
if orf_Pos in comp.matched_ORFs.keys():
|
|
433
|
+
try:
|
|
434
|
+
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
435
|
+
except KeyError:
|
|
436
|
+
last_key = [*comp.matched_ORFs.keys()][-1]
|
|
437
|
+
previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
|
|
438
|
+
comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
439
|
+
'-'), orf_Details[4]] # ORF collects multiple gene pos'
|
|
440
|
+
comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
|
|
441
|
+
comp.genes_Detected.update({str(gene_details): orf_Pos})
|
|
442
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
443
|
+
if verbose == True:
|
|
444
|
+
print('There was more than 1 potential Match - Best Chosen')
|
|
445
|
+
partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
|
|
446
|
+
elif out_Frame: # Keep record of ORFs which overlap a gene but in the wrong frame
|
|
447
|
+
if verbose == True:
|
|
448
|
+
print("Out of Frame Predicted CDS")
|
|
449
|
+
genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) #
|
|
450
|
+
else:
|
|
451
|
+
genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) # No hit
|
|
452
|
+
#if verbose == True:
|
|
453
|
+
# print("No Hit")
|
|
454
|
+
for orf_Key in comp.matched_ORFs: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
|
|
455
|
+
if orf_Key in comp.out_Of_Frame_ORFs:
|
|
456
|
+
del comp.out_Of_Frame_ORFs[orf_Key]
|
|
457
|
+
######################################## ORF Lengths and Precision
|
|
458
|
+
start_Difference = [x for x in comp.start_Difference if x != 0] # Remove 0s (Perfect hits)
|
|
459
|
+
stop_Difference = [x for x in comp.stop_Difference if x != 0]
|
|
460
|
+
if len(start_Difference) >= 1:
|
|
461
|
+
median_Start_Difference = np.median(start_Difference)
|
|
462
|
+
else:
|
|
463
|
+
median_Start_Difference = 'N/A'
|
|
464
|
+
if len(stop_Difference) >= 1:
|
|
465
|
+
median_Stop_Difference = np.median(stop_Difference)
|
|
466
|
+
else:
|
|
467
|
+
median_Stop_Difference = 'N/A'
|
|
468
|
+
|
|
469
|
+
# Get Start and Stop Codon Usage
|
|
470
|
+
atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(current_orfs)
|
|
471
|
+
tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(current_orfs)
|
|
472
|
+
# Count nucleotides found from ALL ORFs
|
|
473
|
+
gene_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
474
|
+
orf_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
475
|
+
matched_ORF_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
476
|
+
|
|
477
|
+
prev_Gene_Stop = 0
|
|
478
|
+
prev_Gene_Overlapped = False
|
|
479
|
+
for gene_Num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
|
|
480
|
+
g_Start = int(gene_details[0])
|
|
481
|
+
g_Stop = int(gene_details[1])
|
|
482
|
+
g_Strand = gene_details[2]
|
|
483
|
+
gene_Length = (g_Stop - g_Start) +1
|
|
484
|
+
if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
485
|
+
comp.gene_Lengths.append(gene_Length)
|
|
486
|
+
gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
|
|
487
|
+
comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
|
|
488
|
+
if gene_Length <= SHORT_ORF_LENGTH: # .utils
|
|
489
|
+
comp.gene_Short.append(gene_Length)
|
|
490
|
+
### Calculate overlapping Genes -
|
|
491
|
+
if prev_Gene_Stop > g_Start:
|
|
492
|
+
if '+' in g_Strand:
|
|
493
|
+
comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
|
|
494
|
+
elif '-' in g_Strand:
|
|
495
|
+
comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
|
|
496
|
+
prev_Gene_Overlapped = True
|
|
497
|
+
elif prev_Gene_Stop < g_Start:
|
|
498
|
+
if prev_Gene_Overlapped == True:
|
|
499
|
+
if '+' in g_Strand:
|
|
500
|
+
comp.gene_Pos_Olap.append(0)
|
|
501
|
+
elif '-' in g_Strand:
|
|
502
|
+
comp.gene_Neg_Olap.append(0)
|
|
503
|
+
prev_Gene_Overlapped = False
|
|
504
|
+
prev_Gene_Stop = g_Stop
|
|
505
|
+
if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
|
|
506
|
+
if '+' in g_Strand:
|
|
507
|
+
comp.gene_Pos_Olap.append(0)
|
|
508
|
+
elif '-' in g_Strand:
|
|
509
|
+
comp.gene_Neg_Olap.append(0)
|
|
510
|
+
#### avoid ValueError
|
|
511
|
+
if comp.gene_Lengths:
|
|
512
|
+
min_Gene_Length = min(comp.gene_Lengths)
|
|
513
|
+
max_Gene_Length = max(comp.gene_Lengths)
|
|
514
|
+
median_Gene_Length = np.median(comp.gene_Lengths)
|
|
515
|
+
else:
|
|
516
|
+
min_Gene_Length = max_Gene_Length = min_Length_Difference = 0
|
|
517
|
+
prev_ORF_Stop = 0
|
|
518
|
+
prev_ORF_Overlapped = False
|
|
519
|
+
for o_Positions, orf_Details in current_orfs.items():
|
|
520
|
+
o_Start = int(o_Positions.split(',')[0])
|
|
521
|
+
o_Stop = int(o_Positions.split(',')[1])
|
|
522
|
+
o_Strand = orf_Details[0]
|
|
523
|
+
# Stats just for Unmatched ORFs
|
|
524
|
+
if o_Positions not in list(comp.matched_ORFs.keys()):
|
|
525
|
+
orf_Unmatched(o_Start, o_Stop, o_Strand, orf_Details[4])
|
|
526
|
+
# Get ORF Strand metrics:
|
|
527
|
+
if o_Strand == "+": # Get number of Positive and Negative strand ORFs
|
|
528
|
+
comp.pos_Strand += 1
|
|
529
|
+
elif o_Strand == "-":
|
|
530
|
+
comp.neg_Strand += 1
|
|
531
|
+
orf_Length = (o_Stop - o_Start) +1
|
|
532
|
+
comp.orf_Lengths.append(orf_Length)
|
|
533
|
+
orf_Nuc_Array[o_Start - 1:o_Stop] = True # Changing all between the two positions to 1's
|
|
534
|
+
comp.orf_GC.append(nuc_Count(verbose, o_Start, o_Stop, o_Strand))
|
|
535
|
+
if orf_Length <= SHORT_ORF_LENGTH: # .utils
|
|
536
|
+
comp.orf_Short.append(orf_Length)
|
|
537
|
+
### Calculate overlapping ORFs -
|
|
538
|
+
if prev_ORF_Stop > o_Start:
|
|
539
|
+
if '+' in o_Strand:
|
|
540
|
+
comp.orf_Pos_Olap.append(prev_ORF_Stop - o_Start)
|
|
541
|
+
elif '-' in o_Strand:
|
|
542
|
+
comp.orf_Neg_Olap.append(prev_ORF_Stop - o_Start)
|
|
543
|
+
prev_ORF_Overlapped = True
|
|
544
|
+
elif prev_ORF_Stop < o_Start:
|
|
545
|
+
if prev_ORF_Overlapped == True:
|
|
546
|
+
if '+' in o_Strand:
|
|
547
|
+
comp.orf_Pos_Olap.append(0)
|
|
548
|
+
elif '-' in o_Strand:
|
|
549
|
+
comp.orf_Neg_Olap.append(0)
|
|
550
|
+
prev_ORF_Overlapped = False
|
|
551
|
+
prev_ORF_Stop = o_Stop
|
|
552
|
+
if prev_ORF_Overlapped == True: # If last has a prev overlap, count it
|
|
553
|
+
if '+' in o_Strand:
|
|
554
|
+
comp.orf_Pos_Olap.append(0)
|
|
555
|
+
elif '-' in o_Strand:
|
|
556
|
+
comp.orf_Neg_Olap.append(0)
|
|
557
|
+
|
|
558
|
+
# Nucleotide Coverage calculated from ORFs matching a gene only
|
|
559
|
+
matched_Prev_ORF_Stop = 0
|
|
560
|
+
matched_Prev_ORF_Overlapped = False
|
|
561
|
+
for mo_Positions, m_ORF_Details in comp.matched_ORFs.items():
|
|
562
|
+
mo_Start = int(mo_Positions.split(',')[0])
|
|
563
|
+
mo_Stop = int(mo_Positions.split(',')[1])
|
|
564
|
+
mo_Strand = m_ORF_Details[0]
|
|
565
|
+
mo_Length = (mo_Stop - mo_Start)
|
|
566
|
+
matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
|
|
567
|
+
|
|
568
|
+
comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
|
|
569
|
+
if mo_Length <= SHORT_ORF_LENGTH: # .utils
|
|
570
|
+
comp.m_ORF_Short.append(mo_Length)
|
|
571
|
+
### Calculate overlapping Matched ORFs -
|
|
572
|
+
if matched_Prev_ORF_Stop > mo_Start:
|
|
573
|
+
if '+' in mo_Strand:
|
|
574
|
+
comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
575
|
+
elif '-' in mo_Strand:
|
|
576
|
+
comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
577
|
+
matched_Prev_ORF_Overlapped = True
|
|
578
|
+
elif matched_Prev_ORF_Stop < mo_Start:
|
|
579
|
+
if matched_Prev_ORF_Overlapped == True:
|
|
580
|
+
if '+' in mo_Strand:
|
|
581
|
+
comp.m_ORF_Pos_Olap.append(0)
|
|
582
|
+
elif '-' in mo_Strand:
|
|
583
|
+
comp.m_ORF_Neg_Olap.append(0)
|
|
584
|
+
matched_Prev_ORF_Overlapped = False
|
|
585
|
+
matched_Prev_ORF_Stop = mo_Stop
|
|
586
|
+
if matched_Prev_ORF_Overlapped == True: # If last has a prev overlap, count it
|
|
587
|
+
if '+' in mo_Strand:
|
|
588
|
+
comp.m_ORF_Pos_Olap.append(0)
|
|
589
|
+
elif '-' in mo_Strand:
|
|
590
|
+
comp.m_ORF_Neg_Olap.append(0)
|
|
591
|
+
####
|
|
592
|
+
gene_Coverage_Genome = format(100 * np.sum(gene_Nuc_Array) / comp.genome_Size, '.2f')
|
|
593
|
+
orf_Coverage_Genome = format(100 * np.sum(orf_Nuc_Array) / comp.genome_Size, '.2f')
|
|
594
|
+
matched_ORF_Coverage_Genome = format(100 * np.sum(matched_ORF_Nuc_Array) / comp.genome_Size,
|
|
595
|
+
'.2f') # This gets the nts which are in matched ORFs - Check below
|
|
596
|
+
# matched_ORF_Nuc_AND_Gene = np.logical_and(matched_ORF_Nuc_Array,gene_Nuc_Array) + [0 for i in range(len(gene_Nuc_Array))] # This gets the nts which are in both matched ORFs and detected genes
|
|
597
|
+
# matched_ORF_Coverage_Genome = format(100 * np.count_nonzero(matched_ORF_Nuc_AND_Gene) / comp.genome_Size,'.2f')
|
|
598
|
+
|
|
599
|
+
# gene and orf nucleotide Intersection
|
|
600
|
+
gene_ORF_Nuc_Intersection = np.sum(gene_Nuc_Array & orf_Nuc_Array)
|
|
601
|
+
# not gene but orf nucleotides
|
|
602
|
+
not_Gene_Nuc_Array = np.logical_not(gene_Nuc_Array)
|
|
603
|
+
not_Gene_Nuc_And_ORF_Count = np.sum(not_Gene_Nuc_Array & orf_Nuc_Array)
|
|
604
|
+
# not orf nucleotides but gene
|
|
605
|
+
not_ORF_Nuc_Array = np.logical_not(orf_Nuc_Array)
|
|
606
|
+
not_ORF_Nuc_And_Gene_Count = np.sum(not_ORF_Nuc_Array & gene_Nuc_Array)
|
|
607
|
+
# not gene or orf nucleotides
|
|
608
|
+
not_Gene_Nuc_Not_ORF_Nuc_Count = np.sum(not_Gene_Nuc_Array & not_ORF_Nuc_Array)
|
|
609
|
+
# Nucleotide 'accuracy' - Normalised by number of nucelotides annotated by a gene
|
|
610
|
+
NT_TP = format(gene_ORF_Nuc_Intersection / np.sum(gene_Nuc_Array), '.2f')
|
|
611
|
+
NT_FP = format(not_Gene_Nuc_And_ORF_Count / np.sum(not_Gene_Nuc_Array), '.2f')
|
|
612
|
+
NT_FN = format(not_ORF_Nuc_And_Gene_Count / np.sum(gene_Nuc_Array), '.2f')
|
|
613
|
+
NT_TN = format(not_Gene_Nuc_Not_ORF_Nuc_Count / np.sum(not_Gene_Nuc_Array), '.2f')
|
|
614
|
+
NT_Precision = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_Gene_Nuc_And_ORF_Count), '.2f')
|
|
615
|
+
NT_Recall = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_ORF_Nuc_And_Gene_Count), '.2f')
|
|
616
|
+
NT_False_Discovery_Rate = format(
|
|
617
|
+
not_Gene_Nuc_And_ORF_Count / (not_Gene_Nuc_And_ORF_Count + gene_ORF_Nuc_Intersection), '.2f')
|
|
618
|
+
################################# Precision and Recall of whole ORFs and Genes
|
|
619
|
+
TP = format(len(comp.genes_Detected) / len(ref_genes), '.2f')
|
|
620
|
+
FP = format(len(comp.unmatched_ORFs) / len(ref_genes), '.2f')
|
|
621
|
+
FN = format(len(comp.genes_Undetected) / len(ref_genes), '.2f')
|
|
622
|
+
#################################################### Need a better way to handle 'no hits/ORFs'
|
|
623
|
+
try:
|
|
624
|
+
precision = format(float(TP) / (float(TP) + float(FP)), '.2f')
|
|
625
|
+
except ZeroDivisionError:
|
|
626
|
+
precision = format(0.00, '.2f')
|
|
627
|
+
try:
|
|
628
|
+
recall = format(float(TP) / (float(TP) + float(FN)), '.2f')
|
|
629
|
+
except ZeroDivisionError:
|
|
630
|
+
recall = format(0.00, '.2f')
|
|
631
|
+
try:
|
|
632
|
+
false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
|
|
633
|
+
except ZeroDivisionError:
|
|
634
|
+
false_Discovery_Rate = 'N/A'
|
|
635
|
+
min_ORF_Length = min(comp.orf_Lengths)
|
|
636
|
+
max_ORF_Length = max(comp.orf_Lengths)
|
|
637
|
+
median_ORF_Length = np.median(comp.orf_Lengths)
|
|
638
|
+
|
|
639
|
+
##########################################################################
|
|
640
|
+
# Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
|
|
641
|
+
ORFs_Difference = format(100 * (len(current_orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
|
|
642
|
+
genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
|
|
643
|
+
matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(current_orfs)), '.2f')
|
|
644
|
+
all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
|
|
645
|
+
matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
|
|
646
|
+
all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
|
|
647
|
+
|
|
648
|
+
if all_ORF_Olap: # If no overlapping ORFs
|
|
649
|
+
try:
|
|
650
|
+
overlap_Difference = format(100 * (len(all_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap), '.2f')
|
|
651
|
+
matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
|
|
652
|
+
'.2f')
|
|
653
|
+
except ZeroDivisionError:
|
|
654
|
+
overlap_Difference = 'N/A'
|
|
655
|
+
matched_Overlap_Difference = 'N/A'
|
|
656
|
+
num_All_ORF_Olap = len(all_ORF_Olap)
|
|
657
|
+
if matched_ORF_Olap:
|
|
658
|
+
max_Matched_ORF_Olap = max(matched_ORF_Olap)
|
|
659
|
+
matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
|
|
660
|
+
else:
|
|
661
|
+
max_Matched_ORF_Olap = 'N/A'
|
|
662
|
+
matched_Median_ORF_Overlap = 'N/A'
|
|
663
|
+
max_All_ORF_Olap = max(all_ORF_Olap)
|
|
664
|
+
median_ORF_Overlap = format(np.median(all_ORF_Olap), '.2f')
|
|
665
|
+
else:
|
|
666
|
+
overlap_Difference = 'N/A'
|
|
667
|
+
matched_Overlap_Difference = 'N/A'
|
|
668
|
+
num_All_ORF_Olap = 0
|
|
669
|
+
max_Matched_ORF_Olap = 'N/A'
|
|
670
|
+
max_All_ORF_Olap = 'N/A'
|
|
671
|
+
median_ORF_Overlap = 'N/A'
|
|
672
|
+
matched_Median_ORF_Overlap = 'N/A'
|
|
673
|
+
if len(matched_ORF_Olap) == 0: # -100.00 is not informative
|
|
674
|
+
matched_Overlap_Difference = 'N/A'
|
|
675
|
+
|
|
676
|
+
# Need to NA everything
|
|
677
|
+
|
|
678
|
+
if comp.orf_Short and comp.gene_Short: # IF Short-ORFs/Genes
|
|
679
|
+
short_ORF_Difference = format(100 * (len(comp.orf_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
|
|
680
|
+
matched_Short_ORF_Difference = format(
|
|
681
|
+
100 * (len(comp.m_ORF_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
|
|
682
|
+
num_ORF_Short = len(comp.orf_Short)
|
|
683
|
+
num_Matched_ORF_Short = len(comp.m_ORF_Short)
|
|
684
|
+
elif comp.orf_Short: # If only Short-ORFs
|
|
685
|
+
num_ORF_Short = len(comp.orf_Short)
|
|
686
|
+
num_Matched_ORF_Short = 'N/A'
|
|
687
|
+
short_ORF_Difference = (num_ORF_Short * 100)
|
|
688
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
689
|
+
else: # If only Short-Genes and Undetected StORFs
|
|
690
|
+
comp.gene_Short
|
|
691
|
+
short_ORF_Difference = 'N/A'
|
|
692
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
693
|
+
num_ORF_Short = 0
|
|
694
|
+
num_Matched_ORF_Short = 'N/A'
|
|
695
|
+
if num_Matched_ORF_Short == 0: # -100.00 is not informative
|
|
696
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
697
|
+
|
|
698
|
+
median_Length_Difference = format(100 * (median_ORF_Length - median_Gene_Length) / median_Gene_Length, '.2f')
|
|
699
|
+
min_Length_Difference = format(100 * (min_ORF_Length - min_Gene_Length) / min_Gene_Length, '.2f')
|
|
700
|
+
max_Length_Difference = format(100 * (max_ORF_Length - max_Gene_Length) / max_Gene_Length, '.2f')
|
|
701
|
+
pos_Strand_Percentage = format(comp.pos_Strand / len(current_orfs), '.2f')
|
|
702
|
+
neg_Strand_Percentage = format(comp.neg_Strand / len(current_orfs), '.2f')
|
|
703
|
+
median_ORF_GC = np.median(comp.orf_GC)
|
|
704
|
+
matched_Median_ORF_GC = np.median(comp.m_ORF_GC)
|
|
705
|
+
median_Gene_GC = np.median(comp.gene_GC)
|
|
706
|
+
median_GC_Difference = format(100 * (float(median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
|
|
707
|
+
matched_Median_GC_Difference = format(
|
|
708
|
+
100 * (float(matched_Median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
|
|
709
|
+
|
|
710
|
+
if comp.matched_ORFs: # No ORFs detected a gene
|
|
711
|
+
extended_CDS_Percentage = format(100 * comp.extended_CDS / len(comp.matched_ORFs), '.2f')
|
|
712
|
+
extended_Start_Percentage = format(100 * comp.extended_Start / len(comp.matched_ORFs), '.2f')
|
|
713
|
+
extended_Stop_Percentage = format(100 * comp.extended_Stop / len(comp.matched_ORFs), '.2f')
|
|
714
|
+
perfect_Matches_Percentage = format(100 * len(comp.perfect_Matches) / len(comp.matched_ORFs), '.2f')
|
|
715
|
+
perfect_Starts_Percentage = format(100 * comp.perfect_Starts / len(comp.matched_ORFs), '.2f')
|
|
716
|
+
perfect_Stops_Percentage = format(100 * comp.perfect_Stops / len(comp.matched_ORFs), '.2f')
|
|
717
|
+
else:
|
|
718
|
+
# correct_Frame_Percentage = 0
|
|
719
|
+
extended_CDS_Percentage = format(0.00, '.2f')
|
|
720
|
+
extended_Start_Percentage = format(0.00, '.2f')
|
|
721
|
+
extended_Stop_Percentage = format(0.00, '.2f')
|
|
722
|
+
perfect_Matches_Percentage = format(0.00, '.2f')
|
|
723
|
+
perfect_Starts_Percentage = format(0.00, '.2f')
|
|
724
|
+
perfect_Stops_Percentage = format(0.00, '.2f')
|
|
725
|
+
################### Missed Genes Metrics:
|
|
726
|
+
if comp.genes_Undetected:
|
|
727
|
+
mg_Starts = []
|
|
728
|
+
mg_Stops = []
|
|
729
|
+
mg_Lengths = []
|
|
730
|
+
mg_Strands = []
|
|
731
|
+
for mg, seq in comp.genes_Undetected.items():
|
|
732
|
+
mg = mg.split(',')
|
|
733
|
+
mg_Starts.append(mg[3])
|
|
734
|
+
mg_Stops.append(mg[4])
|
|
735
|
+
mg_Strands.append(mg[2])
|
|
736
|
+
mg_Lengths.append(int(mg[1]) - int(mg[0]))
|
|
737
|
+
|
|
738
|
+
mg_ATG = 100 * mg_Starts.count('ATG') / len(comp.genes_Undetected)
|
|
739
|
+
mg_GTG = 100 * mg_Starts.count('GTG') / len(comp.genes_Undetected)
|
|
740
|
+
mg_TTG = 100 * mg_Starts.count('TTG') / len(comp.genes_Undetected)
|
|
741
|
+
mg_ATT = 100 * mg_Starts.count('ATT') / len(comp.genes_Undetected)
|
|
742
|
+
mg_CTG = 100 * mg_Starts.count('CTG') / len(comp.genes_Undetected)
|
|
743
|
+
mg_O_Start = 100 - (mg_ATG + mg_GTG + mg_TTG + mg_ATT + mg_CTG)
|
|
744
|
+
mg_TGA = 100 * mg_Stops.count('TGA') / len(comp.genes_Undetected)
|
|
745
|
+
mg_TAA = 100 * mg_Stops.count('TAA') / len(comp.genes_Undetected)
|
|
746
|
+
mg_TAG = 100 * mg_Stops.count('TAG') / len(comp.genes_Undetected)
|
|
747
|
+
mg_O_Stop = 100 - (mg_TGA + mg_TAA + mg_TAG)
|
|
748
|
+
median_mg_Len = np.median(mg_Lengths)
|
|
749
|
+
mg_Pos = mg_Strands.count('+')
|
|
750
|
+
mg_Neg = mg_Strands.count('-')
|
|
751
|
+
undetected_Gene_Metrics = (
|
|
752
|
+
format(mg_ATG, '.2f'), format(mg_GTG, '.2f'), format(mg_TTG, '.2f'), format(mg_ATT, '.2f'),
|
|
753
|
+
format(mg_CTG, '.2f'), format(mg_O_Start, '.2f'), format(mg_TGA, '.2f'), format(mg_TAA, '.2f'),
|
|
754
|
+
format(mg_TAG, '.2f'), format(mg_O_Stop, '.2f'), format(median_mg_Len, '.2f'), mg_Pos, mg_Neg)
|
|
755
|
+
else:
|
|
756
|
+
undetected_Gene_Metrics = ''
|
|
757
|
+
#################### Unmathced ORF Metrics:
|
|
758
|
+
if comp.unmatched_ORFs:
|
|
759
|
+
uo_Starts = []
|
|
760
|
+
uo_Stops = []
|
|
761
|
+
uo_Lengths = []
|
|
762
|
+
uo_Strands = []
|
|
763
|
+
for uo, seq in comp.unmatched_ORFs.items():
|
|
764
|
+
uo = uo.split(',')
|
|
765
|
+
uo_Starts.append(uo[3])
|
|
766
|
+
uo_Stops.append(uo[4])
|
|
767
|
+
uo_Strands.append(uo[2])
|
|
768
|
+
uo_Lengths.append(int(uo[1]) - int(uo[0]))
|
|
769
|
+
uo_ATG = 100 * uo_Starts.count('ATG') / len(comp.unmatched_ORFs)
|
|
770
|
+
uo_GTG = 100 * uo_Starts.count('GTG') / len(comp.unmatched_ORFs)
|
|
771
|
+
uo_TTG = 100 * uo_Starts.count('TTG') / len(comp.unmatched_ORFs)
|
|
772
|
+
uo_ATT = 100 * uo_Starts.count('ATT') / len(comp.unmatched_ORFs)
|
|
773
|
+
uo_CTG = 100 * uo_Starts.count('CTG') / len(comp.unmatched_ORFs)
|
|
774
|
+
uo_O_Start = 100 - (uo_ATG + uo_GTG + uo_TTG + uo_ATT + uo_CTG)
|
|
775
|
+
uo_TGA = 100 * uo_Stops.count('TGA') / len(comp.unmatched_ORFs)
|
|
776
|
+
uo_TAA = 100 * uo_Stops.count('TAA') / len(comp.unmatched_ORFs)
|
|
777
|
+
uo_TAG = 100 * uo_Stops.count('TAG') / len(comp.unmatched_ORFs)
|
|
778
|
+
uo_O_Stop = 100 - (uo_TGA + uo_TAA + uo_TAG)
|
|
779
|
+
# uo_O_Stop = 100 * uo_O_Stop / len(comp.unmatched_ORFs) ########WHY?
|
|
780
|
+
median_uo_Len = np.median(uo_Lengths)
|
|
781
|
+
uo_Pos = uo_Strands.count('+')
|
|
782
|
+
uo_Neg = uo_Strands.count('-')
|
|
783
|
+
unmatched_ORF_Metrics = (
|
|
784
|
+
format(uo_ATG, '.2f'), format(uo_GTG, '.2f'), format(uo_TTG, '.2f'), format(uo_ATT, '.2f'),
|
|
785
|
+
format(uo_CTG, '.2f'), format(uo_O_Start, '.2f'), format(uo_TGA, '.2f'), format(uo_TAA, '.2f'),
|
|
786
|
+
format(uo_TAG, '.2f'), format(uo_O_Stop, '.2f'), format(median_uo_Len, '.2f'), uo_Pos, uo_Neg)
|
|
787
|
+
else:
|
|
788
|
+
unmatched_ORF_Metrics = ''
|
|
789
|
+
#################################
|
|
790
|
+
# Rep_Metrics - This is the final report of metrics
|
|
791
|
+
rep_Metrics = collections.OrderedDict(
|
|
792
|
+
{'Percentage_of_Genes_Detected': genes_Detected_Percentage,
|
|
793
|
+
'genes_Undetected': comp.genes_Undetected,
|
|
794
|
+
'undetected_Gene_Metrics': undetected_Gene_Metrics,
|
|
795
|
+
'gene_Coverage_Genome': gene_Coverage_Genome,
|
|
796
|
+
'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
|
|
797
|
+
'Percent_Difference_of_All_ORFs': ORFs_Difference,
|
|
798
|
+
'Median_Length_Difference': median_Length_Difference,
|
|
799
|
+
'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
|
|
800
|
+
'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
|
|
801
|
+
'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference,
|
|
802
|
+
'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
|
|
803
|
+
'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
|
|
804
|
+
'Precision': precision,
|
|
805
|
+
'Recall': recall,
|
|
806
|
+
'False_Discovery_Rate': false_Discovery_Rate})
|
|
807
|
+
# Pred Metrics - This is the final report of metrics
|
|
808
|
+
pred_metrics = collections.OrderedDict(
|
|
809
|
+
{'Number_of_ORFs': len(current_orfs), 'Percent_Difference_of_All_ORFs': ORFs_Difference,
|
|
810
|
+
'perfect_Matches': comp.perfect_Matches,
|
|
811
|
+
'unmatched_ORFs': comp.unmatched_ORFs,
|
|
812
|
+
'unmatched_ORF_Metrics': unmatched_ORF_Metrics,
|
|
813
|
+
'orf_Coverage_Genome': orf_Coverage_Genome,
|
|
814
|
+
'matched_ORF_Coverage_Genome': matched_ORF_Coverage_Genome,
|
|
815
|
+
'multi_Matched_ORFs': comp.multi_Matched_ORFs,
|
|
816
|
+
'partial_Hits': comp.partial_Hits,
|
|
817
|
+
'Number_of_ORFs_that_Detected_a_Gene': len(comp.matched_ORFs),
|
|
818
|
+
'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
|
|
819
|
+
'Number_of_Genes_Detected': len(comp.genes_Detected),
|
|
820
|
+
'Percentage_of_Genes_Detected': genes_Detected_Percentage, 'Median_Length_of_All_ORFs': median_ORF_Length,
|
|
821
|
+
'Median_Length_Difference': median_Length_Difference,
|
|
822
|
+
'Minimum_Length_of_All_ORFs': min_ORF_Length, 'Minimum_Length_Difference': min_Length_Difference,
|
|
823
|
+
'Maximum_Length_of_All_ORFs': max_ORF_Length, 'Maximum_Length_Difference': max_Length_Difference,
|
|
824
|
+
'Median_GC_content_of_All_ORFs': format(median_ORF_GC, '.2f'),
|
|
825
|
+
'Percent_Difference_of_All_ORFs_Median_GC': median_GC_Difference,
|
|
826
|
+
'Median_GC_content_of_Matched_ORFs': format(matched_Median_ORF_GC, '.2f'),
|
|
827
|
+
'Percent_Difference_of_Matched_ORF_GC': matched_Median_GC_Difference,
|
|
828
|
+
'Number_of_ORFs_which_Overlap_Another_ORF': num_All_ORF_Olap,
|
|
829
|
+
'Percent_Difference_of_Overlapping_ORFs': overlap_Difference,
|
|
830
|
+
'Maximum_ORF_Overlap': max_All_ORF_Olap, 'Median_ORF_Overlap': median_ORF_Overlap,
|
|
831
|
+
'Number_of_Matched_ORFs_Overlapping_Another_ORF': len(matched_ORF_Olap),
|
|
832
|
+
'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
|
|
833
|
+
'Maximum_Matched_ORF_Overlap': max_Matched_ORF_Olap, 'Median_Matched_ORF_Overlap': matched_Median_ORF_Overlap,
|
|
834
|
+
'Number_of_Short-ORFs': num_ORF_Short, 'Percent_Difference_of_Short-ORFs': short_ORF_Difference,
|
|
835
|
+
'Number_of_Short-Matched-ORFs': num_Matched_ORF_Short,
|
|
836
|
+
'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
|
|
837
|
+
'Number_of_Perfect_Matches': len(comp.perfect_Matches),
|
|
838
|
+
'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
|
|
839
|
+
'Number_of_Perfect_Starts': comp.perfect_Starts, 'Percentage_of_Perfect_Starts': perfect_Starts_Percentage,
|
|
840
|
+
'Number_of_Perfect_Stops': comp.perfect_Stops, 'Percentage_of_Perfect_Stops': perfect_Stops_Percentage,
|
|
841
|
+
'Number_of_Out_of_Frame_ORFs': len(comp.out_Of_Frame_ORFs),
|
|
842
|
+
'Number_of_Matched_ORFs_Extending_a_Coding_Region': comp.extended_CDS,
|
|
843
|
+
'Percentage_of_Matched_ORFs_Extending_a_Coding_Region': extended_CDS_Percentage,
|
|
844
|
+
'Number_of_Matched_ORFs_Extending_Start_Region': comp.extended_Start,
|
|
845
|
+
'Percentage_of_Matched_ORFs_Extending_Start_Region': extended_Start_Percentage,
|
|
846
|
+
'Number_of_Matched_ORFs_Extending_Stop_Region': comp.extended_Stop,
|
|
847
|
+
'Percentage_of_Matched_ORFs_Extending_Stop_Region': extended_Stop_Percentage,
|
|
848
|
+
'Number_of_All_ORFs_on_Positive_Strand': comp.pos_Strand,
|
|
849
|
+
'Percentage_of_All_ORFs_on_Positive_Strand': pos_Strand_Percentage,
|
|
850
|
+
'Number_of_All_ORFs_on_Negative_Strand': comp.neg_Strand,
|
|
851
|
+
'Percentage_of_All_ORFs_on_Negative_Strand': neg_Strand_Percentage,
|
|
852
|
+
'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
|
|
853
|
+
'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference, 'ATG_Start_Percentage': atg_P,
|
|
854
|
+
'GTG_Start_Percentage': gtg_P, 'TTG_Start_Percentage': ttg_P,
|
|
855
|
+
'ATT_Start_Percentage': att_P, 'CTG_Start_Percentage': ctg_P, 'Other_Start_Codon_Percentage': other_Start_P,
|
|
856
|
+
'TAG_Stop_Percentage': tag_P, 'TAA_Stop_Percentage': taa_P,
|
|
857
|
+
'TGA_Stop_Percentage': tga_P, 'Other_Stop_Codon_Percentage': other_Stop_P, 'True_Positive': TP,
|
|
858
|
+
'False_Positive': FP, 'False_Negative': FN, 'Precision': precision,
|
|
859
|
+
'Recall': recall, 'False_Discovery_Rate': false_Discovery_Rate, 'Nucleotide_True_Positive': NT_TP,
|
|
860
|
+
'Nucleotide_False_Positive': NT_FP, 'Nucleotide_True_Negative': NT_TN,
|
|
861
|
+
'Nucleotide_False_Negative': NT_FN, 'Nucleotide_Precision': NT_Precision, 'Nucleotide_Recall': NT_Recall,
|
|
862
|
+
'Nucleotide_False_Discovery_Rate': NT_False_Discovery_Rate,
|
|
863
|
+
'ORF_Nucleotide_Coverage_of_Genome': orf_Coverage_Genome,
|
|
864
|
+
'Matched_ORF_Nucleotide_Coverage_of_Genome': matched_ORF_Coverage_Genome})
|
|
865
|
+
result = collections.OrderedDict()
|
|
866
|
+
result.update({
|
|
867
|
+
'rep_metrics': rep_Metrics,
|
|
868
|
+
'pred_metrics': pred_metrics,
|
|
869
|
+
})
|
|
870
|
+
|
|
871
|
+
# To account for unbalanced data
|
|
872
|
+
for m_key, m_value in result.items():
|
|
873
|
+
if m_value == 'nan':
|
|
874
|
+
result[m_key] = 'N/A'
|
|
875
|
+
|
|
876
|
+
results[dna_region] = result
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
print("Finished calculating metrics for: ", dna_region)
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
return results # Return the results dictionary containing all metrics and details
|