ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +318 -133
- ORForise/Annotation_Compare.py +243 -125
- ORForise/Comparator.py +600 -552
- ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
- ORForise/Tools/Augustus/Augustus.py +30 -23
- ORForise/Tools/Balrog/Balrog.py +31 -23
- ORForise/Tools/EasyGene/EasyGene.py +30 -22
- ORForise/Tools/FGENESB/FGENESB.py +32 -25
- ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
- ORForise/Tools/GFF/GFF.py +51 -47
- ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
- ORForise/Tools/GeneMark/GeneMark.py +46 -40
- ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
- ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
- ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
- ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
- ORForise/Tools/MetaGene/MetaGene.py +29 -22
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
- ORForise/Tools/Prodigal/Prodigal.py +30 -26
- ORForise/Tools/Prokka/Prokka.py +30 -25
- ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
- ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
- ORForise/utils.py +204 -2
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
ORForise/Comparator.py
CHANGED
|
@@ -6,47 +6,62 @@ except ImportError:
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class comparator: # Class to hold global-type variables
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
23
|
-
|
|
24
|
-
self.
|
|
25
|
-
self.
|
|
26
|
-
|
|
27
|
-
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.reset()
|
|
12
|
+
|
|
13
|
+
def reset(self):
|
|
14
|
+
self.perfect_Starts = 0
|
|
15
|
+
self.perfect_Stops = 0
|
|
16
|
+
self.genome_Seq = ''
|
|
17
|
+
self.genome_Seq_Rev = ''
|
|
18
|
+
self.genome_Size = 0
|
|
19
|
+
self.correct_Frame_Number = 0
|
|
20
|
+
self.extended_Start = 0
|
|
21
|
+
self.extended_Stop = 0
|
|
22
|
+
self.extended_CDS = 0
|
|
23
|
+
|
|
24
|
+
self.perfect_Matches = collections.OrderedDict()
|
|
25
|
+
self.matched_ORFs = collections.OrderedDict()
|
|
26
|
+
self.multi_Matched_ORFs = collections.defaultdict(list)
|
|
27
|
+
self.unmatched_ORFs = collections.OrderedDict()
|
|
28
|
+
self.genes_Detected = collections.OrderedDict()
|
|
29
|
+
self.genes_Undetected = collections.OrderedDict()
|
|
30
|
+
self.out_Of_Frame_ORFs = collections.OrderedDict()
|
|
31
|
+
self.partial_Hits = collections.OrderedDict()
|
|
32
|
+
|
|
33
|
+
self.start_Difference = []
|
|
34
|
+
self.stop_Difference = []
|
|
35
|
+
self.orf_Lengths = []
|
|
36
|
+
self.gene_Lengths = []
|
|
37
|
+
|
|
38
|
+
self.gene_Pos_Olap = []
|
|
39
|
+
self.gene_Neg_Olap = []
|
|
40
|
+
self.orf_Pos_Olap = []
|
|
41
|
+
self.orf_Neg_Olap = []
|
|
42
|
+
self.m_ORF_Pos_Olap = []
|
|
43
|
+
self.m_ORF_Neg_Olap = []
|
|
44
|
+
|
|
45
|
+
self.gene_GC = []
|
|
46
|
+
self.orf_GC = []
|
|
47
|
+
self.m_ORF_GC = []
|
|
48
|
+
|
|
49
|
+
self.gene_Short = []
|
|
50
|
+
self.orf_Short = []
|
|
51
|
+
self.m_ORF_Short = []
|
|
52
|
+
|
|
53
|
+
self.pos_Strand = 0
|
|
54
|
+
self.neg_Strand = 0
|
|
28
55
|
|
|
29
56
|
|
|
30
|
-
comp = comparator()
|
|
31
57
|
|
|
58
|
+
comp = comparator()
|
|
32
59
|
|
|
33
|
-
# Not needed
|
|
34
|
-
# def keyshift(dictionary, key, diff):
|
|
35
|
-
# if key in dictionary:
|
|
36
|
-
# token = object()
|
|
37
|
-
# keys = [token]*(diff*-1) + dictionary + [token]*diff
|
|
38
|
-
# newkey = keys[keys.index(key)+diff]
|
|
39
|
-
# if newkey is token:
|
|
40
|
-
# print (None)
|
|
41
|
-
# else:
|
|
42
|
-
# to_return = dictionary[newkey].split(',')
|
|
43
|
-
# to_return = to_return[0]+'_'+to_return[1]+'_'+to_return[2]
|
|
44
|
-
# return to_return
|
|
45
|
-
# else:
|
|
46
|
-
# print ('Key not found')
|
|
47
60
|
|
|
48
61
|
def is_double_range(range1, range2):
|
|
49
62
|
return len(range1) >= 2 * len(range2)
|
|
63
|
+
|
|
64
|
+
|
|
50
65
|
def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
|
|
51
66
|
if stop >= comp.genome_Size:
|
|
52
67
|
if verbose == True:
|
|
@@ -91,53 +106,53 @@ def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
|
|
|
91
106
|
return gc_content
|
|
92
107
|
|
|
93
108
|
|
|
94
|
-
def orf_Unmatched(o_Start, o_Stop, o_Strand):
|
|
109
|
+
def orf_Unmatched(o_Start, o_Stop, o_Strand, tools):
|
|
95
110
|
if o_Strand == '-':
|
|
96
111
|
r_Start = comp.genome_Size - o_Stop
|
|
97
112
|
r_Stop = comp.genome_Size - o_Start
|
|
98
113
|
Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq_Rev[
|
|
99
114
|
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
100
|
-
r_Stop - 2:r_Stop + 1]
|
|
115
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
101
116
|
seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
102
117
|
comp.unmatched_ORFs.update({Unmatched_ORF: seq})
|
|
103
118
|
elif o_Strand == '+':
|
|
104
119
|
Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq[
|
|
105
120
|
o_Start - 1:o_Start + 2] + ',' + comp.genome_Seq[
|
|
106
|
-
o_Stop - 3:o_Stop]
|
|
121
|
+
o_Stop - 3:o_Stop] + ',' + tools
|
|
107
122
|
seq = (comp.genome_Seq[o_Start - 1:o_Stop])
|
|
108
123
|
comp.unmatched_ORFs.update({Unmatched_ORF: seq})
|
|
109
124
|
|
|
110
125
|
|
|
111
|
-
def genes_Unmatched(g_Start, g_Stop, g_Strand):
|
|
126
|
+
def genes_Unmatched(g_Start, g_Stop, g_Strand, tools):
|
|
112
127
|
if g_Strand == '-':
|
|
113
128
|
r_Start = comp.genome_Size - g_Stop
|
|
114
129
|
r_Stop = comp.genome_Size - g_Start
|
|
115
130
|
missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
|
|
116
131
|
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
117
|
-
r_Stop - 2:r_Stop + 1]
|
|
132
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
118
133
|
genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
119
134
|
comp.genes_Undetected.update({missed_Gene: genSeq})
|
|
120
135
|
elif g_Strand == '+':
|
|
121
136
|
missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
|
|
122
137
|
g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
|
|
123
|
-
g_Stop - 3:g_Stop]
|
|
138
|
+
g_Stop - 3:g_Stop] + ',' + tools
|
|
124
139
|
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
125
140
|
comp.genes_Undetected.update({missed_Gene: genSeq})
|
|
126
141
|
|
|
127
142
|
|
|
128
|
-
def perfect_Matched_Genes(g_Start, g_Stop, g_Strand):
|
|
143
|
+
def perfect_Matched_Genes(g_Start, g_Stop, g_Strand,tools):
|
|
129
144
|
if g_Strand == '-':
|
|
130
145
|
r_Start = comp.genome_Size - g_Stop
|
|
131
146
|
r_Stop = comp.genome_Size - g_Start
|
|
132
147
|
perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
|
|
133
148
|
r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
|
|
134
|
-
r_Stop - 2:r_Stop + 1]
|
|
149
|
+
r_Stop - 2:r_Stop + 1] + ',' + tools
|
|
135
150
|
genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
|
|
136
151
|
comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
|
|
137
152
|
elif g_Strand == '+':
|
|
138
153
|
perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
|
|
139
154
|
g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
|
|
140
|
-
g_Stop - 3:g_Stop]
|
|
155
|
+
g_Stop - 3:g_Stop] + ',' + tools
|
|
141
156
|
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
142
157
|
comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
|
|
143
158
|
|
|
@@ -250,7 +265,7 @@ def candidate_ORF_Selection(gene_Set,
|
|
|
250
265
|
return pos, orf_Details
|
|
251
266
|
|
|
252
267
|
|
|
253
|
-
def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
|
|
268
|
+
def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, tools):
|
|
254
269
|
if g_Strand == '-':
|
|
255
270
|
r_G_Start = comp.genome_Size - g_Stop
|
|
256
271
|
r_G_Stop = comp.genome_Size - g_Start
|
|
@@ -261,7 +276,7 @@ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
|
|
|
261
276
|
r_G_Stop - 2:r_G_Stop + 1] + ';Predicted_CDS:' + str(
|
|
262
277
|
o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
|
|
263
278
|
r_O_Start:r_O_Start + 3] + '_' + comp.genome_Seq_Rev[
|
|
264
|
-
r_O_Stop - 2:r_O_Stop + 1]
|
|
279
|
+
r_O_Stop - 2:r_O_Stop + 1] + ';' + tools
|
|
265
280
|
genSeq = (comp.genome_Seq_Rev[r_G_Start:r_G_Stop + 1])
|
|
266
281
|
orfSeq = (comp.genome_Seq_Rev[r_O_Start:r_O_Stop + 1])
|
|
267
282
|
comp.partial_Hits.update({partial: [genSeq, orfSeq]})
|
|
@@ -271,531 +286,564 @@ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
|
|
|
271
286
|
g_Stop - 3:g_Stop] + ';Predicted_CDS:' + str(
|
|
272
287
|
o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
|
|
273
288
|
o_Start - 1:o_Start + 2] + '_' + comp.genome_Seq[
|
|
274
|
-
o_Stop - 3:o_Stop]
|
|
289
|
+
o_Stop - 3:o_Stop] + ';' + tools
|
|
275
290
|
genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
|
|
276
291
|
orfSeq = (comp.genome_Seq[o_Start - 1:o_Stop])
|
|
277
292
|
comp.partial_Hits.update({partial: [genSeq, orfSeq]})
|
|
278
293
|
|
|
279
294
|
|
|
280
|
-
def tool_comparison(
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
if
|
|
314
|
-
|
|
315
|
-
elif
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
#
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
o_Start = int(orf_Pos.split(',')[0])
|
|
350
|
-
o_Stop = int(orf_Pos.split(',')[1])
|
|
351
|
-
orf_Details = overlapping_ORFs[orf_Pos]
|
|
352
|
-
m_ORF_Details = orf_Details[:]
|
|
353
|
-
m_ORF_Details.append(g_pos)
|
|
354
|
-
if orf_Pos in comp.matched_ORFs.keys():
|
|
355
|
-
try:
|
|
356
|
-
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
357
|
-
except KeyError:
|
|
358
|
-
last_key = [*comp.matched_ORFs.keys()][-1]
|
|
359
|
-
previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
|
|
360
|
-
comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
361
|
-
'-')] # ORF collects multiple gene pos'
|
|
362
|
-
comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
|
|
363
|
-
comp.genes_Detected.update({str(gene_details): orf_Pos})
|
|
364
|
-
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
365
|
-
#if verbose == True:
|
|
366
|
-
# print('Partial Match')
|
|
367
|
-
partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop)
|
|
368
|
-
elif perfect_Match == False and len(
|
|
369
|
-
overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
|
|
370
|
-
orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
|
|
371
|
-
o_Start = int(orf_Pos.split(',')[0])
|
|
372
|
-
o_Stop = int(orf_Pos.split(',')[1])
|
|
373
|
-
m_ORF_Details = orf_Details[:]
|
|
374
|
-
m_ORF_Details.append(g_pos)
|
|
375
|
-
if orf_Pos in comp.matched_ORFs.keys():
|
|
376
|
-
try:
|
|
295
|
+
def tool_comparison(all_orfs, dna_regions, verbose):
|
|
296
|
+
results = collections.OrderedDict() # Store results for each DNA region
|
|
297
|
+
for dna_region in dna_regions: # Loop through each DNA region
|
|
298
|
+
# reset comparator class variables
|
|
299
|
+
comp.reset()
|
|
300
|
+
|
|
301
|
+
ref_genes_list = dna_regions[dna_region][2]
|
|
302
|
+
ref_genes = collections.OrderedDict()
|
|
303
|
+
for d in ref_genes_list:
|
|
304
|
+
ref_genes.update(d)
|
|
305
|
+
comp.genome_Seq = dna_regions[dna_region][0]
|
|
306
|
+
comp.genome_Seq_Rev = revCompIterative(dna_regions[dna_region][0])
|
|
307
|
+
comp.genome_Size = len(dna_regions[dna_region][0])
|
|
308
|
+
|
|
309
|
+
current_orfs = all_orfs[dna_region]
|
|
310
|
+
# sort the ORFs by start position
|
|
311
|
+
|
|
312
|
+
better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
|
|
313
|
+
|
|
314
|
+
for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
|
|
315
|
+
g_Start = int(gene_details[0])
|
|
316
|
+
g_Stop = int(gene_details[1])
|
|
317
|
+
g_Strand = gene_details[2]
|
|
318
|
+
g_pos = str(g_Start) + ',' + str(g_Stop)
|
|
319
|
+
gene_Set = set(range(g_Start,
|
|
320
|
+
g_Stop + 1)) # Used to check Overlap of ORFs and pick best match - slow but confirms best match
|
|
321
|
+
overlapping_ORFs = collections.OrderedDict()
|
|
322
|
+
perfect_Match = False
|
|
323
|
+
out_Frame = False
|
|
324
|
+
for pos, orf_Details in better_pos_orfs_items: # Check if perfect match, if not check if match covers at least 75% of gene - Loop through ALL ORFs - SLOW
|
|
325
|
+
o_Start,o_Stop = pos
|
|
326
|
+
o_Strand = orf_Details[0]
|
|
327
|
+
#orf_Set = set(range(o_Start, o_Stop + 1)) Removed for optimisation
|
|
328
|
+
if o_Stop <= g_Start or o_Start >= g_Stop: # Not caught up yet
|
|
329
|
+
continue
|
|
330
|
+
elif o_Start == g_Start and o_Stop == g_Stop: # If perfect match, break and skip the rest of the ORFs
|
|
331
|
+
perfect_Match = True
|
|
332
|
+
break
|
|
333
|
+
elif is_double_range(range(o_Start, o_Stop), range(g_Start,g_Stop)): # If ORF is double or more than the length of the gene, we do not count as found.
|
|
334
|
+
continue
|
|
335
|
+
elif g_Start <= o_Start < g_Stop or g_Start < o_Stop < g_Stop: # If ORF Start or Stop is between gene Start or Stop
|
|
336
|
+
#overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
|
|
337
|
+
overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
|
|
338
|
+
coverage = 100 * float(overlap) / float(len(gene_Set))
|
|
339
|
+
orf_Details.append(coverage)
|
|
340
|
+
if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
|
|
341
|
+
overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
342
|
+
elif coverage >= MIN_COVERAGE: # Not in frame / on same strand
|
|
343
|
+
comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
344
|
+
out_Frame = True
|
|
345
|
+
elif o_Start <= g_Start and o_Stop >= g_Stop: # If ORF extends one or both ends of the gene
|
|
346
|
+
#overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
|
|
347
|
+
overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
|
|
348
|
+
coverage = 100 * float(overlap) / float(len(gene_Set))
|
|
349
|
+
orf_Details.append(coverage)
|
|
350
|
+
if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
|
|
351
|
+
overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
352
|
+
elif coverage >= MIN_COVERAGE:
|
|
353
|
+
comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
|
|
354
|
+
out_Frame = True
|
|
355
|
+
else:
|
|
356
|
+
if verbose == True:
|
|
357
|
+
print("Unexpected Error Finding Predicted CDSs") # Should not happen
|
|
358
|
+
# Now Check that we select the best ORF
|
|
359
|
+
### Multi_Match_ORFs Should contain All genes found by a specific ORF
|
|
360
|
+
if perfect_Match == True: # Check if the ORF is a perfect match to the Gene
|
|
361
|
+
m_ORF_Details = orf_Details[:]
|
|
362
|
+
m_ORF_Details.append(g_pos)
|
|
363
|
+
if g_pos in comp.matched_ORFs.keys():
|
|
377
364
|
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
comp.
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
365
|
+
comp.multi_Matched_ORFs[g_pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
366
|
+
'-'), orf_Details[4]] # ORF is same as gene so can use g_pos
|
|
367
|
+
comp.matched_ORFs.update({g_pos: m_ORF_Details})
|
|
368
|
+
comp.genes_Detected.update({str(gene_details): g_pos})
|
|
369
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
370
|
+
perfect_Matched_Genes(g_Start, g_Stop, g_Strand,orf_Details[4])
|
|
371
|
+
#if verbose == True:
|
|
372
|
+
# print('Perfect Match')
|
|
373
|
+
elif perfect_Match == False and len(
|
|
374
|
+
overlapping_ORFs) == 1: # If we do not have a perfect match but 1 ORF which has passed the filtering
|
|
375
|
+
orf_Pos = list(overlapping_ORFs.keys())[0]
|
|
376
|
+
o_Start = int(orf_Pos.split(',')[0])
|
|
377
|
+
o_Stop = int(orf_Pos.split(',')[1])
|
|
378
|
+
orf_Details = overlapping_ORFs[orf_Pos]
|
|
379
|
+
m_ORF_Details = orf_Details[:]
|
|
380
|
+
m_ORF_Details.append(g_pos)
|
|
381
|
+
if orf_Pos in comp.matched_ORFs.keys():
|
|
382
|
+
try:
|
|
383
|
+
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
384
|
+
except KeyError:
|
|
385
|
+
last_key = [*comp.matched_ORFs.keys()][-1]
|
|
386
|
+
previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
|
|
387
|
+
comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
388
|
+
'-'), orf_Details[4]] # ORF collects multiple gene pos'
|
|
389
|
+
comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
|
|
390
|
+
comp.genes_Detected.update({str(gene_details): orf_Pos})
|
|
391
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
392
|
+
#if verbose == True:
|
|
393
|
+
# print('Partial Match')
|
|
394
|
+
partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
|
|
395
|
+
elif perfect_Match == False and len(
|
|
396
|
+
overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
|
|
397
|
+
orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
|
|
398
|
+
o_Start = int(orf_Pos.split(',')[0])
|
|
399
|
+
o_Stop = int(orf_Pos.split(',')[1])
|
|
400
|
+
m_ORF_Details = orf_Details[:]
|
|
401
|
+
m_ORF_Details.append(g_pos)
|
|
402
|
+
if orf_Pos in comp.matched_ORFs.keys():
|
|
403
|
+
try:
|
|
404
|
+
previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
|
|
405
|
+
except KeyError:
|
|
406
|
+
last_key = [*comp.matched_ORFs.keys()][-1]
|
|
407
|
+
previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
|
|
408
|
+
comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
|
|
409
|
+
'-'), orf_Details[4]] # ORF collects multiple gene pos'
|
|
410
|
+
comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
|
|
411
|
+
comp.genes_Detected.update({str(gene_details): orf_Pos})
|
|
412
|
+
match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
|
|
413
|
+
if verbose == True:
|
|
414
|
+
print('There was more than 1 potential Match - Best Chosen')
|
|
415
|
+
partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
|
|
416
|
+
elif out_Frame: # Keep record of ORFs which overlap a gene but in the wrong frame
|
|
417
|
+
if verbose == True:
|
|
418
|
+
print("Out of Frame Predicted CDS")
|
|
419
|
+
genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) #
|
|
420
|
+
else:
|
|
421
|
+
genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) # No hit
|
|
422
|
+
#if verbose == True:
|
|
423
|
+
# print("No Hit")
|
|
424
|
+
for orf_Key in comp.matched_ORFs: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
|
|
425
|
+
if orf_Key in comp.out_Of_Frame_ORFs:
|
|
426
|
+
del comp.out_Of_Frame_ORFs[orf_Key]
|
|
427
|
+
######################################## ORF Lengths and Precision
|
|
428
|
+
start_Difference = [x for x in comp.start_Difference if x != 0] # Remove 0s (Perfect hits)
|
|
429
|
+
stop_Difference = [x for x in comp.stop_Difference if x != 0]
|
|
430
|
+
if len(start_Difference) >= 1:
|
|
431
|
+
median_Start_Difference = np.median(start_Difference)
|
|
393
432
|
else:
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
g_Start = int(gene_details[0])
|
|
424
|
-
g_Stop = int(gene_details[1])
|
|
425
|
-
g_Strand = gene_details[2]
|
|
426
|
-
gene_Length = (g_Stop - g_Start) +1
|
|
427
|
-
if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
428
|
-
comp.gene_Lengths.append(gene_Length)
|
|
429
|
-
gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
|
|
430
|
-
comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
|
|
431
|
-
if gene_Length <= SHORT_ORF_LENGTH: # .utils
|
|
432
|
-
comp.gene_Short.append(gene_Length)
|
|
433
|
-
### Calculate overlapping Genes -
|
|
434
|
-
if prev_Gene_Stop > g_Start:
|
|
435
|
-
if '+' in g_Strand:
|
|
436
|
-
comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
|
|
437
|
-
elif '-' in g_Strand:
|
|
438
|
-
comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
|
|
439
|
-
prev_Gene_Overlapped = True
|
|
440
|
-
elif prev_Gene_Stop < g_Start:
|
|
441
|
-
if prev_Gene_Overlapped == True:
|
|
433
|
+
median_Start_Difference = 'N/A'
|
|
434
|
+
if len(stop_Difference) >= 1:
|
|
435
|
+
median_Stop_Difference = np.median(stop_Difference)
|
|
436
|
+
else:
|
|
437
|
+
median_Stop_Difference = 'N/A'
|
|
438
|
+
|
|
439
|
+
# Get Start and Stop Codon Usage
|
|
440
|
+
atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(current_orfs)
|
|
441
|
+
tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(current_orfs)
|
|
442
|
+
# Count nucleotides found from ALL ORFs
|
|
443
|
+
gene_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
444
|
+
orf_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
445
|
+
matched_ORF_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
|
|
446
|
+
|
|
447
|
+
prev_Gene_Stop = 0
|
|
448
|
+
prev_Gene_Overlapped = False
|
|
449
|
+
for gene_Num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
|
|
450
|
+
g_Start = int(gene_details[0])
|
|
451
|
+
g_Stop = int(gene_details[1])
|
|
452
|
+
g_Strand = gene_details[2]
|
|
453
|
+
gene_Length = (g_Stop - g_Start) +1
|
|
454
|
+
if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
455
|
+
comp.gene_Lengths.append(gene_Length)
|
|
456
|
+
gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
|
|
457
|
+
comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
|
|
458
|
+
if gene_Length <= SHORT_ORF_LENGTH: # .utils
|
|
459
|
+
comp.gene_Short.append(gene_Length)
|
|
460
|
+
### Calculate overlapping Genes -
|
|
461
|
+
if prev_Gene_Stop > g_Start:
|
|
442
462
|
if '+' in g_Strand:
|
|
443
|
-
comp.gene_Pos_Olap.append(
|
|
463
|
+
comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
|
|
444
464
|
elif '-' in g_Strand:
|
|
445
|
-
comp.gene_Neg_Olap.append(
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
if
|
|
465
|
+
comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
|
|
466
|
+
prev_Gene_Overlapped = True
|
|
467
|
+
elif prev_Gene_Stop < g_Start:
|
|
468
|
+
if prev_Gene_Overlapped == True:
|
|
469
|
+
if '+' in g_Strand:
|
|
470
|
+
comp.gene_Pos_Olap.append(0)
|
|
471
|
+
elif '-' in g_Strand:
|
|
472
|
+
comp.gene_Neg_Olap.append(0)
|
|
473
|
+
prev_Gene_Overlapped = False
|
|
474
|
+
prev_Gene_Stop = g_Stop
|
|
475
|
+
if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
|
|
476
|
+
if '+' in g_Strand:
|
|
477
|
+
comp.gene_Pos_Olap.append(0)
|
|
478
|
+
elif '-' in g_Strand:
|
|
479
|
+
comp.gene_Neg_Olap.append(0)
|
|
480
|
+
####
|
|
481
|
+
min_Gene_Length = min(comp.gene_Lengths)
|
|
482
|
+
max_Gene_Length = max(comp.gene_Lengths)
|
|
483
|
+
median_Gene_Length = np.median(comp.gene_Lengths)
|
|
484
|
+
prev_ORF_Stop = 0
|
|
485
|
+
prev_ORF_Overlapped = False
|
|
486
|
+
for o_Positions, orf_Details in current_orfs.items():
|
|
487
|
+
o_Start = int(o_Positions.split(',')[0])
|
|
488
|
+
o_Stop = int(o_Positions.split(',')[1])
|
|
489
|
+
o_Strand = orf_Details[0]
|
|
490
|
+
# Stats just for Unmatched ORFs
|
|
491
|
+
if o_Positions not in list(comp.matched_ORFs.keys()):
|
|
492
|
+
orf_Unmatched(o_Start, o_Stop, o_Strand, orf_Details[4])
|
|
493
|
+
# Get ORF Strand metrics:
|
|
494
|
+
if o_Strand == "+": # Get number of Positive and Negative strand ORFs
|
|
495
|
+
comp.pos_Strand += 1
|
|
496
|
+
elif o_Strand == "-":
|
|
497
|
+
comp.neg_Strand += 1
|
|
498
|
+
orf_Length = (o_Stop - o_Start) +1
|
|
499
|
+
comp.orf_Lengths.append(orf_Length)
|
|
500
|
+
orf_Nuc_Array[o_Start - 1:o_Stop] = True # Changing all between the two positions to 1's
|
|
501
|
+
comp.orf_GC.append(nuc_Count(verbose, o_Start, o_Stop, o_Strand))
|
|
502
|
+
if orf_Length <= SHORT_ORF_LENGTH: # .utils
|
|
503
|
+
comp.orf_Short.append(orf_Length)
|
|
504
|
+
### Calculate overlapping ORFs -
|
|
505
|
+
if prev_ORF_Stop > o_Start:
|
|
506
|
+
if '+' in o_Strand:
|
|
507
|
+
comp.orf_Pos_Olap.append(prev_ORF_Stop - o_Start)
|
|
508
|
+
elif '-' in o_Strand:
|
|
509
|
+
comp.orf_Neg_Olap.append(prev_ORF_Stop - o_Start)
|
|
510
|
+
prev_ORF_Overlapped = True
|
|
511
|
+
elif prev_ORF_Stop < o_Start:
|
|
512
|
+
if prev_ORF_Overlapped == True:
|
|
513
|
+
if '+' in o_Strand:
|
|
514
|
+
comp.orf_Pos_Olap.append(0)
|
|
515
|
+
elif '-' in o_Strand:
|
|
516
|
+
comp.orf_Neg_Olap.append(0)
|
|
517
|
+
prev_ORF_Overlapped = False
|
|
518
|
+
prev_ORF_Stop = o_Stop
|
|
519
|
+
if prev_ORF_Overlapped == True: # If last has a prev overlap, count it
|
|
486
520
|
if '+' in o_Strand:
|
|
487
521
|
comp.orf_Pos_Olap.append(0)
|
|
488
522
|
elif '-' in o_Strand:
|
|
489
523
|
comp.orf_Neg_Olap.append(0)
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
|
|
507
|
-
|
|
508
|
-
comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
|
|
509
|
-
if mo_Length <= SHORT_ORF_LENGTH: # .utils
|
|
510
|
-
comp.m_ORF_Short.append(mo_Length)
|
|
511
|
-
### Calculate overlapping Matched ORFs -
|
|
512
|
-
if matched_Prev_ORF_Stop > mo_Start:
|
|
513
|
-
if '+' in mo_Strand:
|
|
514
|
-
comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
515
|
-
elif '-' in mo_Strand:
|
|
516
|
-
comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
517
|
-
matched_Prev_ORF_Overlapped = True
|
|
518
|
-
elif matched_Prev_ORF_Stop < mo_Start:
|
|
519
|
-
if matched_Prev_ORF_Overlapped == True:
|
|
524
|
+
|
|
525
|
+
# Nucleotide Coverage calculated from ORFs matching a gene only
|
|
526
|
+
matched_Prev_ORF_Stop = 0
|
|
527
|
+
matched_Prev_ORF_Overlapped = False
|
|
528
|
+
for mo_Positions, m_ORF_Details in comp.matched_ORFs.items():
|
|
529
|
+
mo_Start = int(mo_Positions.split(',')[0])
|
|
530
|
+
mo_Stop = int(mo_Positions.split(',')[1])
|
|
531
|
+
mo_Strand = m_ORF_Details[0]
|
|
532
|
+
mo_Length = (mo_Stop - mo_Start)
|
|
533
|
+
matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
|
|
534
|
+
|
|
535
|
+
comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
|
|
536
|
+
if mo_Length <= SHORT_ORF_LENGTH: # .utils
|
|
537
|
+
comp.m_ORF_Short.append(mo_Length)
|
|
538
|
+
### Calculate overlapping Matched ORFs -
|
|
539
|
+
if matched_Prev_ORF_Stop > mo_Start:
|
|
520
540
|
if '+' in mo_Strand:
|
|
521
|
-
comp.m_ORF_Pos_Olap.append(
|
|
541
|
+
comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
522
542
|
elif '-' in mo_Strand:
|
|
523
|
-
comp.m_ORF_Neg_Olap.append(
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
recall = format(0.00, '.2f')
|
|
571
|
-
try:
|
|
572
|
-
false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
|
|
573
|
-
except ZeroDivisionError:
|
|
574
|
-
false_Discovery_Rate = 'N/A'
|
|
575
|
-
min_ORF_Length = min(comp.orf_Lengths)
|
|
576
|
-
max_ORF_Length = max(comp.orf_Lengths)
|
|
577
|
-
median_ORF_Length = np.median(comp.orf_Lengths)
|
|
578
|
-
|
|
579
|
-
##########################################################################
|
|
580
|
-
# Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
|
|
581
|
-
ORFs_Difference = format(100 * (len(orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
|
|
582
|
-
genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
|
|
583
|
-
matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(orfs)), '.2f')
|
|
584
|
-
all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
|
|
585
|
-
matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
|
|
586
|
-
all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
|
|
587
|
-
|
|
588
|
-
if all_ORF_Olap: # If no overlapping ORFs
|
|
543
|
+
comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
|
|
544
|
+
matched_Prev_ORF_Overlapped = True
|
|
545
|
+
elif matched_Prev_ORF_Stop < mo_Start:
|
|
546
|
+
if matched_Prev_ORF_Overlapped == True:
|
|
547
|
+
if '+' in mo_Strand:
|
|
548
|
+
comp.m_ORF_Pos_Olap.append(0)
|
|
549
|
+
elif '-' in mo_Strand:
|
|
550
|
+
comp.m_ORF_Neg_Olap.append(0)
|
|
551
|
+
matched_Prev_ORF_Overlapped = False
|
|
552
|
+
matched_Prev_ORF_Stop = mo_Stop
|
|
553
|
+
if matched_Prev_ORF_Overlapped == True: # If last has a prev overlap, count it
|
|
554
|
+
if '+' in mo_Strand:
|
|
555
|
+
comp.m_ORF_Pos_Olap.append(0)
|
|
556
|
+
elif '-' in mo_Strand:
|
|
557
|
+
comp.m_ORF_Neg_Olap.append(0)
|
|
558
|
+
####
|
|
559
|
+
gene_Coverage_Genome = format(100 * np.sum(gene_Nuc_Array) / comp.genome_Size, '.2f')
|
|
560
|
+
orf_Coverage_Genome = format(100 * np.sum(orf_Nuc_Array) / comp.genome_Size, '.2f')
|
|
561
|
+
matched_ORF_Coverage_Genome = format(100 * np.sum(matched_ORF_Nuc_Array) / comp.genome_Size,
|
|
562
|
+
'.2f') # This gets the nts which are in matched ORFs - Check below
|
|
563
|
+
# matched_ORF_Nuc_AND_Gene = np.logical_and(matched_ORF_Nuc_Array,gene_Nuc_Array) + [0 for i in range(len(gene_Nuc_Array))] # This gets the nts which are in both matched ORFs and detected genes
|
|
564
|
+
# matched_ORF_Coverage_Genome = format(100 * np.count_nonzero(matched_ORF_Nuc_AND_Gene) / comp.genome_Size,'.2f')
|
|
565
|
+
|
|
566
|
+
# gene and orf nucleotide Intersection
|
|
567
|
+
gene_ORF_Nuc_Intersection = np.sum(gene_Nuc_Array & orf_Nuc_Array)
|
|
568
|
+
# not gene but orf nucleotides
|
|
569
|
+
not_Gene_Nuc_Array = np.logical_not(gene_Nuc_Array)
|
|
570
|
+
not_Gene_Nuc_And_ORF_Count = np.sum(not_Gene_Nuc_Array & orf_Nuc_Array)
|
|
571
|
+
# not orf nucleotides but gene
|
|
572
|
+
not_ORF_Nuc_Array = np.logical_not(orf_Nuc_Array)
|
|
573
|
+
not_ORF_Nuc_And_Gene_Count = np.sum(not_ORF_Nuc_Array & gene_Nuc_Array)
|
|
574
|
+
# not gene or orf nucleotides
|
|
575
|
+
not_Gene_Nuc_Not_ORF_Nuc_Count = np.sum(not_Gene_Nuc_Array & not_ORF_Nuc_Array)
|
|
576
|
+
# Nucleotide 'accuracy' - Normalised by number of nucelotides annotated by a gene
|
|
577
|
+
NT_TP = format(gene_ORF_Nuc_Intersection / np.sum(gene_Nuc_Array), '.2f')
|
|
578
|
+
NT_FP = format(not_Gene_Nuc_And_ORF_Count / np.sum(not_Gene_Nuc_Array), '.2f')
|
|
579
|
+
NT_FN = format(not_ORF_Nuc_And_Gene_Count / np.sum(gene_Nuc_Array), '.2f')
|
|
580
|
+
NT_TN = format(not_Gene_Nuc_Not_ORF_Nuc_Count / np.sum(not_Gene_Nuc_Array), '.2f')
|
|
581
|
+
NT_Precision = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_Gene_Nuc_And_ORF_Count), '.2f')
|
|
582
|
+
NT_Recall = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_ORF_Nuc_And_Gene_Count), '.2f')
|
|
583
|
+
NT_False_Discovery_Rate = format(
|
|
584
|
+
not_Gene_Nuc_And_ORF_Count / (not_Gene_Nuc_And_ORF_Count + gene_ORF_Nuc_Intersection), '.2f')
|
|
585
|
+
################################# Precision and Recall of whole ORFs and Genes
|
|
586
|
+
TP = format(len(comp.genes_Detected) / len(ref_genes), '.2f')
|
|
587
|
+
FP = format(len(comp.unmatched_ORFs) / len(ref_genes), '.2f')
|
|
588
|
+
FN = format(len(comp.genes_Undetected) / len(ref_genes), '.2f')
|
|
589
|
+
#################################################### Need a better way to handle 'no hits/ORFs'
|
|
589
590
|
try:
|
|
590
|
-
|
|
591
|
-
matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
|
|
592
|
-
'.2f')
|
|
591
|
+
precision = format(float(TP) / (float(TP) + float(FP)), '.2f')
|
|
593
592
|
except ZeroDivisionError:
|
|
593
|
+
precision = format(0.00, '.2f')
|
|
594
|
+
try:
|
|
595
|
+
recall = format(float(TP) / (float(TP) + float(FN)), '.2f')
|
|
596
|
+
except ZeroDivisionError:
|
|
597
|
+
recall = format(0.00, '.2f')
|
|
598
|
+
try:
|
|
599
|
+
false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
|
|
600
|
+
except ZeroDivisionError:
|
|
601
|
+
false_Discovery_Rate = 'N/A'
|
|
602
|
+
min_ORF_Length = min(comp.orf_Lengths)
|
|
603
|
+
max_ORF_Length = max(comp.orf_Lengths)
|
|
604
|
+
median_ORF_Length = np.median(comp.orf_Lengths)
|
|
605
|
+
|
|
606
|
+
##########################################################################
|
|
607
|
+
# Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
|
|
608
|
+
ORFs_Difference = format(100 * (len(current_orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
|
|
609
|
+
genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
|
|
610
|
+
matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(current_orfs)), '.2f')
|
|
611
|
+
all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
|
|
612
|
+
matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
|
|
613
|
+
all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
|
|
614
|
+
|
|
615
|
+
if all_ORF_Olap: # If no overlapping ORFs
|
|
616
|
+
try:
|
|
617
|
+
overlap_Difference = format(100 * (len(all_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap), '.2f')
|
|
618
|
+
matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
|
|
619
|
+
'.2f')
|
|
620
|
+
except ZeroDivisionError:
|
|
621
|
+
overlap_Difference = 'N/A'
|
|
622
|
+
matched_Overlap_Difference = 'N/A'
|
|
623
|
+
num_All_ORF_Olap = len(all_ORF_Olap)
|
|
624
|
+
if matched_ORF_Olap:
|
|
625
|
+
max_Matched_ORF_Olap = max(matched_ORF_Olap)
|
|
626
|
+
matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
|
|
627
|
+
else:
|
|
628
|
+
max_Matched_ORF_Olap = 'N/A'
|
|
629
|
+
matched_Median_ORF_Overlap = 'N/A'
|
|
630
|
+
max_All_ORF_Olap = max(all_ORF_Olap)
|
|
631
|
+
median_ORF_Overlap = format(np.median(all_ORF_Olap), '.2f')
|
|
632
|
+
else:
|
|
594
633
|
overlap_Difference = 'N/A'
|
|
595
634
|
matched_Overlap_Difference = 'N/A'
|
|
596
|
-
|
|
597
|
-
if matched_ORF_Olap:
|
|
598
|
-
max_Matched_ORF_Olap = max(matched_ORF_Olap)
|
|
599
|
-
matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
|
|
600
|
-
else:
|
|
635
|
+
num_All_ORF_Olap = 0
|
|
601
636
|
max_Matched_ORF_Olap = 'N/A'
|
|
637
|
+
max_All_ORF_Olap = 'N/A'
|
|
638
|
+
median_ORF_Overlap = 'N/A'
|
|
602
639
|
matched_Median_ORF_Overlap = 'N/A'
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
mg_Stops.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
uo_Stops.
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
640
|
+
if len(matched_ORF_Olap) == 0: # -100.00 is not informative
|
|
641
|
+
matched_Overlap_Difference = 'N/A'
|
|
642
|
+
|
|
643
|
+
# Need to NA everything
|
|
644
|
+
|
|
645
|
+
if comp.orf_Short and comp.gene_Short: # IF Short-ORFs/Genes
|
|
646
|
+
short_ORF_Difference = format(100 * (len(comp.orf_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
|
|
647
|
+
matched_Short_ORF_Difference = format(
|
|
648
|
+
100 * (len(comp.m_ORF_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
|
|
649
|
+
num_ORF_Short = len(comp.orf_Short)
|
|
650
|
+
num_Matched_ORF_Short = len(comp.m_ORF_Short)
|
|
651
|
+
elif comp.orf_Short: # If only Short-ORFs
|
|
652
|
+
num_ORF_Short = len(comp.orf_Short)
|
|
653
|
+
num_Matched_ORF_Short = 'N/A'
|
|
654
|
+
short_ORF_Difference = (num_ORF_Short * 100)
|
|
655
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
656
|
+
else: # If only Short-Genes and Undetected StORFs
|
|
657
|
+
comp.gene_Short
|
|
658
|
+
short_ORF_Difference = 'N/A'
|
|
659
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
660
|
+
num_ORF_Short = 0
|
|
661
|
+
num_Matched_ORF_Short = 'N/A'
|
|
662
|
+
if num_Matched_ORF_Short == 0: # -100.00 is not informative
|
|
663
|
+
matched_Short_ORF_Difference = 'N/A'
|
|
664
|
+
|
|
665
|
+
median_Length_Difference = format(100 * (median_ORF_Length - median_Gene_Length) / median_Gene_Length, '.2f')
|
|
666
|
+
min_Length_Difference = format(100 * (min_ORF_Length - min_Gene_Length) / min_Gene_Length, '.2f')
|
|
667
|
+
max_Length_Difference = format(100 * (max_ORF_Length - max_Gene_Length) / max_Gene_Length, '.2f')
|
|
668
|
+
pos_Strand_Percentage = format(comp.pos_Strand / len(current_orfs), '.2f')
|
|
669
|
+
neg_Strand_Percentage = format(comp.neg_Strand / len(current_orfs), '.2f')
|
|
670
|
+
median_ORF_GC = np.median(comp.orf_GC)
|
|
671
|
+
matched_Median_ORF_GC = np.median(comp.m_ORF_GC)
|
|
672
|
+
median_Gene_GC = np.median(comp.gene_GC)
|
|
673
|
+
median_GC_Difference = format(100 * (float(median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
|
|
674
|
+
matched_Median_GC_Difference = format(
|
|
675
|
+
100 * (float(matched_Median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
|
|
676
|
+
|
|
677
|
+
if comp.matched_ORFs: # No ORFs detected a gene
|
|
678
|
+
extended_CDS_Percentage = format(100 * comp.extended_CDS / len(comp.matched_ORFs), '.2f')
|
|
679
|
+
extended_Start_Percentage = format(100 * comp.extended_Start / len(comp.matched_ORFs), '.2f')
|
|
680
|
+
extended_Stop_Percentage = format(100 * comp.extended_Stop / len(comp.matched_ORFs), '.2f')
|
|
681
|
+
perfect_Matches_Percentage = format(100 * len(comp.perfect_Matches) / len(comp.matched_ORFs), '.2f')
|
|
682
|
+
perfect_Starts_Percentage = format(100 * comp.perfect_Starts / len(comp.matched_ORFs), '.2f')
|
|
683
|
+
perfect_Stops_Percentage = format(100 * comp.perfect_Stops / len(comp.matched_ORFs), '.2f')
|
|
684
|
+
else:
|
|
685
|
+
# correct_Frame_Percentage = 0
|
|
686
|
+
extended_CDS_Percentage = format(0.00, '.2f')
|
|
687
|
+
extended_Start_Percentage = format(0.00, '.2f')
|
|
688
|
+
extended_Stop_Percentage = format(0.00, '.2f')
|
|
689
|
+
perfect_Matches_Percentage = format(0.00, '.2f')
|
|
690
|
+
perfect_Starts_Percentage = format(0.00, '.2f')
|
|
691
|
+
perfect_Stops_Percentage = format(0.00, '.2f')
|
|
692
|
+
################### Missed Genes Metrics:
|
|
693
|
+
if comp.genes_Undetected:
|
|
694
|
+
mg_Starts = []
|
|
695
|
+
mg_Stops = []
|
|
696
|
+
mg_Lengths = []
|
|
697
|
+
mg_Strands = []
|
|
698
|
+
for mg, seq in comp.genes_Undetected.items():
|
|
699
|
+
mg = mg.split(',')
|
|
700
|
+
mg_Starts.append(mg[3])
|
|
701
|
+
mg_Stops.append(mg[4])
|
|
702
|
+
mg_Strands.append(mg[2])
|
|
703
|
+
mg_Lengths.append(int(mg[1]) - int(mg[0]))
|
|
704
|
+
|
|
705
|
+
mg_ATG = 100 * mg_Starts.count('ATG') / len(comp.genes_Undetected)
|
|
706
|
+
mg_GTG = 100 * mg_Starts.count('GTG') / len(comp.genes_Undetected)
|
|
707
|
+
mg_TTG = 100 * mg_Starts.count('TTG') / len(comp.genes_Undetected)
|
|
708
|
+
mg_ATT = 100 * mg_Starts.count('ATT') / len(comp.genes_Undetected)
|
|
709
|
+
mg_CTG = 100 * mg_Starts.count('CTG') / len(comp.genes_Undetected)
|
|
710
|
+
mg_O_Start = 100 - (mg_ATG + mg_GTG + mg_TTG + mg_ATT + mg_CTG)
|
|
711
|
+
mg_TGA = 100 * mg_Stops.count('TGA') / len(comp.genes_Undetected)
|
|
712
|
+
mg_TAA = 100 * mg_Stops.count('TAA') / len(comp.genes_Undetected)
|
|
713
|
+
mg_TAG = 100 * mg_Stops.count('TAG') / len(comp.genes_Undetected)
|
|
714
|
+
mg_O_Stop = 100 - (mg_TGA + mg_TAA + mg_TAG)
|
|
715
|
+
median_mg_Len = np.median(mg_Lengths)
|
|
716
|
+
mg_Pos = mg_Strands.count('+')
|
|
717
|
+
mg_Neg = mg_Strands.count('-')
|
|
718
|
+
undetected_Gene_Metrics = (
|
|
719
|
+
format(mg_ATG, '.2f'), format(mg_GTG, '.2f'), format(mg_TTG, '.2f'), format(mg_ATT, '.2f'),
|
|
720
|
+
format(mg_CTG, '.2f'), format(mg_O_Start, '.2f'), format(mg_TGA, '.2f'), format(mg_TAA, '.2f'),
|
|
721
|
+
format(mg_TAG, '.2f'), format(mg_O_Stop, '.2f'), format(median_mg_Len, '.2f'), mg_Pos, mg_Neg)
|
|
722
|
+
else:
|
|
723
|
+
undetected_Gene_Metrics = ''
|
|
724
|
+
#################### Unmathced ORF Metrics:
|
|
725
|
+
if comp.unmatched_ORFs:
|
|
726
|
+
uo_Starts = []
|
|
727
|
+
uo_Stops = []
|
|
728
|
+
uo_Lengths = []
|
|
729
|
+
uo_Strands = []
|
|
730
|
+
for uo, seq in comp.unmatched_ORFs.items():
|
|
731
|
+
uo = uo.split(',')
|
|
732
|
+
uo_Starts.append(uo[3])
|
|
733
|
+
uo_Stops.append(uo[4])
|
|
734
|
+
uo_Strands.append(uo[2])
|
|
735
|
+
uo_Lengths.append(int(uo[1]) - int(uo[0]))
|
|
736
|
+
uo_ATG = 100 * uo_Starts.count('ATG') / len(comp.unmatched_ORFs)
|
|
737
|
+
uo_GTG = 100 * uo_Starts.count('GTG') / len(comp.unmatched_ORFs)
|
|
738
|
+
uo_TTG = 100 * uo_Starts.count('TTG') / len(comp.unmatched_ORFs)
|
|
739
|
+
uo_ATT = 100 * uo_Starts.count('ATT') / len(comp.unmatched_ORFs)
|
|
740
|
+
uo_CTG = 100 * uo_Starts.count('CTG') / len(comp.unmatched_ORFs)
|
|
741
|
+
uo_O_Start = 100 - (uo_ATG + uo_GTG + uo_TTG + uo_ATT + uo_CTG)
|
|
742
|
+
uo_TGA = 100 * uo_Stops.count('TGA') / len(comp.unmatched_ORFs)
|
|
743
|
+
uo_TAA = 100 * uo_Stops.count('TAA') / len(comp.unmatched_ORFs)
|
|
744
|
+
uo_TAG = 100 * uo_Stops.count('TAG') / len(comp.unmatched_ORFs)
|
|
745
|
+
uo_O_Stop = 100 - (uo_TGA + uo_TAA + uo_TAG)
|
|
746
|
+
# uo_O_Stop = 100 * uo_O_Stop / len(comp.unmatched_ORFs) ########WHY?
|
|
747
|
+
median_uo_Len = np.median(uo_Lengths)
|
|
748
|
+
uo_Pos = uo_Strands.count('+')
|
|
749
|
+
uo_Neg = uo_Strands.count('-')
|
|
750
|
+
unmatched_ORF_Metrics = (
|
|
751
|
+
format(uo_ATG, '.2f'), format(uo_GTG, '.2f'), format(uo_TTG, '.2f'), format(uo_ATT, '.2f'),
|
|
752
|
+
format(uo_CTG, '.2f'), format(uo_O_Start, '.2f'), format(uo_TGA, '.2f'), format(uo_TAA, '.2f'),
|
|
753
|
+
format(uo_TAG, '.2f'), format(uo_O_Stop, '.2f'), format(median_uo_Len, '.2f'), uo_Pos, uo_Neg)
|
|
754
|
+
else:
|
|
755
|
+
unmatched_ORF_Metrics = ''
|
|
756
|
+
#################################
|
|
757
|
+
# Rep_Metrics - This is the final report of metrics
|
|
758
|
+
rep_Metrics = collections.OrderedDict(
|
|
759
|
+
{'Percentage_of_Genes_Detected': genes_Detected_Percentage,
|
|
760
|
+
'genes_Undetected': comp.genes_Undetected,
|
|
761
|
+
'undetected_Gene_Metrics': undetected_Gene_Metrics,
|
|
762
|
+
'gene_Coverage_Genome': gene_Coverage_Genome,
|
|
763
|
+
'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
|
|
764
|
+
'Percent_Difference_of_All_ORFs': ORFs_Difference,
|
|
765
|
+
'Median_Length_Difference': median_Length_Difference,
|
|
766
|
+
'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
|
|
767
|
+
'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
|
|
768
|
+
'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference,
|
|
769
|
+
'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
|
|
770
|
+
'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
|
|
771
|
+
'Precision': precision,
|
|
772
|
+
'Recall': recall,
|
|
773
|
+
'False_Discovery_Rate': false_Discovery_Rate})
|
|
774
|
+
# Pred Metrics - This is the final report of metrics
|
|
775
|
+
pred_metrics = collections.OrderedDict(
|
|
776
|
+
{'Number_of_ORFs': len(current_orfs), 'Percent_Difference_of_All_ORFs': ORFs_Difference,
|
|
777
|
+
'perfect_Matches': comp.perfect_Matches,
|
|
778
|
+
'unmatched_ORFs': comp.unmatched_ORFs,
|
|
779
|
+
'unmatched_ORF_Metrics': unmatched_ORF_Metrics,
|
|
780
|
+
'orf_Coverage_Genome': orf_Coverage_Genome,
|
|
781
|
+
'matched_ORF_Coverage_Genome': matched_ORF_Coverage_Genome,
|
|
782
|
+
'multi_Matched_ORFs': comp.multi_Matched_ORFs,
|
|
783
|
+
'partial_Hits': comp.partial_Hits,
|
|
784
|
+
'Number_of_ORFs_that_Detected_a_Gene': len(comp.matched_ORFs),
|
|
785
|
+
'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
|
|
786
|
+
'Number_of_Genes_Detected': len(comp.genes_Detected),
|
|
787
|
+
'Percentage_of_Genes_Detected': genes_Detected_Percentage, 'Median_Length_of_All_ORFs': median_ORF_Length,
|
|
788
|
+
'Median_Length_Difference': median_Length_Difference,
|
|
789
|
+
'Minimum_Length_of_All_ORFs': min_ORF_Length, 'Minimum_Length_Difference': min_Length_Difference,
|
|
790
|
+
'Maximum_Length_of_All_ORFs': max_ORF_Length, 'Maximum_Length_Difference': max_Length_Difference,
|
|
791
|
+
'Median_GC_content_of_All_ORFs': format(median_ORF_GC, '.2f'),
|
|
792
|
+
'Percent_Difference_of_All_ORFs_Median_GC': median_GC_Difference,
|
|
793
|
+
'Median_GC_content_of_Matched_ORFs': format(matched_Median_ORF_GC, '.2f'),
|
|
794
|
+
'Percent_Difference_of_Matched_ORF_GC': matched_Median_GC_Difference,
|
|
795
|
+
'Number_of_ORFs_which_Overlap_Another_ORF': num_All_ORF_Olap,
|
|
796
|
+
'Percent_Difference_of_Overlapping_ORFs': overlap_Difference,
|
|
797
|
+
'Maximum_ORF_Overlap': max_All_ORF_Olap, 'Median_ORF_Overlap': median_ORF_Overlap,
|
|
798
|
+
'Number_of_Matched_ORFs_Overlapping_Another_ORF': len(matched_ORF_Olap),
|
|
799
|
+
'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
|
|
800
|
+
'Maximum_Matched_ORF_Overlap': max_Matched_ORF_Olap, 'Median_Matched_ORF_Overlap': matched_Median_ORF_Overlap,
|
|
801
|
+
'Number_of_Short-ORFs': num_ORF_Short, 'Percent_Difference_of_Short-ORFs': short_ORF_Difference,
|
|
802
|
+
'Number_of_Short-Matched-ORFs': num_Matched_ORF_Short,
|
|
803
|
+
'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
|
|
804
|
+
'Number_of_Perfect_Matches': len(comp.perfect_Matches),
|
|
805
|
+
'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
|
|
806
|
+
'Number_of_Perfect_Starts': comp.perfect_Starts, 'Percentage_of_Perfect_Starts': perfect_Starts_Percentage,
|
|
807
|
+
'Number_of_Perfect_Stops': comp.perfect_Stops, 'Percentage_of_Perfect_Stops': perfect_Stops_Percentage,
|
|
808
|
+
'Number_of_Out_of_Frame_ORFs': len(comp.out_Of_Frame_ORFs),
|
|
809
|
+
'Number_of_Matched_ORFs_Extending_a_Coding_Region': comp.extended_CDS,
|
|
810
|
+
'Percentage_of_Matched_ORFs_Extending_a_Coding_Region': extended_CDS_Percentage,
|
|
811
|
+
'Number_of_Matched_ORFs_Extending_Start_Region': comp.extended_Start,
|
|
812
|
+
'Percentage_of_Matched_ORFs_Extending_Start_Region': extended_Start_Percentage,
|
|
813
|
+
'Number_of_Matched_ORFs_Extending_Stop_Region': comp.extended_Stop,
|
|
814
|
+
'Percentage_of_Matched_ORFs_Extending_Stop_Region': extended_Stop_Percentage,
|
|
815
|
+
'Number_of_All_ORFs_on_Positive_Strand': comp.pos_Strand,
|
|
816
|
+
'Percentage_of_All_ORFs_on_Positive_Strand': pos_Strand_Percentage,
|
|
817
|
+
'Number_of_All_ORFs_on_Negative_Strand': comp.neg_Strand,
|
|
818
|
+
'Percentage_of_All_ORFs_on_Negative_Strand': neg_Strand_Percentage,
|
|
819
|
+
'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
|
|
820
|
+
'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference, 'ATG_Start_Percentage': atg_P,
|
|
821
|
+
'GTG_Start_Percentage': gtg_P, 'TTG_Start_Percentage': ttg_P,
|
|
822
|
+
'ATT_Start_Percentage': att_P, 'CTG_Start_Percentage': ctg_P, 'Other_Start_Codon_Percentage': other_Start_P,
|
|
823
|
+
'TAG_Stop_Percentage': tag_P, 'TAA_Stop_Percentage': taa_P,
|
|
824
|
+
'TGA_Stop_Percentage': tga_P, 'Other_Stop_Codon_Percentage': other_Stop_P, 'True_Positive': TP,
|
|
825
|
+
'False_Positive': FP, 'False_Negative': FN, 'Precision': precision,
|
|
826
|
+
'Recall': recall, 'False_Discovery_Rate': false_Discovery_Rate, 'Nucleotide_True_Positive': NT_TP,
|
|
827
|
+
'Nucleotide_False_Positive': NT_FP, 'Nucleotide_True_Negative': NT_TN,
|
|
828
|
+
'Nucleotide_False_Negative': NT_FN, 'Nucleotide_Precision': NT_Precision, 'Nucleotide_Recall': NT_Recall,
|
|
829
|
+
'Nucleotide_False_Discovery_Rate': NT_False_Discovery_Rate,
|
|
830
|
+
'ORF_Nucleotide_Coverage_of_Genome': orf_Coverage_Genome,
|
|
831
|
+
'Matched_ORF_Nucleotide_Coverage_of_Genome': matched_ORF_Coverage_Genome})
|
|
832
|
+
result = collections.OrderedDict()
|
|
833
|
+
result.update({
|
|
834
|
+
'rep_metrics': rep_Metrics,
|
|
835
|
+
'pred_metrics': pred_metrics,
|
|
836
|
+
})
|
|
837
|
+
|
|
838
|
+
# To account for unbalanced data
|
|
839
|
+
for m_key, m_value in result.items():
|
|
840
|
+
if m_value == 'nan':
|
|
841
|
+
result[m_key] = 'N/A'
|
|
842
|
+
|
|
843
|
+
results[dna_region] = result
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
print("Finished calculating metrics for: ", dna_region)
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
return results # Return the results dictionary containing all metrics and details
|