ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ORForise/Aggregate_Compare.py +318 -133
  2. ORForise/Annotation_Compare.py +243 -125
  3. ORForise/Comparator.py +600 -552
  4. ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
  5. ORForise/Tools/Augustus/Augustus.py +30 -23
  6. ORForise/Tools/Balrog/Balrog.py +31 -23
  7. ORForise/Tools/EasyGene/EasyGene.py +30 -22
  8. ORForise/Tools/FGENESB/FGENESB.py +32 -25
  9. ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
  10. ORForise/Tools/GFF/GFF.py +51 -47
  11. ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
  12. ORForise/Tools/GeneMark/GeneMark.py +46 -40
  13. ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
  14. ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
  15. ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
  16. ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
  17. ORForise/Tools/MetaGene/MetaGene.py +29 -22
  18. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
  19. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
  20. ORForise/Tools/Prodigal/Prodigal.py +30 -26
  21. ORForise/Tools/Prokka/Prokka.py +30 -25
  22. ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
  23. ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
  24. ORForise/utils.py +204 -2
  25. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
  26. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
  27. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
  28. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
  29. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
  30. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
ORForise/Comparator.py CHANGED
@@ -6,47 +6,62 @@ except ImportError:
6
6
 
7
7
 
8
8
  class comparator: # Class to hold global-type variables
9
- def __init__(self, perfect_Starts=0, perfect_Stops=0, genome_Seq='',
10
- genome_Seq_Rev='',
11
- genome_Size=0, correct_Frame_Number=0, extended_Start=0,
12
- extended_Stop=0, extended_CDS=0, perfect_Matches=collections.OrderedDict(),
13
- matched_ORFs=collections.OrderedDict(), multi_Matched_ORFs=collections.defaultdict(list),
14
- unmatched_ORFs=collections.OrderedDict(), genes_Detected=collections.OrderedDict(),
15
- genes_Undetected=collections.OrderedDict(),
16
- out_Of_Frame_ORFs=collections.OrderedDict(), start_Difference=[], stop_Difference=[],
17
- orf_Lengths=[], gene_Lengths=[], gene_Pos_Olap=[], gene_Neg_Olap=[], orf_Pos_Olap=[], orf_Neg_Olap=[],
18
- m_ORF_Pos_Olap=[], m_ORF_Neg_Olap=[], gene_GC=[],
19
- orf_GC=[], m_ORF_GC=[], gene_Short=[], orf_Short=[], m_ORF_Short=[], pos_Strand=0, neg_Strand=0,
20
- partial_Hits=collections.OrderedDict()):
21
- self.perfect_Starts, self.perfect_Stops, self.genome_Seq, self.genome_Seq_Rev, self.genome_Size, self.correct_Frame_Number, self.extended_Start, self.extended_Stop, self.extended_CDS, \
22
- self.perfect_Matches, self.matched_ORFs, self.multi_Matched_ORFs, self.unmatched_ORFs, self.genes_Detected, self.genes_Undetected, self.out_Of_Frame_ORFs, self.start_Difference, \
23
- self.stop_Difference, self.orf_Lengths, self.gene_Lengths, self.gene_Pos_Olap, \
24
- self.gene_Neg_Olap, self.orf_Pos_Olap, self.orf_Neg_Olap, self.m_ORF_Pos_Olap, self.m_ORF_Neg_Olap, self.gene_GC, self.orf_GC, self.m_ORF_GC, self.gene_Short, self.orf_Short, self.m_ORF_Short, self.pos_Strand, \
25
- self.neg_Strand, self.partial_Hits = perfect_Starts, perfect_Stops, genome_Seq, genome_Seq_Rev, \
26
- genome_Size, correct_Frame_Number, extended_Start, extended_Stop, extended_CDS, perfect_Matches, matched_ORFs, multi_Matched_ORFs, unmatched_ORFs, genes_Detected, genes_Undetected, out_Of_Frame_ORFs, start_Difference, stop_Difference, orf_Lengths, \
27
- gene_Lengths, gene_Pos_Olap, gene_Neg_Olap, orf_Pos_Olap, orf_Neg_Olap, m_ORF_Pos_Olap, m_ORF_Neg_Olap, gene_GC, orf_GC, m_ORF_GC, gene_Short, orf_Short, m_ORF_Short, pos_Strand, neg_Strand, partial_Hits
9
+
10
+ def __init__(self):
11
+ self.reset()
12
+
13
+ def reset(self):
14
+ self.perfect_Starts = 0
15
+ self.perfect_Stops = 0
16
+ self.genome_Seq = ''
17
+ self.genome_Seq_Rev = ''
18
+ self.genome_Size = 0
19
+ self.correct_Frame_Number = 0
20
+ self.extended_Start = 0
21
+ self.extended_Stop = 0
22
+ self.extended_CDS = 0
23
+
24
+ self.perfect_Matches = collections.OrderedDict()
25
+ self.matched_ORFs = collections.OrderedDict()
26
+ self.multi_Matched_ORFs = collections.defaultdict(list)
27
+ self.unmatched_ORFs = collections.OrderedDict()
28
+ self.genes_Detected = collections.OrderedDict()
29
+ self.genes_Undetected = collections.OrderedDict()
30
+ self.out_Of_Frame_ORFs = collections.OrderedDict()
31
+ self.partial_Hits = collections.OrderedDict()
32
+
33
+ self.start_Difference = []
34
+ self.stop_Difference = []
35
+ self.orf_Lengths = []
36
+ self.gene_Lengths = []
37
+
38
+ self.gene_Pos_Olap = []
39
+ self.gene_Neg_Olap = []
40
+ self.orf_Pos_Olap = []
41
+ self.orf_Neg_Olap = []
42
+ self.m_ORF_Pos_Olap = []
43
+ self.m_ORF_Neg_Olap = []
44
+
45
+ self.gene_GC = []
46
+ self.orf_GC = []
47
+ self.m_ORF_GC = []
48
+
49
+ self.gene_Short = []
50
+ self.orf_Short = []
51
+ self.m_ORF_Short = []
52
+
53
+ self.pos_Strand = 0
54
+ self.neg_Strand = 0
28
55
 
29
56
 
30
- comp = comparator()
31
57
 
58
+ comp = comparator()
32
59
 
33
- # Not needed
34
- # def keyshift(dictionary, key, diff):
35
- # if key in dictionary:
36
- # token = object()
37
- # keys = [token]*(diff*-1) + dictionary + [token]*diff
38
- # newkey = keys[keys.index(key)+diff]
39
- # if newkey is token:
40
- # print (None)
41
- # else:
42
- # to_return = dictionary[newkey].split(',')
43
- # to_return = to_return[0]+'_'+to_return[1]+'_'+to_return[2]
44
- # return to_return
45
- # else:
46
- # print ('Key not found')
47
60
 
48
61
  def is_double_range(range1, range2):
49
62
  return len(range1) >= 2 * len(range2)
63
+
64
+
50
65
  def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
51
66
  if stop >= comp.genome_Size:
52
67
  if verbose == True:
@@ -91,53 +106,53 @@ def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
91
106
  return gc_content
92
107
 
93
108
 
94
- def orf_Unmatched(o_Start, o_Stop, o_Strand):
109
+ def orf_Unmatched(o_Start, o_Stop, o_Strand, tools):
95
110
  if o_Strand == '-':
96
111
  r_Start = comp.genome_Size - o_Stop
97
112
  r_Stop = comp.genome_Size - o_Start
98
113
  Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq_Rev[
99
114
  r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
100
- r_Stop - 2:r_Stop + 1]
115
+ r_Stop - 2:r_Stop + 1] + ',' + tools
101
116
  seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
102
117
  comp.unmatched_ORFs.update({Unmatched_ORF: seq})
103
118
  elif o_Strand == '+':
104
119
  Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq[
105
120
  o_Start - 1:o_Start + 2] + ',' + comp.genome_Seq[
106
- o_Stop - 3:o_Stop]
121
+ o_Stop - 3:o_Stop] + ',' + tools
107
122
  seq = (comp.genome_Seq[o_Start - 1:o_Stop])
108
123
  comp.unmatched_ORFs.update({Unmatched_ORF: seq})
109
124
 
110
125
 
111
- def genes_Unmatched(g_Start, g_Stop, g_Strand):
126
+ def genes_Unmatched(g_Start, g_Stop, g_Strand, tools):
112
127
  if g_Strand == '-':
113
128
  r_Start = comp.genome_Size - g_Stop
114
129
  r_Stop = comp.genome_Size - g_Start
115
130
  missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
116
131
  r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
117
- r_Stop - 2:r_Stop + 1]
132
+ r_Stop - 2:r_Stop + 1] + ',' + tools
118
133
  genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
119
134
  comp.genes_Undetected.update({missed_Gene: genSeq})
120
135
  elif g_Strand == '+':
121
136
  missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
122
137
  g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
123
- g_Stop - 3:g_Stop]
138
+ g_Stop - 3:g_Stop] + ',' + tools
124
139
  genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
125
140
  comp.genes_Undetected.update({missed_Gene: genSeq})
126
141
 
127
142
 
128
- def perfect_Matched_Genes(g_Start, g_Stop, g_Strand):
143
+ def perfect_Matched_Genes(g_Start, g_Stop, g_Strand,tools):
129
144
  if g_Strand == '-':
130
145
  r_Start = comp.genome_Size - g_Stop
131
146
  r_Stop = comp.genome_Size - g_Start
132
147
  perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
133
148
  r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
134
- r_Stop - 2:r_Stop + 1]
149
+ r_Stop - 2:r_Stop + 1] + ',' + tools
135
150
  genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
136
151
  comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
137
152
  elif g_Strand == '+':
138
153
  perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
139
154
  g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
140
- g_Stop - 3:g_Stop]
155
+ g_Stop - 3:g_Stop] + ',' + tools
141
156
  genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
142
157
  comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
143
158
 
@@ -250,7 +265,7 @@ def candidate_ORF_Selection(gene_Set,
250
265
  return pos, orf_Details
251
266
 
252
267
 
253
- def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
268
+ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, tools):
254
269
  if g_Strand == '-':
255
270
  r_G_Start = comp.genome_Size - g_Stop
256
271
  r_G_Stop = comp.genome_Size - g_Start
@@ -261,7 +276,7 @@ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
261
276
  r_G_Stop - 2:r_G_Stop + 1] + ';Predicted_CDS:' + str(
262
277
  o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
263
278
  r_O_Start:r_O_Start + 3] + '_' + comp.genome_Seq_Rev[
264
- r_O_Stop - 2:r_O_Stop + 1]
279
+ r_O_Stop - 2:r_O_Stop + 1] + ';' + tools
265
280
  genSeq = (comp.genome_Seq_Rev[r_G_Start:r_G_Stop + 1])
266
281
  orfSeq = (comp.genome_Seq_Rev[r_O_Start:r_O_Stop + 1])
267
282
  comp.partial_Hits.update({partial: [genSeq, orfSeq]})
@@ -271,531 +286,564 @@ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop):
271
286
  g_Stop - 3:g_Stop] + ';Predicted_CDS:' + str(
272
287
  o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
273
288
  o_Start - 1:o_Start + 2] + '_' + comp.genome_Seq[
274
- o_Stop - 3:o_Stop]
289
+ o_Stop - 3:o_Stop] + ';' + tools
275
290
  genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
276
291
  orfSeq = (comp.genome_Seq[o_Start - 1:o_Stop])
277
292
  comp.partial_Hits.update({partial: [genSeq, orfSeq]})
278
293
 
279
294
 
280
- def tool_comparison(ref_genes, orfs, genome, verbose):
281
- comp.genome_Seq = genome
282
- comp.genome_Seq_Rev = revCompIterative(genome)
283
- comp.genome_Size = len(genome)
284
-
285
- better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in orfs.items()] #TODO: turn pos into tuple instead of string everywhere
286
-
287
- for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
288
- g_Start = int(gene_details[0])
289
- g_Stop = int(gene_details[1])
290
- g_Strand = gene_details[2]
291
- g_pos = str(g_Start) + ',' + str(g_Stop)
292
- gene_Set = set(range(g_Start,
293
- g_Stop + 1)) # Used to check Overlap of ORFs and pick best match - slow but confirms best match
294
- overlapping_ORFs = collections.OrderedDict()
295
- perfect_Match = False
296
- out_Frame = False
297
- for pos, orf_Details in better_pos_orfs_items: # Check if perfect match, if not check if match covers at least 75% of gene - Loop through ALL ORFs - SLOW
298
- o_Start,o_Stop = pos
299
- o_Strand = orf_Details[0]
300
- #orf_Set = set(range(o_Start, o_Stop + 1)) Removed for optimisation
301
- if o_Stop <= g_Start or o_Start >= g_Stop: # Not caught up yet
302
- continue
303
- elif o_Start == g_Start and o_Stop == g_Stop: # If perfect match, break and skip the rest of the ORFs
304
- perfect_Match = True
305
- break
306
- elif is_double_range(range(o_Start, o_Stop), range(g_Start,g_Stop)): # If ORF is double or more than the length of the gene, we do not count as found.
307
- continue
308
- elif g_Start <= o_Start < g_Stop or g_Start < o_Stop < g_Stop: # If ORF Start or Stop is between gene Start or Stop
309
- #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
310
- overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
311
- coverage = 100 * float(overlap) / float(len(gene_Set))
312
- orf_Details.append(coverage)
313
- if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
314
- overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
315
- elif coverage >= MIN_COVERAGE: # Not in frame / on same strand
316
- comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
317
- out_Frame = True
318
- elif o_Start <= g_Start and o_Stop >= g_Stop: # If ORF extends one or both ends of the gene
319
- #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
320
- overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
321
- coverage = 100 * float(overlap) / float(len(gene_Set))
322
- orf_Details.append(coverage)
323
- if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
324
- overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
325
- elif coverage >= MIN_COVERAGE:
326
- comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
327
- out_Frame = True
328
- else:
329
- if verbose == True:
330
- print("Unexpected Error Finding Predicted CDSs") # Should not happen
331
- # Now Check that we select the best ORF
332
- ### Multi_Match_ORFs Should contain All genes found by a specific ORF
333
- if perfect_Match == True: # Check if the ORF is a perfect match to the Gene
334
- m_ORF_Details = orf_Details[:]
335
- m_ORF_Details.append(g_pos)
336
- if g_pos in comp.matched_ORFs.keys():
337
- previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
338
- comp.multi_Matched_ORFs[g_pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
339
- '-')] # ORF is same as gene so can use g_pos
340
- comp.matched_ORFs.update({g_pos: m_ORF_Details})
341
- comp.genes_Detected.update({str(gene_details): g_pos})
342
- match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
343
- perfect_Matched_Genes(g_Start, g_Stop, g_Strand)
344
- #if verbose == True:
345
- # print('Perfect Match')
346
- elif perfect_Match == False and len(
347
- overlapping_ORFs) == 1: # If we do not have a perfect match but 1 ORF which has passed the filtering
348
- orf_Pos = list(overlapping_ORFs.keys())[0]
349
- o_Start = int(orf_Pos.split(',')[0])
350
- o_Stop = int(orf_Pos.split(',')[1])
351
- orf_Details = overlapping_ORFs[orf_Pos]
352
- m_ORF_Details = orf_Details[:]
353
- m_ORF_Details.append(g_pos)
354
- if orf_Pos in comp.matched_ORFs.keys():
355
- try:
356
- previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
357
- except KeyError:
358
- last_key = [*comp.matched_ORFs.keys()][-1]
359
- previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
360
- comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
361
- '-')] # ORF collects multiple gene pos'
362
- comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
363
- comp.genes_Detected.update({str(gene_details): orf_Pos})
364
- match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
365
- #if verbose == True:
366
- # print('Partial Match')
367
- partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop)
368
- elif perfect_Match == False and len(
369
- overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
370
- orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
371
- o_Start = int(orf_Pos.split(',')[0])
372
- o_Stop = int(orf_Pos.split(',')[1])
373
- m_ORF_Details = orf_Details[:]
374
- m_ORF_Details.append(g_pos)
375
- if orf_Pos in comp.matched_ORFs.keys():
376
- try:
295
+ def tool_comparison(all_orfs, dna_regions, verbose):
296
+ results = collections.OrderedDict() # Store results for each DNA region
297
+ for dna_region in dna_regions: # Loop through each DNA region
298
+ # reset comparator class variables
299
+ comp.reset()
300
+
301
+ ref_genes_list = dna_regions[dna_region][2]
302
+ ref_genes = collections.OrderedDict()
303
+ for d in ref_genes_list:
304
+ ref_genes.update(d)
305
+ comp.genome_Seq = dna_regions[dna_region][0]
306
+ comp.genome_Seq_Rev = revCompIterative(dna_regions[dna_region][0])
307
+ comp.genome_Size = len(dna_regions[dna_region][0])
308
+
309
+ current_orfs = all_orfs[dna_region]
310
+ # sort the ORFs by start position
311
+
312
+ better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
313
+
314
+ for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
315
+ g_Start = int(gene_details[0])
316
+ g_Stop = int(gene_details[1])
317
+ g_Strand = gene_details[2]
318
+ g_pos = str(g_Start) + ',' + str(g_Stop)
319
+ gene_Set = set(range(g_Start,
320
+ g_Stop + 1)) # Used to check Overlap of ORFs and pick best match - slow but confirms best match
321
+ overlapping_ORFs = collections.OrderedDict()
322
+ perfect_Match = False
323
+ out_Frame = False
324
+ for pos, orf_Details in better_pos_orfs_items: # Check if perfect match, if not check if match covers at least 75% of gene - Loop through ALL ORFs - SLOW
325
+ o_Start,o_Stop = pos
326
+ o_Strand = orf_Details[0]
327
+ #orf_Set = set(range(o_Start, o_Stop + 1)) Removed for optimisation
328
+ if o_Stop <= g_Start or o_Start >= g_Stop: # Not caught up yet
329
+ continue
330
+ elif o_Start == g_Start and o_Stop == g_Stop: # If perfect match, break and skip the rest of the ORFs
331
+ perfect_Match = True
332
+ break
333
+ elif is_double_range(range(o_Start, o_Stop), range(g_Start,g_Stop)): # If ORF is double or more than the length of the gene, we do not count as found.
334
+ continue
335
+ elif g_Start <= o_Start < g_Stop or g_Start < o_Stop < g_Stop: # If ORF Start or Stop is between gene Start or Stop
336
+ #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
337
+ overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
338
+ coverage = 100 * float(overlap) / float(len(gene_Set))
339
+ orf_Details.append(coverage)
340
+ if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
341
+ overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
342
+ elif coverage >= MIN_COVERAGE: # Not in frame / on same strand
343
+ comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
344
+ out_Frame = True
345
+ elif o_Start <= g_Start and o_Stop >= g_Stop: # If ORF extends one or both ends of the gene
346
+ #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
347
+ overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
348
+ coverage = 100 * float(overlap) / float(len(gene_Set))
349
+ orf_Details.append(coverage)
350
+ if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
351
+ overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
352
+ elif coverage >= MIN_COVERAGE:
353
+ comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
354
+ out_Frame = True
355
+ else:
356
+ if verbose == True:
357
+ print("Unexpected Error Finding Predicted CDSs") # Should not happen
358
+ # Now Check that we select the best ORF
359
+ ### Multi_Match_ORFs Should contain All genes found by a specific ORF
360
+ if perfect_Match == True: # Check if the ORF is a perfect match to the Gene
361
+ m_ORF_Details = orf_Details[:]
362
+ m_ORF_Details.append(g_pos)
363
+ if g_pos in comp.matched_ORFs.keys():
377
364
  previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
378
- except KeyError:
379
- last_key = [*comp.matched_ORFs.keys()][-1]
380
- previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
381
- comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
382
- '-')] # ORF collects multiple gene pos'
383
- comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
384
- comp.genes_Detected.update({str(gene_details): orf_Pos})
385
- match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
386
- if verbose == True:
387
- print('There was more than 1 potential Match - Best Chosen')
388
- partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop)
389
- elif out_Frame: # Keep record of ORFs which overlap a gene but in the wrong frame
390
- if verbose == True:
391
- print("Out of Frame Predicted CDS")
392
- genes_Unmatched(g_Start, g_Stop, g_Strand) #
365
+ comp.multi_Matched_ORFs[g_pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
366
+ '-'), orf_Details[4]] # ORF is same as gene so can use g_pos
367
+ comp.matched_ORFs.update({g_pos: m_ORF_Details})
368
+ comp.genes_Detected.update({str(gene_details): g_pos})
369
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
370
+ perfect_Matched_Genes(g_Start, g_Stop, g_Strand,orf_Details[4])
371
+ #if verbose == True:
372
+ # print('Perfect Match')
373
+ elif perfect_Match == False and len(
374
+ overlapping_ORFs) == 1: # If we do not have a perfect match but 1 ORF which has passed the filtering
375
+ orf_Pos = list(overlapping_ORFs.keys())[0]
376
+ o_Start = int(orf_Pos.split(',')[0])
377
+ o_Stop = int(orf_Pos.split(',')[1])
378
+ orf_Details = overlapping_ORFs[orf_Pos]
379
+ m_ORF_Details = orf_Details[:]
380
+ m_ORF_Details.append(g_pos)
381
+ if orf_Pos in comp.matched_ORFs.keys():
382
+ try:
383
+ previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
384
+ except KeyError:
385
+ last_key = [*comp.matched_ORFs.keys()][-1]
386
+ previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
387
+ comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
388
+ '-'), orf_Details[4]] # ORF collects multiple gene pos'
389
+ comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
390
+ comp.genes_Detected.update({str(gene_details): orf_Pos})
391
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
392
+ #if verbose == True:
393
+ # print('Partial Match')
394
+ partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
395
+ elif perfect_Match == False and len(
396
+ overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
397
+ orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
398
+ o_Start = int(orf_Pos.split(',')[0])
399
+ o_Stop = int(orf_Pos.split(',')[1])
400
+ m_ORF_Details = orf_Details[:]
401
+ m_ORF_Details.append(g_pos)
402
+ if orf_Pos in comp.matched_ORFs.keys():
403
+ try:
404
+ previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
405
+ except KeyError:
406
+ last_key = [*comp.matched_ORFs.keys()][-1]
407
+ previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
408
+ comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
409
+ '-'), orf_Details[4]] # ORF collects multiple gene pos'
410
+ comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
411
+ comp.genes_Detected.update({str(gene_details): orf_Pos})
412
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
413
+ if verbose == True:
414
+ print('There was more than 1 potential Match - Best Chosen')
415
+ partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
416
+ elif out_Frame: # Keep record of ORFs which overlap a gene but in the wrong frame
417
+ if verbose == True:
418
+ print("Out of Frame Predicted CDS")
419
+ genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) #
420
+ else:
421
+ genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) # No hit
422
+ #if verbose == True:
423
+ # print("No Hit")
424
+ for orf_Key in comp.matched_ORFs: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
425
+ if orf_Key in comp.out_Of_Frame_ORFs:
426
+ del comp.out_Of_Frame_ORFs[orf_Key]
427
+ ######################################## ORF Lengths and Precision
428
+ start_Difference = [x for x in comp.start_Difference if x != 0] # Remove 0s (Perfect hits)
429
+ stop_Difference = [x for x in comp.stop_Difference if x != 0]
430
+ if len(start_Difference) >= 1:
431
+ median_Start_Difference = np.median(start_Difference)
393
432
  else:
394
- genes_Unmatched(g_Start, g_Stop, g_Strand) # No hit
395
- #if verbose == True:
396
- # print("No Hit")
397
- for orf_Key in comp.matched_ORFs: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
398
- if orf_Key in comp.out_Of_Frame_ORFs:
399
- del comp.out_Of_Frame_ORFs[orf_Key]
400
- ######################################## ORF Lengths and Precision
401
- start_Difference = [x for x in comp.start_Difference if x != 0] # Remove 0s (Perfect hits)
402
- stop_Difference = [x for x in comp.stop_Difference if x != 0]
403
- if len(start_Difference) >= 1:
404
- median_Start_Difference = np.median(start_Difference)
405
- else:
406
- median_Start_Difference = 'N/A'
407
- if len(stop_Difference) >= 1:
408
- median_Stop_Difference = np.median(stop_Difference)
409
- else:
410
- median_Stop_Difference = 'N/A'
411
-
412
- # Get Start and Stop Codon Usage
413
- atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(orfs)
414
- tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(orfs)
415
- # Count nucleotides found from ALL ORFs
416
- gene_Nuc_Array = np.zeros((comp.genome_Size), dtype=np.bool)
417
- orf_Nuc_Array = np.zeros((comp.genome_Size), dtype=np.bool)
418
- matched_ORF_Nuc_Array = np.zeros((comp.genome_Size), dtype=np.bool)
419
-
420
- prev_Gene_Stop = 0
421
- prev_Gene_Overlapped = False
422
- for gene_Num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
423
- g_Start = int(gene_details[0])
424
- g_Stop = int(gene_details[1])
425
- g_Strand = gene_details[2]
426
- gene_Length = (g_Stop - g_Start) +1
427
- if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
428
- comp.gene_Lengths.append(gene_Length)
429
- gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
430
- comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
431
- if gene_Length <= SHORT_ORF_LENGTH: # .utils
432
- comp.gene_Short.append(gene_Length)
433
- ### Calculate overlapping Genes -
434
- if prev_Gene_Stop > g_Start:
435
- if '+' in g_Strand:
436
- comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
437
- elif '-' in g_Strand:
438
- comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
439
- prev_Gene_Overlapped = True
440
- elif prev_Gene_Stop < g_Start:
441
- if prev_Gene_Overlapped == True:
433
+ median_Start_Difference = 'N/A'
434
+ if len(stop_Difference) >= 1:
435
+ median_Stop_Difference = np.median(stop_Difference)
436
+ else:
437
+ median_Stop_Difference = 'N/A'
438
+
439
+ # Get Start and Stop Codon Usage
440
+ atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(current_orfs)
441
+ tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(current_orfs)
442
+ # Count nucleotides found from ALL ORFs
443
+ gene_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
444
+ orf_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
445
+ matched_ORF_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
446
+
447
+ prev_Gene_Stop = 0
448
+ prev_Gene_Overlapped = False
449
+ for gene_Num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
450
+ g_Start = int(gene_details[0])
451
+ g_Stop = int(gene_details[1])
452
+ g_Strand = gene_details[2]
453
+ gene_Length = (g_Stop - g_Start) +1
454
+ if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
455
+ comp.gene_Lengths.append(gene_Length)
456
+ gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
457
+ comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
458
+ if gene_Length <= SHORT_ORF_LENGTH: # .utils
459
+ comp.gene_Short.append(gene_Length)
460
+ ### Calculate overlapping Genes -
461
+ if prev_Gene_Stop > g_Start:
442
462
  if '+' in g_Strand:
443
- comp.gene_Pos_Olap.append(0)
463
+ comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
444
464
  elif '-' in g_Strand:
445
- comp.gene_Neg_Olap.append(0)
446
- prev_Gene_Overlapped = False
447
- prev_Gene_Stop = g_Stop
448
- if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
449
- if '+' in g_Strand:
450
- comp.gene_Pos_Olap.append(0)
451
- elif '-' in g_Strand:
452
- comp.gene_Neg_Olap.append(0)
453
- ####
454
- min_Gene_Length = min(comp.gene_Lengths)
455
- max_Gene_Length = max(comp.gene_Lengths)
456
- median_Gene_Length = np.median(comp.gene_Lengths)
457
- prev_ORF_Stop = 0
458
- prev_ORF_Overlapped = False
459
- for o_Positions, orf_Details in orfs.items():
460
- o_Start = int(o_Positions.split(',')[0])
461
- o_Stop = int(o_Positions.split(',')[1])
462
- o_Strand = orf_Details[0]
463
- # Stats just for Unmatched ORFs
464
- if o_Positions not in list(comp.matched_ORFs.keys()):
465
- orf_Unmatched(o_Start, o_Stop, o_Strand)
466
- # Get ORF Strand metrics:
467
- if o_Strand == "+": # Get number of Positive and Negative strand ORFs
468
- comp.pos_Strand += 1
469
- elif o_Strand == "-":
470
- comp.neg_Strand += 1
471
- orf_Length = (o_Stop - o_Start) +1
472
- comp.orf_Lengths.append(orf_Length)
473
- orf_Nuc_Array[o_Start - 1:o_Stop] = True # Changing all between the two positions to 1's
474
- comp.orf_GC.append(nuc_Count(verbose, o_Start, o_Stop, o_Strand))
475
- if orf_Length <= SHORT_ORF_LENGTH: # .utils
476
- comp.orf_Short.append(orf_Length)
477
- ### Calculate overlapping ORFs -
478
- if prev_ORF_Stop > o_Start:
479
- if '+' in o_Strand:
480
- comp.orf_Pos_Olap.append(prev_ORF_Stop - o_Start)
481
- elif '-' in o_Strand:
482
- comp.orf_Neg_Olap.append(prev_ORF_Stop - o_Start)
483
- prev_ORF_Overlapped = True
484
- elif prev_ORF_Stop < o_Start:
485
- if prev_ORF_Overlapped == True:
465
+ comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
466
+ prev_Gene_Overlapped = True
467
+ elif prev_Gene_Stop < g_Start:
468
+ if prev_Gene_Overlapped == True:
469
+ if '+' in g_Strand:
470
+ comp.gene_Pos_Olap.append(0)
471
+ elif '-' in g_Strand:
472
+ comp.gene_Neg_Olap.append(0)
473
+ prev_Gene_Overlapped = False
474
+ prev_Gene_Stop = g_Stop
475
+ if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
476
+ if '+' in g_Strand:
477
+ comp.gene_Pos_Olap.append(0)
478
+ elif '-' in g_Strand:
479
+ comp.gene_Neg_Olap.append(0)
480
+ ####
481
+ min_Gene_Length = min(comp.gene_Lengths)
482
+ max_Gene_Length = max(comp.gene_Lengths)
483
+ median_Gene_Length = np.median(comp.gene_Lengths)
484
+ prev_ORF_Stop = 0
485
+ prev_ORF_Overlapped = False
486
+ for o_Positions, orf_Details in current_orfs.items():
487
+ o_Start = int(o_Positions.split(',')[0])
488
+ o_Stop = int(o_Positions.split(',')[1])
489
+ o_Strand = orf_Details[0]
490
+ # Stats just for Unmatched ORFs
491
+ if o_Positions not in list(comp.matched_ORFs.keys()):
492
+ orf_Unmatched(o_Start, o_Stop, o_Strand, orf_Details[4])
493
+ # Get ORF Strand metrics:
494
+ if o_Strand == "+": # Get number of Positive and Negative strand ORFs
495
+ comp.pos_Strand += 1
496
+ elif o_Strand == "-":
497
+ comp.neg_Strand += 1
498
+ orf_Length = (o_Stop - o_Start) +1
499
+ comp.orf_Lengths.append(orf_Length)
500
+ orf_Nuc_Array[o_Start - 1:o_Stop] = True # Changing all between the two positions to 1's
501
+ comp.orf_GC.append(nuc_Count(verbose, o_Start, o_Stop, o_Strand))
502
+ if orf_Length <= SHORT_ORF_LENGTH: # .utils
503
+ comp.orf_Short.append(orf_Length)
504
+ ### Calculate overlapping ORFs -
505
+ if prev_ORF_Stop > o_Start:
506
+ if '+' in o_Strand:
507
+ comp.orf_Pos_Olap.append(prev_ORF_Stop - o_Start)
508
+ elif '-' in o_Strand:
509
+ comp.orf_Neg_Olap.append(prev_ORF_Stop - o_Start)
510
+ prev_ORF_Overlapped = True
511
+ elif prev_ORF_Stop < o_Start:
512
+ if prev_ORF_Overlapped == True:
513
+ if '+' in o_Strand:
514
+ comp.orf_Pos_Olap.append(0)
515
+ elif '-' in o_Strand:
516
+ comp.orf_Neg_Olap.append(0)
517
+ prev_ORF_Overlapped = False
518
+ prev_ORF_Stop = o_Stop
519
+ if prev_ORF_Overlapped == True: # If last has a prev overlap, count it
486
520
  if '+' in o_Strand:
487
521
  comp.orf_Pos_Olap.append(0)
488
522
  elif '-' in o_Strand:
489
523
  comp.orf_Neg_Olap.append(0)
490
- prev_ORF_Overlapped = False
491
- prev_ORF_Stop = o_Stop
492
- if prev_ORF_Overlapped == True: # If last has a prev overlap, count it
493
- if '+' in o_Strand:
494
- comp.orf_Pos_Olap.append(0)
495
- elif '-' in o_Strand:
496
- comp.orf_Neg_Olap.append(0)
497
-
498
- # Nucleotide Coverage calculated from ORFs matching a gene only
499
- matched_Prev_ORF_Stop = 0
500
- matched_Prev_ORF_Overlapped = False
501
- for mo_Positions, m_ORF_Details in comp.matched_ORFs.items():
502
- mo_Start = int(mo_Positions.split(',')[0])
503
- mo_Stop = int(mo_Positions.split(',')[1])
504
- mo_Strand = m_ORF_Details[0]
505
- mo_Length = (mo_Stop - mo_Start)
506
- matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
507
-
508
- comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
509
- if mo_Length <= SHORT_ORF_LENGTH: # .utils
510
- comp.m_ORF_Short.append(mo_Length)
511
- ### Calculate overlapping Matched ORFs -
512
- if matched_Prev_ORF_Stop > mo_Start:
513
- if '+' in mo_Strand:
514
- comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
515
- elif '-' in mo_Strand:
516
- comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
517
- matched_Prev_ORF_Overlapped = True
518
- elif matched_Prev_ORF_Stop < mo_Start:
519
- if matched_Prev_ORF_Overlapped == True:
524
+
525
+ # Nucleotide Coverage calculated from ORFs matching a gene only
526
+ matched_Prev_ORF_Stop = 0
527
+ matched_Prev_ORF_Overlapped = False
528
+ for mo_Positions, m_ORF_Details in comp.matched_ORFs.items():
529
+ mo_Start = int(mo_Positions.split(',')[0])
530
+ mo_Stop = int(mo_Positions.split(',')[1])
531
+ mo_Strand = m_ORF_Details[0]
532
+ mo_Length = (mo_Stop - mo_Start)
533
+ matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
534
+
535
+ comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
536
+ if mo_Length <= SHORT_ORF_LENGTH: # .utils
537
+ comp.m_ORF_Short.append(mo_Length)
538
+ ### Calculate overlapping Matched ORFs -
539
+ if matched_Prev_ORF_Stop > mo_Start:
520
540
  if '+' in mo_Strand:
521
- comp.m_ORF_Pos_Olap.append(0)
541
+ comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
522
542
  elif '-' in mo_Strand:
523
- comp.m_ORF_Neg_Olap.append(0)
524
- matched_Prev_ORF_Overlapped = False
525
- matched_Prev_ORF_Stop = mo_Stop
526
- if matched_Prev_ORF_Overlapped == True: # If last has a prev overlap, count it
527
- if '+' in mo_Strand:
528
- comp.m_ORF_Pos_Olap.append(0)
529
- elif '-' in mo_Strand:
530
- comp.m_ORF_Neg_Olap.append(0)
531
- ####
532
- gene_Coverage_Genome = format(100 * np.sum(gene_Nuc_Array) / comp.genome_Size, '.2f')
533
- orf_Coverage_Genome = format(100 * np.sum(orf_Nuc_Array) / comp.genome_Size, '.2f')
534
- matched_ORF_Coverage_Genome = format(100 * np.sum(matched_ORF_Nuc_Array) / comp.genome_Size,
535
- '.2f') # This gets the nts which are in matched ORFs - Check below
536
- # matched_ORF_Nuc_AND_Gene = np.logical_and(matched_ORF_Nuc_Array,gene_Nuc_Array) + [0 for i in range(len(gene_Nuc_Array))] # This gets the nts which are in both matched ORFs and detected genes
537
- # matched_ORF_Coverage_Genome = format(100 * np.count_nonzero(matched_ORF_Nuc_AND_Gene) / comp.genome_Size,'.2f')
538
-
539
- # gene and orf nucleotide Intersection
540
- gene_ORF_Nuc_Intersection = np.sum(gene_Nuc_Array & orf_Nuc_Array)
541
- # not gene but orf nucleotides
542
- not_Gene_Nuc_Array = np.logical_not(gene_Nuc_Array)
543
- not_Gene_Nuc_And_ORF_Count = np.sum(not_Gene_Nuc_Array & orf_Nuc_Array)
544
- # not orf nucleotides but gene
545
- not_ORF_Nuc_Array = np.logical_not(orf_Nuc_Array)
546
- not_ORF_Nuc_And_Gene_Count = np.sum(not_ORF_Nuc_Array & gene_Nuc_Array)
547
- # not gene or orf nucleotides
548
- not_Gene_Nuc_Not_ORF_Nuc_Count = np.sum(not_Gene_Nuc_Array & not_ORF_Nuc_Array)
549
- # Nucleotide 'accuracy' - Normalised by number of nucelotides annotated by a gene
550
- NT_TP = format(gene_ORF_Nuc_Intersection / np.sum(gene_Nuc_Array), '.2f')
551
- NT_FP = format(not_Gene_Nuc_And_ORF_Count / np.sum(not_Gene_Nuc_Array), '.2f')
552
- NT_FN = format(not_ORF_Nuc_And_Gene_Count / np.sum(gene_Nuc_Array), '.2f')
553
- NT_TN = format(not_Gene_Nuc_Not_ORF_Nuc_Count / np.sum(not_Gene_Nuc_Array), '.2f')
554
- NT_Precision = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_Gene_Nuc_And_ORF_Count), '.2f')
555
- NT_Recall = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_ORF_Nuc_And_Gene_Count), '.2f')
556
- NT_False_Discovery_Rate = format(
557
- not_Gene_Nuc_And_ORF_Count / (not_Gene_Nuc_And_ORF_Count + gene_ORF_Nuc_Intersection), '.2f')
558
- ################################# Precision and Recall of whole ORFs and Genes
559
- TP = format(len(comp.genes_Detected) / len(ref_genes), '.2f')
560
- FP = format(len(comp.unmatched_ORFs) / len(ref_genes), '.2f')
561
- FN = format(len(comp.genes_Undetected) / len(ref_genes), '.2f')
562
- #################################################### Need a better way to handle 'no hits/ORFs'
563
- try:
564
- precision = format(float(TP) / (float(TP) + float(FP)), '.2f')
565
- except ZeroDivisionError:
566
- precision = format(0.00, '.2f')
567
- try:
568
- recall = format(float(TP) / (float(TP) + float(FN)), '.2f')
569
- except ZeroDivisionError:
570
- recall = format(0.00, '.2f')
571
- try:
572
- false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
573
- except ZeroDivisionError:
574
- false_Discovery_Rate = 'N/A'
575
- min_ORF_Length = min(comp.orf_Lengths)
576
- max_ORF_Length = max(comp.orf_Lengths)
577
- median_ORF_Length = np.median(comp.orf_Lengths)
578
-
579
- ##########################################################################
580
- # Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
581
- ORFs_Difference = format(100 * (len(orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
582
- genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
583
- matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(orfs)), '.2f')
584
- all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
585
- matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
586
- all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
587
-
588
- if all_ORF_Olap: # If no overlapping ORFs
543
+ comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
544
+ matched_Prev_ORF_Overlapped = True
545
+ elif matched_Prev_ORF_Stop < mo_Start:
546
+ if matched_Prev_ORF_Overlapped == True:
547
+ if '+' in mo_Strand:
548
+ comp.m_ORF_Pos_Olap.append(0)
549
+ elif '-' in mo_Strand:
550
+ comp.m_ORF_Neg_Olap.append(0)
551
+ matched_Prev_ORF_Overlapped = False
552
+ matched_Prev_ORF_Stop = mo_Stop
553
+ if matched_Prev_ORF_Overlapped == True: # If last has a prev overlap, count it
554
+ if '+' in mo_Strand:
555
+ comp.m_ORF_Pos_Olap.append(0)
556
+ elif '-' in mo_Strand:
557
+ comp.m_ORF_Neg_Olap.append(0)
558
+ ####
559
+ gene_Coverage_Genome = format(100 * np.sum(gene_Nuc_Array) / comp.genome_Size, '.2f')
560
+ orf_Coverage_Genome = format(100 * np.sum(orf_Nuc_Array) / comp.genome_Size, '.2f')
561
+ matched_ORF_Coverage_Genome = format(100 * np.sum(matched_ORF_Nuc_Array) / comp.genome_Size,
562
+ '.2f') # This gets the nts which are in matched ORFs - Check below
563
+ # matched_ORF_Nuc_AND_Gene = np.logical_and(matched_ORF_Nuc_Array,gene_Nuc_Array) + [0 for i in range(len(gene_Nuc_Array))] # This gets the nts which are in both matched ORFs and detected genes
564
+ # matched_ORF_Coverage_Genome = format(100 * np.count_nonzero(matched_ORF_Nuc_AND_Gene) / comp.genome_Size,'.2f')
565
+
566
+ # gene and orf nucleotide Intersection
567
+ gene_ORF_Nuc_Intersection = np.sum(gene_Nuc_Array & orf_Nuc_Array)
568
+ # not gene but orf nucleotides
569
+ not_Gene_Nuc_Array = np.logical_not(gene_Nuc_Array)
570
+ not_Gene_Nuc_And_ORF_Count = np.sum(not_Gene_Nuc_Array & orf_Nuc_Array)
571
+ # not orf nucleotides but gene
572
+ not_ORF_Nuc_Array = np.logical_not(orf_Nuc_Array)
573
+ not_ORF_Nuc_And_Gene_Count = np.sum(not_ORF_Nuc_Array & gene_Nuc_Array)
574
+ # not gene or orf nucleotides
575
+ not_Gene_Nuc_Not_ORF_Nuc_Count = np.sum(not_Gene_Nuc_Array & not_ORF_Nuc_Array)
576
+ # Nucleotide 'accuracy' - Normalised by number of nucelotides annotated by a gene
577
+ NT_TP = format(gene_ORF_Nuc_Intersection / np.sum(gene_Nuc_Array), '.2f')
578
+ NT_FP = format(not_Gene_Nuc_And_ORF_Count / np.sum(not_Gene_Nuc_Array), '.2f')
579
+ NT_FN = format(not_ORF_Nuc_And_Gene_Count / np.sum(gene_Nuc_Array), '.2f')
580
+ NT_TN = format(not_Gene_Nuc_Not_ORF_Nuc_Count / np.sum(not_Gene_Nuc_Array), '.2f')
581
+ NT_Precision = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_Gene_Nuc_And_ORF_Count), '.2f')
582
+ NT_Recall = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_ORF_Nuc_And_Gene_Count), '.2f')
583
+ NT_False_Discovery_Rate = format(
584
+ not_Gene_Nuc_And_ORF_Count / (not_Gene_Nuc_And_ORF_Count + gene_ORF_Nuc_Intersection), '.2f')
585
+ ################################# Precision and Recall of whole ORFs and Genes
586
+ TP = format(len(comp.genes_Detected) / len(ref_genes), '.2f')
587
+ FP = format(len(comp.unmatched_ORFs) / len(ref_genes), '.2f')
588
+ FN = format(len(comp.genes_Undetected) / len(ref_genes), '.2f')
589
+ #################################################### Need a better way to handle 'no hits/ORFs'
589
590
  try:
590
- overlap_Difference = format(100 * (len(all_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap), '.2f')
591
- matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
592
- '.2f')
591
+ precision = format(float(TP) / (float(TP) + float(FP)), '.2f')
593
592
  except ZeroDivisionError:
593
+ precision = format(0.00, '.2f')
594
+ try:
595
+ recall = format(float(TP) / (float(TP) + float(FN)), '.2f')
596
+ except ZeroDivisionError:
597
+ recall = format(0.00, '.2f')
598
+ try:
599
+ false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
600
+ except ZeroDivisionError:
601
+ false_Discovery_Rate = 'N/A'
602
+ min_ORF_Length = min(comp.orf_Lengths)
603
+ max_ORF_Length = max(comp.orf_Lengths)
604
+ median_ORF_Length = np.median(comp.orf_Lengths)
605
+
606
+ ##########################################################################
607
+ # Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
608
+ ORFs_Difference = format(100 * (len(current_orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
609
+ genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
610
+ matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(current_orfs)), '.2f')
611
+ all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
612
+ matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
613
+ all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
614
+
615
+ if all_ORF_Olap: # If no overlapping ORFs
616
+ try:
617
+ overlap_Difference = format(100 * (len(all_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap), '.2f')
618
+ matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
619
+ '.2f')
620
+ except ZeroDivisionError:
621
+ overlap_Difference = 'N/A'
622
+ matched_Overlap_Difference = 'N/A'
623
+ num_All_ORF_Olap = len(all_ORF_Olap)
624
+ if matched_ORF_Olap:
625
+ max_Matched_ORF_Olap = max(matched_ORF_Olap)
626
+ matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
627
+ else:
628
+ max_Matched_ORF_Olap = 'N/A'
629
+ matched_Median_ORF_Overlap = 'N/A'
630
+ max_All_ORF_Olap = max(all_ORF_Olap)
631
+ median_ORF_Overlap = format(np.median(all_ORF_Olap), '.2f')
632
+ else:
594
633
  overlap_Difference = 'N/A'
595
634
  matched_Overlap_Difference = 'N/A'
596
- num_All_ORF_Olap = len(all_ORF_Olap)
597
- if matched_ORF_Olap:
598
- max_Matched_ORF_Olap = max(matched_ORF_Olap)
599
- matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
600
- else:
635
+ num_All_ORF_Olap = 0
601
636
  max_Matched_ORF_Olap = 'N/A'
637
+ max_All_ORF_Olap = 'N/A'
638
+ median_ORF_Overlap = 'N/A'
602
639
  matched_Median_ORF_Overlap = 'N/A'
603
- max_All_ORF_Olap = max(all_ORF_Olap)
604
- median_ORF_Overlap = format(np.median(all_ORF_Olap), '.2f')
605
- else:
606
- overlap_Difference = 'N/A'
607
- matched_Overlap_Difference = 'N/A'
608
- num_All_ORF_Olap = 0
609
- max_Matched_ORF_Olap = 'N/A'
610
- max_All_ORF_Olap = 'N/A'
611
- median_ORF_Overlap = 'N/A'
612
- matched_Median_ORF_Overlap = 'N/A'
613
- if len(matched_ORF_Olap) == 0: # -100.00 is not informative
614
- matched_Overlap_Difference = 'N/A'
615
-
616
- # Need to NA everything
617
-
618
- if comp.orf_Short and comp.gene_Short: # IF Short-ORFs/Genes
619
- short_ORF_Difference = format(100 * (len(comp.orf_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
620
- matched_Short_ORF_Difference = format(
621
- 100 * (len(comp.m_ORF_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
622
- num_ORF_Short = len(comp.orf_Short)
623
- num_Matched_ORF_Short = len(comp.m_ORF_Short)
624
- elif comp.orf_Short: # If only Short-ORFs
625
- num_ORF_Short = len(comp.orf_Short)
626
- num_Matched_ORF_Short = 'N/A'
627
- short_ORF_Difference = (num_ORF_Short * 100)
628
- matched_Short_ORF_Difference = 'N/A'
629
- else: # If only Short-Genes and Undetected StORFs
630
- comp.gene_Short
631
- short_ORF_Difference = 'N/A'
632
- matched_Short_ORF_Difference = 'N/A'
633
- num_ORF_Short = 0
634
- num_Matched_ORF_Short = 'N/A'
635
- if num_Matched_ORF_Short == 0: # -100.00 is not informative
636
- matched_Short_ORF_Difference = 'N/A'
637
-
638
- median_Length_Difference = format(100 * (median_ORF_Length - median_Gene_Length) / median_Gene_Length, '.2f')
639
- min_Length_Difference = format(100 * (min_ORF_Length - min_Gene_Length) / min_Gene_Length, '.2f')
640
- max_Length_Difference = format(100 * (max_ORF_Length - max_Gene_Length) / max_Gene_Length, '.2f')
641
- pos_Strand_Percentage = format(comp.pos_Strand / len(orfs), '.2f')
642
- neg_Strand_Percentage = format(comp.neg_Strand / len(orfs), '.2f')
643
- median_ORF_GC = np.median(comp.orf_GC)
644
- matched_Median_ORF_GC = np.median(comp.m_ORF_GC)
645
- median_Gene_GC = np.median(comp.gene_GC)
646
- median_GC_Difference = format(100 * (float(median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
647
- matched_Median_GC_Difference = format(
648
- 100 * (float(matched_Median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
649
-
650
- if comp.matched_ORFs: # No ORFs detected a gene
651
- extended_CDS_Percentage = format(100 * comp.extended_CDS / len(comp.matched_ORFs), '.2f')
652
- extended_Start_Percentage = format(100 * comp.extended_Start / len(comp.matched_ORFs), '.2f')
653
- extended_Stop_Percentage = format(100 * comp.extended_Stop / len(comp.matched_ORFs), '.2f')
654
- perfect_Matches_Percentage = format(100 * len(comp.perfect_Matches) / len(comp.matched_ORFs), '.2f')
655
- perfect_Starts_Percentage = format(100 * comp.perfect_Starts / len(comp.matched_ORFs), '.2f')
656
- perfect_Stops_Percentage = format(100 * comp.perfect_Stops / len(comp.matched_ORFs), '.2f')
657
- else:
658
- # correct_Frame_Percentage = 0
659
- extended_CDS_Percentage = format(0.00, '.2f')
660
- extended_Start_Percentage = format(0.00, '.2f')
661
- extended_Stop_Percentage = format(0.00, '.2f')
662
- perfect_Matches_Percentage = format(0.00, '.2f')
663
- perfect_Starts_Percentage = format(0.00, '.2f')
664
- perfect_Stops_Percentage = format(0.00, '.2f')
665
- ################### Missed Genes Metrics:
666
- if comp.genes_Undetected:
667
- mg_Starts = []
668
- mg_Stops = []
669
- mg_Lengths = []
670
- mg_Strands = []
671
- for mg, seq in comp.genes_Undetected.items():
672
- mg = mg.split(',')
673
- mg_Starts.append(mg[3])
674
- mg_Stops.append(mg[4])
675
- mg_Strands.append(mg[2])
676
- mg_Lengths.append(int(mg[1]) - int(mg[0]))
677
-
678
- mg_ATG = 100 * mg_Starts.count('ATG') / len(comp.genes_Undetected)
679
- mg_GTG = 100 * mg_Starts.count('GTG') / len(comp.genes_Undetected)
680
- mg_TTG = 100 * mg_Starts.count('TTG') / len(comp.genes_Undetected)
681
- mg_ATT = 100 * mg_Starts.count('ATT') / len(comp.genes_Undetected)
682
- mg_CTG = 100 * mg_Starts.count('CTG') / len(comp.genes_Undetected)
683
- mg_O_Start = 100 - (mg_ATG + mg_GTG + mg_TTG + mg_ATT + mg_CTG)
684
- mg_TGA = 100 * mg_Stops.count('TGA') / len(comp.genes_Undetected)
685
- mg_TAA = 100 * mg_Stops.count('TAA') / len(comp.genes_Undetected)
686
- mg_TAG = 100 * mg_Stops.count('TAG') / len(comp.genes_Undetected)
687
- mg_O_Stop = 100 - (mg_TGA + mg_TAA + mg_TAG)
688
- median_mg_Len = np.median(mg_Lengths)
689
- mg_Pos = mg_Strands.count('+')
690
- mg_Neg = mg_Strands.count('-')
691
- undetected_Gene_Metrics = (
692
- format(mg_ATG, '.2f'), format(mg_GTG, '.2f'), format(mg_TTG, '.2f'), format(mg_ATT, '.2f'),
693
- format(mg_CTG, '.2f'), format(mg_O_Start, '.2f'), format(mg_TGA, '.2f'), format(mg_TAA, '.2f'),
694
- format(mg_TAG, '.2f'), format(mg_O_Stop, '.2f'), format(median_mg_Len, '.2f'), mg_Pos, mg_Neg)
695
- else:
696
- undetected_Gene_Metrics = ''
697
- #################### Unmathced ORF Metrics:
698
- if comp.unmatched_ORFs:
699
- uo_Starts = []
700
- uo_Stops = []
701
- uo_Lengths = []
702
- uo_Strands = []
703
- for uo, seq in comp.unmatched_ORFs.items():
704
- uo = uo.split(',')
705
- uo_Starts.append(uo[3])
706
- uo_Stops.append(uo[4])
707
- uo_Strands.append(uo[2])
708
- uo_Lengths.append(int(uo[1]) - int(uo[0]))
709
- uo_ATG = 100 * uo_Starts.count('ATG') / len(comp.unmatched_ORFs)
710
- uo_GTG = 100 * uo_Starts.count('GTG') / len(comp.unmatched_ORFs)
711
- uo_TTG = 100 * uo_Starts.count('TTG') / len(comp.unmatched_ORFs)
712
- uo_ATT = 100 * uo_Starts.count('ATT') / len(comp.unmatched_ORFs)
713
- uo_CTG = 100 * uo_Starts.count('CTG') / len(comp.unmatched_ORFs)
714
- uo_O_Start = 100 - (uo_ATG + uo_GTG + uo_TTG + uo_ATT + uo_CTG)
715
- uo_TGA = 100 * uo_Stops.count('TGA') / len(comp.unmatched_ORFs)
716
- uo_TAA = 100 * uo_Stops.count('TAA') / len(comp.unmatched_ORFs)
717
- uo_TAG = 100 * uo_Stops.count('TAG') / len(comp.unmatched_ORFs)
718
- uo_O_Stop = 100 - (uo_TGA + uo_TAA + uo_TAG)
719
- # uo_O_Stop = 100 * uo_O_Stop / len(comp.unmatched_ORFs) ########WHY?
720
- median_uo_Len = np.median(uo_Lengths)
721
- uo_Pos = uo_Strands.count('+')
722
- uo_Neg = uo_Strands.count('-')
723
- unmatched_ORF_Metrics = (
724
- format(uo_ATG, '.2f'), format(uo_GTG, '.2f'), format(uo_TTG, '.2f'), format(uo_ATT, '.2f'),
725
- format(uo_CTG, '.2f'), format(uo_O_Start, '.2f'), format(uo_TGA, '.2f'), format(uo_TAA, '.2f'),
726
- format(uo_TAG, '.2f'), format(uo_O_Stop, '.2f'), format(median_uo_Len, '.2f'), uo_Pos, uo_Neg)
727
- else:
728
- unmatched_ORF_Metrics = ''
729
- #################################
730
-
731
- all_Metrics = collections.OrderedDict(
732
- {'Number_of_ORFs': len(orfs), 'Percent_Difference_of_All_ORFs': ORFs_Difference,
733
- 'Number_of_ORFs_that_Detected_a_Gene': len(comp.matched_ORFs),
734
- 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
735
- 'Number_of_Genes_Detected': len(comp.genes_Detected),
736
- 'Percentage_of_Genes_Detected': genes_Detected_Percentage, 'Median_Length_of_All_ORFs': median_ORF_Length,
737
- 'Median_Length_Difference': median_Length_Difference,
738
- 'Minimum_Length_of_All_ORFs': min_ORF_Length, 'Minimum_Length_Difference': min_Length_Difference,
739
- 'Maximum_Length_of_All_ORFs': max_ORF_Length, 'Maximum_Length_Difference': max_Length_Difference,
740
- 'Median_GC_content_of_All_ORFs': format(median_ORF_GC, '.2f'),
741
- 'Percent_Difference_of_All_ORFs_Median_GC': median_GC_Difference,
742
- 'Median_GC_content_of_Matched_ORFs': format(matched_Median_ORF_GC, '.2f'),
743
- 'Percent_Difference_of_Matched_ORF_GC': matched_Median_GC_Difference,
744
- 'Number_of_ORFs_which_Overlap_Another_ORF': num_All_ORF_Olap,
745
- 'Percent_Difference_of_Overlapping_ORFs': overlap_Difference,
746
- 'Maximum_ORF_Overlap': max_All_ORF_Olap, 'Median_ORF_Overlap': median_ORF_Overlap,
747
- 'Number_of_Matched_ORFs_Overlapping_Another_ORF': len(matched_ORF_Olap),
748
- 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
749
- 'Maximum_Matched_ORF_Overlap': max_Matched_ORF_Olap, 'Median_Matched_ORF_Overlap': matched_Median_ORF_Overlap,
750
- 'Number_of_Short-ORFs': num_ORF_Short, 'Percent_Difference_of_Short-ORFs': short_ORF_Difference,
751
- 'Number_of_Short-Matched-ORFs': num_Matched_ORF_Short,
752
- 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
753
- 'Number_of_Perfect_Matches': len(comp.perfect_Matches),
754
- 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
755
- 'Number_of_Perfect_Starts': comp.perfect_Starts, 'Percentage_of_Perfect_Starts': perfect_Starts_Percentage,
756
- 'Number_of_Perfect_Stops': comp.perfect_Stops, 'Percentage_of_Perfect_Stops': perfect_Stops_Percentage,
757
- 'Number_of_Out_of_Frame_ORFs': len(comp.out_Of_Frame_ORFs),
758
- 'Number_of_Matched_ORFs_Extending_a_Coding_Region': comp.extended_CDS,
759
- 'Percentage_of_Matched_ORFs_Extending_a_Coding_Region': extended_CDS_Percentage,
760
- 'Number_of_Matched_ORFs_Extending_Start_Region': comp.extended_Start,
761
- 'Percentage_of_Matched_ORFs_Extending_Start_Region': extended_Start_Percentage,
762
- 'Number_of_Matched_ORFs_Extending_Stop_Region': comp.extended_Stop,
763
- 'Percentage_of_Matched_ORFs_Extending_Stop_Region': extended_Stop_Percentage,
764
- 'Number_of_All_ORFs_on_Positive_Strand': comp.pos_Strand,
765
- 'Percentage_of_All_ORFs_on_Positive_Strand': pos_Strand_Percentage,
766
- 'Number_of_All_ORFs_on_Negative_Strand': comp.neg_Strand,
767
- 'Percentage_of_All_ORFs_on_Negative_Strand': neg_Strand_Percentage,
768
- 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
769
- 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference, 'ATG_Start_Percentage': atg_P,
770
- 'GTG_Start_Percentage': gtg_P, 'TTG_Start_Percentage': ttg_P,
771
- 'ATT_Start_Percentage': att_P, 'CTG_Start_Percentage': ctg_P, 'Other_Start_Codon_Percentage': other_Start_P,
772
- 'TAG_Stop_Percentage': tag_P, 'TAA_Stop_Percentage': taa_P,
773
- 'TGA_Stop_Percentage': tga_P, 'Other_Stop_Codon_Percentage': other_Stop_P, 'True_Positive': TP,
774
- 'False_Positive': FP, 'False_Negative': FN, 'Precision': precision,
775
- 'Recall': recall, 'False_Discovery_Rate': false_Discovery_Rate, 'Nucleotide_True_Positive': NT_TP,
776
- 'Nucleotide_False_Positive': NT_FP, 'Nucleotide_True_Negative': NT_TN,
777
- 'Nucleotide_False_Negative': NT_FN, 'Nucleotide_Precision': NT_Precision, 'Nucleotide_Recall': NT_Recall,
778
- 'Nucleotide_False_Discovery_Rate': NT_False_Discovery_Rate,
779
- 'ORF_Nucleotide_Coverage_of_Genome': orf_Coverage_Genome,
780
- 'Matched_ORF_Nucleotide_Coverage_of_Genome': matched_ORF_Coverage_Genome})
781
-
782
- rep_Metrics = collections.OrderedDict(
783
- {'Percentage_of_Genes_Detected': genes_Detected_Percentage,
784
- 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
785
- 'Percent_Difference_of_All_ORFs': ORFs_Difference,
786
- 'Median_Length_Difference': median_Length_Difference,
787
- 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
788
- 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
789
- 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference,
790
- 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
791
- 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
792
- 'Precision': precision,
793
- 'Recall': recall,
794
- 'False_Discovery_Rate': false_Discovery_Rate})
795
-
796
- # To account for unbalanced data
797
- for m_key, m_value in all_Metrics.items():
798
- if 'nan' == m_value:
799
- all_Metrics[m_key] = 'N/A'
800
-
801
- return all_Metrics, rep_Metrics, start_Difference, stop_Difference, other_Starts, other_Stops, comp.perfect_Matches, comp.genes_Undetected, comp.unmatched_ORFs, undetected_Gene_Metrics, unmatched_ORF_Metrics, orf_Coverage_Genome, matched_ORF_Coverage_Genome, gene_Coverage_Genome, comp.multi_Matched_ORFs, comp.partial_Hits
640
+ if len(matched_ORF_Olap) == 0: # -100.00 is not informative
641
+ matched_Overlap_Difference = 'N/A'
642
+
643
+ # Need to NA everything
644
+
645
+ if comp.orf_Short and comp.gene_Short: # IF Short-ORFs/Genes
646
+ short_ORF_Difference = format(100 * (len(comp.orf_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
647
+ matched_Short_ORF_Difference = format(
648
+ 100 * (len(comp.m_ORF_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
649
+ num_ORF_Short = len(comp.orf_Short)
650
+ num_Matched_ORF_Short = len(comp.m_ORF_Short)
651
+ elif comp.orf_Short: # If only Short-ORFs
652
+ num_ORF_Short = len(comp.orf_Short)
653
+ num_Matched_ORF_Short = 'N/A'
654
+ short_ORF_Difference = (num_ORF_Short * 100)
655
+ matched_Short_ORF_Difference = 'N/A'
656
+ else: # If only Short-Genes and Undetected StORFs
657
+ comp.gene_Short
658
+ short_ORF_Difference = 'N/A'
659
+ matched_Short_ORF_Difference = 'N/A'
660
+ num_ORF_Short = 0
661
+ num_Matched_ORF_Short = 'N/A'
662
+ if num_Matched_ORF_Short == 0: # -100.00 is not informative
663
+ matched_Short_ORF_Difference = 'N/A'
664
+
665
+ median_Length_Difference = format(100 * (median_ORF_Length - median_Gene_Length) / median_Gene_Length, '.2f')
666
+ min_Length_Difference = format(100 * (min_ORF_Length - min_Gene_Length) / min_Gene_Length, '.2f')
667
+ max_Length_Difference = format(100 * (max_ORF_Length - max_Gene_Length) / max_Gene_Length, '.2f')
668
+ pos_Strand_Percentage = format(comp.pos_Strand / len(current_orfs), '.2f')
669
+ neg_Strand_Percentage = format(comp.neg_Strand / len(current_orfs), '.2f')
670
+ median_ORF_GC = np.median(comp.orf_GC)
671
+ matched_Median_ORF_GC = np.median(comp.m_ORF_GC)
672
+ median_Gene_GC = np.median(comp.gene_GC)
673
+ median_GC_Difference = format(100 * (float(median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
674
+ matched_Median_GC_Difference = format(
675
+ 100 * (float(matched_Median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
676
+
677
+ if comp.matched_ORFs: # No ORFs detected a gene
678
+ extended_CDS_Percentage = format(100 * comp.extended_CDS / len(comp.matched_ORFs), '.2f')
679
+ extended_Start_Percentage = format(100 * comp.extended_Start / len(comp.matched_ORFs), '.2f')
680
+ extended_Stop_Percentage = format(100 * comp.extended_Stop / len(comp.matched_ORFs), '.2f')
681
+ perfect_Matches_Percentage = format(100 * len(comp.perfect_Matches) / len(comp.matched_ORFs), '.2f')
682
+ perfect_Starts_Percentage = format(100 * comp.perfect_Starts / len(comp.matched_ORFs), '.2f')
683
+ perfect_Stops_Percentage = format(100 * comp.perfect_Stops / len(comp.matched_ORFs), '.2f')
684
+ else:
685
+ # correct_Frame_Percentage = 0
686
+ extended_CDS_Percentage = format(0.00, '.2f')
687
+ extended_Start_Percentage = format(0.00, '.2f')
688
+ extended_Stop_Percentage = format(0.00, '.2f')
689
+ perfect_Matches_Percentage = format(0.00, '.2f')
690
+ perfect_Starts_Percentage = format(0.00, '.2f')
691
+ perfect_Stops_Percentage = format(0.00, '.2f')
692
+ ################### Missed Genes Metrics:
693
+ if comp.genes_Undetected:
694
+ mg_Starts = []
695
+ mg_Stops = []
696
+ mg_Lengths = []
697
+ mg_Strands = []
698
+ for mg, seq in comp.genes_Undetected.items():
699
+ mg = mg.split(',')
700
+ mg_Starts.append(mg[3])
701
+ mg_Stops.append(mg[4])
702
+ mg_Strands.append(mg[2])
703
+ mg_Lengths.append(int(mg[1]) - int(mg[0]))
704
+
705
+ mg_ATG = 100 * mg_Starts.count('ATG') / len(comp.genes_Undetected)
706
+ mg_GTG = 100 * mg_Starts.count('GTG') / len(comp.genes_Undetected)
707
+ mg_TTG = 100 * mg_Starts.count('TTG') / len(comp.genes_Undetected)
708
+ mg_ATT = 100 * mg_Starts.count('ATT') / len(comp.genes_Undetected)
709
+ mg_CTG = 100 * mg_Starts.count('CTG') / len(comp.genes_Undetected)
710
+ mg_O_Start = 100 - (mg_ATG + mg_GTG + mg_TTG + mg_ATT + mg_CTG)
711
+ mg_TGA = 100 * mg_Stops.count('TGA') / len(comp.genes_Undetected)
712
+ mg_TAA = 100 * mg_Stops.count('TAA') / len(comp.genes_Undetected)
713
+ mg_TAG = 100 * mg_Stops.count('TAG') / len(comp.genes_Undetected)
714
+ mg_O_Stop = 100 - (mg_TGA + mg_TAA + mg_TAG)
715
+ median_mg_Len = np.median(mg_Lengths)
716
+ mg_Pos = mg_Strands.count('+')
717
+ mg_Neg = mg_Strands.count('-')
718
+ undetected_Gene_Metrics = (
719
+ format(mg_ATG, '.2f'), format(mg_GTG, '.2f'), format(mg_TTG, '.2f'), format(mg_ATT, '.2f'),
720
+ format(mg_CTG, '.2f'), format(mg_O_Start, '.2f'), format(mg_TGA, '.2f'), format(mg_TAA, '.2f'),
721
+ format(mg_TAG, '.2f'), format(mg_O_Stop, '.2f'), format(median_mg_Len, '.2f'), mg_Pos, mg_Neg)
722
+ else:
723
+ undetected_Gene_Metrics = ''
724
+ #################### Unmathced ORF Metrics:
725
+ if comp.unmatched_ORFs:
726
+ uo_Starts = []
727
+ uo_Stops = []
728
+ uo_Lengths = []
729
+ uo_Strands = []
730
+ for uo, seq in comp.unmatched_ORFs.items():
731
+ uo = uo.split(',')
732
+ uo_Starts.append(uo[3])
733
+ uo_Stops.append(uo[4])
734
+ uo_Strands.append(uo[2])
735
+ uo_Lengths.append(int(uo[1]) - int(uo[0]))
736
+ uo_ATG = 100 * uo_Starts.count('ATG') / len(comp.unmatched_ORFs)
737
+ uo_GTG = 100 * uo_Starts.count('GTG') / len(comp.unmatched_ORFs)
738
+ uo_TTG = 100 * uo_Starts.count('TTG') / len(comp.unmatched_ORFs)
739
+ uo_ATT = 100 * uo_Starts.count('ATT') / len(comp.unmatched_ORFs)
740
+ uo_CTG = 100 * uo_Starts.count('CTG') / len(comp.unmatched_ORFs)
741
+ uo_O_Start = 100 - (uo_ATG + uo_GTG + uo_TTG + uo_ATT + uo_CTG)
742
+ uo_TGA = 100 * uo_Stops.count('TGA') / len(comp.unmatched_ORFs)
743
+ uo_TAA = 100 * uo_Stops.count('TAA') / len(comp.unmatched_ORFs)
744
+ uo_TAG = 100 * uo_Stops.count('TAG') / len(comp.unmatched_ORFs)
745
+ uo_O_Stop = 100 - (uo_TGA + uo_TAA + uo_TAG)
746
+ # uo_O_Stop = 100 * uo_O_Stop / len(comp.unmatched_ORFs) ########WHY?
747
+ median_uo_Len = np.median(uo_Lengths)
748
+ uo_Pos = uo_Strands.count('+')
749
+ uo_Neg = uo_Strands.count('-')
750
+ unmatched_ORF_Metrics = (
751
+ format(uo_ATG, '.2f'), format(uo_GTG, '.2f'), format(uo_TTG, '.2f'), format(uo_ATT, '.2f'),
752
+ format(uo_CTG, '.2f'), format(uo_O_Start, '.2f'), format(uo_TGA, '.2f'), format(uo_TAA, '.2f'),
753
+ format(uo_TAG, '.2f'), format(uo_O_Stop, '.2f'), format(median_uo_Len, '.2f'), uo_Pos, uo_Neg)
754
+ else:
755
+ unmatched_ORF_Metrics = ''
756
+ #################################
757
+ # Rep_Metrics - This is the final report of metrics
758
+ rep_Metrics = collections.OrderedDict(
759
+ {'Percentage_of_Genes_Detected': genes_Detected_Percentage,
760
+ 'genes_Undetected': comp.genes_Undetected,
761
+ 'undetected_Gene_Metrics': undetected_Gene_Metrics,
762
+ 'gene_Coverage_Genome': gene_Coverage_Genome,
763
+ 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
764
+ 'Percent_Difference_of_All_ORFs': ORFs_Difference,
765
+ 'Median_Length_Difference': median_Length_Difference,
766
+ 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
767
+ 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
768
+ 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference,
769
+ 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
770
+ 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
771
+ 'Precision': precision,
772
+ 'Recall': recall,
773
+ 'False_Discovery_Rate': false_Discovery_Rate})
774
+ # Pred Metrics - This is the final report of metrics
775
+ pred_metrics = collections.OrderedDict(
776
+ {'Number_of_ORFs': len(current_orfs), 'Percent_Difference_of_All_ORFs': ORFs_Difference,
777
+ 'perfect_Matches': comp.perfect_Matches,
778
+ 'unmatched_ORFs': comp.unmatched_ORFs,
779
+ 'unmatched_ORF_Metrics': unmatched_ORF_Metrics,
780
+ 'orf_Coverage_Genome': orf_Coverage_Genome,
781
+ 'matched_ORF_Coverage_Genome': matched_ORF_Coverage_Genome,
782
+ 'multi_Matched_ORFs': comp.multi_Matched_ORFs,
783
+ 'partial_Hits': comp.partial_Hits,
784
+ 'Number_of_ORFs_that_Detected_a_Gene': len(comp.matched_ORFs),
785
+ 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
786
+ 'Number_of_Genes_Detected': len(comp.genes_Detected),
787
+ 'Percentage_of_Genes_Detected': genes_Detected_Percentage, 'Median_Length_of_All_ORFs': median_ORF_Length,
788
+ 'Median_Length_Difference': median_Length_Difference,
789
+ 'Minimum_Length_of_All_ORFs': min_ORF_Length, 'Minimum_Length_Difference': min_Length_Difference,
790
+ 'Maximum_Length_of_All_ORFs': max_ORF_Length, 'Maximum_Length_Difference': max_Length_Difference,
791
+ 'Median_GC_content_of_All_ORFs': format(median_ORF_GC, '.2f'),
792
+ 'Percent_Difference_of_All_ORFs_Median_GC': median_GC_Difference,
793
+ 'Median_GC_content_of_Matched_ORFs': format(matched_Median_ORF_GC, '.2f'),
794
+ 'Percent_Difference_of_Matched_ORF_GC': matched_Median_GC_Difference,
795
+ 'Number_of_ORFs_which_Overlap_Another_ORF': num_All_ORF_Olap,
796
+ 'Percent_Difference_of_Overlapping_ORFs': overlap_Difference,
797
+ 'Maximum_ORF_Overlap': max_All_ORF_Olap, 'Median_ORF_Overlap': median_ORF_Overlap,
798
+ 'Number_of_Matched_ORFs_Overlapping_Another_ORF': len(matched_ORF_Olap),
799
+ 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
800
+ 'Maximum_Matched_ORF_Overlap': max_Matched_ORF_Olap, 'Median_Matched_ORF_Overlap': matched_Median_ORF_Overlap,
801
+ 'Number_of_Short-ORFs': num_ORF_Short, 'Percent_Difference_of_Short-ORFs': short_ORF_Difference,
802
+ 'Number_of_Short-Matched-ORFs': num_Matched_ORF_Short,
803
+ 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
804
+ 'Number_of_Perfect_Matches': len(comp.perfect_Matches),
805
+ 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
806
+ 'Number_of_Perfect_Starts': comp.perfect_Starts, 'Percentage_of_Perfect_Starts': perfect_Starts_Percentage,
807
+ 'Number_of_Perfect_Stops': comp.perfect_Stops, 'Percentage_of_Perfect_Stops': perfect_Stops_Percentage,
808
+ 'Number_of_Out_of_Frame_ORFs': len(comp.out_Of_Frame_ORFs),
809
+ 'Number_of_Matched_ORFs_Extending_a_Coding_Region': comp.extended_CDS,
810
+ 'Percentage_of_Matched_ORFs_Extending_a_Coding_Region': extended_CDS_Percentage,
811
+ 'Number_of_Matched_ORFs_Extending_Start_Region': comp.extended_Start,
812
+ 'Percentage_of_Matched_ORFs_Extending_Start_Region': extended_Start_Percentage,
813
+ 'Number_of_Matched_ORFs_Extending_Stop_Region': comp.extended_Stop,
814
+ 'Percentage_of_Matched_ORFs_Extending_Stop_Region': extended_Stop_Percentage,
815
+ 'Number_of_All_ORFs_on_Positive_Strand': comp.pos_Strand,
816
+ 'Percentage_of_All_ORFs_on_Positive_Strand': pos_Strand_Percentage,
817
+ 'Number_of_All_ORFs_on_Negative_Strand': comp.neg_Strand,
818
+ 'Percentage_of_All_ORFs_on_Negative_Strand': neg_Strand_Percentage,
819
+ 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
820
+ 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference, 'ATG_Start_Percentage': atg_P,
821
+ 'GTG_Start_Percentage': gtg_P, 'TTG_Start_Percentage': ttg_P,
822
+ 'ATT_Start_Percentage': att_P, 'CTG_Start_Percentage': ctg_P, 'Other_Start_Codon_Percentage': other_Start_P,
823
+ 'TAG_Stop_Percentage': tag_P, 'TAA_Stop_Percentage': taa_P,
824
+ 'TGA_Stop_Percentage': tga_P, 'Other_Stop_Codon_Percentage': other_Stop_P, 'True_Positive': TP,
825
+ 'False_Positive': FP, 'False_Negative': FN, 'Precision': precision,
826
+ 'Recall': recall, 'False_Discovery_Rate': false_Discovery_Rate, 'Nucleotide_True_Positive': NT_TP,
827
+ 'Nucleotide_False_Positive': NT_FP, 'Nucleotide_True_Negative': NT_TN,
828
+ 'Nucleotide_False_Negative': NT_FN, 'Nucleotide_Precision': NT_Precision, 'Nucleotide_Recall': NT_Recall,
829
+ 'Nucleotide_False_Discovery_Rate': NT_False_Discovery_Rate,
830
+ 'ORF_Nucleotide_Coverage_of_Genome': orf_Coverage_Genome,
831
+ 'Matched_ORF_Nucleotide_Coverage_of_Genome': matched_ORF_Coverage_Genome})
832
+ result = collections.OrderedDict()
833
+ result.update({
834
+ 'rep_metrics': rep_Metrics,
835
+ 'pred_metrics': pred_metrics,
836
+ })
837
+
838
+ # To account for unbalanced data
839
+ for m_key, m_value in result.items():
840
+ if m_value == 'nan':
841
+ result[m_key] = 'N/A'
842
+
843
+ results[dna_region] = result
844
+
845
+
846
+ print("Finished calculating metrics for: ", dna_region)
847
+
848
+
849
+ return results # Return the results dictionary containing all metrics and details