ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/Comparator.py ADDED
@@ -0,0 +1,882 @@
1
+ import numpy as np
2
+
3
+ try:
4
+ from .utils import *
5
+ except (ImportError, ModuleNotFoundError):
6
+ from utils import *
7
+
8
+
9
+ class comparator: # Class to hold global-type variables
10
+
11
+ def __init__(self):
12
+ self.reset()
13
+
14
+ def reset(self):
15
+ self.perfect_Starts = 0
16
+ self.perfect_Stops = 0
17
+ self.genome_Seq = ''
18
+ self.genome_Seq_Rev = ''
19
+ self.genome_Size = 0
20
+ self.correct_Frame_Number = 0
21
+ self.extended_Start = 0
22
+ self.extended_Stop = 0
23
+ self.extended_CDS = 0
24
+
25
+ self.perfect_Matches = collections.OrderedDict()
26
+ self.matched_ORFs = collections.OrderedDict()
27
+ self.multi_Matched_ORFs = collections.defaultdict(list)
28
+ self.unmatched_ORFs = collections.OrderedDict()
29
+ self.genes_Detected = collections.OrderedDict()
30
+ self.genes_Undetected = collections.OrderedDict()
31
+ self.out_Of_Frame_ORFs = collections.OrderedDict()
32
+ self.partial_Hits = collections.OrderedDict()
33
+
34
+ self.start_Difference = []
35
+ self.stop_Difference = []
36
+ self.orf_Lengths = []
37
+ self.gene_Lengths = []
38
+
39
+ self.gene_Pos_Olap = []
40
+ self.gene_Neg_Olap = []
41
+ self.orf_Pos_Olap = []
42
+ self.orf_Neg_Olap = []
43
+ self.m_ORF_Pos_Olap = []
44
+ self.m_ORF_Neg_Olap = []
45
+
46
+ self.gene_GC = []
47
+ self.orf_GC = []
48
+ self.m_ORF_GC = []
49
+
50
+ self.gene_Short = []
51
+ self.orf_Short = []
52
+ self.m_ORF_Short = []
53
+
54
+ self.pos_Strand = 0
55
+ self.neg_Strand = 0
56
+
57
+
58
+
59
+ comp = comparator()
60
+
61
+
62
+ def is_double_range(range1, range2):
63
+ return len(range1) >= 2 * len(range2)
64
+
65
+
66
+ def nuc_Count(verbose, start, stop, strand): # Gets correct seq then returns GC
67
+ if stop >= comp.genome_Size:
68
+ if verbose == True:
69
+ print("There is a wrap around gene and I am dealing with it the best I can - Start: " + str(start) + " Stop: " + str(stop))
70
+ extra_stop = stop - comp.genome_Size
71
+ stop = comp.genome_Size
72
+ if strand == '-':
73
+ r_Start = comp.genome_Size - stop
74
+ r_Stop = comp.genome_Size - start
75
+ seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
76
+ extra_seq = (comp.genome_Seq_Rev[-extra_stop-1:])
77
+ seq = extra_seq+seq
78
+ elif strand == '+':
79
+ seq = comp.genome_Seq[start - 1:stop]
80
+ extra_seq = comp.genome_Seq[:extra_stop +1]
81
+ seq = seq+extra_seq
82
+ #seq = (comp.genome_Seq[start - 1:stop])
83
+ else:
84
+ if strand == '-':
85
+ r_Start = comp.genome_Size - stop
86
+ r_Stop = comp.genome_Size - start
87
+ seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
88
+ elif strand == '+':
89
+ seq = (comp.genome_Seq[start - 1:stop])
90
+ c = 0
91
+ a = 0
92
+ g = 0
93
+ t = 0
94
+ n = 0
95
+ for i in seq:
96
+ if "C" in i:
97
+ c += 1
98
+ elif "G" in i:
99
+ g += 1
100
+ elif "A" in i:
101
+ a += 1
102
+ elif "T" in i:
103
+ t += 1
104
+ elif "N" in i:
105
+ n += 1
106
+ gc_content = (g + c) * 100 / (a + t + g + c + n)
107
+ return gc_content
108
+
109
+
110
+ def orf_Unmatched(o_Start, o_Stop, o_Strand, tools):
111
+ if o_Strand == '-':
112
+ r_Start = comp.genome_Size - o_Stop
113
+ r_Stop = comp.genome_Size - o_Start
114
+ Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq_Rev[
115
+ r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
116
+ r_Stop - 2:r_Stop + 1] + ',' + tools
117
+ seq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
118
+ comp.unmatched_ORFs.update({Unmatched_ORF: seq})
119
+ elif o_Strand == '+':
120
+ Unmatched_ORF = str(o_Start) + ',' + str(o_Stop) + ',' + o_Strand + ',' + comp.genome_Seq[
121
+ o_Start - 1:o_Start + 2] + ',' + comp.genome_Seq[
122
+ o_Stop - 3:o_Stop] + ',' + tools
123
+ seq = (comp.genome_Seq[o_Start - 1:o_Stop])
124
+ comp.unmatched_ORFs.update({Unmatched_ORF: seq})
125
+
126
+
127
+ def genes_Unmatched(g_Start, g_Stop, g_Strand, tools):
128
+ if g_Strand == '-':
129
+ r_Start = comp.genome_Size - g_Stop
130
+ r_Stop = comp.genome_Size - g_Start
131
+ missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
132
+ r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
133
+ r_Stop - 2:r_Stop + 1] + ',' + tools
134
+ genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
135
+ comp.genes_Undetected.update({missed_Gene: genSeq})
136
+ elif g_Strand == '+':
137
+ missed_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
138
+ g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
139
+ g_Stop - 3:g_Stop] + ',' + tools
140
+ genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
141
+ comp.genes_Undetected.update({missed_Gene: genSeq})
142
+
143
+
144
+ def perfect_Matched_Genes(g_Start, g_Stop, g_Strand,tools):
145
+ if g_Strand == '-':
146
+ r_Start = comp.genome_Size - g_Stop
147
+ r_Stop = comp.genome_Size - g_Start
148
+ perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq_Rev[
149
+ r_Start:r_Start + 3] + ',' + comp.genome_Seq_Rev[
150
+ r_Stop - 2:r_Stop + 1] + ',' + tools
151
+ genSeq = (comp.genome_Seq_Rev[r_Start:r_Stop + 1])
152
+ comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
153
+ elif g_Strand == '+':
154
+ perfect_Matched_Gene = str(g_Start) + ',' + str(g_Stop) + ',' + g_Strand + ',' + comp.genome_Seq[
155
+ g_Start - 1:g_Start + 2] + ',' + comp.genome_Seq[
156
+ g_Stop - 3:g_Stop] + ',' + tools
157
+ genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
158
+ comp.perfect_Matches.update({perfect_Matched_Gene: genSeq})
159
+
160
+
161
+ def match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand):
162
+ comp.correct_Frame_Number += 1
163
+ ############ Calculate prediction precision
164
+ if '+' in g_Strand:
165
+ comp.start_Difference.append(o_Start - g_Start)
166
+ comp.stop_Difference.append(o_Stop - g_Stop)
167
+ if g_Start == o_Start:
168
+ comp.perfect_Starts += 1
169
+ if g_Stop == o_Stop:
170
+ comp.perfect_Stops += 1
171
+ if o_Start < g_Start and o_Stop > g_Stop:
172
+ comp.extended_CDS += 1
173
+ if o_Start < g_Start:
174
+ comp.extended_Start += 1
175
+ if o_Stop > g_Stop:
176
+ comp.extended_Stop += 1
177
+ elif '-' in g_Strand: # Negative strand genes are reversed
178
+ comp.start_Difference.append(o_Stop - g_Stop)
179
+ comp.stop_Difference.append(o_Start - g_Start)
180
+ if g_Start == o_Start:
181
+ comp.perfect_Stops += 1
182
+ if g_Stop == o_Stop:
183
+ comp.perfect_Starts += 1
184
+ if o_Start < g_Start and o_Stop > g_Stop:
185
+ comp.extended_CDS += 1
186
+ if o_Start < g_Start:
187
+ comp.extended_Stop += 1
188
+ if o_Stop > g_Stop:
189
+ comp.extended_Start += 1
190
+
191
+
192
+ def start_Codon_Count(orfs):
193
+ atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
194
+ other_Starts = []
195
+ for orf in orfs.values():
196
+ codon = orf[1]
197
+ if codon == 'ATG':
198
+ atg += 1
199
+ elif codon == 'GTG':
200
+ gtg += 1
201
+ elif codon == 'TTG':
202
+ ttg += 1
203
+ elif codon == 'ATT':
204
+ att += 1
205
+ elif codon == 'CTG':
206
+ ctg += 1
207
+ else:
208
+ other += 1
209
+ other_Starts.append(codon)
210
+
211
+ total = len(orfs) if orfs is not None else 0
212
+
213
+ if total:
214
+ atg_P = format(100 * atg / len(orfs), '.2f')
215
+ gtg_P = format(100 * gtg / len(orfs), '.2f')
216
+ ttg_P = format(100 * ttg / len(orfs), '.2f')
217
+ att_P = format(100 * att / len(orfs), '.2f')
218
+ ctg_P = format(100 * ctg / len(orfs), '.2f')
219
+ other_Start_P = format(100 * other / len(orfs), '.2f')
220
+ else:
221
+ atg_P = ttg_P = gtg_P = ctg_P = att_P = other_Start_P = format(0, '.2f')
222
+
223
+ return {
224
+ 'ATG': (atg, atg_P),
225
+ 'TTG': (ttg, ttg_P),
226
+ 'GTG': (gtg, gtg_P),
227
+ 'CTG': (ctg, ctg_P),
228
+ 'ATT': (att, att_P),
229
+ 'Other': (other, other_Start_P),
230
+ 'total': total
231
+ }
232
+
233
+ def stop_Codon_Count(orfs):
234
+ tag, taa, tga, other = 0, 0, 0, 0
235
+ other_Stops = []
236
+
237
+ total = len(orfs) if orfs else 0
238
+ if total:
239
+ for orf in orfs.values():
240
+ codon = orf[2]
241
+ if codon == 'TAG':
242
+ tag += 1
243
+ elif codon == 'TAA':
244
+ taa += 1
245
+ elif codon == 'TGA':
246
+ tga += 1
247
+ else:
248
+ other += 1
249
+ other_Stops.append(codon)
250
+ tag_p = format(100 * tag / len(orfs), '.2f')
251
+ taa_p = format(100 * taa / len(orfs), '.2f')
252
+ tga_p = format(100 * tga / len(orfs), '.2f')
253
+ other_Stop_P = format(100 * other / len(orfs), '.2f')
254
+ else:
255
+ tag_p = taa_p = tga_p = other_Stop_P = format(0, '.2f')
256
+
257
+ return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
258
+
259
+
260
+ def candidate_ORF_Selection(gene_Set,
261
+ candidate_ORFs): # Select ORF from candidates which is most similar to partially detected gene
262
+ current_Coverage = 0
263
+ candidate_ORF_Difference = 0
264
+ pos = ''
265
+ orf_Details = []
266
+ for c_Pos, c_ORF_Details in candidate_ORFs.items():
267
+ o_Start = int(c_Pos.split(',')[0])
268
+ o_Stop = int(c_Pos.split(',')[1])
269
+ # Below is not a long term fix
270
+ coverage = c_ORF_Details[-1]
271
+ orf_Set = set(range(o_Start, o_Stop + 1))
272
+ if coverage > current_Coverage:
273
+ current_Coverage = coverage
274
+ # Return set of elements outside the two sets/DNA ranges
275
+ candidate_ORF_Difference = orf_Set.symmetric_difference(gene_Set)
276
+ pos = c_Pos
277
+ orf_Details = c_ORF_Details
278
+ elif coverage == current_Coverage:
279
+ current_ORF_Difference = orf_Set.symmetric_difference(
280
+ gene_Set) # Pick least different ORF set from the Gene Set
281
+ if len(current_ORF_Difference) > len(candidate_ORF_Difference):
282
+ pos = c_Pos
283
+ orf_Details = c_ORF_Details
284
+ #else:
285
+ #("Match filtered out")
286
+ return pos, orf_Details
287
+
288
+
289
+ def partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, tools):
290
+ if g_Strand == '-':
291
+ r_G_Start = comp.genome_Size - g_Stop
292
+ r_G_Stop = comp.genome_Size - g_Start
293
+ r_O_Start = comp.genome_Size - o_Stop
294
+ r_O_Stop = comp.genome_Size - o_Start
295
+ partial = "Gene:" + str(g_Start) + '_' + str(g_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
296
+ r_G_Start:r_G_Start + 3] + '_' + comp.genome_Seq_Rev[
297
+ r_G_Stop - 2:r_G_Stop + 1] + ';Predicted_CDS:' + str(
298
+ o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq_Rev[
299
+ r_O_Start:r_O_Start + 3] + '_' + comp.genome_Seq_Rev[
300
+ r_O_Stop - 2:r_O_Stop + 1] + ';' + tools
301
+ genSeq = (comp.genome_Seq_Rev[r_G_Start:r_G_Stop + 1])
302
+ orfSeq = (comp.genome_Seq_Rev[r_O_Start:r_O_Stop + 1])
303
+ comp.partial_Hits.update({partial: [genSeq, orfSeq]})
304
+ elif g_Strand == '+':
305
+ partial = "Gene:" + str(g_Start) + '_' + str(g_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
306
+ g_Start - 1:g_Start + 2] + '_' + comp.genome_Seq[
307
+ g_Stop - 3:g_Stop] + ';Predicted_CDS:' + str(
308
+ o_Start) + '_' + str(o_Stop) + '_' + g_Strand + '_' + comp.genome_Seq[
309
+ o_Start - 1:o_Start + 2] + '_' + comp.genome_Seq[
310
+ o_Stop - 3:o_Stop] + ';' + tools
311
+ genSeq = (comp.genome_Seq[g_Start - 1:g_Stop])
312
+ orfSeq = (comp.genome_Seq[o_Start - 1:o_Stop])
313
+ comp.partial_Hits.update({partial: [genSeq, orfSeq]})
314
+
315
+
316
+ def tool_comparison(all_orfs, dna_regions, verbose):
317
+ results = collections.OrderedDict() # Store results for each DNA region
318
+ for dna_region in dna_regions: # Loop through each DNA region
319
+ # reset comparator class variables
320
+ comp.reset()
321
+
322
+ ref_genes_list = dna_regions[dna_region][2]
323
+ ref_genes = collections.OrderedDict()
324
+
325
+ if not ref_genes_list:
326
+ results[dna_region] = {}
327
+ continue
328
+
329
+ for d in ref_genes_list:
330
+ ref_genes.update(d)
331
+ comp.genome_Seq = dna_regions[dna_region][0]
332
+ comp.genome_Seq_Rev = revCompIterative(dna_regions[dna_region][0])
333
+ comp.genome_Size = len(dna_regions[dna_region][0])
334
+
335
+ current_orfs = all_orfs[dna_region]
336
+ # sort the ORFs by start position
337
+
338
+ better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
339
+
340
+ if not current_orfs or not better_pos_orfs_items:
341
+ results[dna_region] = {}
342
+ continue
343
+
344
+ for gene_num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
345
+ g_Start = int(gene_details[0])
346
+ g_Stop = int(gene_details[1])
347
+ g_Strand = gene_details[2]
348
+ g_pos = str(g_Start) + ',' + str(g_Stop)
349
+ gene_Set = set(range(g_Start,
350
+ g_Stop + 1)) # Used to check Overlap of ORFs and pick best match - slow but confirms best match
351
+ overlapping_ORFs = collections.OrderedDict()
352
+ perfect_Match = False
353
+ out_Frame = False
354
+ for pos, orf_Details in better_pos_orfs_items: # Check if perfect match, if not check if match covers at least 75% of gene - Loop through ALL ORFs - SLOW
355
+ o_Start,o_Stop = pos
356
+ o_Strand = orf_Details[0]
357
+ #orf_Set = set(range(o_Start, o_Stop + 1)) Removed for optimisation
358
+ if o_Stop <= g_Start or o_Start >= g_Stop: # Not caught up yet
359
+ continue
360
+ elif o_Start == g_Start and o_Stop == g_Stop: # If perfect match, break and skip the rest of the ORFs
361
+ perfect_Match = True
362
+ break
363
+ elif is_double_range(range(o_Start, o_Stop), range(g_Start,g_Stop)): # If ORF is double or more than the length of the gene, we do not count as found.
364
+ continue
365
+ elif g_Start <= o_Start < g_Stop or g_Start < o_Stop < g_Stop: # If ORF Start or Stop is between gene Start or Stop
366
+ #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
367
+ overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
368
+ coverage = 100 * float(overlap) / float(len(gene_Set))
369
+ orf_Details.append(coverage)
370
+ if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
371
+ overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
372
+ elif coverage >= MIN_COVERAGE: # Not in frame / on same strand
373
+ comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
374
+ out_Frame = True
375
+ elif o_Start <= g_Start and o_Stop >= g_Stop: # If ORF extends one or both ends of the gene
376
+ #overlap = len(gene_Set.intersection(orf_Set)) # Replaced for optimisation
377
+ overlap = max(min(o_Stop, g_Stop) - max(o_Start, g_Start), -1) + 1
378
+ coverage = 100 * float(overlap) / float(len(gene_Set))
379
+ orf_Details.append(coverage)
380
+ if abs(o_Stop - g_Stop) % 3 == 0 and o_Strand == g_Strand and coverage >= MIN_COVERAGE: # Only continue if ORF covers at least 75% of the gene and is in frame
381
+ overlapping_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
382
+ elif coverage >= MIN_COVERAGE:
383
+ comp.out_Of_Frame_ORFs.update({f'{o_Start},{o_Stop}': orf_Details})
384
+ out_Frame = True
385
+ else:
386
+ if verbose == True:
387
+ print("Unexpected Error Finding Predicted CDSs") # Should not happen
388
+ # Now Check that we select the best ORF
389
+ ### Multi_Match_ORFs Should contain All genes found by a specific ORF
390
+ if perfect_Match == True: # Check if the ORF is a perfect match to the Gene
391
+ m_ORF_Details = orf_Details[:]
392
+ m_ORF_Details.append(g_pos)
393
+ if g_pos in comp.matched_ORFs.keys():
394
+ previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
395
+ comp.multi_Matched_ORFs[g_pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
396
+ '-'), orf_Details[4]] # ORF is same as gene so can use g_pos
397
+ comp.matched_ORFs.update({g_pos: m_ORF_Details})
398
+ comp.genes_Detected.update({str(gene_details): g_pos})
399
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
400
+ perfect_Matched_Genes(g_Start, g_Stop, g_Strand,orf_Details[4])
401
+ #if verbose == True:
402
+ # print('Perfect Match')
403
+ elif perfect_Match == False and len(
404
+ overlapping_ORFs) == 1: # If we do not have a perfect match but 1 ORF which has passed the filtering
405
+ orf_Pos = list(overlapping_ORFs.keys())[0]
406
+ o_Start = int(orf_Pos.split(',')[0])
407
+ o_Stop = int(orf_Pos.split(',')[1])
408
+ orf_Details = overlapping_ORFs[orf_Pos]
409
+ m_ORF_Details = orf_Details[:]
410
+ m_ORF_Details.append(g_pos)
411
+ if orf_Pos in comp.matched_ORFs.keys():
412
+ try:
413
+ previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
414
+ except KeyError:
415
+ last_key = [*comp.matched_ORFs.keys()][-1]
416
+ previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
417
+ comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
418
+ '-'), orf_Details[4]] # ORF collects multiple gene pos'
419
+ comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
420
+ comp.genes_Detected.update({str(gene_details): orf_Pos})
421
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
422
+ #if verbose == True:
423
+ # print('Partial Match')
424
+ partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
425
+ elif perfect_Match == False and len(
426
+ overlapping_ORFs) >= 1: # If we have more than 1 potential ORF match, we check to see which is the 'best' hit
427
+ orf_Pos, orf_Details = candidate_ORF_Selection(gene_Set, overlapping_ORFs) # Return best match
428
+ o_Start = int(orf_Pos.split(',')[0])
429
+ o_Stop = int(orf_Pos.split(',')[1])
430
+ m_ORF_Details = orf_Details[:]
431
+ m_ORF_Details.append(g_pos)
432
+ if orf_Pos in comp.matched_ORFs.keys():
433
+ try:
434
+ previously_Covered_Gene = comp.matched_ORFs[g_pos][-1]
435
+ except KeyError:
436
+ last_key = [*comp.matched_ORFs.keys()][-1]
437
+ previously_Covered_Gene = comp.matched_ORFs[last_key][-1]
438
+ comp.multi_Matched_ORFs[orf_Pos] += [g_pos.replace(',', '-'), previously_Covered_Gene.replace(',',
439
+ '-'), orf_Details[4]] # ORF collects multiple gene pos'
440
+ comp.matched_ORFs.update({orf_Pos: m_ORF_Details})
441
+ comp.genes_Detected.update({str(gene_details): orf_Pos})
442
+ match_Statistics(o_Start, o_Stop, g_Start, g_Stop, g_Strand)
443
+ if verbose == True:
444
+ print('There was more than 1 potential Match - Best Chosen')
445
+ partial_Hit_Calc(g_Start, g_Stop, g_Strand, o_Start, o_Stop, orf_Details[4])
446
+ elif out_Frame: # Keep record of ORFs which overlap a gene but in the wrong frame
447
+ if verbose == True:
448
+ print("Out of Frame Predicted CDS")
449
+ genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) #
450
+ else:
451
+ genes_Unmatched(g_Start, g_Stop, g_Strand, orf_Details[4]) # No hit
452
+ #if verbose == True:
453
+ # print("No Hit")
454
+ for orf_Key in comp.matched_ORFs: # Remove ORFs from out of frame if ORF was correctly matched to another Gene
455
+ if orf_Key in comp.out_Of_Frame_ORFs:
456
+ del comp.out_Of_Frame_ORFs[orf_Key]
457
+ ######################################## ORF Lengths and Precision
458
+ start_Difference = [x for x in comp.start_Difference if x != 0] # Remove 0s (Perfect hits)
459
+ stop_Difference = [x for x in comp.stop_Difference if x != 0]
460
+ if len(start_Difference) >= 1:
461
+ median_Start_Difference = np.median(start_Difference)
462
+ else:
463
+ median_Start_Difference = 'N/A'
464
+ if len(stop_Difference) >= 1:
465
+ median_Stop_Difference = np.median(stop_Difference)
466
+ else:
467
+ median_Stop_Difference = 'N/A'
468
+
469
+ # Get Start and Stop Codon Usage
470
+ atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(current_orfs)
471
+ tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(current_orfs)
472
+ # Count nucleotides found from ALL ORFs
473
+ gene_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
474
+ orf_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
475
+ matched_ORF_Nuc_Array = np.zeros((comp.genome_Size), dtype=bool)
476
+
477
+ prev_Gene_Stop = 0
478
+ prev_Gene_Overlapped = False
479
+ for gene_Num, gene_details in ref_genes.items(): # Loop through each gene to compare against predicted ORFs
480
+ g_Start = int(gene_details[0])
481
+ g_Stop = int(gene_details[1])
482
+ g_Strand = gene_details[2]
483
+ gene_Length = (g_Stop - g_Start) +1
484
+ if gene_Length == 0: print(g_Start, g_Stop, "!!!!!!!!!!!!!!!!!!!!!!!!")
485
+ comp.gene_Lengths.append(gene_Length)
486
+ gene_Nuc_Array[g_Start - 1:g_Stop] = True # Changing all between the two positions to 1's
487
+ comp.gene_GC.append(nuc_Count(verbose, g_Start, g_Stop, g_Strand))
488
+ if gene_Length <= SHORT_ORF_LENGTH: # .utils
489
+ comp.gene_Short.append(gene_Length)
490
+ ### Calculate overlapping Genes -
491
+ if prev_Gene_Stop > g_Start:
492
+ if '+' in g_Strand:
493
+ comp.gene_Pos_Olap.append(prev_Gene_Stop - g_Start)
494
+ elif '-' in g_Strand:
495
+ comp.gene_Neg_Olap.append(prev_Gene_Stop - g_Start)
496
+ prev_Gene_Overlapped = True
497
+ elif prev_Gene_Stop < g_Start:
498
+ if prev_Gene_Overlapped == True:
499
+ if '+' in g_Strand:
500
+ comp.gene_Pos_Olap.append(0)
501
+ elif '-' in g_Strand:
502
+ comp.gene_Neg_Olap.append(0)
503
+ prev_Gene_Overlapped = False
504
+ prev_Gene_Stop = g_Stop
505
+ if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
506
+ if '+' in g_Strand:
507
+ comp.gene_Pos_Olap.append(0)
508
+ elif '-' in g_Strand:
509
+ comp.gene_Neg_Olap.append(0)
510
+ #### avoid ValueError
511
+ if comp.gene_Lengths:
512
+ min_Gene_Length = min(comp.gene_Lengths)
513
+ max_Gene_Length = max(comp.gene_Lengths)
514
+ median_Gene_Length = np.median(comp.gene_Lengths)
515
+ else:
516
+ min_Gene_Length = max_Gene_Length = min_Length_Difference = 0
517
+ prev_ORF_Stop = 0
518
+ prev_ORF_Overlapped = False
519
+ for o_Positions, orf_Details in current_orfs.items():
520
+ o_Start = int(o_Positions.split(',')[0])
521
+ o_Stop = int(o_Positions.split(',')[1])
522
+ o_Strand = orf_Details[0]
523
+ # Stats just for Unmatched ORFs
524
+ if o_Positions not in list(comp.matched_ORFs.keys()):
525
+ orf_Unmatched(o_Start, o_Stop, o_Strand, orf_Details[4])
526
+ # Get ORF Strand metrics:
527
+ if o_Strand == "+": # Get number of Positive and Negative strand ORFs
528
+ comp.pos_Strand += 1
529
+ elif o_Strand == "-":
530
+ comp.neg_Strand += 1
531
+ orf_Length = (o_Stop - o_Start) +1
532
+ comp.orf_Lengths.append(orf_Length)
533
+ orf_Nuc_Array[o_Start - 1:o_Stop] = True # Changing all between the two positions to 1's
534
+ comp.orf_GC.append(nuc_Count(verbose, o_Start, o_Stop, o_Strand))
535
+ if orf_Length <= SHORT_ORF_LENGTH: # .utils
536
+ comp.orf_Short.append(orf_Length)
537
+ ### Calculate overlapping ORFs -
538
+ if prev_ORF_Stop > o_Start:
539
+ if '+' in o_Strand:
540
+ comp.orf_Pos_Olap.append(prev_ORF_Stop - o_Start)
541
+ elif '-' in o_Strand:
542
+ comp.orf_Neg_Olap.append(prev_ORF_Stop - o_Start)
543
+ prev_ORF_Overlapped = True
544
+ elif prev_ORF_Stop < o_Start:
545
+ if prev_ORF_Overlapped == True:
546
+ if '+' in o_Strand:
547
+ comp.orf_Pos_Olap.append(0)
548
+ elif '-' in o_Strand:
549
+ comp.orf_Neg_Olap.append(0)
550
+ prev_ORF_Overlapped = False
551
+ prev_ORF_Stop = o_Stop
552
+ if prev_ORF_Overlapped == True: # If last has a prev overlap, count it
553
+ if '+' in o_Strand:
554
+ comp.orf_Pos_Olap.append(0)
555
+ elif '-' in o_Strand:
556
+ comp.orf_Neg_Olap.append(0)
557
+
558
+ # Nucleotide Coverage calculated from ORFs matching a gene only
559
+ matched_Prev_ORF_Stop = 0
560
+ matched_Prev_ORF_Overlapped = False
561
+ for mo_Positions, m_ORF_Details in comp.matched_ORFs.items():
562
+ mo_Start = int(mo_Positions.split(',')[0])
563
+ mo_Stop = int(mo_Positions.split(',')[1])
564
+ mo_Strand = m_ORF_Details[0]
565
+ mo_Length = (mo_Stop - mo_Start)
566
+ matched_ORF_Nuc_Array[mo_Start - 1:mo_Stop] = True # This is the complete matched orf not the matched orf bits
567
+
568
+ comp.m_ORF_GC.append(nuc_Count(verbose, mo_Start, mo_Stop, mo_Strand))
569
+ if mo_Length <= SHORT_ORF_LENGTH: # .utils
570
+ comp.m_ORF_Short.append(mo_Length)
571
+ ### Calculate overlapping Matched ORFs -
572
+ if matched_Prev_ORF_Stop > mo_Start:
573
+ if '+' in mo_Strand:
574
+ comp.m_ORF_Pos_Olap.append(matched_Prev_ORF_Stop - mo_Start)
575
+ elif '-' in mo_Strand:
576
+ comp.m_ORF_Neg_Olap.append(matched_Prev_ORF_Stop - mo_Start)
577
+ matched_Prev_ORF_Overlapped = True
578
+ elif matched_Prev_ORF_Stop < mo_Start:
579
+ if matched_Prev_ORF_Overlapped == True:
580
+ if '+' in mo_Strand:
581
+ comp.m_ORF_Pos_Olap.append(0)
582
+ elif '-' in mo_Strand:
583
+ comp.m_ORF_Neg_Olap.append(0)
584
+ matched_Prev_ORF_Overlapped = False
585
+ matched_Prev_ORF_Stop = mo_Stop
586
+ if matched_Prev_ORF_Overlapped == True: # If last has a prev overlap, count it
587
+ if '+' in mo_Strand:
588
+ comp.m_ORF_Pos_Olap.append(0)
589
+ elif '-' in mo_Strand:
590
+ comp.m_ORF_Neg_Olap.append(0)
591
+ ####
592
+ gene_Coverage_Genome = format(100 * np.sum(gene_Nuc_Array) / comp.genome_Size, '.2f')
593
+ orf_Coverage_Genome = format(100 * np.sum(orf_Nuc_Array) / comp.genome_Size, '.2f')
594
+ matched_ORF_Coverage_Genome = format(100 * np.sum(matched_ORF_Nuc_Array) / comp.genome_Size,
595
+ '.2f') # This gets the nts which are in matched ORFs - Check below
596
+ # matched_ORF_Nuc_AND_Gene = np.logical_and(matched_ORF_Nuc_Array,gene_Nuc_Array) + [0 for i in range(len(gene_Nuc_Array))] # This gets the nts which are in both matched ORFs and detected genes
597
+ # matched_ORF_Coverage_Genome = format(100 * np.count_nonzero(matched_ORF_Nuc_AND_Gene) / comp.genome_Size,'.2f')
598
+
599
+ # gene and orf nucleotide Intersection
600
+ gene_ORF_Nuc_Intersection = np.sum(gene_Nuc_Array & orf_Nuc_Array)
601
+ # not gene but orf nucleotides
602
+ not_Gene_Nuc_Array = np.logical_not(gene_Nuc_Array)
603
+ not_Gene_Nuc_And_ORF_Count = np.sum(not_Gene_Nuc_Array & orf_Nuc_Array)
604
+ # not orf nucleotides but gene
605
+ not_ORF_Nuc_Array = np.logical_not(orf_Nuc_Array)
606
+ not_ORF_Nuc_And_Gene_Count = np.sum(not_ORF_Nuc_Array & gene_Nuc_Array)
607
+ # not gene or orf nucleotides
608
+ not_Gene_Nuc_Not_ORF_Nuc_Count = np.sum(not_Gene_Nuc_Array & not_ORF_Nuc_Array)
609
+ # Nucleotide 'accuracy' - Normalised by number of nucelotides annotated by a gene
610
+ NT_TP = format(gene_ORF_Nuc_Intersection / np.sum(gene_Nuc_Array), '.2f')
611
+ NT_FP = format(not_Gene_Nuc_And_ORF_Count / np.sum(not_Gene_Nuc_Array), '.2f')
612
+ NT_FN = format(not_ORF_Nuc_And_Gene_Count / np.sum(gene_Nuc_Array), '.2f')
613
+ NT_TN = format(not_Gene_Nuc_Not_ORF_Nuc_Count / np.sum(not_Gene_Nuc_Array), '.2f')
614
+ NT_Precision = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_Gene_Nuc_And_ORF_Count), '.2f')
615
+ NT_Recall = format(gene_ORF_Nuc_Intersection / (gene_ORF_Nuc_Intersection + not_ORF_Nuc_And_Gene_Count), '.2f')
616
+ NT_False_Discovery_Rate = format(
617
+ not_Gene_Nuc_And_ORF_Count / (not_Gene_Nuc_And_ORF_Count + gene_ORF_Nuc_Intersection), '.2f')
618
+ ################################# Precision and Recall of whole ORFs and Genes
619
+ TP = format(len(comp.genes_Detected) / len(ref_genes), '.2f')
620
+ FP = format(len(comp.unmatched_ORFs) / len(ref_genes), '.2f')
621
+ FN = format(len(comp.genes_Undetected) / len(ref_genes), '.2f')
622
+ #################################################### Need a better way to handle 'no hits/ORFs'
623
+ try:
624
+ precision = format(float(TP) / (float(TP) + float(FP)), '.2f')
625
+ except ZeroDivisionError:
626
+ precision = format(0.00, '.2f')
627
+ try:
628
+ recall = format(float(TP) / (float(TP) + float(FN)), '.2f')
629
+ except ZeroDivisionError:
630
+ recall = format(0.00, '.2f')
631
+ try:
632
+ false_Discovery_Rate = format(float(FP) / (float(FP) + float(TP)), '.2f')
633
+ except ZeroDivisionError:
634
+ false_Discovery_Rate = 'N/A'
635
+ min_ORF_Length = min(comp.orf_Lengths)
636
+ max_ORF_Length = max(comp.orf_Lengths)
637
+ median_ORF_Length = np.median(comp.orf_Lengths)
638
+
639
+ ##########################################################################
640
+ # Metrics - There are numerous cases where certain metric calculations may return a ZeroDivError.
641
+ ORFs_Difference = format(100 * (len(current_orfs) - len(ref_genes)) / len(ref_genes), '.2f') # Difference off +/-
642
+ genes_Detected_Percentage = format(100 * (len(comp.genes_Detected) / len(ref_genes)), '.2f')
643
+ matched_ORF_Percentage = format(100 * (len(comp.matched_ORFs) / len(current_orfs)), '.2f')
644
+ all_ORF_Olap = (comp.orf_Pos_Olap + comp.orf_Neg_Olap) # Combine pos and neg strand overlaps
645
+ matched_ORF_Olap = (comp.m_ORF_Pos_Olap + comp.m_ORF_Neg_Olap)
646
+ all_Gene_Olap = (comp.gene_Pos_Olap + comp.gene_Neg_Olap)
647
+
648
+ if all_ORF_Olap: # If no overlapping ORFs
649
+ try:
650
+ overlap_Difference = format(100 * (len(all_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap), '.2f')
651
+ matched_Overlap_Difference = format(100 * (len(matched_ORF_Olap) - len(all_Gene_Olap)) / len(all_Gene_Olap),
652
+ '.2f')
653
+ except ZeroDivisionError:
654
+ overlap_Difference = 'N/A'
655
+ matched_Overlap_Difference = 'N/A'
656
+ num_All_ORF_Olap = len(all_ORF_Olap)
657
+ if matched_ORF_Olap:
658
+ max_Matched_ORF_Olap = max(matched_ORF_Olap)
659
+ matched_Median_ORF_Overlap = format(np.median(matched_ORF_Olap), '.2f')
660
+ else:
661
+ max_Matched_ORF_Olap = 'N/A'
662
+ matched_Median_ORF_Overlap = 'N/A'
663
+ max_All_ORF_Olap = max(all_ORF_Olap)
664
+ median_ORF_Overlap = format(np.median(all_ORF_Olap), '.2f')
665
+ else:
666
+ overlap_Difference = 'N/A'
667
+ matched_Overlap_Difference = 'N/A'
668
+ num_All_ORF_Olap = 0
669
+ max_Matched_ORF_Olap = 'N/A'
670
+ max_All_ORF_Olap = 'N/A'
671
+ median_ORF_Overlap = 'N/A'
672
+ matched_Median_ORF_Overlap = 'N/A'
673
+ if len(matched_ORF_Olap) == 0: # -100.00 is not informative
674
+ matched_Overlap_Difference = 'N/A'
675
+
676
+ # Need to NA everything
677
+
678
+ if comp.orf_Short and comp.gene_Short: # IF Short-ORFs/Genes
679
+ short_ORF_Difference = format(100 * (len(comp.orf_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
680
+ matched_Short_ORF_Difference = format(
681
+ 100 * (len(comp.m_ORF_Short) - len(comp.gene_Short)) / len(comp.gene_Short), '.2f')
682
+ num_ORF_Short = len(comp.orf_Short)
683
+ num_Matched_ORF_Short = len(comp.m_ORF_Short)
684
+ elif comp.orf_Short: # If only Short-ORFs
685
+ num_ORF_Short = len(comp.orf_Short)
686
+ num_Matched_ORF_Short = 'N/A'
687
+ short_ORF_Difference = (num_ORF_Short * 100)
688
+ matched_Short_ORF_Difference = 'N/A'
689
+ else: # If only Short-Genes and Undetected StORFs
690
+ comp.gene_Short
691
+ short_ORF_Difference = 'N/A'
692
+ matched_Short_ORF_Difference = 'N/A'
693
+ num_ORF_Short = 0
694
+ num_Matched_ORF_Short = 'N/A'
695
+ if num_Matched_ORF_Short == 0: # -100.00 is not informative
696
+ matched_Short_ORF_Difference = 'N/A'
697
+
698
+ median_Length_Difference = format(100 * (median_ORF_Length - median_Gene_Length) / median_Gene_Length, '.2f')
699
+ min_Length_Difference = format(100 * (min_ORF_Length - min_Gene_Length) / min_Gene_Length, '.2f')
700
+ max_Length_Difference = format(100 * (max_ORF_Length - max_Gene_Length) / max_Gene_Length, '.2f')
701
+ pos_Strand_Percentage = format(comp.pos_Strand / len(current_orfs), '.2f')
702
+ neg_Strand_Percentage = format(comp.neg_Strand / len(current_orfs), '.2f')
703
+ median_ORF_GC = np.median(comp.orf_GC)
704
+ matched_Median_ORF_GC = np.median(comp.m_ORF_GC)
705
+ median_Gene_GC = np.median(comp.gene_GC)
706
+ median_GC_Difference = format(100 * (float(median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
707
+ matched_Median_GC_Difference = format(
708
+ 100 * (float(matched_Median_ORF_GC) - float(median_Gene_GC)) / float(median_Gene_GC), '.2f')
709
+
710
+ if comp.matched_ORFs: # No ORFs detected a gene
711
+ extended_CDS_Percentage = format(100 * comp.extended_CDS / len(comp.matched_ORFs), '.2f')
712
+ extended_Start_Percentage = format(100 * comp.extended_Start / len(comp.matched_ORFs), '.2f')
713
+ extended_Stop_Percentage = format(100 * comp.extended_Stop / len(comp.matched_ORFs), '.2f')
714
+ perfect_Matches_Percentage = format(100 * len(comp.perfect_Matches) / len(comp.matched_ORFs), '.2f')
715
+ perfect_Starts_Percentage = format(100 * comp.perfect_Starts / len(comp.matched_ORFs), '.2f')
716
+ perfect_Stops_Percentage = format(100 * comp.perfect_Stops / len(comp.matched_ORFs), '.2f')
717
+ else:
718
+ # correct_Frame_Percentage = 0
719
+ extended_CDS_Percentage = format(0.00, '.2f')
720
+ extended_Start_Percentage = format(0.00, '.2f')
721
+ extended_Stop_Percentage = format(0.00, '.2f')
722
+ perfect_Matches_Percentage = format(0.00, '.2f')
723
+ perfect_Starts_Percentage = format(0.00, '.2f')
724
+ perfect_Stops_Percentage = format(0.00, '.2f')
725
+ ################### Missed Genes Metrics:
726
+ if comp.genes_Undetected:
727
+ mg_Starts = []
728
+ mg_Stops = []
729
+ mg_Lengths = []
730
+ mg_Strands = []
731
+ for mg, seq in comp.genes_Undetected.items():
732
+ mg = mg.split(',')
733
+ mg_Starts.append(mg[3])
734
+ mg_Stops.append(mg[4])
735
+ mg_Strands.append(mg[2])
736
+ mg_Lengths.append(int(mg[1]) - int(mg[0]))
737
+
738
+ mg_ATG = 100 * mg_Starts.count('ATG') / len(comp.genes_Undetected)
739
+ mg_GTG = 100 * mg_Starts.count('GTG') / len(comp.genes_Undetected)
740
+ mg_TTG = 100 * mg_Starts.count('TTG') / len(comp.genes_Undetected)
741
+ mg_ATT = 100 * mg_Starts.count('ATT') / len(comp.genes_Undetected)
742
+ mg_CTG = 100 * mg_Starts.count('CTG') / len(comp.genes_Undetected)
743
+ mg_O_Start = 100 - (mg_ATG + mg_GTG + mg_TTG + mg_ATT + mg_CTG)
744
+ mg_TGA = 100 * mg_Stops.count('TGA') / len(comp.genes_Undetected)
745
+ mg_TAA = 100 * mg_Stops.count('TAA') / len(comp.genes_Undetected)
746
+ mg_TAG = 100 * mg_Stops.count('TAG') / len(comp.genes_Undetected)
747
+ mg_O_Stop = 100 - (mg_TGA + mg_TAA + mg_TAG)
748
+ median_mg_Len = np.median(mg_Lengths)
749
+ mg_Pos = mg_Strands.count('+')
750
+ mg_Neg = mg_Strands.count('-')
751
+ undetected_Gene_Metrics = (
752
+ format(mg_ATG, '.2f'), format(mg_GTG, '.2f'), format(mg_TTG, '.2f'), format(mg_ATT, '.2f'),
753
+ format(mg_CTG, '.2f'), format(mg_O_Start, '.2f'), format(mg_TGA, '.2f'), format(mg_TAA, '.2f'),
754
+ format(mg_TAG, '.2f'), format(mg_O_Stop, '.2f'), format(median_mg_Len, '.2f'), mg_Pos, mg_Neg)
755
+ else:
756
+ undetected_Gene_Metrics = ''
757
+ #################### Unmathced ORF Metrics:
758
+ if comp.unmatched_ORFs:
759
+ uo_Starts = []
760
+ uo_Stops = []
761
+ uo_Lengths = []
762
+ uo_Strands = []
763
+ for uo, seq in comp.unmatched_ORFs.items():
764
+ uo = uo.split(',')
765
+ uo_Starts.append(uo[3])
766
+ uo_Stops.append(uo[4])
767
+ uo_Strands.append(uo[2])
768
+ uo_Lengths.append(int(uo[1]) - int(uo[0]))
769
+ uo_ATG = 100 * uo_Starts.count('ATG') / len(comp.unmatched_ORFs)
770
+ uo_GTG = 100 * uo_Starts.count('GTG') / len(comp.unmatched_ORFs)
771
+ uo_TTG = 100 * uo_Starts.count('TTG') / len(comp.unmatched_ORFs)
772
+ uo_ATT = 100 * uo_Starts.count('ATT') / len(comp.unmatched_ORFs)
773
+ uo_CTG = 100 * uo_Starts.count('CTG') / len(comp.unmatched_ORFs)
774
+ uo_O_Start = 100 - (uo_ATG + uo_GTG + uo_TTG + uo_ATT + uo_CTG)
775
+ uo_TGA = 100 * uo_Stops.count('TGA') / len(comp.unmatched_ORFs)
776
+ uo_TAA = 100 * uo_Stops.count('TAA') / len(comp.unmatched_ORFs)
777
+ uo_TAG = 100 * uo_Stops.count('TAG') / len(comp.unmatched_ORFs)
778
+ uo_O_Stop = 100 - (uo_TGA + uo_TAA + uo_TAG)
779
+ # uo_O_Stop = 100 * uo_O_Stop / len(comp.unmatched_ORFs) ########WHY?
780
+ median_uo_Len = np.median(uo_Lengths)
781
+ uo_Pos = uo_Strands.count('+')
782
+ uo_Neg = uo_Strands.count('-')
783
+ unmatched_ORF_Metrics = (
784
+ format(uo_ATG, '.2f'), format(uo_GTG, '.2f'), format(uo_TTG, '.2f'), format(uo_ATT, '.2f'),
785
+ format(uo_CTG, '.2f'), format(uo_O_Start, '.2f'), format(uo_TGA, '.2f'), format(uo_TAA, '.2f'),
786
+ format(uo_TAG, '.2f'), format(uo_O_Stop, '.2f'), format(median_uo_Len, '.2f'), uo_Pos, uo_Neg)
787
+ else:
788
+ unmatched_ORF_Metrics = ''
789
+ #################################
790
+ # Rep_Metrics - This is the final report of metrics
791
+ rep_Metrics = collections.OrderedDict(
792
+ {'Percentage_of_Genes_Detected': genes_Detected_Percentage,
793
+ 'genes_Undetected': comp.genes_Undetected,
794
+ 'undetected_Gene_Metrics': undetected_Gene_Metrics,
795
+ 'gene_Coverage_Genome': gene_Coverage_Genome,
796
+ 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
797
+ 'Percent_Difference_of_All_ORFs': ORFs_Difference,
798
+ 'Median_Length_Difference': median_Length_Difference,
799
+ 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
800
+ 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
801
+ 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference,
802
+ 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
803
+ 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
804
+ 'Precision': precision,
805
+ 'Recall': recall,
806
+ 'False_Discovery_Rate': false_Discovery_Rate})
807
+ # Pred Metrics - This is the final report of metrics
808
+ pred_metrics = collections.OrderedDict(
809
+ {'Number_of_ORFs': len(current_orfs), 'Percent_Difference_of_All_ORFs': ORFs_Difference,
810
+ 'perfect_Matches': comp.perfect_Matches,
811
+ 'unmatched_ORFs': comp.unmatched_ORFs,
812
+ 'unmatched_ORF_Metrics': unmatched_ORF_Metrics,
813
+ 'orf_Coverage_Genome': orf_Coverage_Genome,
814
+ 'matched_ORF_Coverage_Genome': matched_ORF_Coverage_Genome,
815
+ 'multi_Matched_ORFs': comp.multi_Matched_ORFs,
816
+ 'partial_Hits': comp.partial_Hits,
817
+ 'Number_of_ORFs_that_Detected_a_Gene': len(comp.matched_ORFs),
818
+ 'Percentage_of_ORFs_that_Detected_a_Gene': matched_ORF_Percentage,
819
+ 'Number_of_Genes_Detected': len(comp.genes_Detected),
820
+ 'Percentage_of_Genes_Detected': genes_Detected_Percentage, 'Median_Length_of_All_ORFs': median_ORF_Length,
821
+ 'Median_Length_Difference': median_Length_Difference,
822
+ 'Minimum_Length_of_All_ORFs': min_ORF_Length, 'Minimum_Length_Difference': min_Length_Difference,
823
+ 'Maximum_Length_of_All_ORFs': max_ORF_Length, 'Maximum_Length_Difference': max_Length_Difference,
824
+ 'Median_GC_content_of_All_ORFs': format(median_ORF_GC, '.2f'),
825
+ 'Percent_Difference_of_All_ORFs_Median_GC': median_GC_Difference,
826
+ 'Median_GC_content_of_Matched_ORFs': format(matched_Median_ORF_GC, '.2f'),
827
+ 'Percent_Difference_of_Matched_ORF_GC': matched_Median_GC_Difference,
828
+ 'Number_of_ORFs_which_Overlap_Another_ORF': num_All_ORF_Olap,
829
+ 'Percent_Difference_of_Overlapping_ORFs': overlap_Difference,
830
+ 'Maximum_ORF_Overlap': max_All_ORF_Olap, 'Median_ORF_Overlap': median_ORF_Overlap,
831
+ 'Number_of_Matched_ORFs_Overlapping_Another_ORF': len(matched_ORF_Olap),
832
+ 'Percentage_Difference_of_Matched_Overlapping_CDSs': matched_Overlap_Difference,
833
+ 'Maximum_Matched_ORF_Overlap': max_Matched_ORF_Olap, 'Median_Matched_ORF_Overlap': matched_Median_ORF_Overlap,
834
+ 'Number_of_Short-ORFs': num_ORF_Short, 'Percent_Difference_of_Short-ORFs': short_ORF_Difference,
835
+ 'Number_of_Short-Matched-ORFs': num_Matched_ORF_Short,
836
+ 'Percent_Difference_of_Short-Matched-ORFs': matched_Short_ORF_Difference,
837
+ 'Number_of_Perfect_Matches': len(comp.perfect_Matches),
838
+ 'Percentage_of_Perfect_Matches': perfect_Matches_Percentage,
839
+ 'Number_of_Perfect_Starts': comp.perfect_Starts, 'Percentage_of_Perfect_Starts': perfect_Starts_Percentage,
840
+ 'Number_of_Perfect_Stops': comp.perfect_Stops, 'Percentage_of_Perfect_Stops': perfect_Stops_Percentage,
841
+ 'Number_of_Out_of_Frame_ORFs': len(comp.out_Of_Frame_ORFs),
842
+ 'Number_of_Matched_ORFs_Extending_a_Coding_Region': comp.extended_CDS,
843
+ 'Percentage_of_Matched_ORFs_Extending_a_Coding_Region': extended_CDS_Percentage,
844
+ 'Number_of_Matched_ORFs_Extending_Start_Region': comp.extended_Start,
845
+ 'Percentage_of_Matched_ORFs_Extending_Start_Region': extended_Start_Percentage,
846
+ 'Number_of_Matched_ORFs_Extending_Stop_Region': comp.extended_Stop,
847
+ 'Percentage_of_Matched_ORFs_Extending_Stop_Region': extended_Stop_Percentage,
848
+ 'Number_of_All_ORFs_on_Positive_Strand': comp.pos_Strand,
849
+ 'Percentage_of_All_ORFs_on_Positive_Strand': pos_Strand_Percentage,
850
+ 'Number_of_All_ORFs_on_Negative_Strand': comp.neg_Strand,
851
+ 'Percentage_of_All_ORFs_on_Negative_Strand': neg_Strand_Percentage,
852
+ 'Median_Start_Difference_of_Matched_ORFs': median_Start_Difference,
853
+ 'Median_Stop_Difference_of_Matched_ORFs': median_Stop_Difference, 'ATG_Start_Percentage': atg_P,
854
+ 'GTG_Start_Percentage': gtg_P, 'TTG_Start_Percentage': ttg_P,
855
+ 'ATT_Start_Percentage': att_P, 'CTG_Start_Percentage': ctg_P, 'Other_Start_Codon_Percentage': other_Start_P,
856
+ 'TAG_Stop_Percentage': tag_P, 'TAA_Stop_Percentage': taa_P,
857
+ 'TGA_Stop_Percentage': tga_P, 'Other_Stop_Codon_Percentage': other_Stop_P, 'True_Positive': TP,
858
+ 'False_Positive': FP, 'False_Negative': FN, 'Precision': precision,
859
+ 'Recall': recall, 'False_Discovery_Rate': false_Discovery_Rate, 'Nucleotide_True_Positive': NT_TP,
860
+ 'Nucleotide_False_Positive': NT_FP, 'Nucleotide_True_Negative': NT_TN,
861
+ 'Nucleotide_False_Negative': NT_FN, 'Nucleotide_Precision': NT_Precision, 'Nucleotide_Recall': NT_Recall,
862
+ 'Nucleotide_False_Discovery_Rate': NT_False_Discovery_Rate,
863
+ 'ORF_Nucleotide_Coverage_of_Genome': orf_Coverage_Genome,
864
+ 'Matched_ORF_Nucleotide_Coverage_of_Genome': matched_ORF_Coverage_Genome})
865
+ result = collections.OrderedDict()
866
+ result.update({
867
+ 'rep_metrics': rep_Metrics,
868
+ 'pred_metrics': pred_metrics,
869
+ })
870
+
871
+ # To account for unbalanced data
872
+ for m_key, m_value in result.items():
873
+ if m_value == 'nan':
874
+ result[m_key] = 'N/A'
875
+
876
+ results[dna_region] = result
877
+
878
+
879
+ print("Finished calculating metrics for: ", dna_region)
880
+
881
+
882
+ return results # Return the results dictionary containing all metrics and details