ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,230 @@
1
+ import argparse
2
+ import collections
3
+ import numpy as np
4
+ import sys
5
+
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
8
+ parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
9
+ parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
10
+
11
+ args = parser.parse_args()
12
+
13
+
14
+ def gc_count(dna):
15
+ c = 0
16
+ a = 0
17
+ g = 0
18
+ t = 0
19
+ n = 0
20
+ for i in dna:
21
+ if "C" in i:
22
+ c += 1
23
+ elif "G" in i:
24
+ g += 1
25
+ elif "A" in i:
26
+ a += 1
27
+ elif "T" in i:
28
+ t += 1
29
+ elif "N" in i:
30
+ n += 1
31
+ gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
32
+ return gc_content
33
+
34
+
35
+ def revCompIterative(watson):
36
+ complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
37
+ watson = watson.upper()
38
+ watsonrev = watson[::-1]
39
+ crick = ""
40
+ for nt in watsonrev:
41
+ crick += complements[nt]
42
+ return crick
43
+
44
+
45
+ def start_Codon_Count(starts, lengths):
46
+ try:
47
+ atg_P = format(100 * starts['ATG'] / len(lengths), '.2f')
48
+ gtg_P = format(100 * starts['GTG'] / len(lengths), '.2f')
49
+ ttg_P = format(100 * starts['TTG'] / len(lengths), '.2f')
50
+ att_P = format(100 * starts['ATT'] / len(lengths), '.2f')
51
+ ctg_P = format(100 * starts['CTG'] / len(lengths), '.2f')
52
+ except ZeroDivisionError:
53
+ atg_P, gtg_P, ttg_P, att_P, ctg_P = 0, 0, 0, 0, 0
54
+ return atg_P, gtg_P, ttg_P, att_P, ctg_P # ,other_Start_P,other_Starts
55
+
56
+
57
+ def stop_Codon_Count(stops, lengths):
58
+ try:
59
+ tag_P = format(100 * stops['TAG'] / len(lengths), '.2f')
60
+ taa_P = format(100 * stops['TAA'] / len(lengths), '.2f')
61
+ tga_P = format(100 * stops['TGA'] / len(lengths), '.2f')
62
+ # return atg_P, gtg_P, ttg_P, att_P, ctg_P # ,other_Start_P,other_Starts
63
+ except ZeroDivisionError:
64
+ tag_P, taa_P, tga_P = 0, 0, 0
65
+ return tag_P, taa_P, tga_P # ,other_Stop_P,other_Stops
66
+
67
+
68
+ def partial_gene_read(results_file, partial_genes):
69
+ # partial Genes Read-In
70
+ orf_Lengths = []
71
+ gene_Lengths = []
72
+
73
+ read = False
74
+ prev = ''
75
+ for line in results_file:
76
+ line = line.strip()
77
+ if read == True:
78
+ if line.startswith('Gene:'):
79
+ line = line.replace('Gene:', '')
80
+ entry = line.split('_')
81
+ g_Pos = entry[0] + '_' + entry[1]
82
+ strand = entry[2]
83
+ prev = 'Gene'
84
+ elif line.startswith('ORF:'):
85
+ line = line.replace('ORF:', '')
86
+ entry = line.split('_')
87
+ o_Pos = entry[0] + '_' + entry[1]
88
+ prev = 'ORF'
89
+ elif line:
90
+ if 'Gene' in prev:
91
+ g_Seq = line.strip()
92
+ gene_Lengths.append(len(g_Seq))
93
+ elif 'ORF' in prev:
94
+ o_Seq = line.strip()
95
+ orf_Lengths.append(len(o_Seq))
96
+ elif not line:
97
+ g_GC = gc_count(g_Seq)
98
+ o_GC = gc_count(o_Seq)
99
+ partial_genes.update({g_Pos: [strand, g_Seq, g_GC, o_Pos, o_Seq, o_GC]})
100
+ if line.startswith('Partial_Gene_Hits:'):
101
+ read = True
102
+
103
+ return partial_genes, orf_Lengths, gene_Lengths
104
+
105
+
106
+ def detail_transfer(genes, partial_genes):
107
+ for partial, m_details in partial_genes.items():
108
+ try:
109
+ details = genes[partial]
110
+ gc = details[2]
111
+ up_Overlap = details[3]
112
+ down_Overlap = details[4]
113
+ m_details.insert(2, gc)
114
+ m_details.insert(3, up_Overlap)
115
+ m_details.insert(4, down_Overlap)
116
+ except KeyError:
117
+ pass
118
+ return partial_genes
119
+
120
+
121
+ def result_compare(genome, results_file):
122
+ genome_Seq = ""
123
+ with open('../Genomes/' + genome + '.fa', 'r') as genome_file:
124
+ for line in genome_file:
125
+ line = line.replace("\n", "")
126
+ if ">" not in line:
127
+ genome_Seq += str(line)
128
+
129
+ partial_genes = collections.OrderedDict()
130
+ partial_genes, orf_Lengths, gene_Lengths = partial_gene_read(results_file, partial_genes)
131
+ orf_Median = np.median(orf_Lengths)
132
+ gene_Median = np.median(gene_Lengths)
133
+ strands = collections.defaultdict(int, {'-': 0, '+': 0})
134
+ # Hard coded codons - Not ideal - default dicts?
135
+ gene_Starts = collections.OrderedDict(
136
+ {'ATG': 0, 'ATT': 0, 'CTG': 0, 'GAC': 0, 'GTG': 0, 'TTG': 0, 'ATC': 0, 'ATA': 0})
137
+ gene_Stops = collections.OrderedDict({'TAA': 0, 'TAG': 0, 'TGA': 0})
138
+ gene_GC = []
139
+ orf_Starts = collections.OrderedDict(
140
+ {'ATG': 0, 'ATT': 0, 'CTG': 0, 'GAC': 0, 'GTG': 0, 'TTG': 0, 'ATC': 0, 'ATA': 0})
141
+ orf_Stops = collections.OrderedDict({'TAA': 0, 'TAG': 0, 'TGA': 0})
142
+ orf_GC = []
143
+
144
+ for gene, data in partial_genes.items():
145
+ print(
146
+ "\nPartial Matched Gene:\t" + gene + "\t" + data[1] + "\nPartial Matched ORF:\t" + data[3] + "\t" + data[4])
147
+ strands[data[0]] += 1
148
+ try:
149
+ gene_Starts[data[1][0:3]] += 1
150
+ gene_Stops[data[1][-3:]] += 1
151
+ gene_GC.append(float(data[2]))
152
+ orf_Starts[data[4][0:3]] += 1
153
+ orf_Stops[data[4][-3:]] += 1
154
+ orf_GC.append(float(data[5]))
155
+ except KeyError:
156
+ sys.exit("Key Error: " + str(data))
157
+
158
+ gene_Median_GC = np.median(gene_GC)
159
+ orf_Median_GC = np.median(orf_GC)
160
+ # atg_P = format(100* gene_Starts['ATG'] / len(gene_Lengths),'.2f')
161
+ # gtg_P = format(100 * gene_Starts['GTG'] / len(gene_Lengths),'.2f')
162
+ # ttg_P = format(100 * gene_Starts['TTG'] / len(gene_Lengths),'.2f')
163
+ # att_P = format(100 * gene_Starts['ATT'] / len(gene_Lengths),'.2f')
164
+ # ctg_P = format(100 * gene_Starts['CTG'] / len(gene_Lengths),'.2f')
165
+ # #other_Start_P = format(100 * other / len(gene_Lengths),'.2f')
166
+ #
167
+ # orf_GC_Median = format(np.median(pcg_GC),'.2f')
168
+ # num_Short_PCGs = len(short_PCGs)
169
+ #
170
+ # partial_genes = detail_transfer(genes,partial_genes)
171
+
172
+ g_atg_P, g_gtg_P, g_ttg_P, g_att_P, g_ctg_P = start_Codon_Count(gene_Starts, gene_Lengths)
173
+ g_tag_P, g_taa_P, g_tga_P = stop_Codon_Count(gene_Stops, gene_Lengths)
174
+ o_atg_P, o_gtg_P, o_ttg_P, o_att_P, o_ctg_P = start_Codon_Count(orf_Starts, orf_Lengths)
175
+ o_tag_P, o_taa_P, o_tga_P = stop_Codon_Count(orf_Stops, orf_Lengths)
176
+
177
+ # output = ("Number of Protein Coding Genes in " + str(annotation) + " : " + str(len(gene_Lengths)) + ", Median Length of PCGs: " +
178
+ # str(gene_Median) + ", Min Length of PCGs: " + str('NA') + ", Max Length of PCGs: " + str('NA') +
179
+ # ", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(strands['-']) +
180
+ # ", Median GC of PCGs: " + str('NA') +
181
+ # ", Number of PCGs less than 100nt: " + str('NA') +
182
+ output = ("Number of Partial Hits:" + str(len(gene_Lengths)) + "\nMedian Length of Partial Hit Genes:" + str(
183
+ gene_Median) +
184
+ '\nMedian Length of Partial Hit ORFs:' + str(orf_Median) + '\nMedian GC Partial Hit Genes:' + str(
185
+ gene_Median_GC) +
186
+ '\nMedian GC Partial Hit ORFs:' + str(orf_Median_GC) +
187
+ '\nPercentage of Genes starting with ATG - Annotation/partial: ' + g_atg_P + ' ' + o_atg_P +
188
+ '\nPercentage of Genes starting with GTG - Annotation/partial: ' + g_gtg_P + ' ' + o_gtg_P +
189
+ '\nPercentage of Genes starting with TTG - Annotation/partial: ' + g_ttg_P + ' ' + o_ttg_P +
190
+ '\nPercentage of Genes starting with ATT - Annotation/partial: ' + g_att_P + ' ' + o_att_P +
191
+ '\nPercentage of Genes starting with CTG - Annotation/partial: ' + g_ctg_P + ' ' + o_ctg_P +
192
+ # '\nPercentage of Genes starting with Alternative Start Codon - Annotation/partial: ' + other_Starts_P + ' ' + m_other_Stops_P +
193
+ '\nPercentage of Genes ending with TAG - Annotation/partial: ' + g_tag_P + ' ' + o_tag_P +
194
+ '\nPercentage of Genes ending with TAA - Annotation/partial: ' + g_taa_P + ' ' + o_taa_P +
195
+ '\nPercentage of Genes ending with TGA - Annotation/partial: ' + g_tga_P + ' ' + o_tga_P)
196
+ # '\nPercentage of Genes ending with Alternative Stop Codon - Annotation/partial: ' + other_Stops_P + ' ' + m_other_Stops_P)
197
+
198
+ print(output)
199
+
200
+ # import matplotlib.pylab as plt
201
+ #
202
+ # list_ORF_Starts = list(orf_Starts.items()) # sorted by key, return a list of tuples
203
+ # list_Gene_Starts = list(gene_Starts.items())
204
+ # o_x, o_y = zip(*list_ORF_Starts) # unpack a list of pairs into two tuples
205
+ # g_x, g_y = zip(*list_Gene_Starts)
206
+ #
207
+ # plt.plot(o_x, o_y)
208
+ # plt.plot(g_x, g_y)
209
+ # plt.show()
210
+ #
211
+ # list_ORF_Stops = list(orf_Stops.items()) # sorted by key, return a list of tuples
212
+ # list_Gene_Stops = list(gene_Stops.items())
213
+ # o_x, o_y = zip(*list_ORF_Stops) # unpack a list of pairs into two tuples
214
+ # g_x, g_y = zip(*list_Gene_Stops)
215
+ #
216
+ # plt.plot(o_x, o_y)
217
+ # plt.plot(g_x, g_y)
218
+ # plt.show()
219
+
220
+
221
+ if __name__ == "__main__":
222
+ options = parser.parse_args()
223
+ parameters = options.parameters
224
+ tool = options.tool
225
+ genome = options.genome
226
+ if parameters:
227
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
228
+ else:
229
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
230
+ result_compare(genome, results_file)
@@ -0,0 +1,286 @@
1
+ import copy
2
+
3
+ import argparse
4
+ import collections
5
+
6
+ from ORForise.src.ORForise.utils import *
7
+
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
10
+ parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
11
+ parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
12
+ args = parser.parse_args()
13
+
14
+
15
+ def gc_count(dna):
16
+ c = 0
17
+ a = 0
18
+ g = 0
19
+ t = 0
20
+ n = 0
21
+ for i in dna:
22
+ if "C" in i:
23
+ c += 1
24
+ elif "G" in i:
25
+ g += 1
26
+ elif "A" in i:
27
+ a += 1
28
+ elif "T" in i:
29
+ t += 1
30
+ elif "N" in i:
31
+ n += 1
32
+ gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
33
+ n_per = n * 100 / (a + t + g + c + n)
34
+ return n_per, gc_content
35
+
36
+
37
+ def start_Codon_Count(orfs):
38
+ atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
39
+ other_Starts = []
40
+ for orf in orfs.values():
41
+ codon = orf[-2]
42
+ if codon == 'ATG':
43
+ atg += 1
44
+ elif codon == 'GTG':
45
+ gtg += 1
46
+ elif codon == 'TTG':
47
+ ttg += 1
48
+ elif codon == 'ATT':
49
+ att += 1
50
+ elif codon == 'CTG':
51
+ ctg += 1
52
+ else:
53
+ other += 1
54
+ other_Starts.append(codon)
55
+ atg_P = format(100 * atg / len(orfs), '.2f')
56
+ gtg_P = format(100 * gtg / len(orfs), '.2f')
57
+ ttg_P = format(100 * ttg / len(orfs), '.2f')
58
+ att_P = format(100 * att / len(orfs), '.2f')
59
+ ctg_P = format(100 * ctg / len(orfs), '.2f')
60
+ other_Start_P = format(100 * other / len(orfs), '.2f')
61
+ return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
62
+
63
+
64
+ def stop_Codon_Count(orfs):
65
+ tag, taa, tga, other = 0, 0, 0, 0
66
+ other_Stops = []
67
+ for orf in orfs.values():
68
+ codon = orf[-1]
69
+ if codon == 'TAG':
70
+ tag += 1
71
+ elif codon == 'TAA':
72
+ taa += 1
73
+ elif codon == 'TGA':
74
+ tga += 1
75
+ else:
76
+ other += 1
77
+ other_Stops.append(codon)
78
+ tag_p = format(100 * tag / len(orfs), '.2f')
79
+ taa_p = format(100 * taa / len(orfs), '.2f')
80
+ tga_p = format(100 * tga / len(orfs), '.2f')
81
+ other_Stop_P = format(100 * other / len(orfs), '.2f')
82
+ return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
83
+
84
+
85
+ def detail_transfer(genes, missed_genes):
86
+ for missed, m_details in missed_genes.items():
87
+ try:
88
+ details = genes[missed]
89
+ gc = details[2]
90
+ up_Overlap = details[3]
91
+ down_Overlap = details[4]
92
+ m_details.insert(2, gc)
93
+ m_details.insert(3, up_Overlap)
94
+ m_details.insert(4, down_Overlap)
95
+ except KeyError:
96
+ pass
97
+ return missed_genes
98
+
99
+
100
+ def get_genome(genome):
101
+ genome_Seq = ""
102
+ with open('../Genomes/' + genome + '.fa', 'r') as genome:
103
+ for line in genome:
104
+ line = line.replace("\n", "")
105
+ if not line.startswith('>'):
106
+ genome_Seq += str(line)
107
+ return genome_Seq
108
+
109
+
110
+ def missed_genes_in(genes_detected, missed_genes, results_in):
111
+ # Missed Genes Read-In
112
+ read = False
113
+ for line in results_in:
114
+ line = line.strip()
115
+ if read == True:
116
+ if line.startswith('>'):
117
+ entry = line.split('_')
118
+ entry = entry[1] + '_' + entry[2]
119
+ strand = entry[-1]
120
+ if int(strand) <= 2:
121
+ strand = '+'
122
+ else:
123
+ strand = '-'
124
+ elif len(line.strip()) > 0:
125
+ startCodon = line[0:3]
126
+ stopCodon = line[-3:]
127
+ length = len(line)
128
+ missed_genes.update({entry: [line, strand, length, startCodon, stopCodon]})
129
+
130
+ if line.startswith('Undetected_Genes:'):
131
+ read = True
132
+ if read == True and not line:
133
+ break
134
+ list_Missed = list(missed_genes.keys())
135
+ for key in list_Missed:
136
+ # ### printed out to confirm figure lengths
137
+ # start = key.split('_')[0]
138
+ # stop = key.split('_')[1]
139
+ if key in genes_detected:
140
+ del genes_detected[key]
141
+
142
+ return missed_genes, genes_detected
143
+
144
+
145
+ def partial_matches_in(partial_matches, results_in):
146
+ # partial Genes Read-In
147
+ read = False
148
+ prev = ''
149
+ for line in results_in:
150
+ line = line.strip()
151
+ if read == True:
152
+ if line.startswith('Gene:'):
153
+ line = line.replace('Gene:', '')
154
+ entry = line.split('_')
155
+ g_Pos = entry[0] + '_' + entry[1]
156
+ strand = entry[2]
157
+ prev = 'Gene'
158
+ elif line.startswith('ORF:'):
159
+ line = line.replace('ORF:', '')
160
+ entry = line.split('_')
161
+ o_Pos = entry[0] + '_' + entry[1]
162
+ prev = 'ORF'
163
+ elif line:
164
+ if 'Gene' in prev:
165
+ g_Seq = line.strip()
166
+ g_length = len(g_Seq)
167
+ elif 'ORF' in prev:
168
+ o_Seq = line.strip()
169
+ orf_length = len(o_Seq)
170
+ elif not line:
171
+ partial_matches.update({g_Pos: [strand, g_length, g_Seq, o_Pos, orf_length, o_Seq]})
172
+ if line.startswith('Partial_Gene_Hits:'):
173
+ read = True
174
+
175
+ return partial_matches
176
+
177
+
178
+ def unmatched_ORFs_in(unmatched_ORFs, results_file):
179
+ # Unmatched ORFs Read-In
180
+ read = False
181
+ for line in results_file:
182
+ line = line.strip()
183
+ if read == True:
184
+ if line.startswith('>'):
185
+ line = line.replace('Gene:', '')
186
+ entry = line.split('_')
187
+ strand = entry[-1]
188
+ o_Pos = entry[1] + '_' + entry[2]
189
+ unmatched_ORFs.update({o_Pos: [strand, None, None, None, None]})
190
+ elif line:
191
+ o_Seq = line.strip()
192
+ o_Length = len(o_Seq)
193
+ startCodon = line[0:3]
194
+ stopCodon = line[-3:]
195
+ unmatched_ORFs.update({o_Pos: [strand, o_Length, o_Seq, startCodon, stopCodon]})
196
+ if line.startswith('ORF_Without_Corresponding_Gene_in_Ensembl:'):
197
+ read = True
198
+ elif read == True and not line:
199
+ unmatched_ORFs.update({o_Pos: [strand, o_Length, o_Seq, startCodon, stopCodon]})
200
+ break
201
+
202
+ return unmatched_ORFs
203
+
204
+
205
+ def genes_in(genome, genome_Seq, genome_Seq_Rev, genome_Size, genes):
206
+ with open('../Genomes/' + genome + '.gff', 'r') as genome_gff:
207
+ for line in genome_gff:
208
+ line = line.split('\t')
209
+ try:
210
+ if "CDS" in line[2] and len(line) == 9:
211
+ start = int(line[3])
212
+ stop = int(line[4])
213
+ strand = line[6]
214
+ length = stop - start
215
+ gene = str(start) + '_' + str(stop)
216
+ if '+' in strand:
217
+ seq = genome_Seq[start - 1:stop]
218
+ elif '-' in strand:
219
+ r_Start = genome_Size - stop
220
+ r_Stop = genome_Size - start
221
+ seq = genome_Seq_Rev[r_Start:r_Stop + 1]
222
+ startCodon = seq[0:3]
223
+ stopCodon = seq[-3:]
224
+ genes.update({gene: [seq, strand, length, startCodon, stopCodon]})
225
+ except IndexError:
226
+ continue
227
+ return genes
228
+
229
+
230
+ def extract_results(genome, results_file):
231
+ genome_Seq = get_genome(genome)
232
+ genome_Seq_Rev = revCompIterative(genome_Seq)
233
+ genome_Size = len(genome_Seq)
234
+ genes = collections.OrderedDict()
235
+ partial_matches = collections.OrderedDict()
236
+ missed_genes = collections.OrderedDict()
237
+ unmatched_ORFs = collections.OrderedDict()
238
+
239
+ genes = genes_in(genome, genome_Seq, genome_Seq_Rev, genome_Size, genes)
240
+ genes_detected = copy.deepcopy(genes)
241
+ missed_genes, genes_detected = missed_genes_in(genes_detected, missed_genes, results_file)
242
+ results_file.seek(0, 0) # Reset file position
243
+ partial_matches = partial_matches_in(partial_matches, results_file)
244
+ results_file.seek(0, 0) # Reset file position
245
+ unmatched_ORFs = unmatched_ORFs_in(unmatched_ORFs, results_file)
246
+ results_file.seek(0, 0) # Reset file position
247
+
248
+ return genes, genes_detected, missed_genes, partial_matches, unmatched_ORFs
249
+
250
+
251
+ if __name__ == "__main__":
252
+ options = parser.parse_args()
253
+ parameters = options.parameters
254
+ tool = options.tool
255
+ genome = options.genome
256
+ if parameters:
257
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
258
+ else:
259
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
260
+
261
+ genes, genes_detected, missed_genes, partial_matches, unmatched_ORFs = extract_results(genome, results_file)
262
+ gene_Lengths, genes_detected_Lengths, missed_Lengths, partial_Lengths, unmatched_Lengths = [], [], [], [], []
263
+
264
+ for pos in genes.keys():
265
+ gene_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
266
+ for pos in genes_detected.keys():
267
+ genes_detected_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
268
+ for pos in partial_matches.keys():
269
+ partial_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
270
+ for pos in missed_genes.keys():
271
+ missed_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
272
+ for pos in unmatched_ORFs.keys():
273
+ unmatched_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
274
+
275
+ import numpy as np
276
+
277
+ print(len(gene_Lengths))
278
+ print(gene_Lengths)
279
+ print(np.mean(gene_Lengths))
280
+ print(len(partial_Lengths))
281
+ print(partial_Lengths)
282
+ print(len(missed_Lengths))
283
+ print(missed_Lengths)
284
+ print(len(unmatched_Lengths))
285
+ print(unmatched_Lengths)
286
+ print(np.mean(unmatched_Lengths))
@@ -0,0 +1,161 @@
1
+ import argparse
2
+ import collections
3
+
4
+ parser = argparse.ArgumentParser()
5
+ parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
6
+ parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
7
+ parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
8
+ args = parser.parse_args()
9
+
10
+
11
+ def revCompIterative(watson):
12
+ complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
13
+ watson = watson.upper()
14
+ watsonrev = watson[::-1]
15
+ crick = ""
16
+ for nt in watsonrev:
17
+ crick += complements[nt]
18
+ return crick
19
+
20
+
21
+ def partial_gene_read(results_file, partial_genes):
22
+ # partial Genes Read-In
23
+ orf_Lengths = []
24
+ gene_Lengths = []
25
+ read = False
26
+ prev = ''
27
+ for line in results_file:
28
+ line = line.strip()
29
+ if read == True:
30
+ if line.startswith('Gene:'):
31
+ line = line.replace('Gene:', '')
32
+ entry = line.split('_')
33
+ g_Pos = entry[0] + '_' + entry[1]
34
+ strand = entry[2]
35
+ prev = 'Gene'
36
+ elif line.startswith('ORF:'):
37
+ line = line.replace('ORF:', '')
38
+ entry = line.split('_')
39
+ o_Pos = entry[0] + '_' + entry[1]
40
+ prev = 'ORF'
41
+ elif line:
42
+ if 'Gene' in prev:
43
+ g_Seq = line.strip()
44
+ gene_Lengths.append(len(g_Seq))
45
+ elif 'ORF' in prev:
46
+ o_Seq = line.strip()
47
+ orf_Lengths.append(len(o_Seq))
48
+ elif not line:
49
+ partial_genes.update({g_Pos: [strand, g_Seq, o_Pos, o_Seq]})
50
+ if line.startswith('Partial_Gene_Hits:'):
51
+ read = True
52
+ return partial_genes, orf_Lengths, gene_Lengths
53
+
54
+
55
+ def perfect_Matches(results_file, perfect_Match_Genes):
56
+ read = False
57
+ for line in results_file:
58
+ line = line.strip()
59
+ if read == True:
60
+ if line.startswith('>'):
61
+ entry = line.split('_')
62
+ g_Pos = entry[1] + '_' + entry[2]
63
+ strand = entry[3]
64
+ elif line:
65
+ g_Seq = line
66
+ g_Start = line[0:3]
67
+ g_Stop = line[-3:]
68
+ elif not line:
69
+ perfect_Match_Genes.update({g_Pos: [strand, g_Seq, g_Start, g_Stop]})
70
+ if line.startswith('Perfect_Match_Genes:'):
71
+ read = True
72
+ if line.startswith('Undetected_Genes'):
73
+ break
74
+ return perfect_Match_Genes
75
+
76
+
77
+ def detail_transfer(genes, partial_genes):
78
+ for partial, m_details in partial_genes.items():
79
+ try:
80
+ details = genes[partial]
81
+ gc = details[2]
82
+ up_Overlap = details[3]
83
+ down_Overlap = details[4]
84
+ m_details.insert(2, gc)
85
+ m_details.insert(3, up_Overlap)
86
+ m_details.insert(4, down_Overlap)
87
+ except KeyError:
88
+ pass
89
+ return partial_genes
90
+
91
+
92
+ def result_compare(results_file, genome_file):
93
+ genome = ""
94
+ with open('../Genomes/' + genome_file + '.fa', 'r') as genome_file:
95
+ for line in genome_file:
96
+ line = line.replace("\n", "")
97
+ if ">" not in line:
98
+ genome += str(line)
99
+
100
+ partial_genes = collections.OrderedDict()
101
+ perfect_Match_Genes = collections.OrderedDict()
102
+ partial_genes, orf_Lengths, gene_Lengths = partial_gene_read(results_file, partial_genes)
103
+ results_file.seek(0)
104
+ perfect_Match_Genes = perfect_Matches(results_file, perfect_Match_Genes)
105
+
106
+ perfect_Match_Gene_Start_Codons = collections.OrderedDict({'ATG': 0, 'GTG': 0, 'TTG': 0, 'CTG': 0, 'Other': 0})
107
+ for gene, data in perfect_Match_Genes.items():
108
+ try:
109
+ perfect_Match_Gene_Start_Codons[data[2]] += 1
110
+ except KeyError:
111
+ perfect_Match_Gene_Start_Codons['Other'] += 1
112
+ print("Perfect Match Start Codons\nATG:" + str(perfect_Match_Gene_Start_Codons['ATG']) + ",GTG:" + str(
113
+ perfect_Match_Gene_Start_Codons['GTG']) + ",TTG:" +
114
+ str(perfect_Match_Gene_Start_Codons['TTG']) + ",CTG:" + str(
115
+ perfect_Match_Gene_Start_Codons['CTG']) + ",Other_Start:" + str(perfect_Match_Gene_Start_Codons['Other']))
116
+
117
+ strands = collections.defaultdict(int, {'-': 0, '+': 0})
118
+ start_Codon_Substitution = collections.OrderedDict(
119
+ {'ATG-ATG': 0, 'GTG-ATG': 0, 'TTG-ATG': 0, 'CTG-ATG': 0, 'Alt-ATG': 0,
120
+ 'ATG-GTG': 0, 'GTG-GTG': 0, 'TTG-GTG': 0, 'CTG-GTG': 0, 'Alt-CTG': 0,
121
+ 'ATG-TTG': 0, 'GTG-TTG': 0, 'TTG-TTG': 0, 'CTG-TTG': 0, 'Alt-GTG': 0,
122
+ 'ATG-CTG': 0, 'GTG-CTG': 0, 'TTG-CTG': 0, 'CTG-CTG': 0, 'Alt-TTG': 0,
123
+ 'ATG-Alt': 0, 'GTG-Alt': 0, 'TTG-Alt': 0, 'CTG-Alt': 0, 'Alt-Alt': 0})
124
+
125
+ codon_set = ['ATG', 'CTG', 'GTG', 'TTG']
126
+ for gene, data in partial_genes.items():
127
+ strands[data[0]] += 1
128
+ gene_Start = [data[1][0:3]]
129
+ orf_Start = [data[3][0:3]]
130
+ if gene_Start[0] in codon_set:
131
+ gene_Start = gene_Start[0]
132
+ else:
133
+ print('Gene_Codon_Alternative:' + str(gene_Start[0]))
134
+ gene_Start = 'Alt'
135
+ if orf_Start[0] in codon_set:
136
+ orf_Start = orf_Start[0]
137
+ else:
138
+ print('ORF_Codon_Alternative:' + str(orf_Start[0]))
139
+ orf_Start = 'Alt'
140
+
141
+ matrix_index = gene_Start + '-' + orf_Start
142
+ start_Codon_Substitution[matrix_index] += 1
143
+
144
+ ####### HERE - Need to flip the data - GS along the top
145
+ subs = start_Codon_Substitution.values()
146
+ subs = list(subs)
147
+ subs[:0] = ['ATG', 'GTG', 'TTG', 'CTG', 'Other']
148
+ for i in [subs[c:c + 5] for c in range(0, len(subs), 5) if c % 5 == 0]:
149
+ print(*i)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ options = parser.parse_args()
154
+ parameters = options.parameters
155
+ tool = options.tool
156
+ genome = options.genome
157
+ if parameters:
158
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
159
+ else:
160
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
161
+ result_compare(results_file, genome)