ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
import numpy as np
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
parser = argparse.ArgumentParser()
|
|
7
|
+
parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
|
|
8
|
+
parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
|
|
9
|
+
parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
|
|
10
|
+
|
|
11
|
+
args = parser.parse_args()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def gc_count(dna):
|
|
15
|
+
c = 0
|
|
16
|
+
a = 0
|
|
17
|
+
g = 0
|
|
18
|
+
t = 0
|
|
19
|
+
n = 0
|
|
20
|
+
for i in dna:
|
|
21
|
+
if "C" in i:
|
|
22
|
+
c += 1
|
|
23
|
+
elif "G" in i:
|
|
24
|
+
g += 1
|
|
25
|
+
elif "A" in i:
|
|
26
|
+
a += 1
|
|
27
|
+
elif "T" in i:
|
|
28
|
+
t += 1
|
|
29
|
+
elif "N" in i:
|
|
30
|
+
n += 1
|
|
31
|
+
gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
|
|
32
|
+
return gc_content
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def revCompIterative(watson):
|
|
36
|
+
complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
|
37
|
+
watson = watson.upper()
|
|
38
|
+
watsonrev = watson[::-1]
|
|
39
|
+
crick = ""
|
|
40
|
+
for nt in watsonrev:
|
|
41
|
+
crick += complements[nt]
|
|
42
|
+
return crick
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def start_Codon_Count(starts, lengths):
|
|
46
|
+
try:
|
|
47
|
+
atg_P = format(100 * starts['ATG'] / len(lengths), '.2f')
|
|
48
|
+
gtg_P = format(100 * starts['GTG'] / len(lengths), '.2f')
|
|
49
|
+
ttg_P = format(100 * starts['TTG'] / len(lengths), '.2f')
|
|
50
|
+
att_P = format(100 * starts['ATT'] / len(lengths), '.2f')
|
|
51
|
+
ctg_P = format(100 * starts['CTG'] / len(lengths), '.2f')
|
|
52
|
+
except ZeroDivisionError:
|
|
53
|
+
atg_P, gtg_P, ttg_P, att_P, ctg_P = 0, 0, 0, 0, 0
|
|
54
|
+
return atg_P, gtg_P, ttg_P, att_P, ctg_P # ,other_Start_P,other_Starts
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def stop_Codon_Count(stops, lengths):
|
|
58
|
+
try:
|
|
59
|
+
tag_P = format(100 * stops['TAG'] / len(lengths), '.2f')
|
|
60
|
+
taa_P = format(100 * stops['TAA'] / len(lengths), '.2f')
|
|
61
|
+
tga_P = format(100 * stops['TGA'] / len(lengths), '.2f')
|
|
62
|
+
# return atg_P, gtg_P, ttg_P, att_P, ctg_P # ,other_Start_P,other_Starts
|
|
63
|
+
except ZeroDivisionError:
|
|
64
|
+
tag_P, taa_P, tga_P = 0, 0, 0
|
|
65
|
+
return tag_P, taa_P, tga_P # ,other_Stop_P,other_Stops
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def partial_gene_read(results_file, partial_genes):
|
|
69
|
+
# partial Genes Read-In
|
|
70
|
+
orf_Lengths = []
|
|
71
|
+
gene_Lengths = []
|
|
72
|
+
|
|
73
|
+
read = False
|
|
74
|
+
prev = ''
|
|
75
|
+
for line in results_file:
|
|
76
|
+
line = line.strip()
|
|
77
|
+
if read == True:
|
|
78
|
+
if line.startswith('Gene:'):
|
|
79
|
+
line = line.replace('Gene:', '')
|
|
80
|
+
entry = line.split('_')
|
|
81
|
+
g_Pos = entry[0] + '_' + entry[1]
|
|
82
|
+
strand = entry[2]
|
|
83
|
+
prev = 'Gene'
|
|
84
|
+
elif line.startswith('ORF:'):
|
|
85
|
+
line = line.replace('ORF:', '')
|
|
86
|
+
entry = line.split('_')
|
|
87
|
+
o_Pos = entry[0] + '_' + entry[1]
|
|
88
|
+
prev = 'ORF'
|
|
89
|
+
elif line:
|
|
90
|
+
if 'Gene' in prev:
|
|
91
|
+
g_Seq = line.strip()
|
|
92
|
+
gene_Lengths.append(len(g_Seq))
|
|
93
|
+
elif 'ORF' in prev:
|
|
94
|
+
o_Seq = line.strip()
|
|
95
|
+
orf_Lengths.append(len(o_Seq))
|
|
96
|
+
elif not line:
|
|
97
|
+
g_GC = gc_count(g_Seq)
|
|
98
|
+
o_GC = gc_count(o_Seq)
|
|
99
|
+
partial_genes.update({g_Pos: [strand, g_Seq, g_GC, o_Pos, o_Seq, o_GC]})
|
|
100
|
+
if line.startswith('Partial_Gene_Hits:'):
|
|
101
|
+
read = True
|
|
102
|
+
|
|
103
|
+
return partial_genes, orf_Lengths, gene_Lengths
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def detail_transfer(genes, partial_genes):
|
|
107
|
+
for partial, m_details in partial_genes.items():
|
|
108
|
+
try:
|
|
109
|
+
details = genes[partial]
|
|
110
|
+
gc = details[2]
|
|
111
|
+
up_Overlap = details[3]
|
|
112
|
+
down_Overlap = details[4]
|
|
113
|
+
m_details.insert(2, gc)
|
|
114
|
+
m_details.insert(3, up_Overlap)
|
|
115
|
+
m_details.insert(4, down_Overlap)
|
|
116
|
+
except KeyError:
|
|
117
|
+
pass
|
|
118
|
+
return partial_genes
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def result_compare(genome, results_file):
|
|
122
|
+
genome_Seq = ""
|
|
123
|
+
with open('../Genomes/' + genome + '.fa', 'r') as genome_file:
|
|
124
|
+
for line in genome_file:
|
|
125
|
+
line = line.replace("\n", "")
|
|
126
|
+
if ">" not in line:
|
|
127
|
+
genome_Seq += str(line)
|
|
128
|
+
|
|
129
|
+
partial_genes = collections.OrderedDict()
|
|
130
|
+
partial_genes, orf_Lengths, gene_Lengths = partial_gene_read(results_file, partial_genes)
|
|
131
|
+
orf_Median = np.median(orf_Lengths)
|
|
132
|
+
gene_Median = np.median(gene_Lengths)
|
|
133
|
+
strands = collections.defaultdict(int, {'-': 0, '+': 0})
|
|
134
|
+
# Hard coded codons - Not ideal - default dicts?
|
|
135
|
+
gene_Starts = collections.OrderedDict(
|
|
136
|
+
{'ATG': 0, 'ATT': 0, 'CTG': 0, 'GAC': 0, 'GTG': 0, 'TTG': 0, 'ATC': 0, 'ATA': 0})
|
|
137
|
+
gene_Stops = collections.OrderedDict({'TAA': 0, 'TAG': 0, 'TGA': 0})
|
|
138
|
+
gene_GC = []
|
|
139
|
+
orf_Starts = collections.OrderedDict(
|
|
140
|
+
{'ATG': 0, 'ATT': 0, 'CTG': 0, 'GAC': 0, 'GTG': 0, 'TTG': 0, 'ATC': 0, 'ATA': 0})
|
|
141
|
+
orf_Stops = collections.OrderedDict({'TAA': 0, 'TAG': 0, 'TGA': 0})
|
|
142
|
+
orf_GC = []
|
|
143
|
+
|
|
144
|
+
for gene, data in partial_genes.items():
|
|
145
|
+
print(
|
|
146
|
+
"\nPartial Matched Gene:\t" + gene + "\t" + data[1] + "\nPartial Matched ORF:\t" + data[3] + "\t" + data[4])
|
|
147
|
+
strands[data[0]] += 1
|
|
148
|
+
try:
|
|
149
|
+
gene_Starts[data[1][0:3]] += 1
|
|
150
|
+
gene_Stops[data[1][-3:]] += 1
|
|
151
|
+
gene_GC.append(float(data[2]))
|
|
152
|
+
orf_Starts[data[4][0:3]] += 1
|
|
153
|
+
orf_Stops[data[4][-3:]] += 1
|
|
154
|
+
orf_GC.append(float(data[5]))
|
|
155
|
+
except KeyError:
|
|
156
|
+
sys.exit("Key Error: " + str(data))
|
|
157
|
+
|
|
158
|
+
gene_Median_GC = np.median(gene_GC)
|
|
159
|
+
orf_Median_GC = np.median(orf_GC)
|
|
160
|
+
# atg_P = format(100* gene_Starts['ATG'] / len(gene_Lengths),'.2f')
|
|
161
|
+
# gtg_P = format(100 * gene_Starts['GTG'] / len(gene_Lengths),'.2f')
|
|
162
|
+
# ttg_P = format(100 * gene_Starts['TTG'] / len(gene_Lengths),'.2f')
|
|
163
|
+
# att_P = format(100 * gene_Starts['ATT'] / len(gene_Lengths),'.2f')
|
|
164
|
+
# ctg_P = format(100 * gene_Starts['CTG'] / len(gene_Lengths),'.2f')
|
|
165
|
+
# #other_Start_P = format(100 * other / len(gene_Lengths),'.2f')
|
|
166
|
+
#
|
|
167
|
+
# orf_GC_Median = format(np.median(pcg_GC),'.2f')
|
|
168
|
+
# num_Short_PCGs = len(short_PCGs)
|
|
169
|
+
#
|
|
170
|
+
# partial_genes = detail_transfer(genes,partial_genes)
|
|
171
|
+
|
|
172
|
+
g_atg_P, g_gtg_P, g_ttg_P, g_att_P, g_ctg_P = start_Codon_Count(gene_Starts, gene_Lengths)
|
|
173
|
+
g_tag_P, g_taa_P, g_tga_P = stop_Codon_Count(gene_Stops, gene_Lengths)
|
|
174
|
+
o_atg_P, o_gtg_P, o_ttg_P, o_att_P, o_ctg_P = start_Codon_Count(orf_Starts, orf_Lengths)
|
|
175
|
+
o_tag_P, o_taa_P, o_tga_P = stop_Codon_Count(orf_Stops, orf_Lengths)
|
|
176
|
+
|
|
177
|
+
# output = ("Number of Protein Coding Genes in " + str(annotation) + " : " + str(len(gene_Lengths)) + ", Median Length of PCGs: " +
|
|
178
|
+
# str(gene_Median) + ", Min Length of PCGs: " + str('NA') + ", Max Length of PCGs: " + str('NA') +
|
|
179
|
+
# ", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(strands['-']) +
|
|
180
|
+
# ", Median GC of PCGs: " + str('NA') +
|
|
181
|
+
# ", Number of PCGs less than 100nt: " + str('NA') +
|
|
182
|
+
output = ("Number of Partial Hits:" + str(len(gene_Lengths)) + "\nMedian Length of Partial Hit Genes:" + str(
|
|
183
|
+
gene_Median) +
|
|
184
|
+
'\nMedian Length of Partial Hit ORFs:' + str(orf_Median) + '\nMedian GC Partial Hit Genes:' + str(
|
|
185
|
+
gene_Median_GC) +
|
|
186
|
+
'\nMedian GC Partial Hit ORFs:' + str(orf_Median_GC) +
|
|
187
|
+
'\nPercentage of Genes starting with ATG - Annotation/partial: ' + g_atg_P + ' ' + o_atg_P +
|
|
188
|
+
'\nPercentage of Genes starting with GTG - Annotation/partial: ' + g_gtg_P + ' ' + o_gtg_P +
|
|
189
|
+
'\nPercentage of Genes starting with TTG - Annotation/partial: ' + g_ttg_P + ' ' + o_ttg_P +
|
|
190
|
+
'\nPercentage of Genes starting with ATT - Annotation/partial: ' + g_att_P + ' ' + o_att_P +
|
|
191
|
+
'\nPercentage of Genes starting with CTG - Annotation/partial: ' + g_ctg_P + ' ' + o_ctg_P +
|
|
192
|
+
# '\nPercentage of Genes starting with Alternative Start Codon - Annotation/partial: ' + other_Starts_P + ' ' + m_other_Stops_P +
|
|
193
|
+
'\nPercentage of Genes ending with TAG - Annotation/partial: ' + g_tag_P + ' ' + o_tag_P +
|
|
194
|
+
'\nPercentage of Genes ending with TAA - Annotation/partial: ' + g_taa_P + ' ' + o_taa_P +
|
|
195
|
+
'\nPercentage of Genes ending with TGA - Annotation/partial: ' + g_tga_P + ' ' + o_tga_P)
|
|
196
|
+
# '\nPercentage of Genes ending with Alternative Stop Codon - Annotation/partial: ' + other_Stops_P + ' ' + m_other_Stops_P)
|
|
197
|
+
|
|
198
|
+
print(output)
|
|
199
|
+
|
|
200
|
+
# import matplotlib.pylab as plt
|
|
201
|
+
#
|
|
202
|
+
# list_ORF_Starts = list(orf_Starts.items()) # sorted by key, return a list of tuples
|
|
203
|
+
# list_Gene_Starts = list(gene_Starts.items())
|
|
204
|
+
# o_x, o_y = zip(*list_ORF_Starts) # unpack a list of pairs into two tuples
|
|
205
|
+
# g_x, g_y = zip(*list_Gene_Starts)
|
|
206
|
+
#
|
|
207
|
+
# plt.plot(o_x, o_y)
|
|
208
|
+
# plt.plot(g_x, g_y)
|
|
209
|
+
# plt.show()
|
|
210
|
+
#
|
|
211
|
+
# list_ORF_Stops = list(orf_Stops.items()) # sorted by key, return a list of tuples
|
|
212
|
+
# list_Gene_Stops = list(gene_Stops.items())
|
|
213
|
+
# o_x, o_y = zip(*list_ORF_Stops) # unpack a list of pairs into two tuples
|
|
214
|
+
# g_x, g_y = zip(*list_Gene_Stops)
|
|
215
|
+
#
|
|
216
|
+
# plt.plot(o_x, o_y)
|
|
217
|
+
# plt.plot(g_x, g_y)
|
|
218
|
+
# plt.show()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
if __name__ == "__main__":
|
|
222
|
+
options = parser.parse_args()
|
|
223
|
+
parameters = options.parameters
|
|
224
|
+
tool = options.tool
|
|
225
|
+
genome = options.genome
|
|
226
|
+
if parameters:
|
|
227
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
|
|
228
|
+
else:
|
|
229
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
|
|
230
|
+
result_compare(genome, results_file)
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import collections
|
|
5
|
+
|
|
6
|
+
from ORForise.src.ORForise.utils import *
|
|
7
|
+
|
|
8
|
+
parser = argparse.ArgumentParser()
|
|
9
|
+
parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
|
|
10
|
+
parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
|
|
11
|
+
parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
|
|
12
|
+
args = parser.parse_args()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def gc_count(dna):
|
|
16
|
+
c = 0
|
|
17
|
+
a = 0
|
|
18
|
+
g = 0
|
|
19
|
+
t = 0
|
|
20
|
+
n = 0
|
|
21
|
+
for i in dna:
|
|
22
|
+
if "C" in i:
|
|
23
|
+
c += 1
|
|
24
|
+
elif "G" in i:
|
|
25
|
+
g += 1
|
|
26
|
+
elif "A" in i:
|
|
27
|
+
a += 1
|
|
28
|
+
elif "T" in i:
|
|
29
|
+
t += 1
|
|
30
|
+
elif "N" in i:
|
|
31
|
+
n += 1
|
|
32
|
+
gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
|
|
33
|
+
n_per = n * 100 / (a + t + g + c + n)
|
|
34
|
+
return n_per, gc_content
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def start_Codon_Count(orfs):
|
|
38
|
+
atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
|
|
39
|
+
other_Starts = []
|
|
40
|
+
for orf in orfs.values():
|
|
41
|
+
codon = orf[-2]
|
|
42
|
+
if codon == 'ATG':
|
|
43
|
+
atg += 1
|
|
44
|
+
elif codon == 'GTG':
|
|
45
|
+
gtg += 1
|
|
46
|
+
elif codon == 'TTG':
|
|
47
|
+
ttg += 1
|
|
48
|
+
elif codon == 'ATT':
|
|
49
|
+
att += 1
|
|
50
|
+
elif codon == 'CTG':
|
|
51
|
+
ctg += 1
|
|
52
|
+
else:
|
|
53
|
+
other += 1
|
|
54
|
+
other_Starts.append(codon)
|
|
55
|
+
atg_P = format(100 * atg / len(orfs), '.2f')
|
|
56
|
+
gtg_P = format(100 * gtg / len(orfs), '.2f')
|
|
57
|
+
ttg_P = format(100 * ttg / len(orfs), '.2f')
|
|
58
|
+
att_P = format(100 * att / len(orfs), '.2f')
|
|
59
|
+
ctg_P = format(100 * ctg / len(orfs), '.2f')
|
|
60
|
+
other_Start_P = format(100 * other / len(orfs), '.2f')
|
|
61
|
+
return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def stop_Codon_Count(orfs):
|
|
65
|
+
tag, taa, tga, other = 0, 0, 0, 0
|
|
66
|
+
other_Stops = []
|
|
67
|
+
for orf in orfs.values():
|
|
68
|
+
codon = orf[-1]
|
|
69
|
+
if codon == 'TAG':
|
|
70
|
+
tag += 1
|
|
71
|
+
elif codon == 'TAA':
|
|
72
|
+
taa += 1
|
|
73
|
+
elif codon == 'TGA':
|
|
74
|
+
tga += 1
|
|
75
|
+
else:
|
|
76
|
+
other += 1
|
|
77
|
+
other_Stops.append(codon)
|
|
78
|
+
tag_p = format(100 * tag / len(orfs), '.2f')
|
|
79
|
+
taa_p = format(100 * taa / len(orfs), '.2f')
|
|
80
|
+
tga_p = format(100 * tga / len(orfs), '.2f')
|
|
81
|
+
other_Stop_P = format(100 * other / len(orfs), '.2f')
|
|
82
|
+
return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def detail_transfer(genes, missed_genes):
|
|
86
|
+
for missed, m_details in missed_genes.items():
|
|
87
|
+
try:
|
|
88
|
+
details = genes[missed]
|
|
89
|
+
gc = details[2]
|
|
90
|
+
up_Overlap = details[3]
|
|
91
|
+
down_Overlap = details[4]
|
|
92
|
+
m_details.insert(2, gc)
|
|
93
|
+
m_details.insert(3, up_Overlap)
|
|
94
|
+
m_details.insert(4, down_Overlap)
|
|
95
|
+
except KeyError:
|
|
96
|
+
pass
|
|
97
|
+
return missed_genes
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_genome(genome):
|
|
101
|
+
genome_Seq = ""
|
|
102
|
+
with open('../Genomes/' + genome + '.fa', 'r') as genome:
|
|
103
|
+
for line in genome:
|
|
104
|
+
line = line.replace("\n", "")
|
|
105
|
+
if not line.startswith('>'):
|
|
106
|
+
genome_Seq += str(line)
|
|
107
|
+
return genome_Seq
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def missed_genes_in(genes_detected, missed_genes, results_in):
|
|
111
|
+
# Missed Genes Read-In
|
|
112
|
+
read = False
|
|
113
|
+
for line in results_in:
|
|
114
|
+
line = line.strip()
|
|
115
|
+
if read == True:
|
|
116
|
+
if line.startswith('>'):
|
|
117
|
+
entry = line.split('_')
|
|
118
|
+
entry = entry[1] + '_' + entry[2]
|
|
119
|
+
strand = entry[-1]
|
|
120
|
+
if int(strand) <= 2:
|
|
121
|
+
strand = '+'
|
|
122
|
+
else:
|
|
123
|
+
strand = '-'
|
|
124
|
+
elif len(line.strip()) > 0:
|
|
125
|
+
startCodon = line[0:3]
|
|
126
|
+
stopCodon = line[-3:]
|
|
127
|
+
length = len(line)
|
|
128
|
+
missed_genes.update({entry: [line, strand, length, startCodon, stopCodon]})
|
|
129
|
+
|
|
130
|
+
if line.startswith('Undetected_Genes:'):
|
|
131
|
+
read = True
|
|
132
|
+
if read == True and not line:
|
|
133
|
+
break
|
|
134
|
+
list_Missed = list(missed_genes.keys())
|
|
135
|
+
for key in list_Missed:
|
|
136
|
+
# ### printed out to confirm figure lengths
|
|
137
|
+
# start = key.split('_')[0]
|
|
138
|
+
# stop = key.split('_')[1]
|
|
139
|
+
if key in genes_detected:
|
|
140
|
+
del genes_detected[key]
|
|
141
|
+
|
|
142
|
+
return missed_genes, genes_detected
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def partial_matches_in(partial_matches, results_in):
|
|
146
|
+
# partial Genes Read-In
|
|
147
|
+
read = False
|
|
148
|
+
prev = ''
|
|
149
|
+
for line in results_in:
|
|
150
|
+
line = line.strip()
|
|
151
|
+
if read == True:
|
|
152
|
+
if line.startswith('Gene:'):
|
|
153
|
+
line = line.replace('Gene:', '')
|
|
154
|
+
entry = line.split('_')
|
|
155
|
+
g_Pos = entry[0] + '_' + entry[1]
|
|
156
|
+
strand = entry[2]
|
|
157
|
+
prev = 'Gene'
|
|
158
|
+
elif line.startswith('ORF:'):
|
|
159
|
+
line = line.replace('ORF:', '')
|
|
160
|
+
entry = line.split('_')
|
|
161
|
+
o_Pos = entry[0] + '_' + entry[1]
|
|
162
|
+
prev = 'ORF'
|
|
163
|
+
elif line:
|
|
164
|
+
if 'Gene' in prev:
|
|
165
|
+
g_Seq = line.strip()
|
|
166
|
+
g_length = len(g_Seq)
|
|
167
|
+
elif 'ORF' in prev:
|
|
168
|
+
o_Seq = line.strip()
|
|
169
|
+
orf_length = len(o_Seq)
|
|
170
|
+
elif not line:
|
|
171
|
+
partial_matches.update({g_Pos: [strand, g_length, g_Seq, o_Pos, orf_length, o_Seq]})
|
|
172
|
+
if line.startswith('Partial_Gene_Hits:'):
|
|
173
|
+
read = True
|
|
174
|
+
|
|
175
|
+
return partial_matches
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def unmatched_ORFs_in(unmatched_ORFs, results_file):
|
|
179
|
+
# Unmatched ORFs Read-In
|
|
180
|
+
read = False
|
|
181
|
+
for line in results_file:
|
|
182
|
+
line = line.strip()
|
|
183
|
+
if read == True:
|
|
184
|
+
if line.startswith('>'):
|
|
185
|
+
line = line.replace('Gene:', '')
|
|
186
|
+
entry = line.split('_')
|
|
187
|
+
strand = entry[-1]
|
|
188
|
+
o_Pos = entry[1] + '_' + entry[2]
|
|
189
|
+
unmatched_ORFs.update({o_Pos: [strand, None, None, None, None]})
|
|
190
|
+
elif line:
|
|
191
|
+
o_Seq = line.strip()
|
|
192
|
+
o_Length = len(o_Seq)
|
|
193
|
+
startCodon = line[0:3]
|
|
194
|
+
stopCodon = line[-3:]
|
|
195
|
+
unmatched_ORFs.update({o_Pos: [strand, o_Length, o_Seq, startCodon, stopCodon]})
|
|
196
|
+
if line.startswith('ORF_Without_Corresponding_Gene_in_Ensembl:'):
|
|
197
|
+
read = True
|
|
198
|
+
elif read == True and not line:
|
|
199
|
+
unmatched_ORFs.update({o_Pos: [strand, o_Length, o_Seq, startCodon, stopCodon]})
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
return unmatched_ORFs
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def genes_in(genome, genome_Seq, genome_Seq_Rev, genome_Size, genes):
|
|
206
|
+
with open('../Genomes/' + genome + '.gff', 'r') as genome_gff:
|
|
207
|
+
for line in genome_gff:
|
|
208
|
+
line = line.split('\t')
|
|
209
|
+
try:
|
|
210
|
+
if "CDS" in line[2] and len(line) == 9:
|
|
211
|
+
start = int(line[3])
|
|
212
|
+
stop = int(line[4])
|
|
213
|
+
strand = line[6]
|
|
214
|
+
length = stop - start
|
|
215
|
+
gene = str(start) + '_' + str(stop)
|
|
216
|
+
if '+' in strand:
|
|
217
|
+
seq = genome_Seq[start - 1:stop]
|
|
218
|
+
elif '-' in strand:
|
|
219
|
+
r_Start = genome_Size - stop
|
|
220
|
+
r_Stop = genome_Size - start
|
|
221
|
+
seq = genome_Seq_Rev[r_Start:r_Stop + 1]
|
|
222
|
+
startCodon = seq[0:3]
|
|
223
|
+
stopCodon = seq[-3:]
|
|
224
|
+
genes.update({gene: [seq, strand, length, startCodon, stopCodon]})
|
|
225
|
+
except IndexError:
|
|
226
|
+
continue
|
|
227
|
+
return genes
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def extract_results(genome, results_file):
|
|
231
|
+
genome_Seq = get_genome(genome)
|
|
232
|
+
genome_Seq_Rev = revCompIterative(genome_Seq)
|
|
233
|
+
genome_Size = len(genome_Seq)
|
|
234
|
+
genes = collections.OrderedDict()
|
|
235
|
+
partial_matches = collections.OrderedDict()
|
|
236
|
+
missed_genes = collections.OrderedDict()
|
|
237
|
+
unmatched_ORFs = collections.OrderedDict()
|
|
238
|
+
|
|
239
|
+
genes = genes_in(genome, genome_Seq, genome_Seq_Rev, genome_Size, genes)
|
|
240
|
+
genes_detected = copy.deepcopy(genes)
|
|
241
|
+
missed_genes, genes_detected = missed_genes_in(genes_detected, missed_genes, results_file)
|
|
242
|
+
results_file.seek(0, 0) # Reset file position
|
|
243
|
+
partial_matches = partial_matches_in(partial_matches, results_file)
|
|
244
|
+
results_file.seek(0, 0) # Reset file position
|
|
245
|
+
unmatched_ORFs = unmatched_ORFs_in(unmatched_ORFs, results_file)
|
|
246
|
+
results_file.seek(0, 0) # Reset file position
|
|
247
|
+
|
|
248
|
+
return genes, genes_detected, missed_genes, partial_matches, unmatched_ORFs
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == "__main__":
|
|
252
|
+
options = parser.parse_args()
|
|
253
|
+
parameters = options.parameters
|
|
254
|
+
tool = options.tool
|
|
255
|
+
genome = options.genome
|
|
256
|
+
if parameters:
|
|
257
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
|
|
258
|
+
else:
|
|
259
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
|
|
260
|
+
|
|
261
|
+
genes, genes_detected, missed_genes, partial_matches, unmatched_ORFs = extract_results(genome, results_file)
|
|
262
|
+
gene_Lengths, genes_detected_Lengths, missed_Lengths, partial_Lengths, unmatched_Lengths = [], [], [], [], []
|
|
263
|
+
|
|
264
|
+
for pos in genes.keys():
|
|
265
|
+
gene_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
|
|
266
|
+
for pos in genes_detected.keys():
|
|
267
|
+
genes_detected_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
|
|
268
|
+
for pos in partial_matches.keys():
|
|
269
|
+
partial_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
|
|
270
|
+
for pos in missed_genes.keys():
|
|
271
|
+
missed_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
|
|
272
|
+
for pos in unmatched_ORFs.keys():
|
|
273
|
+
unmatched_Lengths.append(int(pos.split('_')[1]) - int(pos.split('_')[0]))
|
|
274
|
+
|
|
275
|
+
import numpy as np
|
|
276
|
+
|
|
277
|
+
print(len(gene_Lengths))
|
|
278
|
+
print(gene_Lengths)
|
|
279
|
+
print(np.mean(gene_Lengths))
|
|
280
|
+
print(len(partial_Lengths))
|
|
281
|
+
print(partial_Lengths)
|
|
282
|
+
print(len(missed_Lengths))
|
|
283
|
+
print(missed_Lengths)
|
|
284
|
+
print(len(unmatched_Lengths))
|
|
285
|
+
print(unmatched_Lengths)
|
|
286
|
+
print(np.mean(unmatched_Lengths))
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
|
|
4
|
+
parser = argparse.ArgumentParser()
|
|
5
|
+
parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
|
|
6
|
+
parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
|
|
7
|
+
parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
|
|
8
|
+
args = parser.parse_args()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def revCompIterative(watson):
|
|
12
|
+
complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
|
13
|
+
watson = watson.upper()
|
|
14
|
+
watsonrev = watson[::-1]
|
|
15
|
+
crick = ""
|
|
16
|
+
for nt in watsonrev:
|
|
17
|
+
crick += complements[nt]
|
|
18
|
+
return crick
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def partial_gene_read(results_file, partial_genes):
|
|
22
|
+
# partial Genes Read-In
|
|
23
|
+
orf_Lengths = []
|
|
24
|
+
gene_Lengths = []
|
|
25
|
+
read = False
|
|
26
|
+
prev = ''
|
|
27
|
+
for line in results_file:
|
|
28
|
+
line = line.strip()
|
|
29
|
+
if read == True:
|
|
30
|
+
if line.startswith('Gene:'):
|
|
31
|
+
line = line.replace('Gene:', '')
|
|
32
|
+
entry = line.split('_')
|
|
33
|
+
g_Pos = entry[0] + '_' + entry[1]
|
|
34
|
+
strand = entry[2]
|
|
35
|
+
prev = 'Gene'
|
|
36
|
+
elif line.startswith('ORF:'):
|
|
37
|
+
line = line.replace('ORF:', '')
|
|
38
|
+
entry = line.split('_')
|
|
39
|
+
o_Pos = entry[0] + '_' + entry[1]
|
|
40
|
+
prev = 'ORF'
|
|
41
|
+
elif line:
|
|
42
|
+
if 'Gene' in prev:
|
|
43
|
+
g_Seq = line.strip()
|
|
44
|
+
gene_Lengths.append(len(g_Seq))
|
|
45
|
+
elif 'ORF' in prev:
|
|
46
|
+
o_Seq = line.strip()
|
|
47
|
+
orf_Lengths.append(len(o_Seq))
|
|
48
|
+
elif not line:
|
|
49
|
+
partial_genes.update({g_Pos: [strand, g_Seq, o_Pos, o_Seq]})
|
|
50
|
+
if line.startswith('Partial_Gene_Hits:'):
|
|
51
|
+
read = True
|
|
52
|
+
return partial_genes, orf_Lengths, gene_Lengths
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def perfect_Matches(results_file, perfect_Match_Genes):
|
|
56
|
+
read = False
|
|
57
|
+
for line in results_file:
|
|
58
|
+
line = line.strip()
|
|
59
|
+
if read == True:
|
|
60
|
+
if line.startswith('>'):
|
|
61
|
+
entry = line.split('_')
|
|
62
|
+
g_Pos = entry[1] + '_' + entry[2]
|
|
63
|
+
strand = entry[3]
|
|
64
|
+
elif line:
|
|
65
|
+
g_Seq = line
|
|
66
|
+
g_Start = line[0:3]
|
|
67
|
+
g_Stop = line[-3:]
|
|
68
|
+
elif not line:
|
|
69
|
+
perfect_Match_Genes.update({g_Pos: [strand, g_Seq, g_Start, g_Stop]})
|
|
70
|
+
if line.startswith('Perfect_Match_Genes:'):
|
|
71
|
+
read = True
|
|
72
|
+
if line.startswith('Undetected_Genes'):
|
|
73
|
+
break
|
|
74
|
+
return perfect_Match_Genes
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def detail_transfer(genes, partial_genes):
|
|
78
|
+
for partial, m_details in partial_genes.items():
|
|
79
|
+
try:
|
|
80
|
+
details = genes[partial]
|
|
81
|
+
gc = details[2]
|
|
82
|
+
up_Overlap = details[3]
|
|
83
|
+
down_Overlap = details[4]
|
|
84
|
+
m_details.insert(2, gc)
|
|
85
|
+
m_details.insert(3, up_Overlap)
|
|
86
|
+
m_details.insert(4, down_Overlap)
|
|
87
|
+
except KeyError:
|
|
88
|
+
pass
|
|
89
|
+
return partial_genes
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def result_compare(results_file, genome_file):
|
|
93
|
+
genome = ""
|
|
94
|
+
with open('../Genomes/' + genome_file + '.fa', 'r') as genome_file:
|
|
95
|
+
for line in genome_file:
|
|
96
|
+
line = line.replace("\n", "")
|
|
97
|
+
if ">" not in line:
|
|
98
|
+
genome += str(line)
|
|
99
|
+
|
|
100
|
+
partial_genes = collections.OrderedDict()
|
|
101
|
+
perfect_Match_Genes = collections.OrderedDict()
|
|
102
|
+
partial_genes, orf_Lengths, gene_Lengths = partial_gene_read(results_file, partial_genes)
|
|
103
|
+
results_file.seek(0)
|
|
104
|
+
perfect_Match_Genes = perfect_Matches(results_file, perfect_Match_Genes)
|
|
105
|
+
|
|
106
|
+
perfect_Match_Gene_Start_Codons = collections.OrderedDict({'ATG': 0, 'GTG': 0, 'TTG': 0, 'CTG': 0, 'Other': 0})
|
|
107
|
+
for gene, data in perfect_Match_Genes.items():
|
|
108
|
+
try:
|
|
109
|
+
perfect_Match_Gene_Start_Codons[data[2]] += 1
|
|
110
|
+
except KeyError:
|
|
111
|
+
perfect_Match_Gene_Start_Codons['Other'] += 1
|
|
112
|
+
print("Perfect Match Start Codons\nATG:" + str(perfect_Match_Gene_Start_Codons['ATG']) + ",GTG:" + str(
|
|
113
|
+
perfect_Match_Gene_Start_Codons['GTG']) + ",TTG:" +
|
|
114
|
+
str(perfect_Match_Gene_Start_Codons['TTG']) + ",CTG:" + str(
|
|
115
|
+
perfect_Match_Gene_Start_Codons['CTG']) + ",Other_Start:" + str(perfect_Match_Gene_Start_Codons['Other']))
|
|
116
|
+
|
|
117
|
+
strands = collections.defaultdict(int, {'-': 0, '+': 0})
|
|
118
|
+
start_Codon_Substitution = collections.OrderedDict(
|
|
119
|
+
{'ATG-ATG': 0, 'GTG-ATG': 0, 'TTG-ATG': 0, 'CTG-ATG': 0, 'Alt-ATG': 0,
|
|
120
|
+
'ATG-GTG': 0, 'GTG-GTG': 0, 'TTG-GTG': 0, 'CTG-GTG': 0, 'Alt-CTG': 0,
|
|
121
|
+
'ATG-TTG': 0, 'GTG-TTG': 0, 'TTG-TTG': 0, 'CTG-TTG': 0, 'Alt-GTG': 0,
|
|
122
|
+
'ATG-CTG': 0, 'GTG-CTG': 0, 'TTG-CTG': 0, 'CTG-CTG': 0, 'Alt-TTG': 0,
|
|
123
|
+
'ATG-Alt': 0, 'GTG-Alt': 0, 'TTG-Alt': 0, 'CTG-Alt': 0, 'Alt-Alt': 0})
|
|
124
|
+
|
|
125
|
+
codon_set = ['ATG', 'CTG', 'GTG', 'TTG']
|
|
126
|
+
for gene, data in partial_genes.items():
|
|
127
|
+
strands[data[0]] += 1
|
|
128
|
+
gene_Start = [data[1][0:3]]
|
|
129
|
+
orf_Start = [data[3][0:3]]
|
|
130
|
+
if gene_Start[0] in codon_set:
|
|
131
|
+
gene_Start = gene_Start[0]
|
|
132
|
+
else:
|
|
133
|
+
print('Gene_Codon_Alternative:' + str(gene_Start[0]))
|
|
134
|
+
gene_Start = 'Alt'
|
|
135
|
+
if orf_Start[0] in codon_set:
|
|
136
|
+
orf_Start = orf_Start[0]
|
|
137
|
+
else:
|
|
138
|
+
print('ORF_Codon_Alternative:' + str(orf_Start[0]))
|
|
139
|
+
orf_Start = 'Alt'
|
|
140
|
+
|
|
141
|
+
matrix_index = gene_Start + '-' + orf_Start
|
|
142
|
+
start_Codon_Substitution[matrix_index] += 1
|
|
143
|
+
|
|
144
|
+
####### HERE - Need to flip the data - GS along the top
|
|
145
|
+
subs = start_Codon_Substitution.values()
|
|
146
|
+
subs = list(subs)
|
|
147
|
+
subs[:0] = ['ATG', 'GTG', 'TTG', 'CTG', 'Other']
|
|
148
|
+
for i in [subs[c:c + 5] for c in range(0, len(subs), 5) if c % 5 == 0]:
|
|
149
|
+
print(*i)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
options = parser.parse_args()
|
|
154
|
+
parameters = options.parameters
|
|
155
|
+
tool = options.tool
|
|
156
|
+
genome = options.genome
|
|
157
|
+
if parameters:
|
|
158
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
|
|
159
|
+
else:
|
|
160
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
|
|
161
|
+
result_compare(results_file, genome)
|