ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
parser = argparse.ArgumentParser()
|
|
5
|
+
parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
|
|
6
|
+
args = parser.parse_args()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def genome_Lengths(genome_to_compare):
|
|
10
|
+
lengths = []
|
|
11
|
+
with open('../Genomes/' + genome_to_compare + '.gff', 'r') as genome_gff:
|
|
12
|
+
for line in genome_gff:
|
|
13
|
+
line = line.split('\t')
|
|
14
|
+
try:
|
|
15
|
+
if "CDS" in line[2] and len(line) == 9:
|
|
16
|
+
start = int(line[3])
|
|
17
|
+
stop = int(line[4])
|
|
18
|
+
length = stop - start
|
|
19
|
+
lengths.append(length)
|
|
20
|
+
except IndexError:
|
|
21
|
+
# print(line)
|
|
22
|
+
continue
|
|
23
|
+
print("Number of Genes: " + str(len(lengths)) +
|
|
24
|
+
'\tMedian Length of Genes: ' + str(np.median(lengths)) + '\nGenes Lengths:\n' + str(lengths))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
genome_Lengths(**vars(args))
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from ORForise.src.ORForise.utils import * # local file
|
|
7
|
+
except ImportError:
|
|
8
|
+
from ORForise.utils import *
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def start_Codon_Count(start_Codons):
|
|
14
|
+
atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
|
|
15
|
+
other_Starts = []
|
|
16
|
+
for start in start_Codons:
|
|
17
|
+
if start == 'ATG':
|
|
18
|
+
atg += 1
|
|
19
|
+
elif start == 'GTG':
|
|
20
|
+
gtg += 1
|
|
21
|
+
elif start == 'TTG':
|
|
22
|
+
ttg += 1
|
|
23
|
+
elif start == 'ATT':
|
|
24
|
+
att += 1
|
|
25
|
+
elif start == 'CTG':
|
|
26
|
+
ctg += 1
|
|
27
|
+
else:
|
|
28
|
+
other += 1
|
|
29
|
+
other_Starts.append(start)
|
|
30
|
+
atg_P = format(100 * atg / len(start_Codons), '.2f')
|
|
31
|
+
gtg_P = format(100 * gtg / len(start_Codons), '.2f')
|
|
32
|
+
ttg_P = format(100 * ttg / len(start_Codons), '.2f')
|
|
33
|
+
att_P = format(100 * att / len(start_Codons), '.2f')
|
|
34
|
+
ctg_P = format(100 * ctg / len(start_Codons), '.2f')
|
|
35
|
+
other_Start_P = format(100 * other / len(start_Codons), '.2f')
|
|
36
|
+
return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def stop_Codon_Count(stop_Codons):
|
|
40
|
+
tag, taa, tga, other = 0, 0, 0, 0
|
|
41
|
+
other_Stops = []
|
|
42
|
+
for stop in stop_Codons:
|
|
43
|
+
if stop == 'TAG':
|
|
44
|
+
tag += 1
|
|
45
|
+
elif stop == 'TAA':
|
|
46
|
+
taa += 1
|
|
47
|
+
elif stop == 'TGA':
|
|
48
|
+
tga += 1
|
|
49
|
+
else:
|
|
50
|
+
other += 1
|
|
51
|
+
other_Stops.append(stop)
|
|
52
|
+
tag_p = format(100 * tag / len(stop_Codons), '.2f')
|
|
53
|
+
taa_p = format(100 * taa / len(stop_Codons), '.2f')
|
|
54
|
+
tga_p = format(100 * tga / len(stop_Codons), '.2f')
|
|
55
|
+
other_Stop_P = format(100 * other / len(stop_Codons), '.2f')
|
|
56
|
+
return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def gc_count(dna):
|
|
60
|
+
c = 0
|
|
61
|
+
a = 0
|
|
62
|
+
g = 0
|
|
63
|
+
t = 0
|
|
64
|
+
n = 0
|
|
65
|
+
for i in dna:
|
|
66
|
+
if "C" in i:
|
|
67
|
+
c += 1
|
|
68
|
+
elif "G" in i:
|
|
69
|
+
g += 1
|
|
70
|
+
elif "A" in i:
|
|
71
|
+
a += 1
|
|
72
|
+
elif "T" in i:
|
|
73
|
+
t += 1
|
|
74
|
+
elif "N" in i:
|
|
75
|
+
n += 1
|
|
76
|
+
gc_content = (g + c) * 100 / (a + t + g + c + n)
|
|
77
|
+
n_per = n * 100 / (a + t + g + c + n)
|
|
78
|
+
return n_per, gc_content
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def revCompIterative(watson):
|
|
82
|
+
complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
|
83
|
+
watson = watson.upper()
|
|
84
|
+
watsonrev = watson[::-1]
|
|
85
|
+
crick = ""
|
|
86
|
+
|
|
87
|
+
for nt in watsonrev:
|
|
88
|
+
crick += complements[nt]
|
|
89
|
+
return crick
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def genome_Metrics(fasta_in, gff_in, output_file):
|
|
93
|
+
|
|
94
|
+
base_name = os.path.basename(fasta_in) # Gets file name with extension
|
|
95
|
+
genome_name = os.path.splitext(base_name)[0] # Removes extension
|
|
96
|
+
|
|
97
|
+
genome_Seq = ""
|
|
98
|
+
with open(fasta_in , 'r') as genome:
|
|
99
|
+
for line in genome:
|
|
100
|
+
line = line.replace("\n", "")
|
|
101
|
+
if not line.startswith('>'):
|
|
102
|
+
genome_Seq += str(line)
|
|
103
|
+
|
|
104
|
+
genome_N_Per, genome_GC = gc_count(genome_Seq)
|
|
105
|
+
|
|
106
|
+
genome_Rev = revCompIterative(genome_Seq)
|
|
107
|
+
genome_Size = len(genome_Seq)
|
|
108
|
+
coding_Regions = np.zeros((genome_Size), dtype=int)
|
|
109
|
+
non_Coding_Regions = np.zeros((genome_Size), dtype=int)
|
|
110
|
+
all_gene_Regions = np.zeros((genome_Size), dtype=int)
|
|
111
|
+
protein_coding_genes = collections.OrderedDict()
|
|
112
|
+
non_protein_coding_genes = collections.OrderedDict()
|
|
113
|
+
strands = collections.defaultdict(int)
|
|
114
|
+
lengths_PCG, gene_Pos_Olap, gene_Neg_Olap, short_PCGs, pcg_GC = [], [], [], [], []
|
|
115
|
+
prev_Gene_Stop, count, nc_Count, pos_Strand, neg_Strand = 0, 0, 0, 0, 0
|
|
116
|
+
prev_Gene_Overlapped = False
|
|
117
|
+
with open(gff_in, 'r') as genome_gff:
|
|
118
|
+
for line in genome_gff:
|
|
119
|
+
line = line.split('\t')
|
|
120
|
+
try:
|
|
121
|
+
if "CDS" in line[2] and len(line) == 9:
|
|
122
|
+
start = int(line[3])
|
|
123
|
+
stop = int(line[4])
|
|
124
|
+
length = stop - start
|
|
125
|
+
all_gene_Regions[start - 1:stop] = [1]
|
|
126
|
+
strand = line[6]
|
|
127
|
+
strands[strand] += 1
|
|
128
|
+
lengths_PCG.append(length)
|
|
129
|
+
coding_Regions[start - 1:stop] = [1]
|
|
130
|
+
gene = str(start) + ',' + str(stop) + ',' + strand
|
|
131
|
+
protein_coding_genes.update({count: gene})
|
|
132
|
+
if '+' in strand:
|
|
133
|
+
seq = genome_Seq[start - 1:stop]
|
|
134
|
+
pos_Strand += 1
|
|
135
|
+
elif '-' in strand:
|
|
136
|
+
r_Start = genome_Size - stop
|
|
137
|
+
r_Stop = genome_Size - start
|
|
138
|
+
seq = genome_Rev[r_Start:r_Stop + 1]
|
|
139
|
+
neg_Strand += 1
|
|
140
|
+
if length < SHORT_ORF_LENGTH:
|
|
141
|
+
short_PCGs.append(gene)
|
|
142
|
+
n_per, gc = gc_count(seq)
|
|
143
|
+
pcg_GC.append(gc)
|
|
144
|
+
### Calculate overlapping ORFs -
|
|
145
|
+
if prev_Gene_Stop > start:
|
|
146
|
+
if '+' in strand:
|
|
147
|
+
gene_Pos_Olap.append(prev_Gene_Stop - start)
|
|
148
|
+
elif '-' in strand:
|
|
149
|
+
gene_Neg_Olap.append(prev_Gene_Stop - start)
|
|
150
|
+
prev_Gene_Overlapped = True
|
|
151
|
+
elif prev_Gene_Stop < start:
|
|
152
|
+
if prev_Gene_Overlapped == True:
|
|
153
|
+
if '+' in strand:
|
|
154
|
+
gene_Pos_Olap.append(0)
|
|
155
|
+
elif '-' in strand:
|
|
156
|
+
gene_Neg_Olap.append(0)
|
|
157
|
+
prev_Gene_Overlapped = False
|
|
158
|
+
prev_Gene_Stop = stop
|
|
159
|
+
count += 1
|
|
160
|
+
elif "ID=gene" in line[8]:
|
|
161
|
+
gene_Info = line[8]
|
|
162
|
+
if "biotype=protein_coding" not in gene_Info:
|
|
163
|
+
start = int(line[3])
|
|
164
|
+
stop = int(line[4])
|
|
165
|
+
strand = line[6]
|
|
166
|
+
gene = str(start) + ',' + str(stop) + ',' + strand
|
|
167
|
+
all_gene_Regions[start - 1:stop] = [1]
|
|
168
|
+
non_Coding_Regions[start - 1:stop] = [1]
|
|
169
|
+
non_protein_coding_genes.update({nc_Count: gene})
|
|
170
|
+
nc_Count += 1
|
|
171
|
+
|
|
172
|
+
except IndexError:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
|
|
176
|
+
if '+' in strand:
|
|
177
|
+
gene_Pos_Olap.append(0)
|
|
178
|
+
elif '-' in strand:
|
|
179
|
+
gene_Neg_Olap.append(0)
|
|
180
|
+
|
|
181
|
+
median_PCG = np.median(lengths_PCG)
|
|
182
|
+
gene_Overlaps = gene_Neg_Olap + gene_Pos_Olap
|
|
183
|
+
median_PCG_Olap = np.median(gene_Overlaps)
|
|
184
|
+
longest_Olap = max(gene_Overlaps)
|
|
185
|
+
coding_Percentage = 100 * float(np.count_nonzero(coding_Regions)) / float(genome_Size)
|
|
186
|
+
non_coding_Percentage = 100 * float(np.count_nonzero(non_Coding_Regions)) / float(genome_Size)
|
|
187
|
+
all_gene_Percentage = 100 * float(np.count_nonzero(all_gene_Regions)) / float(genome_Size)
|
|
188
|
+
start_Codons, stop_Codons = [], []
|
|
189
|
+
for gene in protein_coding_genes.values():
|
|
190
|
+
start = int(gene.split(',')[0])
|
|
191
|
+
stop = int(gene.split(',')[1])
|
|
192
|
+
strand = gene.split(',')[2]
|
|
193
|
+
|
|
194
|
+
if '-' in strand:
|
|
195
|
+
r_start = genome_Size - stop
|
|
196
|
+
r_stop = genome_Size - start
|
|
197
|
+
|
|
198
|
+
start_Codons.append(genome_Rev[r_start:r_start + 3])
|
|
199
|
+
stop_Codons.append(genome_Rev[r_stop - 2:r_stop + 1])
|
|
200
|
+
|
|
201
|
+
elif '+' in strand:
|
|
202
|
+
start_Codons.append(genome_Seq[start - 1:start - 1 + 3])
|
|
203
|
+
stop_Codons.append(genome_Seq[stop - 3:stop - 1 + 1])
|
|
204
|
+
|
|
205
|
+
atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(start_Codons)
|
|
206
|
+
tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(stop_Codons)
|
|
207
|
+
|
|
208
|
+
output = ("Number of Protein Coding Genes in " + genome_name + " : " + str(
|
|
209
|
+
len(lengths_PCG)) + " ,Median Length of PCGs: " + str(median_PCG) + ", Min Length of PCGs: " + str(
|
|
210
|
+
min(lengths_PCG)) + ", Max Length of PCGs: " + str(max(lengths_PCG)) +
|
|
211
|
+
", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(
|
|
212
|
+
strands['-']) + "\nGenome-Wide GC Content: " + format(np.median(genome_GC),
|
|
213
|
+
'.2f') + "Median GC of PCGs: " + format(
|
|
214
|
+
np.median(pcg_GC), '.2f') + ", Number of Overlapping PCGs: " + str(len(gene_Overlaps)) +
|
|
215
|
+
", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(
|
|
216
|
+
median_PCG_Olap) + ", Number of PCGs less than 100 amino acids: " + str(len(short_PCGs)) +
|
|
217
|
+
|
|
218
|
+
"\nPercentage of Genome which is Protein Coding: " + format(coding_Percentage,
|
|
219
|
+
'.2f') + ", Number of Non-PCGs: " + str(
|
|
220
|
+
len(non_protein_coding_genes)) + ", Percentage of Genome Non-PCG: " + format(non_coding_Percentage,
|
|
221
|
+
'.2f') +
|
|
222
|
+
", Percentage of All Genes in Genome: " + format(all_gene_Percentage, '.2f') +
|
|
223
|
+
|
|
224
|
+
"\nPercentage of Genes starting with ATG: " + atg_P +
|
|
225
|
+
"\nPercentage of Genes starting with GTG: " + gtg_P +
|
|
226
|
+
"\nPercentage of Genes starting with TTG: " + ttg_P +
|
|
227
|
+
"\nPercentage of Genes starting with ATT: " + att_P +
|
|
228
|
+
"\nPercentage of Genes starting with CTG: " + ctg_P +
|
|
229
|
+
"\nPercentage of Genes starting with Alternative Start Codon: " + other_Start_P +
|
|
230
|
+
|
|
231
|
+
"\nPercentage of Genes ending with TAG: " + tag_P +
|
|
232
|
+
"\nPercentage of Genes ending with TAA: " + taa_P +
|
|
233
|
+
"\nPercentage of Genes ending with TGA: " + tga_P +
|
|
234
|
+
"\nPercentage of Genes ending with Alternative Stop Codon: " + other_Stop_P)
|
|
235
|
+
|
|
236
|
+
with open(output_file, 'w') as out_file:
|
|
237
|
+
out_file.write('Genome Metrics:\n')
|
|
238
|
+
out_file.write(output + '\n')
|
|
239
|
+
|
|
240
|
+
#print(output)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def main():
|
|
246
|
+
parser = argparse.ArgumentParser(description="...")
|
|
247
|
+
parser.add_argument("-f", dest='fasta_in', required=True, help="Input FASTA file")
|
|
248
|
+
parser.add_argument("-g", dest='gff_in', required=True, help="Corresponding GFF file to FASTA")
|
|
249
|
+
parser.add_argument("-o", dest='output_file', required=True, help="Output metrics file")
|
|
250
|
+
|
|
251
|
+
options = parser.parse_args()
|
|
252
|
+
|
|
253
|
+
genome_Metrics(options.fasta_in, options.gff_in, options.output_file)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
if __name__ == "__main__":
|
|
258
|
+
main()
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
parser = argparse.ArgumentParser()
|
|
7
|
+
parser.add_argument('-ref', '--reference_annotation', required=True,
|
|
8
|
+
help='Which reference annotation file to use as reference?')
|
|
9
|
+
parser.add_argument('-tp', '--tool_prediction', required=True, help='Tool genome prediction file')
|
|
10
|
+
args = parser.parse_args()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main(reference_annotation,tool_prediction):
|
|
14
|
+
ref_genes = collections.OrderedDict() # Order is important
|
|
15
|
+
num_hypo = 0
|
|
16
|
+
with open(reference_annotation, 'r') as genome_gff:
|
|
17
|
+
for line in genome_gff:
|
|
18
|
+
line = line.split('\t')
|
|
19
|
+
try:
|
|
20
|
+
if "gene" in line[2] and len(line) == 9: # Have to use gene and not CDS here because the CDS tag does not contain the classification
|
|
21
|
+
start = line[3]
|
|
22
|
+
stop = line[4]
|
|
23
|
+
gene = start+'_'+stop
|
|
24
|
+
if "hypothetical" in line[8]:
|
|
25
|
+
ref_genes.update({gene:"hypothetical"})
|
|
26
|
+
num_hypo +=1
|
|
27
|
+
else:
|
|
28
|
+
ref_genes.update({gene: "N/A"})
|
|
29
|
+
except IndexError:
|
|
30
|
+
continue
|
|
31
|
+
print("Number of hypothetic genes: "+str(num_hypo))
|
|
32
|
+
###############################
|
|
33
|
+
perfect_match_hypo, partial_match_hypo, missed_hypo = 0,0,0
|
|
34
|
+
perfect_match = False
|
|
35
|
+
partial_match = False
|
|
36
|
+
missed = False
|
|
37
|
+
posss = []
|
|
38
|
+
with open(tool_prediction, 'r') as tool_in:
|
|
39
|
+
for line in tool_in:
|
|
40
|
+
if line.startswith("Perfect_Match_Genes:"):
|
|
41
|
+
perfect_match = True
|
|
42
|
+
elif line.startswith("Partial_Match_Genes:"):
|
|
43
|
+
perfect_match = False
|
|
44
|
+
partial_match = True
|
|
45
|
+
elif line.startswith("Missed_Genes:"):
|
|
46
|
+
perfect_match = False
|
|
47
|
+
partial_match = False
|
|
48
|
+
missed = True
|
|
49
|
+
elif line.startswith("ORF_Without_Corresponding_Gene_in_Ensembl"):
|
|
50
|
+
break
|
|
51
|
+
#################
|
|
52
|
+
if perfect_match == True:
|
|
53
|
+
if line.startswith('>'):
|
|
54
|
+
pos = line.split('_')[1] +'_'+ line.split('_')[2]
|
|
55
|
+
try:
|
|
56
|
+
if "hypothetical" in ref_genes[pos]:
|
|
57
|
+
perfect_match_hypo +=1
|
|
58
|
+
print(pos)
|
|
59
|
+
if pos in posss:
|
|
60
|
+
print("WE")
|
|
61
|
+
posss.append(pos)
|
|
62
|
+
except KeyError:
|
|
63
|
+
continue
|
|
64
|
+
elif partial_match == True:
|
|
65
|
+
if line.startswith('Gene:'): # Different tags
|
|
66
|
+
pos = line.split('_')[0].split(':')[1] +'_'+ line.split('_')[1] # should change Orforise output
|
|
67
|
+
try:
|
|
68
|
+
if "hypothetical" in ref_genes[pos]:
|
|
69
|
+
partial_match_hypo +=1
|
|
70
|
+
print(pos)
|
|
71
|
+
except KeyError:
|
|
72
|
+
continue
|
|
73
|
+
elif missed == True:
|
|
74
|
+
if line.startswith('>'):
|
|
75
|
+
pos = line.split('_')[1] +'_'+ line.split('_')[2]
|
|
76
|
+
try:
|
|
77
|
+
if "hypothetical" in ref_genes[pos]:
|
|
78
|
+
missed_hypo +=1
|
|
79
|
+
print(pos)
|
|
80
|
+
except KeyError:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
print("finished")
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
main(**vars(args))
|
|
87
|
+
|
|
88
|
+
print("Complete")
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from ORForise.src.ORForise.utils import *
|
|
6
|
+
|
|
7
|
+
parser = argparse.ArgumentParser()
|
|
8
|
+
parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
|
|
9
|
+
parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
|
|
10
|
+
parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
|
|
11
|
+
|
|
12
|
+
args = parser.parse_args()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def gc_count(dna):
|
|
16
|
+
c = 0
|
|
17
|
+
a = 0
|
|
18
|
+
g = 0
|
|
19
|
+
t = 0
|
|
20
|
+
n = 0
|
|
21
|
+
for i in dna:
|
|
22
|
+
if "C" in i:
|
|
23
|
+
c += 1
|
|
24
|
+
elif "G" in i:
|
|
25
|
+
g += 1
|
|
26
|
+
elif "A" in i:
|
|
27
|
+
a += 1
|
|
28
|
+
elif "T" in i:
|
|
29
|
+
t += 1
|
|
30
|
+
elif "N" in i:
|
|
31
|
+
n += 1
|
|
32
|
+
gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
|
|
33
|
+
n_per = n * 100 / (a + t + g + c + n)
|
|
34
|
+
return n_per, gc_content
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def revCompIterative(watson):
|
|
38
|
+
complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
|
39
|
+
watson = watson.upper()
|
|
40
|
+
watsonrev = watson[::-1]
|
|
41
|
+
crick = ""
|
|
42
|
+
for nt in watsonrev:
|
|
43
|
+
crick += complements[nt]
|
|
44
|
+
return crick
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_Codon_Count(orfs):
|
|
48
|
+
atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
|
|
49
|
+
other_Starts = []
|
|
50
|
+
for orf in orfs.values():
|
|
51
|
+
codon = orf[-2]
|
|
52
|
+
if codon == 'ATG':
|
|
53
|
+
atg += 1
|
|
54
|
+
elif codon == 'GTG':
|
|
55
|
+
gtg += 1
|
|
56
|
+
elif codon == 'TTG':
|
|
57
|
+
ttg += 1
|
|
58
|
+
elif codon == 'ATT':
|
|
59
|
+
att += 1
|
|
60
|
+
elif codon == 'CTG':
|
|
61
|
+
ctg += 1
|
|
62
|
+
else:
|
|
63
|
+
other += 1
|
|
64
|
+
other_Starts.append(codon)
|
|
65
|
+
atg_P = format(100 * atg / len(orfs), '.2f')
|
|
66
|
+
gtg_P = format(100 * gtg / len(orfs), '.2f')
|
|
67
|
+
ttg_P = format(100 * ttg / len(orfs), '.2f')
|
|
68
|
+
att_P = format(100 * att / len(orfs), '.2f')
|
|
69
|
+
ctg_P = format(100 * ctg / len(orfs), '.2f')
|
|
70
|
+
other_Start_P = format(100 * other / len(orfs), '.2f')
|
|
71
|
+
return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def stop_Codon_Count(orfs):
|
|
75
|
+
tag, taa, tga, other = 0, 0, 0, 0
|
|
76
|
+
other_Stops = []
|
|
77
|
+
for orf in orfs.values():
|
|
78
|
+
codon = orf[-1]
|
|
79
|
+
if codon == 'TAG':
|
|
80
|
+
tag += 1
|
|
81
|
+
elif codon == 'TAA':
|
|
82
|
+
taa += 1
|
|
83
|
+
elif codon == 'TGA':
|
|
84
|
+
tga += 1
|
|
85
|
+
else:
|
|
86
|
+
other += 1
|
|
87
|
+
other_Stops.append(codon)
|
|
88
|
+
tag_p = format(100 * tag / len(orfs), '.2f')
|
|
89
|
+
taa_p = format(100 * taa / len(orfs), '.2f')
|
|
90
|
+
tga_p = format(100 * tga / len(orfs), '.2f')
|
|
91
|
+
other_Stop_P = format(100 * other / len(orfs), '.2f')
|
|
92
|
+
return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def missed_gene_read(results_file, missed_genes):
|
|
96
|
+
# Missed Genes Read-In
|
|
97
|
+
read = False
|
|
98
|
+
for line in results_file:
|
|
99
|
+
if line.startswith('ORFs_Without_Corresponding_Gene_In_Ensembl_Metrics:'):
|
|
100
|
+
break
|
|
101
|
+
line = line.strip()
|
|
102
|
+
if read == True:
|
|
103
|
+
if line.startswith('>'):
|
|
104
|
+
entry = line.split('_')
|
|
105
|
+
entry = entry[1] + '_' + entry[2]
|
|
106
|
+
elif len(line.strip()) > 0:
|
|
107
|
+
startCodon = line[0:3]
|
|
108
|
+
stopCodon = line[-3:]
|
|
109
|
+
length = len(line)
|
|
110
|
+
missed_genes.update({entry: [line, length, startCodon, stopCodon]})
|
|
111
|
+
if line.startswith('Undetected_Genes:'):
|
|
112
|
+
read = True
|
|
113
|
+
|
|
114
|
+
return missed_genes
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def detail_transfer(genes, missed_genes):
|
|
118
|
+
for missed, m_details in missed_genes.items():
|
|
119
|
+
try:
|
|
120
|
+
details = genes[missed]
|
|
121
|
+
gc = details[2]
|
|
122
|
+
up_Overlap = details[3]
|
|
123
|
+
down_Overlap = details[4]
|
|
124
|
+
m_details.insert(2, gc)
|
|
125
|
+
m_details.insert(3, up_Overlap)
|
|
126
|
+
m_details.insert(4, down_Overlap)
|
|
127
|
+
except KeyError:
|
|
128
|
+
pass
|
|
129
|
+
return missed_genes
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def result_compare(genome, results_file):
|
|
133
|
+
genome_Seq = ""
|
|
134
|
+
with open('../Genomes/' + genome + '.fa', 'r') as genome_file:
|
|
135
|
+
for line in genome_file:
|
|
136
|
+
line = line.replace("\n", "")
|
|
137
|
+
if ">" not in line:
|
|
138
|
+
genome_Seq += str(line)
|
|
139
|
+
|
|
140
|
+
missed_genes = collections.OrderedDict()
|
|
141
|
+
missed_genes = missed_gene_read(results_file, missed_genes)
|
|
142
|
+
list_MG = list(missed_genes.keys())
|
|
143
|
+
# Analysis
|
|
144
|
+
genome_Rev = revCompIterative(genome_Seq)
|
|
145
|
+
genome_Size = len(genome_Seq)
|
|
146
|
+
genes = collections.OrderedDict()
|
|
147
|
+
count = 0
|
|
148
|
+
prev_Stop = 0
|
|
149
|
+
### Record Missed and Detected Gene metrics
|
|
150
|
+
genes_strand = collections.defaultdict(int)
|
|
151
|
+
genes_Missed_strand = collections.defaultdict(int)
|
|
152
|
+
short_PCGs, short_Missed_PCGs, pcg_GC, pcg_Missed_GC, lengths_PCG, lengths_Missed_PCG, genes_Overlap, genes_Missed_Overlap = [], [], [], [], [], [], [], []
|
|
153
|
+
with open('../Genomes/' + genome + '.gff', 'r') as genome_gff:
|
|
154
|
+
for line in genome_gff:
|
|
155
|
+
line = line.split('\t')
|
|
156
|
+
try:
|
|
157
|
+
if "CDS" in line[2] and len(line) == 9:
|
|
158
|
+
start = int(line[3])
|
|
159
|
+
stop = int(line[4])
|
|
160
|
+
strand = line[6]
|
|
161
|
+
gene = str(start) + ',' + str(stop) + ',' + strand
|
|
162
|
+
if strand == '-':
|
|
163
|
+
r_Start = genome_Size - stop
|
|
164
|
+
r_Stop = genome_Size - start
|
|
165
|
+
seq = (genome_Rev[r_Start:r_Stop + 1])
|
|
166
|
+
elif strand == '+':
|
|
167
|
+
seq = (genome_Seq[start - 1:stop])
|
|
168
|
+
startCodon = seq[0:3]
|
|
169
|
+
stopCodon = seq[-3:]
|
|
170
|
+
length = stop - start
|
|
171
|
+
n_per, gc = gc_count(seq)
|
|
172
|
+
pos = str(start) + '_' + str(stop)
|
|
173
|
+
if pos in list_MG:
|
|
174
|
+
genes_Missed_strand[strand] += 1
|
|
175
|
+
pcg_Missed_GC.append(float(gc))
|
|
176
|
+
lengths_Missed_PCG.append(length)
|
|
177
|
+
if length < SHORT_ORF_LENGTH:
|
|
178
|
+
short_Missed_PCGs.append(gene)
|
|
179
|
+
if prev_Stop > start:
|
|
180
|
+
overlap = prev_Stop - start
|
|
181
|
+
genes_Missed_Overlap.append(overlap)
|
|
182
|
+
else:
|
|
183
|
+
overlap = 0
|
|
184
|
+
elif pos not in list_MG:
|
|
185
|
+
genes_strand[strand] += 1
|
|
186
|
+
pcg_GC.append(float(gc))
|
|
187
|
+
lengths_PCG.append(length)
|
|
188
|
+
if length < SHORT_ORF_LENGTH:
|
|
189
|
+
short_PCGs.append(gene)
|
|
190
|
+
if prev_Stop > start:
|
|
191
|
+
overlap = prev_Stop - start
|
|
192
|
+
genes_Overlap.append(overlap)
|
|
193
|
+
else:
|
|
194
|
+
overlap = 0
|
|
195
|
+
|
|
196
|
+
count += 1
|
|
197
|
+
prev_Stop = stop
|
|
198
|
+
pos = str(start) + '_' + str(stop)
|
|
199
|
+
if genes:
|
|
200
|
+
prev_details = genes[prev_pos]
|
|
201
|
+
prev_details.insert(4, overlap)
|
|
202
|
+
genes.update({prev_pos: prev_details})
|
|
203
|
+
|
|
204
|
+
genes.update({pos: [strand, length, gc, overlap, seq, startCodon, stopCodon]})
|
|
205
|
+
prev_pos = pos
|
|
206
|
+
|
|
207
|
+
except IndexError:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
missed_genes = detail_transfer(genes, missed_genes)
|
|
211
|
+
|
|
212
|
+
missed_lengths = []
|
|
213
|
+
gene_lengths = []
|
|
214
|
+
for key in list_MG:
|
|
215
|
+
### printed out to confirm figure lengths
|
|
216
|
+
start = key.split('_')[0]
|
|
217
|
+
stop = key.split('_')[1]
|
|
218
|
+
m_length = int(stop) - int(start)
|
|
219
|
+
missed_lengths.append(m_length)
|
|
220
|
+
if key in genes:
|
|
221
|
+
del genes[key]
|
|
222
|
+
|
|
223
|
+
## Printed out for Figure of gene lengths
|
|
224
|
+
for key in genes.keys():
|
|
225
|
+
start = key.split('_')[0]
|
|
226
|
+
stop = key.split('_')[1]
|
|
227
|
+
g_length = int(stop) - int(start)
|
|
228
|
+
gene_lengths.append(g_length)
|
|
229
|
+
|
|
230
|
+
print("Number of Genes Missed:" + str(len(missed_lengths)) + '\nLengths of Genes Missed:\n' + str(missed_lengths))
|
|
231
|
+
|
|
232
|
+
median_PCG = np.median(lengths_PCG)
|
|
233
|
+
median_PCG_Olap = np.median(genes_Overlap)
|
|
234
|
+
longest_Olap = max(genes_Overlap)
|
|
235
|
+
num_overlaps = len(genes_Overlap)
|
|
236
|
+
gc_median = format(np.median(pcg_GC), '.2f')
|
|
237
|
+
num_Short_PCGs = len(short_PCGs)
|
|
238
|
+
|
|
239
|
+
atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Starts_P, other_Starts = start_Codon_Count(genes)
|
|
240
|
+
tag_P, taa_P, tga_P, other_Stops_P, other_Stops = stop_Codon_Count(genes)
|
|
241
|
+
m_atg_P, m_gtg_P, m_ttg_P, m_att_P, m_ctg_P, m_other_Starts_P, m_other_Starts = start_Codon_Count(missed_genes)
|
|
242
|
+
m_tag_P, m_taa_P, m_tga_P, m_other_Stops_P, m_other_Stops = stop_Codon_Count(missed_genes)
|
|
243
|
+
|
|
244
|
+
output = ("Number of Missed Protein Coding Genes in " + str(genome) + " : " + str(
|
|
245
|
+
len(lengths_PCG)) + ", Median Length of PCGs: " +
|
|
246
|
+
str(median_PCG) + ", Min Length of PCGs: " + str(min(lengths_PCG)) + ", Max Length of PCGs: " + str(
|
|
247
|
+
max(lengths_PCG)) +
|
|
248
|
+
", Number of PCGs on Pos Strand: " + str(
|
|
249
|
+
genes_Missed_strand['+']) + ", Number of PCGs on Neg Strand: " + str(genes_Missed_strand['-']) +
|
|
250
|
+
", Median GC of PCGs: " + str(gc_median) + ", Number of Overlapping PCGs: " + str(num_overlaps) +
|
|
251
|
+
", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(median_PCG_Olap) +
|
|
252
|
+
", Number of PCGs less than 100nt: " + str(num_Short_PCGs) +
|
|
253
|
+
|
|
254
|
+
'\nPercentage of Genes starting with ATG - Annotation/Missed: ' + atg_P + ' ' + m_atg_P +
|
|
255
|
+
'\nPercentage of Genes starting with GTG - Annotation/Missed: ' + gtg_P + ' ' + m_gtg_P +
|
|
256
|
+
'\nPercentage of Genes starting with TTG - Annotation/Missed: ' + ttg_P + ' ' + m_ttg_P +
|
|
257
|
+
'\nPercentage of Genes starting with ATT - Annotation/Missed: ' + att_P + ' ' + m_att_P +
|
|
258
|
+
'\nPercentage of Genes starting with CTG - Annotation/Missed: ' + ctg_P + ' ' + m_ctg_P +
|
|
259
|
+
'\nPercentage of Genes starting with Alternative Start Codon - Annotation/Missed: ' + other_Starts_P + ' ' + m_other_Stops_P +
|
|
260
|
+
'\nPercentage of Genes ending with TAG - Annotation/Missed: ' + tag_P + ' ' + m_tag_P +
|
|
261
|
+
'\nPercentage of Genes ending with TAA - Annotation/Missed: ' + taa_P + ' ' + m_taa_P +
|
|
262
|
+
'\nPercentage of Genes ending with TGA - Annotation/Missed: ' + tga_P + ' ' + m_tga_P +
|
|
263
|
+
'\nPercentage of Genes ending with Alternative Stop Codon - Annotation/Missed: ' + other_Stops_P + ' ' + m_other_Stops_P)
|
|
264
|
+
|
|
265
|
+
print(output)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
if __name__ == "__main__":
|
|
269
|
+
options = parser.parse_args()
|
|
270
|
+
parameters = options.parameters
|
|
271
|
+
tool = options.tool
|
|
272
|
+
genome = options.genome
|
|
273
|
+
if parameters:
|
|
274
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
|
|
275
|
+
else:
|
|
276
|
+
results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
|
|
277
|
+
result_compare(genome, results_file)
|