ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,28 @@
1
+ import argparse
2
+ import numpy as np
3
+
4
+ parser = argparse.ArgumentParser()
5
+ parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
6
+ args = parser.parse_args()
7
+
8
+
9
+ def genome_Lengths(genome_to_compare):
10
+ lengths = []
11
+ with open('../Genomes/' + genome_to_compare + '.gff', 'r') as genome_gff:
12
+ for line in genome_gff:
13
+ line = line.split('\t')
14
+ try:
15
+ if "CDS" in line[2] and len(line) == 9:
16
+ start = int(line[3])
17
+ stop = int(line[4])
18
+ length = stop - start
19
+ lengths.append(length)
20
+ except IndexError:
21
+ # print(line)
22
+ continue
23
+ print("Number of Genes: " + str(len(lengths)) +
24
+ '\tMedian Length of Genes: ' + str(np.median(lengths)) + '\nGenes Lengths:\n' + str(lengths))
25
+
26
+
27
+ if __name__ == "__main__":
28
+ genome_Lengths(**vars(args))
@@ -0,0 +1,258 @@
1
+ import argparse
2
+ import numpy as np
3
+ import os
4
+
5
+ try:
6
+ from ORForise.src.ORForise.utils import * # local file
7
+ except ImportError:
8
+ from ORForise.utils import *
9
+
10
+
11
+
12
+
13
+ def start_Codon_Count(start_Codons):
14
+ atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
15
+ other_Starts = []
16
+ for start in start_Codons:
17
+ if start == 'ATG':
18
+ atg += 1
19
+ elif start == 'GTG':
20
+ gtg += 1
21
+ elif start == 'TTG':
22
+ ttg += 1
23
+ elif start == 'ATT':
24
+ att += 1
25
+ elif start == 'CTG':
26
+ ctg += 1
27
+ else:
28
+ other += 1
29
+ other_Starts.append(start)
30
+ atg_P = format(100 * atg / len(start_Codons), '.2f')
31
+ gtg_P = format(100 * gtg / len(start_Codons), '.2f')
32
+ ttg_P = format(100 * ttg / len(start_Codons), '.2f')
33
+ att_P = format(100 * att / len(start_Codons), '.2f')
34
+ ctg_P = format(100 * ctg / len(start_Codons), '.2f')
35
+ other_Start_P = format(100 * other / len(start_Codons), '.2f')
36
+ return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
37
+
38
+
39
+ def stop_Codon_Count(stop_Codons):
40
+ tag, taa, tga, other = 0, 0, 0, 0
41
+ other_Stops = []
42
+ for stop in stop_Codons:
43
+ if stop == 'TAG':
44
+ tag += 1
45
+ elif stop == 'TAA':
46
+ taa += 1
47
+ elif stop == 'TGA':
48
+ tga += 1
49
+ else:
50
+ other += 1
51
+ other_Stops.append(stop)
52
+ tag_p = format(100 * tag / len(stop_Codons), '.2f')
53
+ taa_p = format(100 * taa / len(stop_Codons), '.2f')
54
+ tga_p = format(100 * tga / len(stop_Codons), '.2f')
55
+ other_Stop_P = format(100 * other / len(stop_Codons), '.2f')
56
+ return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
57
+
58
+
59
+ def gc_count(dna):
60
+ c = 0
61
+ a = 0
62
+ g = 0
63
+ t = 0
64
+ n = 0
65
+ for i in dna:
66
+ if "C" in i:
67
+ c += 1
68
+ elif "G" in i:
69
+ g += 1
70
+ elif "A" in i:
71
+ a += 1
72
+ elif "T" in i:
73
+ t += 1
74
+ elif "N" in i:
75
+ n += 1
76
+ gc_content = (g + c) * 100 / (a + t + g + c + n)
77
+ n_per = n * 100 / (a + t + g + c + n)
78
+ return n_per, gc_content
79
+
80
+
81
+ def revCompIterative(watson):
82
+ complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
83
+ watson = watson.upper()
84
+ watsonrev = watson[::-1]
85
+ crick = ""
86
+
87
+ for nt in watsonrev:
88
+ crick += complements[nt]
89
+ return crick
90
+
91
+
92
+ def genome_Metrics(fasta_in, gff_in, output_file):
93
+
94
+ base_name = os.path.basename(fasta_in) # Gets file name with extension
95
+ genome_name = os.path.splitext(base_name)[0] # Removes extension
96
+
97
+ genome_Seq = ""
98
+ with open(fasta_in , 'r') as genome:
99
+ for line in genome:
100
+ line = line.replace("\n", "")
101
+ if not line.startswith('>'):
102
+ genome_Seq += str(line)
103
+
104
+ genome_N_Per, genome_GC = gc_count(genome_Seq)
105
+
106
+ genome_Rev = revCompIterative(genome_Seq)
107
+ genome_Size = len(genome_Seq)
108
+ coding_Regions = np.zeros((genome_Size), dtype=int)
109
+ non_Coding_Regions = np.zeros((genome_Size), dtype=int)
110
+ all_gene_Regions = np.zeros((genome_Size), dtype=int)
111
+ protein_coding_genes = collections.OrderedDict()
112
+ non_protein_coding_genes = collections.OrderedDict()
113
+ strands = collections.defaultdict(int)
114
+ lengths_PCG, gene_Pos_Olap, gene_Neg_Olap, short_PCGs, pcg_GC = [], [], [], [], []
115
+ prev_Gene_Stop, count, nc_Count, pos_Strand, neg_Strand = 0, 0, 0, 0, 0
116
+ prev_Gene_Overlapped = False
117
+ with open(gff_in, 'r') as genome_gff:
118
+ for line in genome_gff:
119
+ line = line.split('\t')
120
+ try:
121
+ if "CDS" in line[2] and len(line) == 9:
122
+ start = int(line[3])
123
+ stop = int(line[4])
124
+ length = stop - start
125
+ all_gene_Regions[start - 1:stop] = [1]
126
+ strand = line[6]
127
+ strands[strand] += 1
128
+ lengths_PCG.append(length)
129
+ coding_Regions[start - 1:stop] = [1]
130
+ gene = str(start) + ',' + str(stop) + ',' + strand
131
+ protein_coding_genes.update({count: gene})
132
+ if '+' in strand:
133
+ seq = genome_Seq[start - 1:stop]
134
+ pos_Strand += 1
135
+ elif '-' in strand:
136
+ r_Start = genome_Size - stop
137
+ r_Stop = genome_Size - start
138
+ seq = genome_Rev[r_Start:r_Stop + 1]
139
+ neg_Strand += 1
140
+ if length < SHORT_ORF_LENGTH:
141
+ short_PCGs.append(gene)
142
+ n_per, gc = gc_count(seq)
143
+ pcg_GC.append(gc)
144
+ ### Calculate overlapping ORFs -
145
+ if prev_Gene_Stop > start:
146
+ if '+' in strand:
147
+ gene_Pos_Olap.append(prev_Gene_Stop - start)
148
+ elif '-' in strand:
149
+ gene_Neg_Olap.append(prev_Gene_Stop - start)
150
+ prev_Gene_Overlapped = True
151
+ elif prev_Gene_Stop < start:
152
+ if prev_Gene_Overlapped == True:
153
+ if '+' in strand:
154
+ gene_Pos_Olap.append(0)
155
+ elif '-' in strand:
156
+ gene_Neg_Olap.append(0)
157
+ prev_Gene_Overlapped = False
158
+ prev_Gene_Stop = stop
159
+ count += 1
160
+ elif "ID=gene" in line[8]:
161
+ gene_Info = line[8]
162
+ if "biotype=protein_coding" not in gene_Info:
163
+ start = int(line[3])
164
+ stop = int(line[4])
165
+ strand = line[6]
166
+ gene = str(start) + ',' + str(stop) + ',' + strand
167
+ all_gene_Regions[start - 1:stop] = [1]
168
+ non_Coding_Regions[start - 1:stop] = [1]
169
+ non_protein_coding_genes.update({nc_Count: gene})
170
+ nc_Count += 1
171
+
172
+ except IndexError:
173
+ continue
174
+
175
+ if prev_Gene_Overlapped == True: # If last has a prev overlap, count it
176
+ if '+' in strand:
177
+ gene_Pos_Olap.append(0)
178
+ elif '-' in strand:
179
+ gene_Neg_Olap.append(0)
180
+
181
+ median_PCG = np.median(lengths_PCG)
182
+ gene_Overlaps = gene_Neg_Olap + gene_Pos_Olap
183
+ median_PCG_Olap = np.median(gene_Overlaps)
184
+ longest_Olap = max(gene_Overlaps)
185
+ coding_Percentage = 100 * float(np.count_nonzero(coding_Regions)) / float(genome_Size)
186
+ non_coding_Percentage = 100 * float(np.count_nonzero(non_Coding_Regions)) / float(genome_Size)
187
+ all_gene_Percentage = 100 * float(np.count_nonzero(all_gene_Regions)) / float(genome_Size)
188
+ start_Codons, stop_Codons = [], []
189
+ for gene in protein_coding_genes.values():
190
+ start = int(gene.split(',')[0])
191
+ stop = int(gene.split(',')[1])
192
+ strand = gene.split(',')[2]
193
+
194
+ if '-' in strand:
195
+ r_start = genome_Size - stop
196
+ r_stop = genome_Size - start
197
+
198
+ start_Codons.append(genome_Rev[r_start:r_start + 3])
199
+ stop_Codons.append(genome_Rev[r_stop - 2:r_stop + 1])
200
+
201
+ elif '+' in strand:
202
+ start_Codons.append(genome_Seq[start - 1:start - 1 + 3])
203
+ stop_Codons.append(genome_Seq[stop - 3:stop - 1 + 1])
204
+
205
+ atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(start_Codons)
206
+ tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(stop_Codons)
207
+
208
+ output = ("Number of Protein Coding Genes in " + genome_name + " : " + str(
209
+ len(lengths_PCG)) + " ,Median Length of PCGs: " + str(median_PCG) + ", Min Length of PCGs: " + str(
210
+ min(lengths_PCG)) + ", Max Length of PCGs: " + str(max(lengths_PCG)) +
211
+ ", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(
212
+ strands['-']) + "\nGenome-Wide GC Content: " + format(np.median(genome_GC),
213
+ '.2f') + "Median GC of PCGs: " + format(
214
+ np.median(pcg_GC), '.2f') + ", Number of Overlapping PCGs: " + str(len(gene_Overlaps)) +
215
+ ", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(
216
+ median_PCG_Olap) + ", Number of PCGs less than 100 amino acids: " + str(len(short_PCGs)) +
217
+
218
+ "\nPercentage of Genome which is Protein Coding: " + format(coding_Percentage,
219
+ '.2f') + ", Number of Non-PCGs: " + str(
220
+ len(non_protein_coding_genes)) + ", Percentage of Genome Non-PCG: " + format(non_coding_Percentage,
221
+ '.2f') +
222
+ ", Percentage of All Genes in Genome: " + format(all_gene_Percentage, '.2f') +
223
+
224
+ "\nPercentage of Genes starting with ATG: " + atg_P +
225
+ "\nPercentage of Genes starting with GTG: " + gtg_P +
226
+ "\nPercentage of Genes starting with TTG: " + ttg_P +
227
+ "\nPercentage of Genes starting with ATT: " + att_P +
228
+ "\nPercentage of Genes starting with CTG: " + ctg_P +
229
+ "\nPercentage of Genes starting with Alternative Start Codon: " + other_Start_P +
230
+
231
+ "\nPercentage of Genes ending with TAG: " + tag_P +
232
+ "\nPercentage of Genes ending with TAA: " + taa_P +
233
+ "\nPercentage of Genes ending with TGA: " + tga_P +
234
+ "\nPercentage of Genes ending with Alternative Stop Codon: " + other_Stop_P)
235
+
236
+ with open(output_file, 'w') as out_file:
237
+ out_file.write('Genome Metrics:\n')
238
+ out_file.write(output + '\n')
239
+
240
+ #print(output)
241
+
242
+
243
+
244
+
245
+ def main():
246
+ parser = argparse.ArgumentParser(description="...")
247
+ parser.add_argument("-f", dest='fasta_in', required=True, help="Input FASTA file")
248
+ parser.add_argument("-g", dest='gff_in', required=True, help="Corresponding GFF file to FASTA")
249
+ parser.add_argument("-o", dest='output_file', required=True, help="Output metrics file")
250
+
251
+ options = parser.parse_args()
252
+
253
+ genome_Metrics(options.fasta_in, options.gff_in, options.output_file)
254
+
255
+
256
+
257
+ if __name__ == "__main__":
258
+ main()
@@ -0,0 +1,88 @@
1
+ import argparse
2
+ import collections
3
+
4
+
5
+
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument('-ref', '--reference_annotation', required=True,
8
+ help='Which reference annotation file to use as reference?')
9
+ parser.add_argument('-tp', '--tool_prediction', required=True, help='Tool genome prediction file')
10
+ args = parser.parse_args()
11
+
12
+
13
+ def main(reference_annotation,tool_prediction):
14
+ ref_genes = collections.OrderedDict() # Order is important
15
+ num_hypo = 0
16
+ with open(reference_annotation, 'r') as genome_gff:
17
+ for line in genome_gff:
18
+ line = line.split('\t')
19
+ try:
20
+ if "gene" in line[2] and len(line) == 9: # Have to use gene and not CDS here because the CDS tag does not contain the classification
21
+ start = line[3]
22
+ stop = line[4]
23
+ gene = start+'_'+stop
24
+ if "hypothetical" in line[8]:
25
+ ref_genes.update({gene:"hypothetical"})
26
+ num_hypo +=1
27
+ else:
28
+ ref_genes.update({gene: "N/A"})
29
+ except IndexError:
30
+ continue
31
+ print("Number of hypothetic genes: "+str(num_hypo))
32
+ ###############################
33
+ perfect_match_hypo, partial_match_hypo, missed_hypo = 0,0,0
34
+ perfect_match = False
35
+ partial_match = False
36
+ missed = False
37
+ posss = []
38
+ with open(tool_prediction, 'r') as tool_in:
39
+ for line in tool_in:
40
+ if line.startswith("Perfect_Match_Genes:"):
41
+ perfect_match = True
42
+ elif line.startswith("Partial_Match_Genes:"):
43
+ perfect_match = False
44
+ partial_match = True
45
+ elif line.startswith("Missed_Genes:"):
46
+ perfect_match = False
47
+ partial_match = False
48
+ missed = True
49
+ elif line.startswith("ORF_Without_Corresponding_Gene_in_Ensembl"):
50
+ break
51
+ #################
52
+ if perfect_match == True:
53
+ if line.startswith('>'):
54
+ pos = line.split('_')[1] +'_'+ line.split('_')[2]
55
+ try:
56
+ if "hypothetical" in ref_genes[pos]:
57
+ perfect_match_hypo +=1
58
+ print(pos)
59
+ if pos in posss:
60
+ print("WE")
61
+ posss.append(pos)
62
+ except KeyError:
63
+ continue
64
+ elif partial_match == True:
65
+ if line.startswith('Gene:'): # Different tags
66
+ pos = line.split('_')[0].split(':')[1] +'_'+ line.split('_')[1] # should change Orforise output
67
+ try:
68
+ if "hypothetical" in ref_genes[pos]:
69
+ partial_match_hypo +=1
70
+ print(pos)
71
+ except KeyError:
72
+ continue
73
+ elif missed == True:
74
+ if line.startswith('>'):
75
+ pos = line.split('_')[1] +'_'+ line.split('_')[2]
76
+ try:
77
+ if "hypothetical" in ref_genes[pos]:
78
+ missed_hypo +=1
79
+ print(pos)
80
+ except KeyError:
81
+ continue
82
+
83
+ print("finished")
84
+
85
+ if __name__ == "__main__":
86
+ main(**vars(args))
87
+
88
+ print("Complete")
@@ -0,0 +1,277 @@
1
+ import argparse
2
+ import collections
3
+ import numpy as np
4
+
5
+ from ORForise.src.ORForise.utils import *
6
+
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument('-t', '--tool', required=True, help='Which tool to compare?')
9
+ parser.add_argument('-p', '--parameters', required=False, help='Optional parameters for prediction tool.')
10
+ parser.add_argument('-g', '--genome', required=True, help='Which genome to analyse?')
11
+
12
+ args = parser.parse_args()
13
+
14
+
15
+ def gc_count(dna):
16
+ c = 0
17
+ a = 0
18
+ g = 0
19
+ t = 0
20
+ n = 0
21
+ for i in dna:
22
+ if "C" in i:
23
+ c += 1
24
+ elif "G" in i:
25
+ g += 1
26
+ elif "A" in i:
27
+ a += 1
28
+ elif "T" in i:
29
+ t += 1
30
+ elif "N" in i:
31
+ n += 1
32
+ gc_content = format((g + c) * 100 / (a + t + g + c + n), '.2f')
33
+ n_per = n * 100 / (a + t + g + c + n)
34
+ return n_per, gc_content
35
+
36
+
37
+ def revCompIterative(watson):
38
+ complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
39
+ watson = watson.upper()
40
+ watsonrev = watson[::-1]
41
+ crick = ""
42
+ for nt in watsonrev:
43
+ crick += complements[nt]
44
+ return crick
45
+
46
+
47
+ def start_Codon_Count(orfs):
48
+ atg, gtg, ttg, att, ctg, other = 0, 0, 0, 0, 0, 0
49
+ other_Starts = []
50
+ for orf in orfs.values():
51
+ codon = orf[-2]
52
+ if codon == 'ATG':
53
+ atg += 1
54
+ elif codon == 'GTG':
55
+ gtg += 1
56
+ elif codon == 'TTG':
57
+ ttg += 1
58
+ elif codon == 'ATT':
59
+ att += 1
60
+ elif codon == 'CTG':
61
+ ctg += 1
62
+ else:
63
+ other += 1
64
+ other_Starts.append(codon)
65
+ atg_P = format(100 * atg / len(orfs), '.2f')
66
+ gtg_P = format(100 * gtg / len(orfs), '.2f')
67
+ ttg_P = format(100 * ttg / len(orfs), '.2f')
68
+ att_P = format(100 * att / len(orfs), '.2f')
69
+ ctg_P = format(100 * ctg / len(orfs), '.2f')
70
+ other_Start_P = format(100 * other / len(orfs), '.2f')
71
+ return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
72
+
73
+
74
+ def stop_Codon_Count(orfs):
75
+ tag, taa, tga, other = 0, 0, 0, 0
76
+ other_Stops = []
77
+ for orf in orfs.values():
78
+ codon = orf[-1]
79
+ if codon == 'TAG':
80
+ tag += 1
81
+ elif codon == 'TAA':
82
+ taa += 1
83
+ elif codon == 'TGA':
84
+ tga += 1
85
+ else:
86
+ other += 1
87
+ other_Stops.append(codon)
88
+ tag_p = format(100 * tag / len(orfs), '.2f')
89
+ taa_p = format(100 * taa / len(orfs), '.2f')
90
+ tga_p = format(100 * tga / len(orfs), '.2f')
91
+ other_Stop_P = format(100 * other / len(orfs), '.2f')
92
+ return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
93
+
94
+
95
+ def missed_gene_read(results_file, missed_genes):
96
+ # Missed Genes Read-In
97
+ read = False
98
+ for line in results_file:
99
+ if line.startswith('ORFs_Without_Corresponding_Gene_In_Ensembl_Metrics:'):
100
+ break
101
+ line = line.strip()
102
+ if read == True:
103
+ if line.startswith('>'):
104
+ entry = line.split('_')
105
+ entry = entry[1] + '_' + entry[2]
106
+ elif len(line.strip()) > 0:
107
+ startCodon = line[0:3]
108
+ stopCodon = line[-3:]
109
+ length = len(line)
110
+ missed_genes.update({entry: [line, length, startCodon, stopCodon]})
111
+ if line.startswith('Undetected_Genes:'):
112
+ read = True
113
+
114
+ return missed_genes
115
+
116
+
117
+ def detail_transfer(genes, missed_genes):
118
+ for missed, m_details in missed_genes.items():
119
+ try:
120
+ details = genes[missed]
121
+ gc = details[2]
122
+ up_Overlap = details[3]
123
+ down_Overlap = details[4]
124
+ m_details.insert(2, gc)
125
+ m_details.insert(3, up_Overlap)
126
+ m_details.insert(4, down_Overlap)
127
+ except KeyError:
128
+ pass
129
+ return missed_genes
130
+
131
+
132
+ def result_compare(genome, results_file):
133
+ genome_Seq = ""
134
+ with open('../Genomes/' + genome + '.fa', 'r') as genome_file:
135
+ for line in genome_file:
136
+ line = line.replace("\n", "")
137
+ if ">" not in line:
138
+ genome_Seq += str(line)
139
+
140
+ missed_genes = collections.OrderedDict()
141
+ missed_genes = missed_gene_read(results_file, missed_genes)
142
+ list_MG = list(missed_genes.keys())
143
+ # Analysis
144
+ genome_Rev = revCompIterative(genome_Seq)
145
+ genome_Size = len(genome_Seq)
146
+ genes = collections.OrderedDict()
147
+ count = 0
148
+ prev_Stop = 0
149
+ ### Record Missed and Detected Gene metrics
150
+ genes_strand = collections.defaultdict(int)
151
+ genes_Missed_strand = collections.defaultdict(int)
152
+ short_PCGs, short_Missed_PCGs, pcg_GC, pcg_Missed_GC, lengths_PCG, lengths_Missed_PCG, genes_Overlap, genes_Missed_Overlap = [], [], [], [], [], [], [], []
153
+ with open('../Genomes/' + genome + '.gff', 'r') as genome_gff:
154
+ for line in genome_gff:
155
+ line = line.split('\t')
156
+ try:
157
+ if "CDS" in line[2] and len(line) == 9:
158
+ start = int(line[3])
159
+ stop = int(line[4])
160
+ strand = line[6]
161
+ gene = str(start) + ',' + str(stop) + ',' + strand
162
+ if strand == '-':
163
+ r_Start = genome_Size - stop
164
+ r_Stop = genome_Size - start
165
+ seq = (genome_Rev[r_Start:r_Stop + 1])
166
+ elif strand == '+':
167
+ seq = (genome_Seq[start - 1:stop])
168
+ startCodon = seq[0:3]
169
+ stopCodon = seq[-3:]
170
+ length = stop - start
171
+ n_per, gc = gc_count(seq)
172
+ pos = str(start) + '_' + str(stop)
173
+ if pos in list_MG:
174
+ genes_Missed_strand[strand] += 1
175
+ pcg_Missed_GC.append(float(gc))
176
+ lengths_Missed_PCG.append(length)
177
+ if length < SHORT_ORF_LENGTH:
178
+ short_Missed_PCGs.append(gene)
179
+ if prev_Stop > start:
180
+ overlap = prev_Stop - start
181
+ genes_Missed_Overlap.append(overlap)
182
+ else:
183
+ overlap = 0
184
+ elif pos not in list_MG:
185
+ genes_strand[strand] += 1
186
+ pcg_GC.append(float(gc))
187
+ lengths_PCG.append(length)
188
+ if length < SHORT_ORF_LENGTH:
189
+ short_PCGs.append(gene)
190
+ if prev_Stop > start:
191
+ overlap = prev_Stop - start
192
+ genes_Overlap.append(overlap)
193
+ else:
194
+ overlap = 0
195
+
196
+ count += 1
197
+ prev_Stop = stop
198
+ pos = str(start) + '_' + str(stop)
199
+ if genes:
200
+ prev_details = genes[prev_pos]
201
+ prev_details.insert(4, overlap)
202
+ genes.update({prev_pos: prev_details})
203
+
204
+ genes.update({pos: [strand, length, gc, overlap, seq, startCodon, stopCodon]})
205
+ prev_pos = pos
206
+
207
+ except IndexError:
208
+ continue
209
+
210
+ missed_genes = detail_transfer(genes, missed_genes)
211
+
212
+ missed_lengths = []
213
+ gene_lengths = []
214
+ for key in list_MG:
215
+ ### printed out to confirm figure lengths
216
+ start = key.split('_')[0]
217
+ stop = key.split('_')[1]
218
+ m_length = int(stop) - int(start)
219
+ missed_lengths.append(m_length)
220
+ if key in genes:
221
+ del genes[key]
222
+
223
+ ## Printed out for Figure of gene lengths
224
+ for key in genes.keys():
225
+ start = key.split('_')[0]
226
+ stop = key.split('_')[1]
227
+ g_length = int(stop) - int(start)
228
+ gene_lengths.append(g_length)
229
+
230
+ print("Number of Genes Missed:" + str(len(missed_lengths)) + '\nLengths of Genes Missed:\n' + str(missed_lengths))
231
+
232
+ median_PCG = np.median(lengths_PCG)
233
+ median_PCG_Olap = np.median(genes_Overlap)
234
+ longest_Olap = max(genes_Overlap)
235
+ num_overlaps = len(genes_Overlap)
236
+ gc_median = format(np.median(pcg_GC), '.2f')
237
+ num_Short_PCGs = len(short_PCGs)
238
+
239
+ atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Starts_P, other_Starts = start_Codon_Count(genes)
240
+ tag_P, taa_P, tga_P, other_Stops_P, other_Stops = stop_Codon_Count(genes)
241
+ m_atg_P, m_gtg_P, m_ttg_P, m_att_P, m_ctg_P, m_other_Starts_P, m_other_Starts = start_Codon_Count(missed_genes)
242
+ m_tag_P, m_taa_P, m_tga_P, m_other_Stops_P, m_other_Stops = stop_Codon_Count(missed_genes)
243
+
244
+ output = ("Number of Missed Protein Coding Genes in " + str(genome) + " : " + str(
245
+ len(lengths_PCG)) + ", Median Length of PCGs: " +
246
+ str(median_PCG) + ", Min Length of PCGs: " + str(min(lengths_PCG)) + ", Max Length of PCGs: " + str(
247
+ max(lengths_PCG)) +
248
+ ", Number of PCGs on Pos Strand: " + str(
249
+ genes_Missed_strand['+']) + ", Number of PCGs on Neg Strand: " + str(genes_Missed_strand['-']) +
250
+ ", Median GC of PCGs: " + str(gc_median) + ", Number of Overlapping PCGs: " + str(num_overlaps) +
251
+ ", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(median_PCG_Olap) +
252
+ ", Number of PCGs less than 100nt: " + str(num_Short_PCGs) +
253
+
254
+ '\nPercentage of Genes starting with ATG - Annotation/Missed: ' + atg_P + ' ' + m_atg_P +
255
+ '\nPercentage of Genes starting with GTG - Annotation/Missed: ' + gtg_P + ' ' + m_gtg_P +
256
+ '\nPercentage of Genes starting with TTG - Annotation/Missed: ' + ttg_P + ' ' + m_ttg_P +
257
+ '\nPercentage of Genes starting with ATT - Annotation/Missed: ' + att_P + ' ' + m_att_P +
258
+ '\nPercentage of Genes starting with CTG - Annotation/Missed: ' + ctg_P + ' ' + m_ctg_P +
259
+ '\nPercentage of Genes starting with Alternative Start Codon - Annotation/Missed: ' + other_Starts_P + ' ' + m_other_Stops_P +
260
+ '\nPercentage of Genes ending with TAG - Annotation/Missed: ' + tag_P + ' ' + m_tag_P +
261
+ '\nPercentage of Genes ending with TAA - Annotation/Missed: ' + taa_P + ' ' + m_taa_P +
262
+ '\nPercentage of Genes ending with TGA - Annotation/Missed: ' + tga_P + ' ' + m_tga_P +
263
+ '\nPercentage of Genes ending with Alternative Stop Codon - Annotation/Missed: ' + other_Stops_P + ' ' + m_other_Stops_P)
264
+
265
+ print(output)
266
+
267
+
268
+ if __name__ == "__main__":
269
+ options = parser.parse_args()
270
+ parameters = options.parameters
271
+ tool = options.tool
272
+ genome = options.genome
273
+ if parameters:
274
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '_' + parameters + '.csv')
275
+ else:
276
+ results_file = open('../Tools/' + tool + '/' + tool + '_' + genome + '.csv')
277
+ result_compare(genome, results_file)