ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +318 -133
- ORForise/Annotation_Compare.py +243 -125
- ORForise/Comparator.py +600 -552
- ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
- ORForise/Tools/Augustus/Augustus.py +30 -23
- ORForise/Tools/Balrog/Balrog.py +31 -23
- ORForise/Tools/EasyGene/EasyGene.py +30 -22
- ORForise/Tools/FGENESB/FGENESB.py +32 -25
- ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
- ORForise/Tools/GFF/GFF.py +51 -47
- ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
- ORForise/Tools/GeneMark/GeneMark.py +46 -40
- ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
- ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
- ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
- ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
- ORForise/Tools/MetaGene/MetaGene.py +29 -22
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
- ORForise/Tools/Prodigal/Prodigal.py +30 -26
- ORForise/Tools/Prokka/Prokka.py +30 -25
- ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
- ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
- ORForise/utils.py +204 -2
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
import csv
|
|
3
2
|
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from ORForise.src.ORForise.utils import * # local file
|
|
7
|
+
except ImportError:
|
|
8
|
+
from ORForise.utils import *
|
|
4
9
|
|
|
5
|
-
from ORForise.src.ORForise.utils import * # local file
|
|
6
10
|
|
|
7
|
-
parser = argparse.ArgumentParser()
|
|
8
|
-
parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
|
|
9
|
-
args = parser.parse_args()
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def start_Codon_Count(start_Codons):
|
|
@@ -39,7 +40,6 @@ def stop_Codon_Count(stop_Codons):
|
|
|
39
40
|
tag, taa, tga, other = 0, 0, 0, 0
|
|
40
41
|
other_Stops = []
|
|
41
42
|
for stop in stop_Codons:
|
|
42
|
-
stop
|
|
43
43
|
if stop == 'TAG':
|
|
44
44
|
tag += 1
|
|
45
45
|
elif stop == 'TAA':
|
|
@@ -83,14 +83,19 @@ def revCompIterative(watson):
|
|
|
83
83
|
watson = watson.upper()
|
|
84
84
|
watsonrev = watson[::-1]
|
|
85
85
|
crick = ""
|
|
86
|
+
|
|
86
87
|
for nt in watsonrev:
|
|
87
88
|
crick += complements[nt]
|
|
88
89
|
return crick
|
|
89
90
|
|
|
90
91
|
|
|
91
|
-
def genome_Metrics(
|
|
92
|
+
def genome_Metrics(fasta_in, gff_in, output_file):
|
|
93
|
+
|
|
94
|
+
base_name = os.path.basename(fasta_in) # Gets file name with extension
|
|
95
|
+
genome_name = os.path.splitext(base_name)[0] # Removes extension
|
|
96
|
+
|
|
92
97
|
genome_Seq = ""
|
|
93
|
-
with open(
|
|
98
|
+
with open(fasta_in , 'r') as genome:
|
|
94
99
|
for line in genome:
|
|
95
100
|
line = line.replace("\n", "")
|
|
96
101
|
if not line.startswith('>'):
|
|
@@ -100,16 +105,16 @@ def genome_Metrics(genome_to_compare):
|
|
|
100
105
|
|
|
101
106
|
genome_Rev = revCompIterative(genome_Seq)
|
|
102
107
|
genome_Size = len(genome_Seq)
|
|
103
|
-
coding_Regions = np.zeros((genome_Size), dtype=
|
|
104
|
-
non_Coding_Regions = np.zeros((genome_Size), dtype=
|
|
105
|
-
all_gene_Regions = np.zeros((genome_Size), dtype=
|
|
108
|
+
coding_Regions = np.zeros((genome_Size), dtype=int)
|
|
109
|
+
non_Coding_Regions = np.zeros((genome_Size), dtype=int)
|
|
110
|
+
all_gene_Regions = np.zeros((genome_Size), dtype=int)
|
|
106
111
|
protein_coding_genes = collections.OrderedDict()
|
|
107
112
|
non_protein_coding_genes = collections.OrderedDict()
|
|
108
113
|
strands = collections.defaultdict(int)
|
|
109
114
|
lengths_PCG, gene_Pos_Olap, gene_Neg_Olap, short_PCGs, pcg_GC = [], [], [], [], []
|
|
110
115
|
prev_Gene_Stop, count, nc_Count, pos_Strand, neg_Strand = 0, 0, 0, 0, 0
|
|
111
116
|
prev_Gene_Overlapped = False
|
|
112
|
-
with open(
|
|
117
|
+
with open(gff_in, 'r') as genome_gff:
|
|
113
118
|
for line in genome_gff:
|
|
114
119
|
line = line.split('\t')
|
|
115
120
|
try:
|
|
@@ -200,7 +205,7 @@ def genome_Metrics(genome_to_compare):
|
|
|
200
205
|
atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(start_Codons)
|
|
201
206
|
tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(stop_Codons)
|
|
202
207
|
|
|
203
|
-
output = ("Number of Protein Coding Genes in " +
|
|
208
|
+
output = ("Number of Protein Coding Genes in " + genome_name + " : " + str(
|
|
204
209
|
len(lengths_PCG)) + " ,Median Length of PCGs: " + str(median_PCG) + ", Min Length of PCGs: " + str(
|
|
205
210
|
min(lengths_PCG)) + ", Max Length of PCGs: " + str(max(lengths_PCG)) +
|
|
206
211
|
", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(
|
|
@@ -210,31 +215,44 @@ def genome_Metrics(genome_to_compare):
|
|
|
210
215
|
", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(
|
|
211
216
|
median_PCG_Olap) + ", Number of PCGs less than 100 amino acids: " + str(len(short_PCGs)) +
|
|
212
217
|
|
|
213
|
-
|
|
214
|
-
'.2f') +
|
|
215
|
-
len(non_protein_coding_genes)) +
|
|
218
|
+
"\nPercentage of Genome which is Protein Coding: " + format(coding_Percentage,
|
|
219
|
+
'.2f') + ", Number of Non-PCGs: " + str(
|
|
220
|
+
len(non_protein_coding_genes)) + ", Percentage of Genome Non-PCG: " + format(non_coding_Percentage,
|
|
216
221
|
'.2f') +
|
|
217
|
-
|
|
222
|
+
", Percentage of All Genes in Genome: " + format(all_gene_Percentage, '.2f') +
|
|
223
|
+
|
|
224
|
+
"\nPercentage of Genes starting with ATG: " + atg_P +
|
|
225
|
+
"\nPercentage of Genes starting with GTG: " + gtg_P +
|
|
226
|
+
"\nPercentage of Genes starting with TTG: " + ttg_P +
|
|
227
|
+
"\nPercentage of Genes starting with ATT: " + att_P +
|
|
228
|
+
"\nPercentage of Genes starting with CTG: " + ctg_P +
|
|
229
|
+
"\nPercentage of Genes starting with Alternative Start Codon: " + other_Start_P +
|
|
230
|
+
|
|
231
|
+
"\nPercentage of Genes ending with TAG: " + tag_P +
|
|
232
|
+
"\nPercentage of Genes ending with TAA: " + taa_P +
|
|
233
|
+
"\nPercentage of Genes ending with TGA: " + tga_P +
|
|
234
|
+
"\nPercentage of Genes ending with Alternative Stop Codon: " + other_Stop_P)
|
|
235
|
+
|
|
236
|
+
with open(output_file, 'w') as out_file:
|
|
237
|
+
out_file.write('Genome Metrics:\n')
|
|
238
|
+
out_file.write(output + '\n')
|
|
239
|
+
|
|
240
|
+
#print(output)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
218
244
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
'\nPercentage of Genes starting with Alternative Start Codon: ' + other_Start_P +
|
|
245
|
+
def main():
|
|
246
|
+
parser = argparse.ArgumentParser(description="...")
|
|
247
|
+
parser.add_argument("-f", dest='fasta_in', required=True, help="Input FASTA file")
|
|
248
|
+
parser.add_argument("-g", dest='gff_in', required=True, help="Corresponding GFF file to FASTA")
|
|
249
|
+
parser.add_argument("-o", dest='output_file', required=True, help="Output metrics file")
|
|
225
250
|
|
|
226
|
-
|
|
227
|
-
'\nPercentage of Genes ending with TAA: ' + taa_P +
|
|
228
|
-
'\nPercentage of Genes ending with TGA: ' + tga_P +
|
|
229
|
-
'\nPercentage of Genes ending with Alternative Stop Codon: ' + other_Stop_P)
|
|
251
|
+
options = parser.parse_args()
|
|
230
252
|
|
|
231
|
-
|
|
232
|
-
out = csv.writer(out_file, delimiter=',')
|
|
233
|
-
out.writerow(['Genome Metrics:'])
|
|
234
|
-
out.writerow([output])
|
|
253
|
+
genome_Metrics(options.fasta_in, options.gff_in, options.output_file)
|
|
235
254
|
|
|
236
|
-
print(output)
|
|
237
255
|
|
|
238
256
|
|
|
239
257
|
if __name__ == "__main__":
|
|
240
|
-
|
|
258
|
+
main()
|
|
@@ -8,28 +8,35 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def Augustus(
|
|
11
|
+
def Augustus(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
augustus_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
if
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
augustus_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred, 'r') as Augustus_input:
|
|
22
|
+
for line in Augustus_input:
|
|
23
|
+
line = line.split()
|
|
24
|
+
if len(line) == 12 and dna_region in line[0] and "CDS" in line[2]:
|
|
25
|
+
start = int(line[3])
|
|
26
|
+
stop = int(line[4])
|
|
27
|
+
strand = line[6]
|
|
28
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
29
|
+
r_start = genome_size - stop
|
|
30
|
+
r_stop = genome_size - start
|
|
31
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
32
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
33
|
+
elif '+' in strand:
|
|
34
|
+
startCodon = genome[start - 1:start + 2]
|
|
35
|
+
stopCodon = genome[stop - 3:stop]
|
|
36
|
+
po = str(start) + ',' + str(stop)
|
|
37
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'Augustus']
|
|
38
|
+
augustus_ORFs.update({po: orf})
|
|
33
39
|
|
|
34
|
-
|
|
35
|
-
|
|
40
|
+
for group in augustus_ORFs:
|
|
41
|
+
augustus_ORFs[group] = sortORFs(augustus_ORFs[group])
|
|
42
|
+
return augustus_ORFs
|
ORForise/Tools/Balrog/Balrog.py
CHANGED
|
@@ -8,29 +8,37 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def Balrog(
|
|
11
|
+
def Balrog(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
Balrog_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
if "CDS" in line[2]:
|
|
20
|
-
start = int(line[3])
|
|
21
|
-
stop = int(line[4])
|
|
22
|
-
strand = line[6]
|
|
23
|
-
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
24
|
-
r_start = genome_size - stop
|
|
25
|
-
r_stop = genome_size - start
|
|
26
|
-
startCodon = genome_rev[r_start:r_start + 3]
|
|
27
|
-
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
28
|
-
elif '+' in strand:
|
|
29
|
-
startCodon = genome[start - 1:start + 2]
|
|
30
|
-
stopCodon = genome[stop - 3:stop]
|
|
31
|
-
po = str(start) + ',' + str(stop)
|
|
32
|
-
orf = [strand, startCodon, stopCodon, 'CDS']
|
|
33
|
-
Balrog_ORFs.update({po: orf})
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
Balrog_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
34
21
|
|
|
35
|
-
|
|
22
|
+
with open(tool_pred, 'r') as Balrog_input:
|
|
23
|
+
for line in Balrog_input:
|
|
24
|
+
if '#' not in line:
|
|
25
|
+
line = line.split('\t')
|
|
26
|
+
if "CDS" in line[2] and dna_region in line[0]:
|
|
27
|
+
start = int(line[3])
|
|
28
|
+
stop = int(line[4])
|
|
29
|
+
strand = line[6]
|
|
30
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
31
|
+
r_start = genome_size - stop
|
|
32
|
+
r_stop = genome_size - start
|
|
33
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
34
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
35
|
+
elif '+' in strand:
|
|
36
|
+
startCodon = genome[start - 1:start + 2]
|
|
37
|
+
stopCodon = genome[stop - 3:stop]
|
|
38
|
+
po = str(start) + ',' + str(stop)
|
|
39
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'Balrog']
|
|
40
|
+
Balrog_ORFs.update({po: orf})
|
|
41
|
+
|
|
42
|
+
for group in Balrog_ORFs:
|
|
43
|
+
Balrog_ORFs[group] = sortORFs(Balrog_ORFs[group])
|
|
36
44
|
return Balrog_ORFs
|
|
@@ -8,28 +8,36 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def EasyGene(
|
|
11
|
+
def EasyGene(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
easyGene_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
if
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
easyGene_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred, 'r') as EasyGene_input:
|
|
22
|
+
for line in EasyGene_input:
|
|
23
|
+
line = line.split()
|
|
24
|
+
if len(line) == 10 and dna_region in line[0] and "CDS" in line[2]:
|
|
25
|
+
start = int(line[3])
|
|
26
|
+
stop = int(line[4])
|
|
27
|
+
strand = line[6]
|
|
28
|
+
info = line[8]
|
|
29
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
30
|
+
r_start = genome_size - stop
|
|
31
|
+
r_stop = genome_size - start
|
|
32
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
33
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
34
|
+
elif '+' in strand:
|
|
35
|
+
startCodon = genome[start - 1:start + 2]
|
|
36
|
+
stopCodon = genome[stop - 3:stop]
|
|
37
|
+
po = str(start) + ',' + str(stop)
|
|
38
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'EasyGene']
|
|
39
|
+
easyGene_ORFs[dna_region].update({po: orf})
|
|
33
40
|
|
|
34
|
-
|
|
41
|
+
for group in easyGene_ORFs:
|
|
42
|
+
easyGene_ORFs[group] = sortORFs(easyGene_ORFs[group])
|
|
35
43
|
return easyGene_ORFs
|
|
@@ -8,31 +8,38 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def FGENESB(
|
|
11
|
+
def FGENESB(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
FGENESB_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
if
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
FGENESB_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred, 'r') as FGENESB_input:
|
|
22
|
+
for line in FGENESB_input:
|
|
23
|
+
if '>GENE' in line:
|
|
24
|
+
line = line.split()
|
|
25
|
+
if '2208' in line:
|
|
26
|
+
print("ss")
|
|
27
|
+
if len(line) == 10 and dna_region in line[0] and ">GENE" in line[0]:
|
|
28
|
+
start = int(line[2])
|
|
29
|
+
stop = int(line[4])
|
|
30
|
+
strand = line[9]
|
|
31
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
32
|
+
r_start = genome_size - stop
|
|
33
|
+
r_stop = genome_size - start
|
|
34
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
35
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
36
|
+
elif '+' in strand:
|
|
37
|
+
startCodon = genome[start - 1:start + 2]
|
|
38
|
+
stopCodon = genome[stop - 3:stop]
|
|
39
|
+
po = str(start) + ',' + str(stop)
|
|
40
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'FGENESB']
|
|
41
|
+
FGENESB_ORFs.update({po: orf})
|
|
36
42
|
|
|
37
|
-
|
|
43
|
+
for group in FGENESB_ORFs:
|
|
44
|
+
FGENESB_ORFs[group] = sortORFs(FGENESB_ORFs[group])
|
|
38
45
|
return FGENESB_ORFs
|
|
@@ -8,28 +8,35 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def FragGeneScan(
|
|
11
|
+
def FragGeneScan(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
fragGeneScan_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
if
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
fragGeneScan_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred, 'r') as fragGeneScan_input:
|
|
22
|
+
for line in fragGeneScan_input:
|
|
23
|
+
line = line.split()
|
|
24
|
+
if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and dna_region in line[0]:
|
|
25
|
+
start = int(line[3])
|
|
26
|
+
stop = int(line[4])
|
|
27
|
+
strand = line[6]
|
|
28
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
29
|
+
r_start = genome_size - stop
|
|
30
|
+
r_stop = genome_size - start
|
|
31
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
32
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
33
|
+
elif '+' in strand:
|
|
34
|
+
startCodon = genome[start - 1:start + 2]
|
|
35
|
+
stopCodon = genome[stop - 3:stop]
|
|
36
|
+
po = str(start) + ',' + str(stop)
|
|
37
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'FragGeneScan']
|
|
38
|
+
fragGeneScan_ORFs.update({po: orf})
|
|
33
39
|
|
|
34
|
-
|
|
40
|
+
for group in fragGeneScan_ORFs:
|
|
41
|
+
fragGeneScan_ORFs[group] = sortORFs(fragGeneScan_ORFs[group])
|
|
35
42
|
return fragGeneScan_ORFs
|
ORForise/Tools/GFF/GFF.py
CHANGED
|
@@ -10,53 +10,57 @@ except ImportError:
|
|
|
10
10
|
|
|
11
11
|
def GFF(*args):
|
|
12
12
|
tool_pred = args[0]
|
|
13
|
-
|
|
14
|
-
#types = args[2]
|
|
13
|
+
dna_regions = args[1]
|
|
15
14
|
GFF_ORFs = collections.OrderedDict()
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
if '
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
GFF_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred, 'r') as gff_input:
|
|
22
|
+
for line in gff_input:
|
|
23
|
+
if '#' not in line:
|
|
24
|
+
line = line.split('\t')
|
|
25
|
+
#gene_types = types.split(',') - Temporary fix
|
|
26
|
+
#if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
|
|
27
|
+
if 'CDS' in line[2] and len(line) == 9 and dna_region in line[0]:
|
|
28
|
+
start = int(line[3])
|
|
29
|
+
stop = int(line[4])
|
|
30
|
+
strand = line[6]
|
|
31
|
+
info = line[8]
|
|
32
|
+
if stop >= genome_size:
|
|
33
|
+
extra_stop = stop - genome_size
|
|
34
|
+
corrected_stop = genome_size
|
|
35
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
36
|
+
r_start = genome_size - corrected_stop
|
|
37
|
+
r_stop = genome_size - start
|
|
38
|
+
seq = genome_rev[r_start:r_stop + 1]
|
|
39
|
+
extra_seq = genome_rev[-extra_stop - 1:]
|
|
40
|
+
seq = extra_seq+seq
|
|
41
|
+
startCodon = seq[:3]
|
|
42
|
+
stopCodon = seq[-3:]
|
|
43
|
+
elif '+' in strand:
|
|
44
|
+
seq = genome[start -1 :corrected_stop]
|
|
45
|
+
extra_seq = genome[:extra_stop +1]
|
|
46
|
+
seq = seq+extra_seq
|
|
47
|
+
startCodon = seq[:3]
|
|
48
|
+
stopCodon = seq[-3:]
|
|
49
|
+
else:
|
|
50
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
51
|
+
r_start = genome_size - stop
|
|
52
|
+
r_stop = genome_size - start
|
|
53
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
54
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
55
|
+
elif '+' in strand:
|
|
56
|
+
startCodon = genome[start - 1:start + 2]
|
|
57
|
+
stopCodon = genome[stop - 3:stop]
|
|
58
|
+
po = str(start) + ',' + str(stop)
|
|
59
|
+
orf = [strand, startCodon, stopCodon, line[2], 'GFF-Standard'] # This needs to detect the type
|
|
60
|
+
GFF_ORFs.update({po: orf})
|
|
61
|
+
# elif "CDS" in line[2]:
|
|
62
|
+
# sys.exit("SAS")
|
|
60
63
|
|
|
61
|
-
|
|
64
|
+
for group in GFF_ORFs:
|
|
65
|
+
GFF_ORFs[group] = sortORFs(GFF_ORFs[group])
|
|
62
66
|
return GFF_ORFs
|
|
@@ -8,33 +8,40 @@ except ImportError:
|
|
|
8
8
|
from ORForise.utils import sortORFs
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def GLIMMER_3(
|
|
11
|
+
def GLIMMER_3(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
12
14
|
GLIMMER_ORFs = collections.OrderedDict()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
15
|
+
for dna_region in dna_regions:
|
|
16
|
+
GLIMMER_ORFs[dna_region] = collections.OrderedDict()
|
|
17
|
+
for dna_region in dna_regions:
|
|
18
|
+
genome = dna_regions[dna_region][0]
|
|
19
|
+
genome_size = len(genome)
|
|
20
|
+
genome_rev = revCompIterative(genome)
|
|
21
|
+
with open(tool_pred,
|
|
22
|
+
'r') as glimmer_input: # GLIMMER_3 reverses the start and stop positions for ORFS on the negative strand
|
|
23
|
+
for line in glimmer_input:
|
|
24
|
+
if '>' not in line: # This will not work with multiple contigs
|
|
25
|
+
line = line.split()
|
|
26
|
+
if len(line) == 5 and "orf" in line[0] and dna_region in line[0]:
|
|
27
|
+
if '-' in line[3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand
|
|
28
|
+
start = int(line[2])
|
|
29
|
+
stop = int(line[1])
|
|
30
|
+
strand = '-'
|
|
31
|
+
r_start = genome_size - stop
|
|
32
|
+
r_stop = genome_size - start
|
|
33
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
34
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
35
|
+
elif '+' in line[3]:
|
|
36
|
+
start = int(line[1])
|
|
37
|
+
stop = int(line[2])
|
|
38
|
+
strand = '+'
|
|
39
|
+
startCodon = genome[start - 1:start + 3]
|
|
40
|
+
stopCodon = genome[stop - 3:stop]
|
|
41
|
+
po = str(start) + ',' + str(stop)
|
|
42
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'GLIMMER_3']
|
|
43
|
+
GLIMMER_ORFs.update({po: orf})
|
|
38
44
|
|
|
39
|
-
|
|
45
|
+
for group in GLIMMER_ORFs:
|
|
46
|
+
GLIMMER_ORFs[group] = sortORFs(GLIMMER_ORFs[group])
|
|
40
47
|
return GLIMMER_ORFs
|