ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/StORForise.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from importlib import import_module
|
|
2
|
+
import argparse
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from .Comparator import tool_comparison
|
|
7
|
+
from .utils import *
|
|
8
|
+
except (ImportError, ModuleNotFoundError):
|
|
9
|
+
from Comparator import tool_comparison
|
|
10
|
+
from utils import *
|
|
11
|
+
|
|
12
|
+
###################
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def comparator(tool, input_to_analyse, storfs_to_find_missing, genome_to_compare):
|
|
16
|
+
genome_Seq = ""
|
|
17
|
+
with open(genome_to_compare, 'r') as genome:
|
|
18
|
+
for line in genome:
|
|
19
|
+
line = line.replace("\n", "")
|
|
20
|
+
if ">" not in line:
|
|
21
|
+
genome_Seq += str(line)
|
|
22
|
+
##############################################
|
|
23
|
+
genes = collections.OrderedDict()
|
|
24
|
+
count = 0
|
|
25
|
+
with open(input_to_analyse, 'r') as genome_gff: # Get list of missed genes
|
|
26
|
+
for line in genome_gff:
|
|
27
|
+
if ">" in line:
|
|
28
|
+
line = line.strip()
|
|
29
|
+
start = int(line.split('_')[1])
|
|
30
|
+
stop = int(line.split('_')[2])
|
|
31
|
+
strand = line.split('_')[3]
|
|
32
|
+
gene_details = [start,stop,strand]
|
|
33
|
+
genes.update({count: gene_details})
|
|
34
|
+
count += 1
|
|
35
|
+
##################################
|
|
36
|
+
tool_predictions = import_module('Tools.' + tool + '.' + tool)
|
|
37
|
+
tool_predictions = getattr(tool_predictions, tool)
|
|
38
|
+
orfs = tool_predictions(storfs_to_find_missing, genome_Seq)
|
|
39
|
+
all_Metrics, all_rep_Metrics, start_precision, stop_precision, other_starts, other_stops, perfect_Matches, missed_genes, unmatched_orfs, undetected_gene_metrics, unmatched_orf_metrics, orf_Coverage_Genome, matched_ORF_Coverage_Genome, gene_coverage_genome, multi_Matched_ORFs, partial_Hits = tool_comparison(
|
|
40
|
+
genes, orfs, genome_Seq,True)
|
|
41
|
+
outname = tool + '_' + genome_to_compare.split('/')[-1].split('.')[0]
|
|
42
|
+
metric_description = list(all_Metrics.keys())
|
|
43
|
+
metrics = list(all_Metrics.values())
|
|
44
|
+
rep_metric_description = list(all_rep_Metrics.keys())
|
|
45
|
+
rep_metrics = list(all_rep_Metrics.values())
|
|
46
|
+
with open("Tools/" + tool + '/' + outname + '.csv', 'w', newline='\n',
|
|
47
|
+
encoding='utf-8') as out_file: # Clear write out of report
|
|
48
|
+
tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
|
|
49
|
+
tool_out.writerow(['Representative_Metrics:'])
|
|
50
|
+
tool_out.writerow(rep_metric_description)
|
|
51
|
+
tool_out.writerow(rep_metrics)
|
|
52
|
+
tool_out.writerow(['All_Metrics:'])
|
|
53
|
+
tool_out.writerow(metric_description)
|
|
54
|
+
tool_out.writerow(metrics)
|
|
55
|
+
tool_out.writerow(['CDS_Gene_Coverage_of_Genome:'])
|
|
56
|
+
tool_out.writerow([gene_coverage_genome])
|
|
57
|
+
tool_out.writerow(['Start_Position_Difference:'])
|
|
58
|
+
tool_out.writerow(start_precision)
|
|
59
|
+
tool_out.writerow(['Stop_Position_Difference:'])
|
|
60
|
+
tool_out.writerow(stop_precision)
|
|
61
|
+
tool_out.writerow(['Alternative_Starts_Predicted:'])
|
|
62
|
+
tool_out.writerow(other_starts)
|
|
63
|
+
tool_out.writerow(['Alternative_Stops_Predicted:'])
|
|
64
|
+
tool_out.writerow(other_stops)
|
|
65
|
+
tool_out.writerow(['Undetected_Gene_Metrics:'])
|
|
66
|
+
tool_out.writerow([
|
|
67
|
+
'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
|
|
68
|
+
tool_out.writerow(undetected_gene_metrics)
|
|
69
|
+
tool_out.writerow(['Undetected_Genes:'])
|
|
70
|
+
for key, value in missed_genes.items():
|
|
71
|
+
key = key.split(',')
|
|
72
|
+
id = ('>' + genome_to_compare + '_' + key[0] + '_' + key[1] + '_' + key[2])
|
|
73
|
+
tool_out.writerow([id + '\n' + value])
|
|
74
|
+
tool_out.writerow(['\nORFs_Without_Corresponding_Gene_In_Ensembl_Metrics:'])
|
|
75
|
+
tool_out.writerow([
|
|
76
|
+
'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
|
|
77
|
+
tool_out.writerow(unmatched_orf_metrics)
|
|
78
|
+
tool_out.writerow(['ORF_Without_Corresponding_Gene_in_Ensembl:'])
|
|
79
|
+
for key, value in unmatched_orfs.items():
|
|
80
|
+
key = key.split(',')
|
|
81
|
+
id = ('>' + tool + '_' + key[0] + '_' + key[1] + '_' + key[2])
|
|
82
|
+
tool_out.writerow([id + '\n' + value])
|
|
83
|
+
tool_out.writerow(['\nORFs_Which_Detected_more_than_one_Gene:'])
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
for key, value in multi_Matched_ORFs.items():
|
|
87
|
+
key = key.split(',')
|
|
88
|
+
value = value[1].split(',')
|
|
89
|
+
multi = ('ORF:' + key[0] + '-' + key[1] + '_Gene:' + value[0] + '-' + value[1])
|
|
90
|
+
tool_out.writerow([multi])
|
|
91
|
+
except IndexError:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
tool_out.writerow(['\n\nPartial_Gene_Hits:'])
|
|
95
|
+
for key, seqs in partial_Hits.items():
|
|
96
|
+
key = key.split(';')
|
|
97
|
+
gene_Seq = seqs[0]
|
|
98
|
+
orf_Seq = seqs[1]
|
|
99
|
+
partial = (key[0] + '\n' + gene_Seq + '\n' + key[1] + '\n' + orf_Seq + '\n')
|
|
100
|
+
tool_out.writerow([partial])
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
print(WELCOME)
|
|
105
|
+
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': StORForise Run Parameters.')
|
|
106
|
+
parser.add_argument('-t', '--tool', default='GFF', help='Which tool/format would you analyse with StORF-R?')
|
|
107
|
+
parser.add_argument('-i', '--input_to_analyse', default='', help='Location of file containing missed genes')
|
|
108
|
+
parser.add_argument('-stf', '--storfs_to_find_missing', default='', help='STORFs to find missing.')
|
|
109
|
+
parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
|
|
110
|
+
args = parser.parse_args()
|
|
111
|
+
|
|
112
|
+
comparator(**vars(args))
|
|
113
|
+
|
|
114
|
+
if __name__ == "__main__":
|
|
115
|
+
main()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def Augustus(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as Augustus_input:
|
|
17
|
+
for line in Augustus_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if len(line) == 10 and "CDS" in line[2] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
augustus_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
augustus_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
with open(tool_pred, 'r') as Augustus_input:
|
|
34
|
+
for line in Augustus_input:
|
|
35
|
+
line = line.split()
|
|
36
|
+
if len(line) == 12 and dna_region in line[0] and "CDS" in line[2]:
|
|
37
|
+
start = int(line[3])
|
|
38
|
+
stop = int(line[4])
|
|
39
|
+
strand = line[6]
|
|
40
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
41
|
+
r_start = genome_size - stop
|
|
42
|
+
r_stop = genome_size - start
|
|
43
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
44
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
45
|
+
elif '+' in strand:
|
|
46
|
+
startCodon = genome[start - 1:start + 2]
|
|
47
|
+
stopCodon = genome[stop - 3:stop]
|
|
48
|
+
po = str(start) + ',' + str(stop)
|
|
49
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'Augustus']
|
|
50
|
+
augustus_ORFs.update({po: orf})
|
|
51
|
+
|
|
52
|
+
for group in augustus_ORFs:
|
|
53
|
+
augustus_ORFs[group] = sortORFs(augustus_ORFs[group])
|
|
54
|
+
return augustus_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def Balrog(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as Balrog_input:
|
|
17
|
+
for line in Balrog_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if "CDS" in line[2] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
Balrog_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
Balrog_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
|
|
34
|
+
with open(tool_pred, 'r') as Balrog_input:
|
|
35
|
+
for line in Balrog_input:
|
|
36
|
+
if '#' not in line:
|
|
37
|
+
line = line.split('\t')
|
|
38
|
+
if "CDS" in line[2] and dna_region in line[0]:
|
|
39
|
+
start = int(line[3])
|
|
40
|
+
stop = int(line[4])
|
|
41
|
+
strand = line[6]
|
|
42
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
43
|
+
r_start = genome_size - stop
|
|
44
|
+
r_stop = genome_size - start
|
|
45
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
46
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
47
|
+
elif '+' in strand:
|
|
48
|
+
startCodon = genome[start - 1:start + 2]
|
|
49
|
+
stopCodon = genome[stop - 3:stop]
|
|
50
|
+
po = str(start) + ',' + str(stop)
|
|
51
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'Balrog']
|
|
52
|
+
Balrog_ORFs.update({po: orf})
|
|
53
|
+
|
|
54
|
+
for group in Balrog_ORFs:
|
|
55
|
+
Balrog_ORFs[group] = sortORFs(Balrog_ORFs[group])
|
|
56
|
+
return Balrog_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def EasyGene(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as EasyGene_input:
|
|
17
|
+
for line in EasyGene_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if len(line) == 10 and line[0] and "CDS" in line[2] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
easyGene_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
easyGene_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
with open(tool_pred, 'r') as EasyGene_input:
|
|
34
|
+
for line in EasyGene_input:
|
|
35
|
+
line = line.split()
|
|
36
|
+
if len(line) == 10 and dna_region in line[0] and "CDS" in line[2]:
|
|
37
|
+
start = int(line[3])
|
|
38
|
+
stop = int(line[4])
|
|
39
|
+
strand = line[6]
|
|
40
|
+
info = line[8]
|
|
41
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
42
|
+
r_start = genome_size - stop
|
|
43
|
+
r_stop = genome_size - start
|
|
44
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
45
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
46
|
+
elif '+' in strand:
|
|
47
|
+
startCodon = genome[start - 1:start + 2]
|
|
48
|
+
stopCodon = genome[stop - 3:stop]
|
|
49
|
+
po = str(start) + ',' + str(stop)
|
|
50
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'EasyGene']
|
|
51
|
+
easyGene_ORFs[dna_region].update({po: orf})
|
|
52
|
+
|
|
53
|
+
for group in easyGene_ORFs:
|
|
54
|
+
easyGene_ORFs[group] = sortORFs(easyGene_ORFs[group])
|
|
55
|
+
return easyGene_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def FGENESB(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as FGENESB_input:
|
|
17
|
+
for line in FGENESB_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if len(line) == 10 and ">GENE" in line[0] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
FGENESB_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
FGENESB_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
with open(tool_pred, 'r') as FGENESB_input:
|
|
34
|
+
for line in FGENESB_input:
|
|
35
|
+
if '>GENE' in line:
|
|
36
|
+
line = line.split()
|
|
37
|
+
if '2208' in line:
|
|
38
|
+
print("ss")
|
|
39
|
+
if len(line) == 10 and dna_region in line[0] and ">GENE" in line[0]:
|
|
40
|
+
start = int(line[2])
|
|
41
|
+
stop = int(line[4])
|
|
42
|
+
strand = line[9]
|
|
43
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
44
|
+
r_start = genome_size - stop
|
|
45
|
+
r_stop = genome_size - start
|
|
46
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
47
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
48
|
+
elif '+' in strand:
|
|
49
|
+
startCodon = genome[start - 1:start + 2]
|
|
50
|
+
stopCodon = genome[stop - 3:stop]
|
|
51
|
+
po = str(start) + ',' + str(stop)
|
|
52
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'FGENESB']
|
|
53
|
+
FGENESB_ORFs.update({po: orf})
|
|
54
|
+
|
|
55
|
+
for group in FGENESB_ORFs:
|
|
56
|
+
FGENESB_ORFs[group] = sortORFs(FGENESB_ORFs[group])
|
|
57
|
+
return FGENESB_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def FragGeneScan(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as fragGeneScan_input:
|
|
17
|
+
for line in fragGeneScan_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
fragGeneScan_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
fragGeneScan_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
with open(tool_pred, 'r') as fragGeneScan_input:
|
|
34
|
+
for line in fragGeneScan_input:
|
|
35
|
+
line = line.split()
|
|
36
|
+
if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and dna_region in line[0]:
|
|
37
|
+
start = int(line[3])
|
|
38
|
+
stop = int(line[4])
|
|
39
|
+
strand = line[6]
|
|
40
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
41
|
+
r_start = genome_size - stop
|
|
42
|
+
r_stop = genome_size - start
|
|
43
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
44
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
45
|
+
elif '+' in strand:
|
|
46
|
+
startCodon = genome[start - 1:start + 2]
|
|
47
|
+
stopCodon = genome[stop - 3:stop]
|
|
48
|
+
po = str(start) + ',' + str(stop)
|
|
49
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'FragGeneScan']
|
|
50
|
+
fragGeneScan_ORFs.update({po: orf})
|
|
51
|
+
|
|
52
|
+
for group in fragGeneScan_ORFs:
|
|
53
|
+
fragGeneScan_ORFs[group] = sortORFs(fragGeneScan_ORFs[group])
|
|
54
|
+
return fragGeneScan_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
try:
|
|
3
|
+
from utils import revCompIterative
|
|
4
|
+
from utils import sortORFs
|
|
5
|
+
except ImportError:
|
|
6
|
+
from ORForise.utils import revCompIterative
|
|
7
|
+
from ORForise.utils import sortORFs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def GFF(*args):
|
|
11
|
+
tool_pred = args[0]
|
|
12
|
+
dna_regions = args[1]
|
|
13
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
14
|
+
dna_regions = collections.OrderedDict()
|
|
15
|
+
with open(tool_pred, 'r') as GFF_input:
|
|
16
|
+
for line in GFF_input:
|
|
17
|
+
line = line.split()
|
|
18
|
+
if 'CDS' in line[2] and len(line) == 9 and line[0] not in dna_regions:
|
|
19
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
20
|
+
return dna_regions
|
|
21
|
+
|
|
22
|
+
GFF_ORFs = collections.OrderedDict()
|
|
23
|
+
for dna_region in dna_regions:
|
|
24
|
+
GFF_ORFs[dna_region] = collections.OrderedDict()
|
|
25
|
+
for dna_region in dna_regions:
|
|
26
|
+
try:
|
|
27
|
+
genome = dna_regions[dna_region][0]
|
|
28
|
+
except IndexError:
|
|
29
|
+
genome = dna_regions[dna_region]
|
|
30
|
+
genome_size = len(genome)
|
|
31
|
+
genome_rev = revCompIterative(genome)
|
|
32
|
+
with open(tool_pred, 'r') as gff_input:
|
|
33
|
+
for line in gff_input:
|
|
34
|
+
if '#' not in line:
|
|
35
|
+
line = line.split('\t')
|
|
36
|
+
#gene_types = types.split(',') - Temporary fix
|
|
37
|
+
#if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
|
|
38
|
+
if 'CDS' in line[2] and len(line) == 9 and dna_region in line[0]:
|
|
39
|
+
start = int(line[3])
|
|
40
|
+
stop = int(line[4])
|
|
41
|
+
strand = line[6]
|
|
42
|
+
info = line[8]
|
|
43
|
+
if stop >= genome_size:
|
|
44
|
+
extra_stop = stop - genome_size
|
|
45
|
+
corrected_stop = genome_size
|
|
46
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
47
|
+
r_start = genome_size - corrected_stop
|
|
48
|
+
r_stop = genome_size - start
|
|
49
|
+
seq = genome_rev[r_start:r_stop + 1]
|
|
50
|
+
extra_seq = genome_rev[-extra_stop - 1:]
|
|
51
|
+
seq = extra_seq+seq
|
|
52
|
+
startCodon = seq[:3]
|
|
53
|
+
stopCodon = seq[-3:]
|
|
54
|
+
elif '+' in strand:
|
|
55
|
+
seq = genome[start -1 :corrected_stop]
|
|
56
|
+
extra_seq = genome[:extra_stop +1]
|
|
57
|
+
seq = seq+extra_seq
|
|
58
|
+
startCodon = seq[:3]
|
|
59
|
+
stopCodon = seq[-3:]
|
|
60
|
+
else:
|
|
61
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
62
|
+
r_start = genome_size - stop
|
|
63
|
+
r_stop = genome_size - start
|
|
64
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
65
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
66
|
+
elif '+' in strand:
|
|
67
|
+
startCodon = genome[start - 1:start + 2]
|
|
68
|
+
stopCodon = genome[stop - 3:stop]
|
|
69
|
+
po = str(start) + ',' + str(stop)
|
|
70
|
+
orf = [strand, startCodon, stopCodon, line[2], 'GFF-Standard'] # This needs to detect the type
|
|
71
|
+
GFF_ORFs.update({po: orf})
|
|
72
|
+
# elif "CDS" in line[2]:
|
|
73
|
+
# sys.exit("SAS")
|
|
74
|
+
|
|
75
|
+
for group in GFF_ORFs:
|
|
76
|
+
GFF_ORFs[group] = sortORFs(GFF_ORFs[group])
|
|
77
|
+
return GFF_ORFs
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def GLIMMER_3(*args):
|
|
12
|
+
tool_pred = args[0]
|
|
13
|
+
dna_regions = args[1]
|
|
14
|
+
if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
|
|
15
|
+
dna_regions = collections.OrderedDict()
|
|
16
|
+
with open(tool_pred, 'r') as GLIMKMER_input:
|
|
17
|
+
for line in GLIMMER_input:
|
|
18
|
+
line = line.split()
|
|
19
|
+
if len(line) == 5 and "orf" in line[0] and line[0] not in dna_regions:
|
|
20
|
+
dna_regions[line[0]] = [] # Placeholder for genome sequence
|
|
21
|
+
return dna_regions
|
|
22
|
+
|
|
23
|
+
GLIMMER_ORFs = collections.OrderedDict()
|
|
24
|
+
for dna_region in dna_regions:
|
|
25
|
+
GLIMMER_ORFs[dna_region] = collections.OrderedDict()
|
|
26
|
+
for dna_region in dna_regions:
|
|
27
|
+
try:
|
|
28
|
+
genome = dna_regions[dna_region][0]
|
|
29
|
+
except IndexError:
|
|
30
|
+
genome = dna_regions[dna_region]
|
|
31
|
+
genome_size = len(genome)
|
|
32
|
+
genome_rev = revCompIterative(genome)
|
|
33
|
+
with open(tool_pred,
|
|
34
|
+
'r') as glimmer_input: # GLIMMER3 reverses the start and stop positions for ORFS on the negative strand
|
|
35
|
+
for line in glimmer_input:
|
|
36
|
+
if '>' not in line: # This will not work with multiple contigs
|
|
37
|
+
line = line.split()
|
|
38
|
+
if len(line) == 5 and "orf" in line[0] and dna_region in line[0]:
|
|
39
|
+
if '-' in line[3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand
|
|
40
|
+
start = int(line[2])
|
|
41
|
+
stop = int(line[1])
|
|
42
|
+
strand = '-'
|
|
43
|
+
r_start = genome_size - stop
|
|
44
|
+
r_stop = genome_size - start
|
|
45
|
+
startCodon = genome_rev[r_start:r_start + 3]
|
|
46
|
+
stopCodon = genome_rev[r_stop - 2:r_stop + 1]
|
|
47
|
+
elif '+' in line[3]:
|
|
48
|
+
start = int(line[1])
|
|
49
|
+
stop = int(line[2])
|
|
50
|
+
strand = '+'
|
|
51
|
+
startCodon = genome[start - 1:start + 3]
|
|
52
|
+
stopCodon = genome[stop - 3:stop]
|
|
53
|
+
po = str(start) + ',' + str(stop)
|
|
54
|
+
orf = [strand, startCodon, stopCodon, 'CDS', 'GLIMMER3']
|
|
55
|
+
GLIMMER_ORFs.update({po: orf})
|
|
56
|
+
|
|
57
|
+
for group in GLIMMER_ORFs:
|
|
58
|
+
GLIMMER_ORFs[group] = sortORFs(GLIMMER_ORFs[group])
|
|
59
|
+
return GLIMMER_ORFs
|
|
File without changes
|