ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
ORForise/StORForise.py ADDED
@@ -0,0 +1,115 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ import csv
4
+
5
+ try:
6
+ from .Comparator import tool_comparison
7
+ from .utils import *
8
+ except (ImportError, ModuleNotFoundError):
9
+ from Comparator import tool_comparison
10
+ from utils import *
11
+
12
+ ###################
13
+
14
+
15
+ def comparator(tool, input_to_analyse, storfs_to_find_missing, genome_to_compare):
16
+ genome_Seq = ""
17
+ with open(genome_to_compare, 'r') as genome:
18
+ for line in genome:
19
+ line = line.replace("\n", "")
20
+ if ">" not in line:
21
+ genome_Seq += str(line)
22
+ ##############################################
23
+ genes = collections.OrderedDict()
24
+ count = 0
25
+ with open(input_to_analyse, 'r') as genome_gff: # Get list of missed genes
26
+ for line in genome_gff:
27
+ if ">" in line:
28
+ line = line.strip()
29
+ start = int(line.split('_')[1])
30
+ stop = int(line.split('_')[2])
31
+ strand = line.split('_')[3]
32
+ gene_details = [start,stop,strand]
33
+ genes.update({count: gene_details})
34
+ count += 1
35
+ ##################################
36
+ tool_predictions = import_module('Tools.' + tool + '.' + tool)
37
+ tool_predictions = getattr(tool_predictions, tool)
38
+ orfs = tool_predictions(storfs_to_find_missing, genome_Seq)
39
+ all_Metrics, all_rep_Metrics, start_precision, stop_precision, other_starts, other_stops, perfect_Matches, missed_genes, unmatched_orfs, undetected_gene_metrics, unmatched_orf_metrics, orf_Coverage_Genome, matched_ORF_Coverage_Genome, gene_coverage_genome, multi_Matched_ORFs, partial_Hits = tool_comparison(
40
+ genes, orfs, genome_Seq,True)
41
+ outname = tool + '_' + genome_to_compare.split('/')[-1].split('.')[0]
42
+ metric_description = list(all_Metrics.keys())
43
+ metrics = list(all_Metrics.values())
44
+ rep_metric_description = list(all_rep_Metrics.keys())
45
+ rep_metrics = list(all_rep_Metrics.values())
46
+ with open("Tools/" + tool + '/' + outname + '.csv', 'w', newline='\n',
47
+ encoding='utf-8') as out_file: # Clear write out of report
48
+ tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
49
+ tool_out.writerow(['Representative_Metrics:'])
50
+ tool_out.writerow(rep_metric_description)
51
+ tool_out.writerow(rep_metrics)
52
+ tool_out.writerow(['All_Metrics:'])
53
+ tool_out.writerow(metric_description)
54
+ tool_out.writerow(metrics)
55
+ tool_out.writerow(['CDS_Gene_Coverage_of_Genome:'])
56
+ tool_out.writerow([gene_coverage_genome])
57
+ tool_out.writerow(['Start_Position_Difference:'])
58
+ tool_out.writerow(start_precision)
59
+ tool_out.writerow(['Stop_Position_Difference:'])
60
+ tool_out.writerow(stop_precision)
61
+ tool_out.writerow(['Alternative_Starts_Predicted:'])
62
+ tool_out.writerow(other_starts)
63
+ tool_out.writerow(['Alternative_Stops_Predicted:'])
64
+ tool_out.writerow(other_stops)
65
+ tool_out.writerow(['Undetected_Gene_Metrics:'])
66
+ tool_out.writerow([
67
+ 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
68
+ tool_out.writerow(undetected_gene_metrics)
69
+ tool_out.writerow(['Undetected_Genes:'])
70
+ for key, value in missed_genes.items():
71
+ key = key.split(',')
72
+ id = ('>' + genome_to_compare + '_' + key[0] + '_' + key[1] + '_' + key[2])
73
+ tool_out.writerow([id + '\n' + value])
74
+ tool_out.writerow(['\nORFs_Without_Corresponding_Gene_In_Ensembl_Metrics:'])
75
+ tool_out.writerow([
76
+ 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'])
77
+ tool_out.writerow(unmatched_orf_metrics)
78
+ tool_out.writerow(['ORF_Without_Corresponding_Gene_in_Ensembl:'])
79
+ for key, value in unmatched_orfs.items():
80
+ key = key.split(',')
81
+ id = ('>' + tool + '_' + key[0] + '_' + key[1] + '_' + key[2])
82
+ tool_out.writerow([id + '\n' + value])
83
+ tool_out.writerow(['\nORFs_Which_Detected_more_than_one_Gene:'])
84
+
85
+ try:
86
+ for key, value in multi_Matched_ORFs.items():
87
+ key = key.split(',')
88
+ value = value[1].split(',')
89
+ multi = ('ORF:' + key[0] + '-' + key[1] + '_Gene:' + value[0] + '-' + value[1])
90
+ tool_out.writerow([multi])
91
+ except IndexError:
92
+ pass
93
+
94
+ tool_out.writerow(['\n\nPartial_Gene_Hits:'])
95
+ for key, seqs in partial_Hits.items():
96
+ key = key.split(';')
97
+ gene_Seq = seqs[0]
98
+ orf_Seq = seqs[1]
99
+ partial = (key[0] + '\n' + gene_Seq + '\n' + key[1] + '\n' + orf_Seq + '\n')
100
+ tool_out.writerow([partial])
101
+
102
+
103
+ def main():
104
+ print(WELCOME)
105
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': StORForise Run Parameters.')
106
+ parser.add_argument('-t', '--tool', default='GFF', help='Which tool/format would you analyse with StORF-R?')
107
+ parser.add_argument('-i', '--input_to_analyse', default='', help='Location of file containing missed genes')
108
+ parser.add_argument('-stf', '--storfs_to_find_missing', default='', help='STORFs to find missing.')
109
+ parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
110
+ args = parser.parse_args()
111
+
112
+ comparator(**vars(args))
113
+
114
+ if __name__ == "__main__":
115
+ main()
@@ -0,0 +1,54 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def Augustus(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as Augustus_input:
17
+ for line in Augustus_input:
18
+ line = line.split()
19
+ if len(line) == 10 and "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ augustus_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ augustus_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as Augustus_input:
34
+ for line in Augustus_input:
35
+ line = line.split()
36
+ if len(line) == 12 and dna_region in line[0] and "CDS" in line[2]:
37
+ start = int(line[3])
38
+ stop = int(line[4])
39
+ strand = line[6]
40
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
41
+ r_start = genome_size - stop
42
+ r_stop = genome_size - start
43
+ startCodon = genome_rev[r_start:r_start + 3]
44
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
45
+ elif '+' in strand:
46
+ startCodon = genome[start - 1:start + 2]
47
+ stopCodon = genome[stop - 3:stop]
48
+ po = str(start) + ',' + str(stop)
49
+ orf = [strand, startCodon, stopCodon, 'CDS', 'Augustus']
50
+ augustus_ORFs.update({po: orf})
51
+
52
+ for group in augustus_ORFs:
53
+ augustus_ORFs[group] = sortORFs(augustus_ORFs[group])
54
+ return augustus_ORFs
File without changes
@@ -0,0 +1,56 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def Balrog(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as Balrog_input:
17
+ for line in Balrog_input:
18
+ line = line.split()
19
+ if "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ Balrog_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ Balrog_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+
34
+ with open(tool_pred, 'r') as Balrog_input:
35
+ for line in Balrog_input:
36
+ if '#' not in line:
37
+ line = line.split('\t')
38
+ if "CDS" in line[2] and dna_region in line[0]:
39
+ start = int(line[3])
40
+ stop = int(line[4])
41
+ strand = line[6]
42
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
43
+ r_start = genome_size - stop
44
+ r_stop = genome_size - start
45
+ startCodon = genome_rev[r_start:r_start + 3]
46
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
47
+ elif '+' in strand:
48
+ startCodon = genome[start - 1:start + 2]
49
+ stopCodon = genome[stop - 3:stop]
50
+ po = str(start) + ',' + str(stop)
51
+ orf = [strand, startCodon, stopCodon, 'CDS', 'Balrog']
52
+ Balrog_ORFs.update({po: orf})
53
+
54
+ for group in Balrog_ORFs:
55
+ Balrog_ORFs[group] = sortORFs(Balrog_ORFs[group])
56
+ return Balrog_ORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def EasyGene(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as EasyGene_input:
17
+ for line in EasyGene_input:
18
+ line = line.split()
19
+ if len(line) == 10 and line[0] and "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ easyGene_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ easyGene_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as EasyGene_input:
34
+ for line in EasyGene_input:
35
+ line = line.split()
36
+ if len(line) == 10 and dna_region in line[0] and "CDS" in line[2]:
37
+ start = int(line[3])
38
+ stop = int(line[4])
39
+ strand = line[6]
40
+ info = line[8]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'EasyGene']
51
+ easyGene_ORFs[dna_region].update({po: orf})
52
+
53
+ for group in easyGene_ORFs:
54
+ easyGene_ORFs[group] = sortORFs(easyGene_ORFs[group])
55
+ return easyGene_ORFs
File without changes
@@ -0,0 +1,57 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def FGENESB(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as FGENESB_input:
17
+ for line in FGENESB_input:
18
+ line = line.split()
19
+ if len(line) == 10 and ">GENE" in line[0] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ FGENESB_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ FGENESB_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as FGENESB_input:
34
+ for line in FGENESB_input:
35
+ if '>GENE' in line:
36
+ line = line.split()
37
+ if '2208' in line:
38
+ print("ss")
39
+ if len(line) == 10 and dna_region in line[0] and ">GENE" in line[0]:
40
+ start = int(line[2])
41
+ stop = int(line[4])
42
+ strand = line[9]
43
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
44
+ r_start = genome_size - stop
45
+ r_stop = genome_size - start
46
+ startCodon = genome_rev[r_start:r_start + 3]
47
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
48
+ elif '+' in strand:
49
+ startCodon = genome[start - 1:start + 2]
50
+ stopCodon = genome[stop - 3:stop]
51
+ po = str(start) + ',' + str(stop)
52
+ orf = [strand, startCodon, stopCodon, 'CDS', 'FGENESB']
53
+ FGENESB_ORFs.update({po: orf})
54
+
55
+ for group in FGENESB_ORFs:
56
+ FGENESB_ORFs[group] = sortORFs(FGENESB_ORFs[group])
57
+ return FGENESB_ORFs
File without changes
@@ -0,0 +1,54 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def FragGeneScan(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as fragGeneScan_input:
17
+ for line in fragGeneScan_input:
18
+ line = line.split()
19
+ if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ fragGeneScan_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ fragGeneScan_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as fragGeneScan_input:
34
+ for line in fragGeneScan_input:
35
+ line = line.split()
36
+ if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and dna_region in line[0]:
37
+ start = int(line[3])
38
+ stop = int(line[4])
39
+ strand = line[6]
40
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
41
+ r_start = genome_size - stop
42
+ r_stop = genome_size - start
43
+ startCodon = genome_rev[r_start:r_start + 3]
44
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
45
+ elif '+' in strand:
46
+ startCodon = genome[start - 1:start + 2]
47
+ stopCodon = genome[stop - 3:stop]
48
+ po = str(start) + ',' + str(stop)
49
+ orf = [strand, startCodon, stopCodon, 'CDS', 'FragGeneScan']
50
+ fragGeneScan_ORFs.update({po: orf})
51
+
52
+ for group in fragGeneScan_ORFs:
53
+ fragGeneScan_ORFs[group] = sortORFs(fragGeneScan_ORFs[group])
54
+ return fragGeneScan_ORFs
File without changes
@@ -0,0 +1,77 @@
1
+ import collections
2
+ try:
3
+ from utils import revCompIterative
4
+ from utils import sortORFs
5
+ except ImportError:
6
+ from ORForise.utils import revCompIterative
7
+ from ORForise.utils import sortORFs
8
+
9
+
10
+ def GFF(*args):
11
+ tool_pred = args[0]
12
+ dna_regions = args[1]
13
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
14
+ dna_regions = collections.OrderedDict()
15
+ with open(tool_pred, 'r') as GFF_input:
16
+ for line in GFF_input:
17
+ line = line.split()
18
+ if 'CDS' in line[2] and len(line) == 9 and line[0] not in dna_regions:
19
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
20
+ return dna_regions
21
+
22
+ GFF_ORFs = collections.OrderedDict()
23
+ for dna_region in dna_regions:
24
+ GFF_ORFs[dna_region] = collections.OrderedDict()
25
+ for dna_region in dna_regions:
26
+ try:
27
+ genome = dna_regions[dna_region][0]
28
+ except IndexError:
29
+ genome = dna_regions[dna_region]
30
+ genome_size = len(genome)
31
+ genome_rev = revCompIterative(genome)
32
+ with open(tool_pred, 'r') as gff_input:
33
+ for line in gff_input:
34
+ if '#' not in line:
35
+ line = line.split('\t')
36
+ #gene_types = types.split(',') - Temporary fix
37
+ #if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
38
+ if 'CDS' in line[2] and len(line) == 9 and dna_region in line[0]:
39
+ start = int(line[3])
40
+ stop = int(line[4])
41
+ strand = line[6]
42
+ info = line[8]
43
+ if stop >= genome_size:
44
+ extra_stop = stop - genome_size
45
+ corrected_stop = genome_size
46
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
47
+ r_start = genome_size - corrected_stop
48
+ r_stop = genome_size - start
49
+ seq = genome_rev[r_start:r_stop + 1]
50
+ extra_seq = genome_rev[-extra_stop - 1:]
51
+ seq = extra_seq+seq
52
+ startCodon = seq[:3]
53
+ stopCodon = seq[-3:]
54
+ elif '+' in strand:
55
+ seq = genome[start -1 :corrected_stop]
56
+ extra_seq = genome[:extra_stop +1]
57
+ seq = seq+extra_seq
58
+ startCodon = seq[:3]
59
+ stopCodon = seq[-3:]
60
+ else:
61
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
62
+ r_start = genome_size - stop
63
+ r_stop = genome_size - start
64
+ startCodon = genome_rev[r_start:r_start + 3]
65
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
66
+ elif '+' in strand:
67
+ startCodon = genome[start - 1:start + 2]
68
+ stopCodon = genome[stop - 3:stop]
69
+ po = str(start) + ',' + str(stop)
70
+ orf = [strand, startCodon, stopCodon, line[2], 'GFF-Standard'] # This needs to detect the type
71
+ GFF_ORFs.update({po: orf})
72
+ # elif "CDS" in line[2]:
73
+ # sys.exit("SAS")
74
+
75
+ for group in GFF_ORFs:
76
+ GFF_ORFs[group] = sortORFs(GFF_ORFs[group])
77
+ return GFF_ORFs
File without changes
@@ -0,0 +1,59 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def GLIMMER_3(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as GLIMKMER_input:
17
+ for line in GLIMMER_input:
18
+ line = line.split()
19
+ if len(line) == 5 and "orf" in line[0] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ GLIMMER_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ GLIMMER_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred,
34
+ 'r') as glimmer_input: # GLIMMER3 reverses the start and stop positions for ORFS on the negative strand
35
+ for line in glimmer_input:
36
+ if '>' not in line: # This will not work with multiple contigs
37
+ line = line.split()
38
+ if len(line) == 5 and "orf" in line[0] and dna_region in line[0]:
39
+ if '-' in line[3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand
40
+ start = int(line[2])
41
+ stop = int(line[1])
42
+ strand = '-'
43
+ r_start = genome_size - stop
44
+ r_stop = genome_size - start
45
+ startCodon = genome_rev[r_start:r_start + 3]
46
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
47
+ elif '+' in line[3]:
48
+ start = int(line[1])
49
+ stop = int(line[2])
50
+ strand = '+'
51
+ startCodon = genome[start - 1:start + 3]
52
+ stopCodon = genome[stop - 3:stop]
53
+ po = str(start) + ',' + str(stop)
54
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GLIMMER3']
55
+ GLIMMER_ORFs.update({po: orf})
56
+
57
+ for group in GLIMMER_ORFs:
58
+ GLIMMER_ORFs[group] = sortORFs(GLIMMER_ORFs[group])
59
+ return GLIMMER_ORFs
File without changes