ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,135 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+ def GeneMark(*args):
11
+ tool_pred = args[0]
12
+ dna_regions = args[1]
13
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
14
+ dna_regions = collections.OrderedDict()
15
+ with open(tool_pred, 'r') as GeneMark_input:
16
+ for line in GeneMark_input:
17
+ line = line.split()
18
+ if 'direct' in line[2] or 'complement' in line[2] and line[0] not in dna_regions:
19
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
20
+ return dna_regions
21
+
22
+ geneMark_ORFs = collections.OrderedDict()
23
+ for dna_region in dna_regions:
24
+ geneMark_ORFs[dna_region] = collections.OrderedDict()
25
+ for dna_region in dna_regions:
26
+ try:
27
+ genome = dna_regions[dna_region][0]
28
+ except IndexError:
29
+ genome = dna_regions[dna_region]
30
+ genome_size = len(genome)
31
+ genome_rev = revCompIterative(genome)
32
+ prev_Start = 0
33
+ prev_Stop = 0
34
+ started = False
35
+ with open(tool_pred, 'r') as GeneMark_input:
36
+ for line in GeneMark_input:
37
+ line = line.split()
38
+ if len(line) == 7:
39
+ started = True
40
+ if 'direct' in line[2] or 'complement' in line[2] and dna_region in line[0]: # Strange Output requires strange code - We select the Longest ORF from each set
41
+ start = int(line[0])
42
+ stop = int(line[1])
43
+ strand = line[2]
44
+ if 'complement' in strand: # Reverse Compliment starts and stops adjusted
45
+ if start != prev_Start:
46
+ r_start = genome_size - stop
47
+ r_stop = genome_size - start
48
+ strand = '-'
49
+ startCodon = genome_rev[r_start:r_start + 3]
50
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
51
+ po = str(start) + ',' + str(stop)
52
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMark']
53
+ geneMark_ORFs.update({po: orf})
54
+ elif 'direct' in strand:
55
+ if stop != prev_Stop:
56
+ startCodon = genome[start - 1:start + 2]
57
+ stopCodon = genome[stop - 3:stop]
58
+ strand = '+'
59
+ po = str(start) + ',' + str(stop)
60
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMark']
61
+ geneMark_ORFs.update({po: orf})
62
+ prev_Start = start
63
+ prev_Stop = stop
64
+ elif len(line) == 0 and started == True:
65
+ prev_Stop = 0
66
+ prev_Start = 0
67
+
68
+ for group in geneMark_ORFs:
69
+ geneMark_ORFs[group] = sortORFs(geneMark_ORFs[group])
70
+ return geneMark_ORFs
71
+
72
+ ############# This section can be used to select the ORF with highest probability score.
73
+ # with open('Tools/GeneMark/' + input_to_analyse, 'r') as GeneMark_input:
74
+ # prob_score = 0
75
+ # started = False
76
+ #
77
+ # for line in GeneMark_input:
78
+ # line = line.split()
79
+ #
80
+ # if len(line) == 7:
81
+ # if 'direct' in line[2] or 'complement' in line[2] and '....' not in line[6] : # Strange Output requires strange code
82
+ # started = True
83
+ # start = int(line[0])
84
+ # stop = int(line[1])
85
+ # score = float(line[5])
86
+ # strand = line[2]
87
+ # if 'complement' in strand: # Reverse Compliment starts and stops to confirm to our definition
88
+ # if start != prev_Start:
89
+ # prob_score = score
90
+ # # Switched to match Sense Strand
91
+ # r_start = genome_size - stop
92
+ # r_stop = genome_size - start
93
+ # strand = '-'
94
+ # startCodon = genome_rev[r_start:r_start + 3]
95
+ # stopCodon = genome_rev[r_stop - 2:r_stop + 1]
96
+ # po = str(start) + ',' + str(stop)
97
+ # orf = [strand, startCodon, stopCodon]
98
+ # elif start == prev_Start and score > prob_score:
99
+ # # Switched to match Sense Strand
100
+ # prob_score = score
101
+ # r_start = genome_size - stop
102
+ # r_stop = genome_size - start
103
+ # strand = '-'
104
+ # startCodon = genome_rev[r_start:r_start + 3]
105
+ # stopCodon = genome_rev[r_stop - 2:r_stop + 1]
106
+ # po = str(start) + ',' + str(stop)
107
+ # orf = [strand, startCodon, stopCodon]
108
+ # elif 'direct' in strand:
109
+ # if stop != prev_Stop:
110
+ # prob_score = score
111
+ # startCodon = genome[start - 1:start - 1 + 3]
112
+ # stopCodon = genome[stop - 3:stop - 1 + 1]
113
+ # strand = '+'
114
+ # po = str(start) + ',' + str(stop)
115
+ # orf = [strand, startCodon, stopCodon]
116
+ # elif stop == prev_Stop and score > prob_score:
117
+ # prob_score = score
118
+ # startCodon = genome[start - 1:start - 1 + 3]
119
+ # stopCodon = genome[stop - 3:stop - 1 + 1]
120
+ # strand = '+'
121
+ # po = str(start) + ',' + str(stop)
122
+ # orf = [strand, startCodon, stopCodon]
123
+ # prev_Start = start
124
+ # prev_Stop = stop
125
+ # elif len(line) == 0 and started == True:
126
+ # prob_score = 0
127
+ # prev_Start = 0
128
+ # prev_Stop = 0
129
+ # GeneMark_ORFs.update({po: orf})
130
+ # po = ''
131
+ # orf = []
132
+ # #Remove last empty dict
133
+ # del GeneMark_ORFs['']
134
+ # print(GeneMark_ORFs)
135
+ # return GeneMark_ORFs
File without changes
@@ -0,0 +1,54 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def GeneMark_HA(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as GeneMarkHA_input:
17
+ for line in GeneMarkHA_input:
18
+ line = line.split()
19
+ if len(line) >= 9 and "CDS" in line[5] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ geneMarkHA_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ geneMarkHA_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as GeneMarkHA_input:
34
+ for line in GeneMarkHA_input:
35
+ line = line.split()
36
+ if len(line) >= 9 and "CDS" in line[5] and dna_region in line[0]:
37
+ start = int(line[6])
38
+ stop = int(line[7])
39
+ strand = line[9]
40
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
41
+ r_start = genome_size - stop
42
+ r_stop = genome_size - start
43
+ startCodon = genome_rev[r_start:r_start + 3]
44
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
45
+ elif '+' in strand:
46
+ startCodon = genome[start - 1:start + 2]
47
+ stopCodon = genome[stop - 3:stop]
48
+ po = str(start) + ',' + str(stop)
49
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMarkHA']
50
+ geneMarkHA_ORFs.update({po: orf})
51
+
52
+ for group in geneMarkHA_ORFs:
53
+ geneMarkHA_ORFs[group] = sortORFs(geneMarkHA_ORFs[group])
54
+ return geneMarkHA_ORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+
12
+ def GeneMark_HMM(*args):
13
+ tool_pred = args[0]
14
+ dna_regions = args[1]
15
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
16
+ dna_regions = collections.OrderedDict()
17
+ with open(tool_pred, 'r') as GeneMarkHMM_input:
18
+ for line in GeneMarkHMM_input:
19
+ line = line.split()
20
+ if len(line) >= 9 and "CDS" in line[2] and line[0] not in dna_regions:
21
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
22
+ return dna_regions
23
+
24
+ geneMarkHMM_ORFs = collections.OrderedDict()
25
+ for dna_region in dna_regions:
26
+ geneMarkHMM_ORFs[dna_region] = collections.OrderedDict()
27
+ for dna_region in dna_regions:
28
+ try:
29
+ genome = dna_regions[dna_region][0]
30
+ except IndexError:
31
+ genome = dna_regions[dna_region]
32
+ genome_size = len(genome)
33
+ genome_rev = revCompIterative(genome)
34
+ with open(tool_pred, 'r') as GeneMarkHMM_input:
35
+ for line in GeneMarkHMM_input:
36
+ line = line.split('\t')
37
+ if len(line) >= 9 and "CDS" in line[2] and dna_region in line[0]:
38
+ start = int(line[3])
39
+ stop = int(line[4])
40
+ strand = line[6]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMarkHMM']
51
+ geneMarkHMM_ORFs.update({po: orf})
52
+
53
+ for group in geneMarkHMM_ORFs:
54
+ geneMarkHMM_ORFs[group] = sortORFs(geneMarkHMM_ORFs[group])
55
+ return geneMarkHMM_ORFs
File without changes
@@ -0,0 +1,54 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def GeneMark_S(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as GeneMarkS_input:
17
+ for line in GeneMarkS_input:
18
+ line = line.split()
19
+ if len(line) >= 9 and "CDS" in line[5] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ geneMarkS_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ geneMarkS_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as GeneMarkS_input:
34
+ for line in GeneMarkS_input:
35
+ line = line.split()
36
+ if len(line) >= 9 and "CDS" in line[5] and dna_region in line[0]:
37
+ start = int(line[6])
38
+ stop = int(line[7])
39
+ strand = line[9]
40
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
41
+ r_start = genome_size - stop
42
+ r_stop = genome_size - start
43
+ startCodon = genome_rev[r_start:r_start + 3]
44
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
45
+ elif '+' in strand:
46
+ startCodon = genome[start - 1:start + 2]
47
+ stopCodon = genome[stop - 3:stop]
48
+ po = str(start) + ',' + str(stop)
49
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMarkS']
50
+ geneMarkS_ORFs.update({po: orf})
51
+
52
+ for group in geneMarkS_ORFs:
53
+ geneMarkS_ORFs[group] = sortORFs(geneMarkS_ORFs[group])
54
+ return geneMarkS_ORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def GeneMarkS2(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as GeneMarkS2_input:
17
+ for line in GeneMarkS2_input:
18
+ line = line.split()
19
+ if len(line) >= 9 and "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ geneMarkS2_ORFs = collections.defaultdict()
24
+ for dna_region in dna_regions:
25
+ geneMarkS2_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as GeneMarkS2_input:
34
+ for line in GeneMarkS2_input:
35
+ line = line.split('\t')
36
+ if len(line) >= 9 and dna_region in line[0] and "CDS" in line[2]:
37
+ start = int(line[3])
38
+ stop = int(line[4])
39
+ strand = line[6]
40
+ info = line[8]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GeneMarkS2']
51
+ geneMarkS2_ORFs[dna_region].update({po: orf})
52
+
53
+ for group in geneMarkS2_ORFs:
54
+ geneMarkS2_ORFs[group] = sortORFs(geneMarkS2_ORFs[group])
55
+ return geneMarkS2_ORFs
File without changes
@@ -0,0 +1,54 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def MetaGene(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as MetaGene_input:
17
+ for line in MetaGene_input:
18
+ line = line.split()
19
+ if len(line) >= 6 and ("-" in line or '+' in line) and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ metaGene_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ metaGene_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as MetaGene_input:
34
+ for line in MetaGene_input:
35
+ line = line.split()
36
+ if len(line) >= 6 and ("-" in line or '+' in line) and dna_region in line[0]:
37
+ start = int(line[0])
38
+ stop = int(line[1])
39
+ strand = line[2]
40
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
41
+ r_start = genome_size - stop
42
+ r_stop = genome_size - start
43
+ startCodon = genome_rev[r_start:r_start + 3]
44
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
45
+ elif '+' in strand:
46
+ startCodon = genome[start - 1:start + 2]
47
+ stopCodon = genome[stop - 3:stop]
48
+ po = str(start) + ',' + str(stop)
49
+ orf = [strand, startCodon, stopCodon, 'CDS', 'MetaGene']
50
+ metaGene_ORFs.update({po: orf})
51
+
52
+ for group in metaGene_ORFs:
53
+ metaGene_ORFs[group] = sortORFs(metaGene_ORFs[group])
54
+ return metaGene_ORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def MetaGeneAnnotator(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as MetaGeneAnnotator_input:
17
+ for line in MetaGeneAnnotator_input:
18
+ line = line.split()
19
+ if len(line) == 11 and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ metaGeneAnnotator_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ metaGeneAnnotator_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as MetaGeneAnnotator_input:
34
+ for line in MetaGeneAnnotator_input:
35
+ line = line.split()
36
+ if len(line) == 11 and dna_region in line[0]:
37
+ if "gene_" in line[0]:
38
+ start = int(line[1])
39
+ stop = int(line[2])
40
+ strand = line[3]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'MetaGeneAnnotator']
51
+ metaGeneAnnotator_ORFs.update({po: orf})
52
+
53
+ for group in metaGeneAnnotator_ORFs:
54
+ metaGeneAnnotator_ORFs[group] = sortORFs(metaGeneAnnotator_ORFs[group])
55
+ return metaGeneAnnotator_ORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def MetaGeneMark(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as MetaGeneMark_input:
17
+ for line in MetaGeneMark_input:
18
+ line = line.split()
19
+ if 'GeneMark.hmm' in line[4] and "CDS" in line[5] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ metaGeneMarkORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ metaGeneMarkORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as metaGeneMark_input:
34
+ for line in metaGeneMark_input:
35
+ line = line.split()
36
+ if len(line) == 19:
37
+ if 'GeneMark.hmm' in line[4] and "CDS" in line[5] and dna_region in line[0]:
38
+ start = int(line[6])
39
+ stop = int(line[7])
40
+ strand = line[9]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'MetaGeneMark']
51
+ metaGeneMarkORFs.update({po: orf})
52
+
53
+ for group in metaGeneMarkORFs:
54
+ metaGeneMarkORFs[group] = sortORFs(metaGeneMarkORFs[group])
55
+ return metaGeneMarkORFs
File without changes
@@ -0,0 +1,55 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def Prodigal(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
15
+ dna_regions = collections.OrderedDict()
16
+ with open(tool_pred, 'r') as Prodigal_input:
17
+ for line in Prodigal_input:
18
+ line = line.split()
19
+ if "Prodigal" in line[1] and "CDS" in line[2] and line[0] not in dna_regions:
20
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
21
+ return dna_regions
22
+
23
+ prodigal_ORFs = collections.OrderedDict()
24
+ for dna_region in dna_regions:
25
+ prodigal_ORFs[dna_region] = collections.OrderedDict()
26
+ for dna_region in dna_regions:
27
+ try:
28
+ genome = dna_regions[dna_region][0]
29
+ except IndexError:
30
+ genome = dna_regions[dna_region]
31
+ genome_size = len(genome)
32
+ genome_rev = revCompIterative(genome)
33
+ with open(tool_pred, 'r') as prodigal_input:
34
+ for line in prodigal_input:
35
+ line = line.split()
36
+ if "Prodigal" in line[1] and dna_region in line[0] and "CDS" in line[2]:
37
+ start = int(line[3])
38
+ stop = int(line[4])
39
+ strand = line[6]
40
+ info = line[8]
41
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
42
+ r_start = genome_size - stop
43
+ r_stop = genome_size - start
44
+ startCodon = genome_rev[r_start:r_start + 3]
45
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
46
+ elif '+' in strand:
47
+ startCodon = genome[start - 1:start + 2]
48
+ stopCodon = genome[stop - 3:stop]
49
+ po = str(start) + ',' + str(stop)
50
+ orf = [strand, startCodon, stopCodon, 'CDS', 'Prodigal']
51
+ prodigal_ORFs[dna_region].update({po: orf})
52
+
53
+ for group in prodigal_ORFs:
54
+ prodigal_ORFs[group] = sortORFs(prodigal_ORFs[group])
55
+ return prodigal_ORFs
File without changes
@@ -0,0 +1,57 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def Prokka(*args): # UNFINISHED
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
14
+ types = args[2]
15
+ if not dna_regions: # This triggers if dna_regions is an empty dict (GFF_Intersect passed nothing)
16
+ dna_regions = collections.OrderedDict()
17
+ with open(tool_pred, 'r') as PROKKA_input:
18
+ for line in PROKKA_input:
19
+ line = line.split()
20
+ if "Prodigal" in line[1] and "CDS" in line[2] and line[0] not in dna_regions:
21
+ dna_regions[line[0]] = [] # Placeholder for genome sequence
22
+ return dna_regions
23
+
24
+ prokkaORFs = collections.defaultdict(list)
25
+ for dna_region in dna_regions:
26
+ prokkaORFs[dna_region] = collections.OrderedDict()
27
+ for dna_region in dna_regions:
28
+ try:
29
+ genome = dna_regions[dna_region][0]
30
+ except IndexError:
31
+ genome = dna_regions[dna_region]
32
+ genome_size = len(genome)
33
+ genome_rev = revCompIterative(genome)
34
+ with open(tool_pred, 'r') as prodigal_input:
35
+ for line in prodigal_input:
36
+ if '#' not in line:
37
+ line = line.split('\t')
38
+ if "prokka" not in line[1] and line[8].startswith('ID=') and dna_region in line[0] and "CDS" in line[2]:
39
+ start = int(line[3])
40
+ stop = int(line[4])
41
+ strand = line[6]
42
+ info = line[8]
43
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
44
+ r_start = genome_size - stop
45
+ r_stop = genome_size - start
46
+ startCodon = genome_rev[r_start:r_start + 3]
47
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
48
+ elif '+' in strand:
49
+ startCodon = genome[start - 1:start + 2]
50
+ stopCodon = genome[stop - 3:stop]
51
+ po = str(start) + ',' + str(stop)
52
+ orf = [strand, startCodon, stopCodon, line[2], 'Prokka']
53
+ prokkaORFs.update({po: orf})
54
+
55
+ for group in prokkaORFs:
56
+ prokkaORFs[group] = sortORFs(prokkaORFs[group])
57
+ return prokkaORFs
File without changes