ORForise 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ORForise/Aggregate_Compare.py +318 -133
  2. ORForise/Annotation_Compare.py +243 -125
  3. ORForise/Comparator.py +600 -552
  4. ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
  5. ORForise/Tools/Augustus/Augustus.py +30 -23
  6. ORForise/Tools/Balrog/Balrog.py +31 -23
  7. ORForise/Tools/EasyGene/EasyGene.py +30 -22
  8. ORForise/Tools/FGENESB/FGENESB.py +32 -25
  9. ORForise/Tools/FragGeneScan/FragGeneScan.py +29 -22
  10. ORForise/Tools/GFF/GFF.py +51 -47
  11. ORForise/Tools/GLIMMER_3/GLIMMER_3.py +34 -27
  12. ORForise/Tools/GeneMark/GeneMark.py +46 -40
  13. ORForise/Tools/GeneMark_HA/GeneMark_HA.py +29 -22
  14. ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +29 -22
  15. ORForise/Tools/GeneMark_S/GeneMark_S.py +29 -22
  16. ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +29 -25
  17. ORForise/Tools/MetaGene/MetaGene.py +29 -22
  18. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +30 -23
  19. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +30 -23
  20. ORForise/Tools/Prodigal/Prodigal.py +30 -26
  21. ORForise/Tools/Prokka/Prokka.py +30 -25
  22. ORForise/Tools/StORF_Reporter/StORF_Reporter.py +33 -26
  23. ORForise/Tools/TransDecoder/TransDecoder.py +29 -22
  24. ORForise/utils.py +204 -2
  25. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/METADATA +5 -5
  26. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/RECORD +30 -30
  27. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/entry_points.txt +5 -0
  28. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/WHEEL +0 -0
  29. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/licenses/LICENSE +0 -0
  30. {orforise-1.4.3.dist-info → orforise-1.5.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  import argparse
2
- import csv
3
2
  import numpy as np
3
+ import os
4
+
5
+ try:
6
+ from ORForise.src.ORForise.utils import * # local file
7
+ except ImportError:
8
+ from ORForise.utils import *
4
9
 
5
- from ORForise.src.ORForise.utils import * # local file
6
10
 
7
- parser = argparse.ArgumentParser()
8
- parser.add_argument('-g', '--genome_to_compare', default='', help='Which genome to analyse?')
9
- args = parser.parse_args()
10
11
 
11
12
 
12
13
  def start_Codon_Count(start_Codons):
@@ -39,7 +40,6 @@ def stop_Codon_Count(stop_Codons):
39
40
  tag, taa, tga, other = 0, 0, 0, 0
40
41
  other_Stops = []
41
42
  for stop in stop_Codons:
42
- stop
43
43
  if stop == 'TAG':
44
44
  tag += 1
45
45
  elif stop == 'TAA':
@@ -83,14 +83,19 @@ def revCompIterative(watson):
83
83
  watson = watson.upper()
84
84
  watsonrev = watson[::-1]
85
85
  crick = ""
86
+
86
87
  for nt in watsonrev:
87
88
  crick += complements[nt]
88
89
  return crick
89
90
 
90
91
 
91
- def genome_Metrics(genome_to_compare):
92
+ def genome_Metrics(fasta_in, gff_in, output_file):
93
+
94
+ base_name = os.path.basename(fasta_in) # Gets file name with extension
95
+ genome_name = os.path.splitext(base_name)[0] # Removes extension
96
+
92
97
  genome_Seq = ""
93
- with open('../Genomes/' + genome_to_compare + '.fa', 'r') as genome:
98
+ with open(fasta_in , 'r') as genome:
94
99
  for line in genome:
95
100
  line = line.replace("\n", "")
96
101
  if not line.startswith('>'):
@@ -100,16 +105,16 @@ def genome_Metrics(genome_to_compare):
100
105
 
101
106
  genome_Rev = revCompIterative(genome_Seq)
102
107
  genome_Size = len(genome_Seq)
103
- coding_Regions = np.zeros((genome_Size), dtype=np.int)
104
- non_Coding_Regions = np.zeros((genome_Size), dtype=np.int)
105
- all_gene_Regions = np.zeros((genome_Size), dtype=np.int)
108
+ coding_Regions = np.zeros((genome_Size), dtype=int)
109
+ non_Coding_Regions = np.zeros((genome_Size), dtype=int)
110
+ all_gene_Regions = np.zeros((genome_Size), dtype=int)
106
111
  protein_coding_genes = collections.OrderedDict()
107
112
  non_protein_coding_genes = collections.OrderedDict()
108
113
  strands = collections.defaultdict(int)
109
114
  lengths_PCG, gene_Pos_Olap, gene_Neg_Olap, short_PCGs, pcg_GC = [], [], [], [], []
110
115
  prev_Gene_Stop, count, nc_Count, pos_Strand, neg_Strand = 0, 0, 0, 0, 0
111
116
  prev_Gene_Overlapped = False
112
- with open('../Genomes/' + genome_to_compare + '.gff', 'r') as genome_gff:
117
+ with open(gff_in, 'r') as genome_gff:
113
118
  for line in genome_gff:
114
119
  line = line.split('\t')
115
120
  try:
@@ -200,7 +205,7 @@ def genome_Metrics(genome_to_compare):
200
205
  atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts = start_Codon_Count(start_Codons)
201
206
  tag_P, taa_P, tga_P, other_Stop_P, other_Stops = stop_Codon_Count(stop_Codons)
202
207
 
203
- output = ("Number of Protein Coding Genes in " + genome_to_compare + " : " + str(
208
+ output = ("Number of Protein Coding Genes in " + genome_name + " : " + str(
204
209
  len(lengths_PCG)) + " ,Median Length of PCGs: " + str(median_PCG) + ", Min Length of PCGs: " + str(
205
210
  min(lengths_PCG)) + ", Max Length of PCGs: " + str(max(lengths_PCG)) +
206
211
  ", Number of PCGs on Pos Strand: " + str(strands['+']) + ", Number of PCGs on Neg Strand: " + str(
@@ -210,31 +215,44 @@ def genome_Metrics(genome_to_compare):
210
215
  ", Longest PCG Overlap: " + str(longest_Olap) + ", Median PCG Overlap: " + str(
211
216
  median_PCG_Olap) + ", Number of PCGs less than 100 amino acids: " + str(len(short_PCGs)) +
212
217
 
213
- '\nPercentage of Genome which is Protein Coding: ' + format(coding_Percentage,
214
- '.2f') + ', Number of Non-PCGs: ' + str(
215
- len(non_protein_coding_genes)) + ', Percentage of Genome Non-PCG: ' + format(non_coding_Percentage,
218
+ "\nPercentage of Genome which is Protein Coding: " + format(coding_Percentage,
219
+ '.2f') + ", Number of Non-PCGs: " + str(
220
+ len(non_protein_coding_genes)) + ", Percentage of Genome Non-PCG: " + format(non_coding_Percentage,
216
221
  '.2f') +
217
- ', Percentage of All Genes in Genome: ' + format(all_gene_Percentage, '.2f') +
222
+ ", Percentage of All Genes in Genome: " + format(all_gene_Percentage, '.2f') +
223
+
224
+ "\nPercentage of Genes starting with ATG: " + atg_P +
225
+ "\nPercentage of Genes starting with GTG: " + gtg_P +
226
+ "\nPercentage of Genes starting with TTG: " + ttg_P +
227
+ "\nPercentage of Genes starting with ATT: " + att_P +
228
+ "\nPercentage of Genes starting with CTG: " + ctg_P +
229
+ "\nPercentage of Genes starting with Alternative Start Codon: " + other_Start_P +
230
+
231
+ "\nPercentage of Genes ending with TAG: " + tag_P +
232
+ "\nPercentage of Genes ending with TAA: " + taa_P +
233
+ "\nPercentage of Genes ending with TGA: " + tga_P +
234
+ "\nPercentage of Genes ending with Alternative Stop Codon: " + other_Stop_P)
235
+
236
+ with open(output_file, 'w') as out_file:
237
+ out_file.write('Genome Metrics:\n')
238
+ out_file.write(output + '\n')
239
+
240
+ #print(output)
241
+
242
+
243
+
218
244
 
219
- '\nPercentage of Genes starting with ATG: ' + atg_P +
220
- '\nPercentage of Genes starting with GTG: ' + gtg_P +
221
- '\nPercentage of Genes starting with TTG: ' + ttg_P +
222
- '\nPercentage of Genes starting with ATT: ' + att_P +
223
- '\nPercentage of Genes starting with CTG: ' + ctg_P +
224
- '\nPercentage of Genes starting with Alternative Start Codon: ' + other_Start_P +
245
+ def main():
246
+ parser = argparse.ArgumentParser(description="...")
247
+ parser.add_argument("-f", dest='fasta_in', required=True, help="Input FASTA file")
248
+ parser.add_argument("-g", dest='gff_in', required=True, help="Corresponding GFF file to FASTA")
249
+ parser.add_argument("-o", dest='output_file', required=True, help="Output metrics file")
225
250
 
226
- '\nPercentage of Genes ending with TAG: ' + tag_P +
227
- '\nPercentage of Genes ending with TAA: ' + taa_P +
228
- '\nPercentage of Genes ending with TGA: ' + tga_P +
229
- '\nPercentage of Genes ending with Alternative Stop Codon: ' + other_Stop_P)
251
+ options = parser.parse_args()
230
252
 
231
- with open('../Genomes/' + genome_to_compare + '_metrics.csv', 'w') as out_file:
232
- out = csv.writer(out_file, delimiter=',')
233
- out.writerow(['Genome Metrics:'])
234
- out.writerow([output])
253
+ genome_Metrics(options.fasta_in, options.gff_in, options.output_file)
235
254
 
236
- print(output)
237
255
 
238
256
 
239
257
  if __name__ == "__main__":
240
- genome_Metrics(**vars(args))
258
+ main()
@@ -8,28 +8,35 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def Augustus(tool_pred, genome):
11
+ def Augustus(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  augustus_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred, 'r') as Augustus_input:
16
- for line in Augustus_input:
17
- line = line.split()
18
- if len(line) == 12 and "CDS" in line[2]:
19
- start = int(line[3])
20
- stop = int(line[4])
21
- strand = line[6]
22
- if '-' in strand: # Reverse Compliment starts and stops adjusted
23
- r_start = genome_size - stop
24
- r_stop = genome_size - start
25
- startCodon = genome_rev[r_start:r_start + 3]
26
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
27
- elif '+' in strand:
28
- startCodon = genome[start - 1:start + 2]
29
- stopCodon = genome[stop - 3:stop]
30
- po = str(start) + ',' + str(stop)
31
- orf = [strand, startCodon, stopCodon, 'CDS']
32
- augustus_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ augustus_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred, 'r') as Augustus_input:
22
+ for line in Augustus_input:
23
+ line = line.split()
24
+ if len(line) == 12 and dna_region in line[0] and "CDS" in line[2]:
25
+ start = int(line[3])
26
+ stop = int(line[4])
27
+ strand = line[6]
28
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
29
+ r_start = genome_size - stop
30
+ r_stop = genome_size - start
31
+ startCodon = genome_rev[r_start:r_start + 3]
32
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
33
+ elif '+' in strand:
34
+ startCodon = genome[start - 1:start + 2]
35
+ stopCodon = genome[stop - 3:stop]
36
+ po = str(start) + ',' + str(stop)
37
+ orf = [strand, startCodon, stopCodon, 'CDS', 'Augustus']
38
+ augustus_ORFs.update({po: orf})
33
39
 
34
- augustus_ORFs = sortORFs(augustus_ORFs)
35
- return augustus_ORFs
40
+ for group in augustus_ORFs:
41
+ augustus_ORFs[group] = sortORFs(augustus_ORFs[group])
42
+ return augustus_ORFs
@@ -8,29 +8,37 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def Balrog(tool_pred, genome):
11
+ def Balrog(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  Balrog_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred, 'r') as Balrog_input:
16
- for line in Balrog_input:
17
- if '#' not in line:
18
- line = line.split('\t')
19
- if "CDS" in line[2]:
20
- start = int(line[3])
21
- stop = int(line[4])
22
- strand = line[6]
23
- if '-' in strand: # Reverse Compliment starts and stops adjusted
24
- r_start = genome_size - stop
25
- r_stop = genome_size - start
26
- startCodon = genome_rev[r_start:r_start + 3]
27
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
28
- elif '+' in strand:
29
- startCodon = genome[start - 1:start + 2]
30
- stopCodon = genome[stop - 3:stop]
31
- po = str(start) + ',' + str(stop)
32
- orf = [strand, startCodon, stopCodon, 'CDS']
33
- Balrog_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ Balrog_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
34
21
 
35
- Balrog_ORFs = sortORFs(Balrog_ORFs)
22
+ with open(tool_pred, 'r') as Balrog_input:
23
+ for line in Balrog_input:
24
+ if '#' not in line:
25
+ line = line.split('\t')
26
+ if "CDS" in line[2] and dna_region in line[0]:
27
+ start = int(line[3])
28
+ stop = int(line[4])
29
+ strand = line[6]
30
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
31
+ r_start = genome_size - stop
32
+ r_stop = genome_size - start
33
+ startCodon = genome_rev[r_start:r_start + 3]
34
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
35
+ elif '+' in strand:
36
+ startCodon = genome[start - 1:start + 2]
37
+ stopCodon = genome[stop - 3:stop]
38
+ po = str(start) + ',' + str(stop)
39
+ orf = [strand, startCodon, stopCodon, 'CDS', 'Balrog']
40
+ Balrog_ORFs.update({po: orf})
41
+
42
+ for group in Balrog_ORFs:
43
+ Balrog_ORFs[group] = sortORFs(Balrog_ORFs[group])
36
44
  return Balrog_ORFs
@@ -8,28 +8,36 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def EasyGene(tool_pred, genome):
11
+ def EasyGene(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  easyGene_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred, 'r') as EasyGene_input:
16
- for line in EasyGene_input:
17
- line = line.split()
18
- if len(line) == 10 and "CDS" in line[2]:
19
- start = int(line[3])
20
- stop = int(line[4])
21
- strand = line[6]
22
- if '-' in strand: # Reverse Compliment starts and stops adjusted
23
- r_start = genome_size - stop
24
- r_stop = genome_size - start
25
- startCodon = genome_rev[r_start:r_start + 3]
26
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
27
- elif '+' in strand:
28
- startCodon = genome[start - 1:start + 2]
29
- stopCodon = genome[stop - 3:stop]
30
- po = str(start) + ',' + str(stop)
31
- orf = [strand, startCodon, stopCodon, 'CDS']
32
- easyGene_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ easyGene_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred, 'r') as EasyGene_input:
22
+ for line in EasyGene_input:
23
+ line = line.split()
24
+ if len(line) == 10 and dna_region in line[0] and "CDS" in line[2]:
25
+ start = int(line[3])
26
+ stop = int(line[4])
27
+ strand = line[6]
28
+ info = line[8]
29
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
30
+ r_start = genome_size - stop
31
+ r_stop = genome_size - start
32
+ startCodon = genome_rev[r_start:r_start + 3]
33
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
34
+ elif '+' in strand:
35
+ startCodon = genome[start - 1:start + 2]
36
+ stopCodon = genome[stop - 3:stop]
37
+ po = str(start) + ',' + str(stop)
38
+ orf = [strand, startCodon, stopCodon, 'CDS', 'EasyGene']
39
+ easyGene_ORFs[dna_region].update({po: orf})
33
40
 
34
- easyGene_ORFs = sortORFs(easyGene_ORFs)
41
+ for group in easyGene_ORFs:
42
+ easyGene_ORFs[group] = sortORFs(easyGene_ORFs[group])
35
43
  return easyGene_ORFs
@@ -8,31 +8,38 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def FGENESB(tool_pred, genome):
11
+ def FGENESB(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  FGENESB_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred, 'r') as FGENESB_input:
16
- for line in FGENESB_input:
17
- if '>GENE' in line:
18
- line = line.split()
19
- if '2208' in line:
20
- print("ss")
21
- if len(line) == 10 and ">GENE" in line[0]:
22
- start = int(line[2])
23
- stop = int(line[4])
24
- strand = line[9]
25
- if '-' in strand: # Reverse Compliment starts and stops adjusted
26
- r_start = genome_size - stop
27
- r_stop = genome_size - start
28
- startCodon = genome_rev[r_start:r_start + 3]
29
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
30
- elif '+' in strand:
31
- startCodon = genome[start - 1:start + 2]
32
- stopCodon = genome[stop - 3:stop]
33
- po = str(start) + ',' + str(stop)
34
- orf = [strand, startCodon, stopCodon, 'CDS']
35
- FGENESB_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ FGENESB_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred, 'r') as FGENESB_input:
22
+ for line in FGENESB_input:
23
+ if '>GENE' in line:
24
+ line = line.split()
25
+ if '2208' in line:
26
+ print("ss")
27
+ if len(line) == 10 and dna_region in line[0] and ">GENE" in line[0]:
28
+ start = int(line[2])
29
+ stop = int(line[4])
30
+ strand = line[9]
31
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
32
+ r_start = genome_size - stop
33
+ r_stop = genome_size - start
34
+ startCodon = genome_rev[r_start:r_start + 3]
35
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
36
+ elif '+' in strand:
37
+ startCodon = genome[start - 1:start + 2]
38
+ stopCodon = genome[stop - 3:stop]
39
+ po = str(start) + ',' + str(stop)
40
+ orf = [strand, startCodon, stopCodon, 'CDS', 'FGENESB']
41
+ FGENESB_ORFs.update({po: orf})
36
42
 
37
- FGENESB_ORFs = sortORFs(FGENESB_ORFs)
43
+ for group in FGENESB_ORFs:
44
+ FGENESB_ORFs[group] = sortORFs(FGENESB_ORFs[group])
38
45
  return FGENESB_ORFs
@@ -8,28 +8,35 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def FragGeneScan(tool_pred, genome):
11
+ def FragGeneScan(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  fragGeneScan_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred, 'r') as fragGeneScan_input:
16
- for line in fragGeneScan_input:
17
- line = line.split()
18
- if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2]:
19
- start = int(line[3])
20
- stop = int(line[4])
21
- strand = line[6]
22
- if '-' in strand: # Reverse Compliment starts and stops adjusted
23
- r_start = genome_size - stop
24
- r_stop = genome_size - start
25
- startCodon = genome_rev[r_start:r_start + 3]
26
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
27
- elif '+' in strand:
28
- startCodon = genome[start - 1:start + 2]
29
- stopCodon = genome[stop - 3:stop]
30
- po = str(start) + ',' + str(stop)
31
- orf = [strand, startCodon, stopCodon, 'CDS']
32
- fragGeneScan_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ fragGeneScan_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred, 'r') as fragGeneScan_input:
22
+ for line in fragGeneScan_input:
23
+ line = line.split()
24
+ if len(line) == 10 and "FGS" in line[1] and "CDS" in line[2] and dna_region in line[0]:
25
+ start = int(line[3])
26
+ stop = int(line[4])
27
+ strand = line[6]
28
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
29
+ r_start = genome_size - stop
30
+ r_stop = genome_size - start
31
+ startCodon = genome_rev[r_start:r_start + 3]
32
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
33
+ elif '+' in strand:
34
+ startCodon = genome[start - 1:start + 2]
35
+ stopCodon = genome[stop - 3:stop]
36
+ po = str(start) + ',' + str(stop)
37
+ orf = [strand, startCodon, stopCodon, 'CDS', 'FragGeneScan']
38
+ fragGeneScan_ORFs.update({po: orf})
33
39
 
34
- fragGeneScan_ORFs = sortORFs(fragGeneScan_ORFs)
40
+ for group in fragGeneScan_ORFs:
41
+ fragGeneScan_ORFs[group] = sortORFs(fragGeneScan_ORFs[group])
35
42
  return fragGeneScan_ORFs
ORForise/Tools/GFF/GFF.py CHANGED
@@ -10,53 +10,57 @@ except ImportError:
10
10
 
11
11
  def GFF(*args):
12
12
  tool_pred = args[0]
13
- genome = args[1]
14
- #types = args[2]
13
+ dna_regions = args[1]
15
14
  GFF_ORFs = collections.OrderedDict()
16
- genome_size = len(genome)
17
- genome_rev = revCompIterative(genome)
18
- with open(tool_pred, 'r') as gff_input:
19
- for line in gff_input:
20
- if '#' not in line:
21
- line = line.split('\t')
22
- #gene_types = types.split(',') - Temporary fix
23
- #if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
24
- if 'CDS' in line[2] and len(line) == 9:
25
- start = int(line[3])
26
- stop = int(line[4])
27
- strand = line[6]
28
- info = line[8]
29
- if stop >= genome_size:
30
- extra_stop = stop - genome_size
31
- corrected_stop = genome_size
32
- if '-' in strand: # Reverse Compliment starts and stops adjusted
33
- r_start = genome_size - corrected_stop
34
- r_stop = genome_size - start
35
- seq = genome_rev[r_start:r_stop + 1]
36
- extra_seq = genome_rev[-extra_stop - 1:]
37
- seq = extra_seq+seq
38
- startCodon = seq[:3]
39
- stopCodon = seq[-3:]
40
- elif '+' in strand:
41
- seq = genome[start -1 :corrected_stop]
42
- extra_seq = genome[:extra_stop +1]
43
- seq = seq+extra_seq
44
- startCodon = seq[:3]
45
- stopCodon = seq[-3:]
46
- else:
47
- if '-' in strand: # Reverse Compliment starts and stops adjusted
48
- r_start = genome_size - stop
49
- r_stop = genome_size - start
50
- startCodon = genome_rev[r_start:r_start + 3]
51
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
52
- elif '+' in strand:
53
- startCodon = genome[start - 1:start + 2]
54
- stopCodon = genome[stop - 3:stop]
55
- po = str(start) + ',' + str(stop)
56
- orf = [strand, startCodon, stopCodon, line[2],info] # This needs to detect the type
57
- GFF_ORFs.update({po: orf})
58
- # elif "CDS" in line[2]:
59
- # sys.exit("SAS")
15
+ for dna_region in dna_regions:
16
+ GFF_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred, 'r') as gff_input:
22
+ for line in gff_input:
23
+ if '#' not in line:
24
+ line = line.split('\t')
25
+ #gene_types = types.split(',') - Temporary fix
26
+ #if any(gene_type == line[2] for gene_type in gene_types) and len(line) == 9: # line[2] for normalrun
27
+ if 'CDS' in line[2] and len(line) == 9 and dna_region in line[0]:
28
+ start = int(line[3])
29
+ stop = int(line[4])
30
+ strand = line[6]
31
+ info = line[8]
32
+ if stop >= genome_size:
33
+ extra_stop = stop - genome_size
34
+ corrected_stop = genome_size
35
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
36
+ r_start = genome_size - corrected_stop
37
+ r_stop = genome_size - start
38
+ seq = genome_rev[r_start:r_stop + 1]
39
+ extra_seq = genome_rev[-extra_stop - 1:]
40
+ seq = extra_seq+seq
41
+ startCodon = seq[:3]
42
+ stopCodon = seq[-3:]
43
+ elif '+' in strand:
44
+ seq = genome[start -1 :corrected_stop]
45
+ extra_seq = genome[:extra_stop +1]
46
+ seq = seq+extra_seq
47
+ startCodon = seq[:3]
48
+ stopCodon = seq[-3:]
49
+ else:
50
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
51
+ r_start = genome_size - stop
52
+ r_stop = genome_size - start
53
+ startCodon = genome_rev[r_start:r_start + 3]
54
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
55
+ elif '+' in strand:
56
+ startCodon = genome[start - 1:start + 2]
57
+ stopCodon = genome[stop - 3:stop]
58
+ po = str(start) + ',' + str(stop)
59
+ orf = [strand, startCodon, stopCodon, line[2], 'GFF-Standard'] # This needs to detect the type
60
+ GFF_ORFs.update({po: orf})
61
+ # elif "CDS" in line[2]:
62
+ # sys.exit("SAS")
60
63
 
61
- GFF_ORFs = sortORFs(GFF_ORFs)
64
+ for group in GFF_ORFs:
65
+ GFF_ORFs[group] = sortORFs(GFF_ORFs[group])
62
66
  return GFF_ORFs
@@ -8,33 +8,40 @@ except ImportError:
8
8
  from ORForise.utils import sortORFs
9
9
 
10
10
 
11
- def GLIMMER_3(tool_pred, genome):
11
+ def GLIMMER_3(*args):
12
+ tool_pred = args[0]
13
+ dna_regions = args[1]
12
14
  GLIMMER_ORFs = collections.OrderedDict()
13
- genome_size = len(genome)
14
- genome_rev = revCompIterative(genome)
15
- with open(tool_pred,
16
- 'r') as glimmer_input: # GLIMMER_3 reverses the start and stop positions for ORFS on the negative strand
17
- for line in glimmer_input:
18
- if '>' not in line: # This will not work with multiple contigs
19
- line = line.split()
20
- if len(line) == 5 and "orf" in line[0]:
21
- if '-' in line[3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand
22
- start = int(line[2])
23
- stop = int(line[1])
24
- strand = '-'
25
- r_start = genome_size - stop
26
- r_stop = genome_size - start
27
- startCodon = genome_rev[r_start:r_start + 3]
28
- stopCodon = genome_rev[r_stop - 2:r_stop + 1]
29
- elif '+' in line[3]:
30
- start = int(line[1])
31
- stop = int(line[2])
32
- strand = '+'
33
- startCodon = genome[start - 1:start + 3]
34
- stopCodon = genome[stop - 3:stop]
35
- po = str(start) + ',' + str(stop)
36
- orf = [strand, startCodon, stopCodon, 'CDS']
37
- GLIMMER_ORFs.update({po: orf})
15
+ for dna_region in dna_regions:
16
+ GLIMMER_ORFs[dna_region] = collections.OrderedDict()
17
+ for dna_region in dna_regions:
18
+ genome = dna_regions[dna_region][0]
19
+ genome_size = len(genome)
20
+ genome_rev = revCompIterative(genome)
21
+ with open(tool_pred,
22
+ 'r') as glimmer_input: # GLIMMER_3 reverses the start and stop positions for ORFS on the negative strand
23
+ for line in glimmer_input:
24
+ if '>' not in line: # This will not work with multiple contigs
25
+ line = line.split()
26
+ if len(line) == 5 and "orf" in line[0] and dna_region in line[0]:
27
+ if '-' in line[3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand
28
+ start = int(line[2])
29
+ stop = int(line[1])
30
+ strand = '-'
31
+ r_start = genome_size - stop
32
+ r_stop = genome_size - start
33
+ startCodon = genome_rev[r_start:r_start + 3]
34
+ stopCodon = genome_rev[r_stop - 2:r_stop + 1]
35
+ elif '+' in line[3]:
36
+ start = int(line[1])
37
+ stop = int(line[2])
38
+ strand = '+'
39
+ startCodon = genome[start - 1:start + 3]
40
+ stopCodon = genome[stop - 3:stop]
41
+ po = str(start) + ',' + str(stop)
42
+ orf = [strand, startCodon, stopCodon, 'CDS', 'GLIMMER_3']
43
+ GLIMMER_ORFs.update({po: orf})
38
44
 
39
- GLIMMER_ORFs = sortORFs(GLIMMER_ORFs)
45
+ for group in GLIMMER_ORFs:
46
+ GLIMMER_ORFs[group] = sortORFs(GLIMMER_ORFs[group])
40
47
  return GLIMMER_ORFs