ORForise 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. ORForise/Aggregate_Compare.py +378 -0
  2. ORForise/Annotation_Compare.py +317 -0
  3. ORForise/Annotation_Intersector.py +726 -0
  4. ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
  5. ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  6. ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
  7. ORForise/Aux/StORF_Undetected/__init__.py +0 -0
  8. ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  9. ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
  10. ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
  11. ORForise/Aux/TabToGFF/__init__.py +0 -0
  12. ORForise/Aux/__init__.py +0 -0
  13. ORForise/Comparator.py +882 -0
  14. ORForise/Convert_To_GFF.py +141 -0
  15. ORForise/GFF_Adder.py +543 -0
  16. ORForise/List_Tools.py +56 -0
  17. ORForise/ORForise_Analysis/__init__.py +0 -0
  18. ORForise/ORForise_Analysis/cds_checker.py +77 -0
  19. ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
  20. ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
  21. ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
  22. ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
  23. ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
  24. ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
  25. ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
  26. ORForise/StORForise.py +115 -0
  27. ORForise/Tools/Augustus/Augustus.py +54 -0
  28. ORForise/Tools/Augustus/__init__.py +0 -0
  29. ORForise/Tools/Balrog/Balrog.py +56 -0
  30. ORForise/Tools/Balrog/__init__.py +0 -0
  31. ORForise/Tools/EasyGene/EasyGene.py +55 -0
  32. ORForise/Tools/EasyGene/__init__.py +0 -0
  33. ORForise/Tools/FGENESB/FGENESB.py +57 -0
  34. ORForise/Tools/FGENESB/__init__.py +0 -0
  35. ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
  36. ORForise/Tools/FragGeneScan/__init__.py +0 -0
  37. ORForise/Tools/GFF/GFF.py +77 -0
  38. ORForise/Tools/GFF/__init__.py +0 -0
  39. ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
  40. ORForise/Tools/GLIMMER3/__init__.py +0 -0
  41. ORForise/Tools/GeneMark/GeneMark.py +135 -0
  42. ORForise/Tools/GeneMark/__init__.py +0 -0
  43. ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
  44. ORForise/Tools/GeneMarkHA/__init__.py +0 -0
  45. ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
  46. ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
  47. ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
  48. ORForise/Tools/GeneMarkS/__init__.py +0 -0
  49. ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
  50. ORForise/Tools/GeneMarkS2/__init__.py +0 -0
  51. ORForise/Tools/MetaGene/MetaGene.py +54 -0
  52. ORForise/Tools/MetaGene/__init__.py +0 -0
  53. ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
  54. ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  55. ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
  56. ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  57. ORForise/Tools/Prodigal/Prodigal.py +55 -0
  58. ORForise/Tools/Prodigal/__init__.py +0 -0
  59. ORForise/Tools/Prokka/Prokka.py +57 -0
  60. ORForise/Tools/Prokka/__init__.py +0 -0
  61. ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
  62. ORForise/Tools/StORF-Reporter/__init__.py +0 -0
  63. ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
  64. ORForise/Tools/TransDecoder/__init__.py +0 -0
  65. ORForise/Tools/__init__.py +0 -0
  66. ORForise/__init__.py +0 -0
  67. ORForise/utils.py +236 -0
  68. orforise-1.6.2.dist-info/METADATA +1038 -0
  69. orforise-1.6.2.dist-info/RECORD +73 -0
  70. orforise-1.6.2.dist-info/WHEEL +5 -0
  71. orforise-1.6.2.dist-info/entry_points.txt +15 -0
  72. orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
  73. orforise-1.6.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,53 @@
1
+ import argparse
2
+
3
+ from orderedset import OrderedSet
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument('-i', '--undetected_Genes', default='', help='Undected Genes.')
7
+ parser.add_argument('-t', '--tool', default='', help='Tool Used.')
8
+ args = parser.parse_args()
9
+
10
+
11
+ def un_Genes(undetected_Genes, tool_ORFs):
12
+ count = 0
13
+ genes = []
14
+ with open(undetected_Genes, 'r') as undected_GFF:
15
+ for line in undected_GFF:
16
+ if ">" in line:
17
+ line = line.split('_')
18
+ g_start = int(line[1])
19
+ g_stop = int(line[2])
20
+ gene_ORF = str(g_start) + ',' + str(g_stop)
21
+ genes.append(gene_ORF)
22
+ gene_Set = OrderedSet(range(g_start, g_stop + 1))
23
+ for t_ORF in tool_ORFs:
24
+ t_ORF = t_ORF.split(',')
25
+ t_start = int(t_ORF[0])
26
+ t_stop = int(t_ORF[1])
27
+ tool_Set = OrderedSet(range(t_start - 0, t_stop + 1))
28
+ overlap = len(tool_Set.intersection(gene_Set))
29
+ if overlap >= 1:
30
+ print(overlap)
31
+ print(line)
32
+ count += 1
33
+ break
34
+ print(
35
+ "Number of Undetected Genes: " + str(len(genes)) + " Number of Spoiled Undetected Genes By Tool: " + str(count))
36
+
37
+
38
+ def tool(tool, undetected_Genes):
39
+ tool_ORFs = []
40
+ with open(tool, 'r') as input:
41
+ for line in input:
42
+ line = line.split()
43
+ if "Prodigal" in line[1] and "CDS" in line[2]: # Modify For Tool
44
+ t_start = int(line[3])
45
+ t_stop = int(line[4])
46
+ tool_ORF = str(t_start) + ',' + str(t_stop)
47
+ tool_ORFs.append(tool_ORF)
48
+
49
+ un_Genes(undetected_Genes, tool_ORFs)
50
+
51
+
52
+ if __name__ == "__main__":
53
+ tool(**vars(args))
@@ -0,0 +1,35 @@
1
+ import collections
2
+
3
+ try:
4
+ from utils import revCompIterative
5
+ from utils import sortORFs
6
+ except ImportError:
7
+ from ORForise.utils import revCompIterative
8
+ from ORForise.utils import sortORFs
9
+
10
+
11
+ def StORF_Undetected(tool_pred, genome):
12
+ storf_orfs = collections.OrderedDict()
13
+ genome_size = len(genome)
14
+ genome_Rev = revCompIterative(genome)
15
+ with open(tool_pred, 'r') as storf_input:
16
+ for line in storf_input:
17
+ line = line.split()
18
+ if "StORF" in line[1] and "ORF" in line[2]:
19
+ start = int(line[3])
20
+ stop = int(line[4])
21
+ strand = line[6]
22
+ if '-' in strand: # Reverse Compliment starts and stops adjusted
23
+ r_start = genome_size - stop
24
+ r_stop = genome_size - start
25
+ startCodon = genome_Rev[r_start:r_start + 3]
26
+ stopCodon = genome_Rev[r_stop - 2:r_stop + 1]
27
+ elif '+' in strand:
28
+ startCodon = genome[start - 1:start - 1 + 3]
29
+ stopCodon = genome[stop - 3:stop - 1 + 1]
30
+ po = str(start) + ',' + str(stop)
31
+ orf = [strand, startCodon, stopCodon]
32
+ storf_orfs.update({po: orf})
33
+
34
+ storf_orfs = sortORFs(storf_orfs)
35
+ return storf_orfs
File without changes
@@ -0,0 +1,46 @@
1
+ import argparse
2
+ import collections
3
+
4
+ parser = argparse.ArgumentParser()
5
+ parser.add_argument('-mg', '--missing_genes', default='', help='Which set of genes to check?')
6
+ parser.add_argument('-g', '--genome_to_compare', required=True,
7
+ help='Which genome to analyse? Genome files have same prefix'
8
+ ' - .fa and .gff appended')
9
+ args = parser.parse_args()
10
+
11
+
12
+ def comparator(missing_genes, genome_to_compare):
13
+ missed_genes = collections.OrderedDict()
14
+ non_vitiated_genes = []
15
+ with open('../' + missing_genes, 'r') as m_genes:
16
+ for line in m_genes:
17
+ if line.startswith('>'):
18
+ line = line.split('_')
19
+ start = int(line[1])
20
+ stop = int(line[2])
21
+ m_gene_Set = set(range(start, stop + 1))
22
+ missed_genes.update({str(start) + '_' + str(stop): m_gene_Set})
23
+ non_vitiated_genes.append(str(start) + '_' + str(stop))
24
+ ##############################################
25
+
26
+ with open('../../Prodigal/Prodigal_' + genome_to_compare + '.gff', 'r') as prodigal_input:
27
+ for line in prodigal_input:
28
+ line = line.split()
29
+ if "Prodigal" in line[1] and "CDS" in line[2]:
30
+ start = int(line[3])
31
+ stop = int(line[4])
32
+ pred_set = set(range(start, stop + 1))
33
+ for missed, g_set in missed_genes.items():
34
+ overlap = len(pred_set.intersection(g_set))
35
+ if overlap > 50:
36
+ print(start)
37
+ try:
38
+ non_vitiated_genes.remove(missed)
39
+ print(missed)
40
+ except ValueError:
41
+ continue
42
+ print(len(non_vitiated_genes))
43
+
44
+
45
+ if __name__ == "__main__":
46
+ comparator(**vars(args))
@@ -0,0 +1,140 @@
1
+ import collections
2
+ import logging
3
+
4
+
5
+
6
+ def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
7
+ attrs = []
8
+ for k, v in attributes.items():
9
+ attrs.append(f"{k}={v}")
10
+ return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
11
+
12
+
13
+ def parse_blast_tab6(path, genome_seq, gene_ident=None):
14
+ results = collections.OrderedDict()
15
+ count = 0
16
+ with open(path, 'r') as fh:
17
+ for i, line in enumerate(fh, 1):
18
+ line = line.strip()
19
+ if not line or line.startswith('#'):
20
+ continue
21
+ parts = line.split('\t')
22
+ if len(parts) < 12:
23
+ logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
24
+ continue
25
+ qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
26
+ try:
27
+ sstart = int(sstart)
28
+ send = int(send)
29
+ except ValueError:
30
+ logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
31
+ continue
32
+ start = min(sstart, send)
33
+ end = max(sstart, send)
34
+ strand = '+' if sstart <= send else '-'
35
+ attrs = {
36
+ 'ID': f'blast_hit{count}',
37
+ 'Target': f'{qseqid} {qstart} {qend}',
38
+ 'pident': pident,
39
+ 'length': length,
40
+ 'evalue': evalue,
41
+ 'bitscore': bitscore
42
+ }
43
+ results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
44
+ count += 1
45
+ return results
46
+
47
+
48
+ def parse_abricate(path, genome_seq, gene_ident=None):
49
+ results = collections.OrderedDict()
50
+ count = 0
51
+ with (open(path, 'r') as fh):
52
+ header = None
53
+ for i, line in enumerate(fh, 1):
54
+ line = line.rstrip('\n')
55
+ if not line:
56
+ continue
57
+ if line.startswith('#'):
58
+ header = line.split('\t')
59
+ continue
60
+ if header is None:
61
+ # skip any pre-header content until a header line is encountered
62
+ continue
63
+ parts = line.split('\t')
64
+ if header and len(parts) == len(header):
65
+ row = dict(zip(header, parts))
66
+
67
+ try:
68
+ start = int(row.get('START', '0'))
69
+ end = int(row.get('END', '0'))
70
+ strand = row.get('STRAND')
71
+ except ValueError:
72
+ logging.warning(f"Line {i}: invalid START/END in Abricate line")
73
+ continue
74
+ seqid = row.get('SEQUENCE')
75
+ gene = row.get('GENE')
76
+ accession = row.get('ACCESSION') or 'unknown'
77
+ db = row.get('DATABASE') or 'unknown'
78
+ identity = row.get('%IDENTITY')
79
+ coverage = row.get('%COVERAGE')
80
+ product = row.get('PRODUCT') or 'unkown'
81
+ resistance = row.get('RESISTANCE') or 'unknown'
82
+
83
+ attrs = {
84
+ 'seqid': seqid,
85
+ 'start': start,
86
+ 'end': end,
87
+ 'strand': strand,
88
+ 'gene': gene,
89
+ 'accession': accession,
90
+ 'database': db,
91
+ 'identity': identity,
92
+ 'coverage': coverage,
93
+ 'product': product,
94
+ 'resistance': resistance
95
+ }
96
+ results[f"{start},{end}"] = attrs
97
+ count += 1
98
+ else:
99
+ logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
100
+ continue
101
+ return results
102
+
103
+
104
+ def parse_genemark(path, genome_seq, gene_ident=None):
105
+ results = collections.OrderedDict()
106
+ count = 0
107
+ with open(path, 'r') as fh:
108
+ for i, line in enumerate(fh, 1):
109
+ line = line.strip()
110
+ if not line:
111
+ continue
112
+ parts = line.split()
113
+ if len(parts) < 3:
114
+ continue
115
+ try:
116
+ start = int(parts[0])
117
+ stop = int(parts[1])
118
+ except ValueError:
119
+ continue
120
+ strand_tok = parts[2]
121
+ if 'complement' in strand_tok:
122
+ strand = '-'
123
+ else:
124
+ strand = '+'
125
+ attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
126
+ results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
127
+ count += 1
128
+ return results
129
+
130
+
131
+ def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
132
+ # Should be cleaned up to use consistent format names
133
+ fmt = fmt.lower()
134
+ if fmt in ('blast', 'blast_tab6', 'tab6'):
135
+ return parse_blast_tab6(input_file, genome_seq, gene_ident)
136
+ if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
137
+ return parse_abricate(input_file, genome_seq, gene_ident)
138
+ if fmt in ('genemark', 'gene_mark'):
139
+ return parse_genemark(input_file, genome_seq, gene_ident)
140
+ raise ValueError(f"Unknown format: {fmt}")
File without changes
File without changes