ORForise 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Aggregate_Compare.py +378 -0
- ORForise/Annotation_Compare.py +317 -0
- ORForise/Annotation_Intersector.py +726 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +53 -0
- ORForise/Aux/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/StORF_Undetected.py +35 -0
- ORForise/Aux/StORF_Undetected/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
- ORForise/Aux/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +46 -0
- ORForise/Aux/TabToGFF/TabToGFF.py +140 -0
- ORForise/Aux/TabToGFF/__init__.py +0 -0
- ORForise/Aux/__init__.py +0 -0
- ORForise/Comparator.py +882 -0
- ORForise/Convert_To_GFF.py +141 -0
- ORForise/GFF_Adder.py +543 -0
- ORForise/List_Tools.py +56 -0
- ORForise/ORForise_Analysis/__init__.py +0 -0
- ORForise/ORForise_Analysis/cds_checker.py +77 -0
- ORForise/ORForise_Analysis/gene_Lenghts.py +28 -0
- ORForise/ORForise_Analysis/genome_Metrics.py +258 -0
- ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +88 -0
- ORForise/ORForise_Analysis/missed_Gene_Metrics.py +277 -0
- ORForise/ORForise_Analysis/parital_Match_Analysis.py +230 -0
- ORForise/ORForise_Analysis/result_File_Analysis.py +286 -0
- ORForise/ORForise_Analysis/start_Codon_Substitution.py +161 -0
- ORForise/StORForise.py +115 -0
- ORForise/Tools/Augustus/Augustus.py +54 -0
- ORForise/Tools/Augustus/__init__.py +0 -0
- ORForise/Tools/Balrog/Balrog.py +56 -0
- ORForise/Tools/Balrog/__init__.py +0 -0
- ORForise/Tools/EasyGene/EasyGene.py +55 -0
- ORForise/Tools/EasyGene/__init__.py +0 -0
- ORForise/Tools/FGENESB/FGENESB.py +57 -0
- ORForise/Tools/FGENESB/__init__.py +0 -0
- ORForise/Tools/FragGeneScan/FragGeneScan.py +54 -0
- ORForise/Tools/FragGeneScan/__init__.py +0 -0
- ORForise/Tools/GFF/GFF.py +77 -0
- ORForise/Tools/GFF/__init__.py +0 -0
- ORForise/Tools/GLIMMER3/GLIMMER3.py +59 -0
- ORForise/Tools/GLIMMER3/__init__.py +0 -0
- ORForise/Tools/GeneMark/GeneMark.py +135 -0
- ORForise/Tools/GeneMark/__init__.py +0 -0
- ORForise/Tools/GeneMarkHA/GeneMarkHA.py +54 -0
- ORForise/Tools/GeneMarkHA/__init__.py +0 -0
- ORForise/Tools/GeneMarkHMM/GeneMarkHMM.py +55 -0
- ORForise/Tools/GeneMarkHMM/__init__.py +0 -0
- ORForise/Tools/GeneMarkS/GeneMarkS.py +54 -0
- ORForise/Tools/GeneMarkS/__init__.py +0 -0
- ORForise/Tools/GeneMarkS2/GeneMarkS2.py +55 -0
- ORForise/Tools/GeneMarkS2/__init__.py +0 -0
- ORForise/Tools/MetaGene/MetaGene.py +54 -0
- ORForise/Tools/MetaGene/__init__.py +0 -0
- ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +55 -0
- ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
- ORForise/Tools/MetaGeneMark/MetaGeneMark.py +55 -0
- ORForise/Tools/MetaGeneMark/__init__.py +0 -0
- ORForise/Tools/Prodigal/Prodigal.py +55 -0
- ORForise/Tools/Prodigal/__init__.py +0 -0
- ORForise/Tools/Prokka/Prokka.py +57 -0
- ORForise/Tools/Prokka/__init__.py +0 -0
- ORForise/Tools/StORF-Reporter/StORF-Reporter.py +56 -0
- ORForise/Tools/StORF-Reporter/__init__.py +0 -0
- ORForise/Tools/TransDecoder/TransDecoder.py +54 -0
- ORForise/Tools/TransDecoder/__init__.py +0 -0
- ORForise/Tools/__init__.py +0 -0
- ORForise/__init__.py +0 -0
- ORForise/utils.py +236 -0
- orforise-1.6.2.dist-info/METADATA +1038 -0
- orforise-1.6.2.dist-info/RECORD +73 -0
- orforise-1.6.2.dist-info/WHEEL +5 -0
- orforise-1.6.2.dist-info/entry_points.txt +15 -0
- orforise-1.6.2.dist-info/licenses/LICENSE +624 -0
- orforise-1.6.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from orderedset import OrderedSet
|
|
4
|
+
|
|
5
|
+
parser = argparse.ArgumentParser()
|
|
6
|
+
parser.add_argument('-i', '--undetected_Genes', default='', help='Undected Genes.')
|
|
7
|
+
parser.add_argument('-t', '--tool', default='', help='Tool Used.')
|
|
8
|
+
args = parser.parse_args()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def un_Genes(undetected_Genes, tool_ORFs):
|
|
12
|
+
count = 0
|
|
13
|
+
genes = []
|
|
14
|
+
with open(undetected_Genes, 'r') as undected_GFF:
|
|
15
|
+
for line in undected_GFF:
|
|
16
|
+
if ">" in line:
|
|
17
|
+
line = line.split('_')
|
|
18
|
+
g_start = int(line[1])
|
|
19
|
+
g_stop = int(line[2])
|
|
20
|
+
gene_ORF = str(g_start) + ',' + str(g_stop)
|
|
21
|
+
genes.append(gene_ORF)
|
|
22
|
+
gene_Set = OrderedSet(range(g_start, g_stop + 1))
|
|
23
|
+
for t_ORF in tool_ORFs:
|
|
24
|
+
t_ORF = t_ORF.split(',')
|
|
25
|
+
t_start = int(t_ORF[0])
|
|
26
|
+
t_stop = int(t_ORF[1])
|
|
27
|
+
tool_Set = OrderedSet(range(t_start - 0, t_stop + 1))
|
|
28
|
+
overlap = len(tool_Set.intersection(gene_Set))
|
|
29
|
+
if overlap >= 1:
|
|
30
|
+
print(overlap)
|
|
31
|
+
print(line)
|
|
32
|
+
count += 1
|
|
33
|
+
break
|
|
34
|
+
print(
|
|
35
|
+
"Number of Undetected Genes: " + str(len(genes)) + " Number of Spoiled Undetected Genes By Tool: " + str(count))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def tool(tool, undetected_Genes):
|
|
39
|
+
tool_ORFs = []
|
|
40
|
+
with open(tool, 'r') as input:
|
|
41
|
+
for line in input:
|
|
42
|
+
line = line.split()
|
|
43
|
+
if "Prodigal" in line[1] and "CDS" in line[2]: # Modify For Tool
|
|
44
|
+
t_start = int(line[3])
|
|
45
|
+
t_stop = int(line[4])
|
|
46
|
+
tool_ORF = str(t_start) + ',' + str(t_stop)
|
|
47
|
+
tool_ORFs.append(tool_ORF)
|
|
48
|
+
|
|
49
|
+
un_Genes(undetected_Genes, tool_ORFs)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
tool(**vars(args))
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from utils import revCompIterative
|
|
5
|
+
from utils import sortORFs
|
|
6
|
+
except ImportError:
|
|
7
|
+
from ORForise.utils import revCompIterative
|
|
8
|
+
from ORForise.utils import sortORFs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def StORF_Undetected(tool_pred, genome):
|
|
12
|
+
storf_orfs = collections.OrderedDict()
|
|
13
|
+
genome_size = len(genome)
|
|
14
|
+
genome_Rev = revCompIterative(genome)
|
|
15
|
+
with open(tool_pred, 'r') as storf_input:
|
|
16
|
+
for line in storf_input:
|
|
17
|
+
line = line.split()
|
|
18
|
+
if "StORF" in line[1] and "ORF" in line[2]:
|
|
19
|
+
start = int(line[3])
|
|
20
|
+
stop = int(line[4])
|
|
21
|
+
strand = line[6]
|
|
22
|
+
if '-' in strand: # Reverse Compliment starts and stops adjusted
|
|
23
|
+
r_start = genome_size - stop
|
|
24
|
+
r_stop = genome_size - start
|
|
25
|
+
startCodon = genome_Rev[r_start:r_start + 3]
|
|
26
|
+
stopCodon = genome_Rev[r_stop - 2:r_stop + 1]
|
|
27
|
+
elif '+' in strand:
|
|
28
|
+
startCodon = genome[start - 1:start - 1 + 3]
|
|
29
|
+
stopCodon = genome[stop - 3:stop - 1 + 1]
|
|
30
|
+
po = str(start) + ',' + str(stop)
|
|
31
|
+
orf = [strand, startCodon, stopCodon]
|
|
32
|
+
storf_orfs.update({po: orf})
|
|
33
|
+
|
|
34
|
+
storf_orfs = sortORFs(storf_orfs)
|
|
35
|
+
return storf_orfs
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
|
|
4
|
+
parser = argparse.ArgumentParser()
|
|
5
|
+
parser.add_argument('-mg', '--missing_genes', default='', help='Which set of genes to check?')
|
|
6
|
+
parser.add_argument('-g', '--genome_to_compare', required=True,
|
|
7
|
+
help='Which genome to analyse? Genome files have same prefix'
|
|
8
|
+
' - .fa and .gff appended')
|
|
9
|
+
args = parser.parse_args()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def comparator(missing_genes, genome_to_compare):
|
|
13
|
+
missed_genes = collections.OrderedDict()
|
|
14
|
+
non_vitiated_genes = []
|
|
15
|
+
with open('../' + missing_genes, 'r') as m_genes:
|
|
16
|
+
for line in m_genes:
|
|
17
|
+
if line.startswith('>'):
|
|
18
|
+
line = line.split('_')
|
|
19
|
+
start = int(line[1])
|
|
20
|
+
stop = int(line[2])
|
|
21
|
+
m_gene_Set = set(range(start, stop + 1))
|
|
22
|
+
missed_genes.update({str(start) + '_' + str(stop): m_gene_Set})
|
|
23
|
+
non_vitiated_genes.append(str(start) + '_' + str(stop))
|
|
24
|
+
##############################################
|
|
25
|
+
|
|
26
|
+
with open('../../Prodigal/Prodigal_' + genome_to_compare + '.gff', 'r') as prodigal_input:
|
|
27
|
+
for line in prodigal_input:
|
|
28
|
+
line = line.split()
|
|
29
|
+
if "Prodigal" in line[1] and "CDS" in line[2]:
|
|
30
|
+
start = int(line[3])
|
|
31
|
+
stop = int(line[4])
|
|
32
|
+
pred_set = set(range(start, stop + 1))
|
|
33
|
+
for missed, g_set in missed_genes.items():
|
|
34
|
+
overlap = len(pred_set.intersection(g_set))
|
|
35
|
+
if overlap > 50:
|
|
36
|
+
print(start)
|
|
37
|
+
try:
|
|
38
|
+
non_vitiated_genes.remove(missed)
|
|
39
|
+
print(missed)
|
|
40
|
+
except ValueError:
|
|
41
|
+
continue
|
|
42
|
+
print(len(non_vitiated_genes))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
comparator(**vars(args))
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
|
|
7
|
+
attrs = []
|
|
8
|
+
for k, v in attributes.items():
|
|
9
|
+
attrs.append(f"{k}={v}")
|
|
10
|
+
return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_blast_tab6(path, genome_seq, gene_ident=None):
|
|
14
|
+
results = collections.OrderedDict()
|
|
15
|
+
count = 0
|
|
16
|
+
with open(path, 'r') as fh:
|
|
17
|
+
for i, line in enumerate(fh, 1):
|
|
18
|
+
line = line.strip()
|
|
19
|
+
if not line or line.startswith('#'):
|
|
20
|
+
continue
|
|
21
|
+
parts = line.split('\t')
|
|
22
|
+
if len(parts) < 12:
|
|
23
|
+
logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
|
|
24
|
+
continue
|
|
25
|
+
qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
|
|
26
|
+
try:
|
|
27
|
+
sstart = int(sstart)
|
|
28
|
+
send = int(send)
|
|
29
|
+
except ValueError:
|
|
30
|
+
logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
|
|
31
|
+
continue
|
|
32
|
+
start = min(sstart, send)
|
|
33
|
+
end = max(sstart, send)
|
|
34
|
+
strand = '+' if sstart <= send else '-'
|
|
35
|
+
attrs = {
|
|
36
|
+
'ID': f'blast_hit{count}',
|
|
37
|
+
'Target': f'{qseqid} {qstart} {qend}',
|
|
38
|
+
'pident': pident,
|
|
39
|
+
'length': length,
|
|
40
|
+
'evalue': evalue,
|
|
41
|
+
'bitscore': bitscore
|
|
42
|
+
}
|
|
43
|
+
results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
|
|
44
|
+
count += 1
|
|
45
|
+
return results
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_abricate(path, genome_seq, gene_ident=None):
|
|
49
|
+
results = collections.OrderedDict()
|
|
50
|
+
count = 0
|
|
51
|
+
with (open(path, 'r') as fh):
|
|
52
|
+
header = None
|
|
53
|
+
for i, line in enumerate(fh, 1):
|
|
54
|
+
line = line.rstrip('\n')
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
if line.startswith('#'):
|
|
58
|
+
header = line.split('\t')
|
|
59
|
+
continue
|
|
60
|
+
if header is None:
|
|
61
|
+
# skip any pre-header content until a header line is encountered
|
|
62
|
+
continue
|
|
63
|
+
parts = line.split('\t')
|
|
64
|
+
if header and len(parts) == len(header):
|
|
65
|
+
row = dict(zip(header, parts))
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
start = int(row.get('START', '0'))
|
|
69
|
+
end = int(row.get('END', '0'))
|
|
70
|
+
strand = row.get('STRAND')
|
|
71
|
+
except ValueError:
|
|
72
|
+
logging.warning(f"Line {i}: invalid START/END in Abricate line")
|
|
73
|
+
continue
|
|
74
|
+
seqid = row.get('SEQUENCE')
|
|
75
|
+
gene = row.get('GENE')
|
|
76
|
+
accession = row.get('ACCESSION') or 'unknown'
|
|
77
|
+
db = row.get('DATABASE') or 'unknown'
|
|
78
|
+
identity = row.get('%IDENTITY')
|
|
79
|
+
coverage = row.get('%COVERAGE')
|
|
80
|
+
product = row.get('PRODUCT') or 'unkown'
|
|
81
|
+
resistance = row.get('RESISTANCE') or 'unknown'
|
|
82
|
+
|
|
83
|
+
attrs = {
|
|
84
|
+
'seqid': seqid,
|
|
85
|
+
'start': start,
|
|
86
|
+
'end': end,
|
|
87
|
+
'strand': strand,
|
|
88
|
+
'gene': gene,
|
|
89
|
+
'accession': accession,
|
|
90
|
+
'database': db,
|
|
91
|
+
'identity': identity,
|
|
92
|
+
'coverage': coverage,
|
|
93
|
+
'product': product,
|
|
94
|
+
'resistance': resistance
|
|
95
|
+
}
|
|
96
|
+
results[f"{start},{end}"] = attrs
|
|
97
|
+
count += 1
|
|
98
|
+
else:
|
|
99
|
+
logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
|
|
100
|
+
continue
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_genemark(path, genome_seq, gene_ident=None):
|
|
105
|
+
results = collections.OrderedDict()
|
|
106
|
+
count = 0
|
|
107
|
+
with open(path, 'r') as fh:
|
|
108
|
+
for i, line in enumerate(fh, 1):
|
|
109
|
+
line = line.strip()
|
|
110
|
+
if not line:
|
|
111
|
+
continue
|
|
112
|
+
parts = line.split()
|
|
113
|
+
if len(parts) < 3:
|
|
114
|
+
continue
|
|
115
|
+
try:
|
|
116
|
+
start = int(parts[0])
|
|
117
|
+
stop = int(parts[1])
|
|
118
|
+
except ValueError:
|
|
119
|
+
continue
|
|
120
|
+
strand_tok = parts[2]
|
|
121
|
+
if 'complement' in strand_tok:
|
|
122
|
+
strand = '-'
|
|
123
|
+
else:
|
|
124
|
+
strand = '+'
|
|
125
|
+
attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
|
|
126
|
+
results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
|
|
127
|
+
count += 1
|
|
128
|
+
return results
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
|
|
132
|
+
# Should be cleaned up to use consistent format names
|
|
133
|
+
fmt = fmt.lower()
|
|
134
|
+
if fmt in ('blast', 'blast_tab6', 'tab6'):
|
|
135
|
+
return parse_blast_tab6(input_file, genome_seq, gene_ident)
|
|
136
|
+
if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
|
|
137
|
+
return parse_abricate(input_file, genome_seq, gene_ident)
|
|
138
|
+
if fmt in ('genemark', 'gene_mark'):
|
|
139
|
+
return parse_genemark(input_file, genome_seq, gene_ident)
|
|
140
|
+
raise ValueError(f"Unknown format: {fmt}")
|
|
File without changes
|
ORForise/Aux/__init__.py
ADDED
|
File without changes
|