ORForise 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Annotation_Compare.py +105 -88
- ORForise/Comparator.py +60 -28
- ORForise/Convert_To_GFF.py +138 -0
- ORForise/Tools/TabToGFF/TabToGFF.py +140 -0
- ORForise/Tools/TabToGFF/__init__.py +0 -0
- ORForise/utils.py +1 -1
- orforise-1.6.0.dist-info/METADATA +1051 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/RECORD +12 -9
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/entry_points.txt +2 -0
- orforise-1.5.0.dist-info/METADATA +0 -451
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/WHEEL +0 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
|
|
7
|
+
attrs = []
|
|
8
|
+
for k, v in attributes.items():
|
|
9
|
+
attrs.append(f"{k}={v}")
|
|
10
|
+
return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_blast_tab6(path, genome_seq, gene_ident=None):
|
|
14
|
+
results = collections.OrderedDict()
|
|
15
|
+
count = 0
|
|
16
|
+
with open(path, 'r') as fh:
|
|
17
|
+
for i, line in enumerate(fh, 1):
|
|
18
|
+
line = line.strip()
|
|
19
|
+
if not line or line.startswith('#'):
|
|
20
|
+
continue
|
|
21
|
+
parts = line.split('\t')
|
|
22
|
+
if len(parts) < 12:
|
|
23
|
+
logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
|
|
24
|
+
continue
|
|
25
|
+
qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
|
|
26
|
+
try:
|
|
27
|
+
sstart = int(sstart)
|
|
28
|
+
send = int(send)
|
|
29
|
+
except ValueError:
|
|
30
|
+
logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
|
|
31
|
+
continue
|
|
32
|
+
start = min(sstart, send)
|
|
33
|
+
end = max(sstart, send)
|
|
34
|
+
strand = '+' if sstart <= send else '-'
|
|
35
|
+
attrs = {
|
|
36
|
+
'ID': f'blast_hit{count}',
|
|
37
|
+
'Target': f'{qseqid} {qstart} {qend}',
|
|
38
|
+
'pident': pident,
|
|
39
|
+
'length': length,
|
|
40
|
+
'evalue': evalue,
|
|
41
|
+
'bitscore': bitscore
|
|
42
|
+
}
|
|
43
|
+
results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
|
|
44
|
+
count += 1
|
|
45
|
+
return results
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_abricate(path, genome_seq, gene_ident=None):
|
|
49
|
+
results = collections.OrderedDict()
|
|
50
|
+
count = 0
|
|
51
|
+
with (open(path, 'r') as fh):
|
|
52
|
+
header = None
|
|
53
|
+
for i, line in enumerate(fh, 1):
|
|
54
|
+
line = line.rstrip('\n')
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
if line.startswith('#'):
|
|
58
|
+
header = line.split('\t')
|
|
59
|
+
continue
|
|
60
|
+
if header is None:
|
|
61
|
+
# skip any pre-header content until a header line is encountered
|
|
62
|
+
continue
|
|
63
|
+
parts = line.split('\t')
|
|
64
|
+
if header and len(parts) == len(header):
|
|
65
|
+
row = dict(zip(header, parts))
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
start = int(row.get('START', '0'))
|
|
69
|
+
end = int(row.get('END', '0'))
|
|
70
|
+
strand = row.get('STRAND')
|
|
71
|
+
except ValueError:
|
|
72
|
+
logging.warning(f"Line {i}: invalid START/END in Abricate line")
|
|
73
|
+
continue
|
|
74
|
+
seqid = row.get('SEQUENCE')
|
|
75
|
+
gene = row.get('GENE')
|
|
76
|
+
accession = row.get('ACCESSION') or 'unknown'
|
|
77
|
+
db = row.get('DATABASE') or 'unknown'
|
|
78
|
+
identity = row.get('%IDENTITY')
|
|
79
|
+
coverage = row.get('%COVERAGE')
|
|
80
|
+
product = row.get('PRODUCT') or 'unkown'
|
|
81
|
+
resistance = row.get('RESISTANCE') or 'unknown'
|
|
82
|
+
|
|
83
|
+
attrs = {
|
|
84
|
+
'seqid': seqid,
|
|
85
|
+
'start': start,
|
|
86
|
+
'end': end,
|
|
87
|
+
'strand': strand,
|
|
88
|
+
'gene': gene,
|
|
89
|
+
'accession': accession,
|
|
90
|
+
'database': db,
|
|
91
|
+
'identity': identity,
|
|
92
|
+
'coverage': coverage,
|
|
93
|
+
'product': product,
|
|
94
|
+
'resistance': resistance
|
|
95
|
+
}
|
|
96
|
+
results[f"{start},{end}"] = attrs
|
|
97
|
+
count += 1
|
|
98
|
+
else:
|
|
99
|
+
logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
|
|
100
|
+
continue
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_genemark(path, genome_seq, gene_ident=None):
|
|
105
|
+
results = collections.OrderedDict()
|
|
106
|
+
count = 0
|
|
107
|
+
with open(path, 'r') as fh:
|
|
108
|
+
for i, line in enumerate(fh, 1):
|
|
109
|
+
line = line.strip()
|
|
110
|
+
if not line:
|
|
111
|
+
continue
|
|
112
|
+
parts = line.split()
|
|
113
|
+
if len(parts) < 3:
|
|
114
|
+
continue
|
|
115
|
+
try:
|
|
116
|
+
start = int(parts[0])
|
|
117
|
+
stop = int(parts[1])
|
|
118
|
+
except ValueError:
|
|
119
|
+
continue
|
|
120
|
+
strand_tok = parts[2]
|
|
121
|
+
if 'complement' in strand_tok:
|
|
122
|
+
strand = '-'
|
|
123
|
+
else:
|
|
124
|
+
strand = '+'
|
|
125
|
+
attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
|
|
126
|
+
results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
|
|
127
|
+
count += 1
|
|
128
|
+
return results
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
|
|
132
|
+
# Should be cleaned up to use consistent format names
|
|
133
|
+
fmt = fmt.lower()
|
|
134
|
+
if fmt in ('blast', 'blast_tab6', 'tab6'):
|
|
135
|
+
return parse_blast_tab6(input_file, genome_seq, gene_ident)
|
|
136
|
+
if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
|
|
137
|
+
return parse_abricate(input_file, genome_seq, gene_ident)
|
|
138
|
+
if fmt in ('genemark', 'gene_mark'):
|
|
139
|
+
return parse_genemark(input_file, genome_seq, gene_ident)
|
|
140
|
+
raise ValueError(f"Unknown format: {fmt}")
|
|
File without changes
|