ORForise 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ORForise/Annotation_Compare.py +14 -48
- ORForise/Convert_To_GFF.py +138 -0
- ORForise/Tools/TabToGFF/TabToGFF.py +140 -0
- ORForise/Tools/TabToGFF/__init__.py +0 -0
- ORForise/utils.py +1 -1
- orforise-1.6.0.dist-info/METADATA +1051 -0
- {orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/RECORD +11 -8
- {orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/entry_points.txt +2 -0
- orforise-1.5.1.dist-info/METADATA +0 -427
- {orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/WHEEL +0 -0
- {orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/top_level.txt +0 -0
ORForise/Annotation_Compare.py
CHANGED
|
@@ -7,16 +7,15 @@ from datetime import datetime
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
|
+
from utils import *
|
|
10
11
|
from Comparator import tool_comparison
|
|
11
12
|
except ImportError:
|
|
12
13
|
from .Comparator import tool_comparison
|
|
13
|
-
|
|
14
|
-
try:
|
|
15
|
-
from utils import *
|
|
16
|
-
except ImportError:
|
|
17
14
|
from ORForise.utils import *
|
|
18
15
|
|
|
19
16
|
|
|
17
|
+
|
|
18
|
+
|
|
20
19
|
##########################
|
|
21
20
|
|
|
22
21
|
# Consolidate printing and logging into a single block
|
|
@@ -131,16 +130,6 @@ def comparator(options):
|
|
|
131
130
|
print(full_msg)
|
|
132
131
|
options.output_logger.info(full_msg)
|
|
133
132
|
|
|
134
|
-
# print("These are the results for: " + dna_region + '\n')
|
|
135
|
-
# print('Current Contig: ' + str(dna_region))
|
|
136
|
-
# print('Number of Genes: ' + str(num_current_genes))
|
|
137
|
-
# print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
|
|
138
|
-
# print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
|
|
139
|
-
# print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
|
|
140
|
-
# print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
|
|
141
|
-
# print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
|
|
142
|
-
# print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
|
|
143
|
-
|
|
144
133
|
# Prepare output directory and file names for each contig
|
|
145
134
|
contig_save = dna_region.replace('/', '_').replace('\\', '_')
|
|
146
135
|
contig_dir = os.path.join(options.outdir, contig_save)
|
|
@@ -190,24 +179,6 @@ def comparator(options):
|
|
|
190
179
|
tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
|
|
191
180
|
tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
|
|
192
181
|
tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
|
|
193
|
-
# tool_out.writerow(['Start_Position_Difference:'])
|
|
194
|
-
# tool_out.writerow(result.get('start_Difference', []))
|
|
195
|
-
# tool_out.writerow(['Stop_Position_Difference:'])
|
|
196
|
-
# tool_out.writerow(result.get('stop_Difference', []))
|
|
197
|
-
# tool_out.writerow(['Alternative_Starts_Predicted:'])
|
|
198
|
-
# tool_out.writerow(result.get('other_Starts', []))
|
|
199
|
-
# tool_out.writerow(['Alternative_Stops_Predicted:'])
|
|
200
|
-
# tool_out.writerow(result.get('other_Stops', []))
|
|
201
|
-
# tool_out.writerow(['Undetected_Gene_Metrics:'])
|
|
202
|
-
# tool_out.writerow([
|
|
203
|
-
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
204
|
-
# ])
|
|
205
|
-
# tool_out.writerow(result.get('undetected_Gene_Metrics', []))
|
|
206
|
-
# tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
|
|
207
|
-
# tool_out.writerow([
|
|
208
|
-
# 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
|
|
209
|
-
# ])
|
|
210
|
-
# tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
|
|
211
182
|
|
|
212
183
|
# Write perfect matches to FASTA
|
|
213
184
|
with open(perfect_fasta, 'w', encoding='utf-8') as f:
|
|
@@ -266,26 +237,21 @@ def comparator(options):
|
|
|
266
237
|
out_file.write('\nOverall Summary:\n')
|
|
267
238
|
out_file.write(f'Number of Genes: {total_genes}\n')
|
|
268
239
|
out_file.write(f'Number of ORFs: {total_orfs}\n')
|
|
269
|
-
out_file.write(
|
|
270
|
-
|
|
271
|
-
out_file.write(
|
|
272
|
-
|
|
273
|
-
out_file.write(
|
|
274
|
-
f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
|
|
275
|
-
out_file.write(
|
|
276
|
-
f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
|
|
277
|
-
out_file.write(
|
|
278
|
-
f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
|
|
240
|
+
out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
|
|
241
|
+
out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
|
|
242
|
+
out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
|
|
243
|
+
out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
|
|
244
|
+
out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
|
|
279
245
|
|
|
280
246
|
lines = [
|
|
281
247
|
f"Combined metrics for all contigs:",
|
|
282
248
|
f"Number of Genes: {total_genes}",
|
|
283
249
|
f"Number of ORFs: {total_orfs}",
|
|
284
|
-
f"Perfect Matches: {total_perfect} [{total_genes}] - {
|
|
285
|
-
f"Partial Matches: {total_partial} [{total_genes}] - {
|
|
286
|
-
f"Missed Genes: {total_missed} [{total_genes}] - {
|
|
287
|
-
f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {
|
|
288
|
-
f"Multi-matched ORFs: {total_multi} [{total_genes}] - {
|
|
250
|
+
f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
|
|
251
|
+
f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
|
|
252
|
+
f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
|
|
253
|
+
f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
|
|
254
|
+
f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
|
|
289
255
|
]
|
|
290
256
|
|
|
291
257
|
full_msg = '\n'.join(lines) + '\n'
|
|
@@ -350,4 +316,4 @@ def main():
|
|
|
350
316
|
|
|
351
317
|
if __name__ == "__main__":
|
|
352
318
|
main()
|
|
353
|
-
print("Complete")
|
|
319
|
+
print("Complete")
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from utils import *
|
|
9
|
+
from Tools.TabToGFF.TabToGFF import TabToGFF
|
|
10
|
+
except ImportError:
|
|
11
|
+
from ORForise.utils import *
|
|
12
|
+
from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def setup_logging(outdir, verbose=False):
|
|
16
|
+
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
17
|
+
logfile = None
|
|
18
|
+
logger = logging.getLogger()
|
|
19
|
+
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
20
|
+
# clear existing handlers to avoid duplicates when running repeatedly
|
|
21
|
+
logger.handlers = []
|
|
22
|
+
fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
23
|
+
# Only create a file handler (and thus the logfile) when verbose is enabled
|
|
24
|
+
if verbose:
|
|
25
|
+
logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
|
|
26
|
+
fh = logging.FileHandler(logfile)
|
|
27
|
+
fh.setLevel(logging.DEBUG)
|
|
28
|
+
fh.setFormatter(fmt)
|
|
29
|
+
logger.addHandler(fh)
|
|
30
|
+
# Always add a stdout handler
|
|
31
|
+
sh = logging.StreamHandler(sys.stdout)
|
|
32
|
+
sh.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
33
|
+
sh.setFormatter(fmt)
|
|
34
|
+
logger.addHandler(sh)
|
|
35
|
+
return logfile
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
|
|
39
|
+
with open(outpath, 'w') as out:
|
|
40
|
+
out.write('##gff-version\t3\n')
|
|
41
|
+
out.write('#\tConvert_To_GFF\n')
|
|
42
|
+
out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
|
|
43
|
+
# Only include genome DNA line if a path was provided
|
|
44
|
+
if genome_DNA:
|
|
45
|
+
out.write('##Genome DNA File:' + genome_DNA + '\n')
|
|
46
|
+
out.write('##Original File: ' + input_annotation + '\n')
|
|
47
|
+
for pos, data in features.items():
|
|
48
|
+
pos_ = pos.split(',')
|
|
49
|
+
start = pos_[0]
|
|
50
|
+
stop = pos_[-1]
|
|
51
|
+
strand = data['strand']
|
|
52
|
+
if fmt == 'abricate': # Currently only supports abricate format
|
|
53
|
+
info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
|
|
54
|
+
entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
|
|
55
|
+
out.write(entry)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_genome(genome_fasta):
|
|
59
|
+
genome_seq = ''
|
|
60
|
+
genome_ID = 'unknown'
|
|
61
|
+
with open(genome_fasta, 'r') as fh:
|
|
62
|
+
for line in fh:
|
|
63
|
+
line = line.rstrip('\n')
|
|
64
|
+
if not line:
|
|
65
|
+
continue
|
|
66
|
+
if line.startswith('>'):
|
|
67
|
+
genome_ID = line.split()[0].lstrip('>')
|
|
68
|
+
else:
|
|
69
|
+
genome_seq += line
|
|
70
|
+
return genome_ID, genome_seq
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
|
|
75
|
+
|
|
76
|
+
parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
|
|
77
|
+
parser._action_groups.pop()
|
|
78
|
+
|
|
79
|
+
required = parser.add_argument_group('Required Arguments')
|
|
80
|
+
# Make genome DNA optional: if not provided we operate without genome sequence
|
|
81
|
+
required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
|
|
82
|
+
required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
|
|
83
|
+
required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
|
|
84
|
+
required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
|
|
85
|
+
|
|
86
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
87
|
+
optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
|
|
88
|
+
optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
|
|
89
|
+
|
|
90
|
+
options = parser.parse_args()
|
|
91
|
+
|
|
92
|
+
if not os.path.exists(options.output_dir):
|
|
93
|
+
os.makedirs(options.output_dir)
|
|
94
|
+
logfile = setup_logging(options.output_dir, verbose=options.verbose)
|
|
95
|
+
logging.info('Starting Convert_To_GFF')
|
|
96
|
+
# Log genome DNA only if provided
|
|
97
|
+
if options.genome_DNA:
|
|
98
|
+
logging.info('Genome DNA: %s', options.genome_DNA)
|
|
99
|
+
else:
|
|
100
|
+
logging.info('Genome DNA: (not provided)')
|
|
101
|
+
logging.info('Input annotation: %s', options.input_annotation)
|
|
102
|
+
logging.info('Format: %s', options.format)
|
|
103
|
+
|
|
104
|
+
# If a genome fasta was provided, load it; otherwise proceed without genome sequence
|
|
105
|
+
if options.genome_DNA:
|
|
106
|
+
if not os.path.exists(options.genome_DNA):
|
|
107
|
+
logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
genome_ID, genome_seq = load_genome(options.genome_DNA)
|
|
110
|
+
else:
|
|
111
|
+
# Derive a sensible genome_ID from the annotation filename and leave sequence empty
|
|
112
|
+
genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
|
|
113
|
+
genome_seq = ''
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
# Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
|
|
117
|
+
genome_map = {genome_ID: (genome_seq,)}
|
|
118
|
+
features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logging.exception('Error parsing input annotation')
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
|
|
123
|
+
#features = sortORFs(features) - Not sorting for now to preserve original order
|
|
124
|
+
basename = os.path.basename(options.input_annotation)
|
|
125
|
+
dot = basename.rfind('.')
|
|
126
|
+
if dot != -1:
|
|
127
|
+
outname = basename[:dot] + '.gff'
|
|
128
|
+
else:
|
|
129
|
+
outname = basename + '.gff'
|
|
130
|
+
outgff = os.path.join(options.output_dir, outname)
|
|
131
|
+
# Pass the original genome path if provided, else pass None so headers adapt
|
|
132
|
+
genome_DNA_path = options.genome_DNA if options.genome_DNA else None
|
|
133
|
+
write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
|
|
134
|
+
logging.info('Wrote GFF to %s', outgff)
|
|
135
|
+
logging.info('Logfile: %s', logfile)
|
|
136
|
+
|
|
137
|
+
if __name__ == '__main__':
|
|
138
|
+
main()
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
|
|
7
|
+
attrs = []
|
|
8
|
+
for k, v in attributes.items():
|
|
9
|
+
attrs.append(f"{k}={v}")
|
|
10
|
+
return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_blast_tab6(path, genome_seq, gene_ident=None):
|
|
14
|
+
results = collections.OrderedDict()
|
|
15
|
+
count = 0
|
|
16
|
+
with open(path, 'r') as fh:
|
|
17
|
+
for i, line in enumerate(fh, 1):
|
|
18
|
+
line = line.strip()
|
|
19
|
+
if not line or line.startswith('#'):
|
|
20
|
+
continue
|
|
21
|
+
parts = line.split('\t')
|
|
22
|
+
if len(parts) < 12:
|
|
23
|
+
logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
|
|
24
|
+
continue
|
|
25
|
+
qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
|
|
26
|
+
try:
|
|
27
|
+
sstart = int(sstart)
|
|
28
|
+
send = int(send)
|
|
29
|
+
except ValueError:
|
|
30
|
+
logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
|
|
31
|
+
continue
|
|
32
|
+
start = min(sstart, send)
|
|
33
|
+
end = max(sstart, send)
|
|
34
|
+
strand = '+' if sstart <= send else '-'
|
|
35
|
+
attrs = {
|
|
36
|
+
'ID': f'blast_hit{count}',
|
|
37
|
+
'Target': f'{qseqid} {qstart} {qend}',
|
|
38
|
+
'pident': pident,
|
|
39
|
+
'length': length,
|
|
40
|
+
'evalue': evalue,
|
|
41
|
+
'bitscore': bitscore
|
|
42
|
+
}
|
|
43
|
+
results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
|
|
44
|
+
count += 1
|
|
45
|
+
return results
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_abricate(path, genome_seq, gene_ident=None):
|
|
49
|
+
results = collections.OrderedDict()
|
|
50
|
+
count = 0
|
|
51
|
+
with (open(path, 'r') as fh):
|
|
52
|
+
header = None
|
|
53
|
+
for i, line in enumerate(fh, 1):
|
|
54
|
+
line = line.rstrip('\n')
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
if line.startswith('#'):
|
|
58
|
+
header = line.split('\t')
|
|
59
|
+
continue
|
|
60
|
+
if header is None:
|
|
61
|
+
# skip any pre-header content until a header line is encountered
|
|
62
|
+
continue
|
|
63
|
+
parts = line.split('\t')
|
|
64
|
+
if header and len(parts) == len(header):
|
|
65
|
+
row = dict(zip(header, parts))
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
start = int(row.get('START', '0'))
|
|
69
|
+
end = int(row.get('END', '0'))
|
|
70
|
+
strand = row.get('STRAND')
|
|
71
|
+
except ValueError:
|
|
72
|
+
logging.warning(f"Line {i}: invalid START/END in Abricate line")
|
|
73
|
+
continue
|
|
74
|
+
seqid = row.get('SEQUENCE')
|
|
75
|
+
gene = row.get('GENE')
|
|
76
|
+
accession = row.get('ACCESSION') or 'unknown'
|
|
77
|
+
db = row.get('DATABASE') or 'unknown'
|
|
78
|
+
identity = row.get('%IDENTITY')
|
|
79
|
+
coverage = row.get('%COVERAGE')
|
|
80
|
+
product = row.get('PRODUCT') or 'unkown'
|
|
81
|
+
resistance = row.get('RESISTANCE') or 'unknown'
|
|
82
|
+
|
|
83
|
+
attrs = {
|
|
84
|
+
'seqid': seqid,
|
|
85
|
+
'start': start,
|
|
86
|
+
'end': end,
|
|
87
|
+
'strand': strand,
|
|
88
|
+
'gene': gene,
|
|
89
|
+
'accession': accession,
|
|
90
|
+
'database': db,
|
|
91
|
+
'identity': identity,
|
|
92
|
+
'coverage': coverage,
|
|
93
|
+
'product': product,
|
|
94
|
+
'resistance': resistance
|
|
95
|
+
}
|
|
96
|
+
results[f"{start},{end}"] = attrs
|
|
97
|
+
count += 1
|
|
98
|
+
else:
|
|
99
|
+
logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
|
|
100
|
+
continue
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_genemark(path, genome_seq, gene_ident=None):
|
|
105
|
+
results = collections.OrderedDict()
|
|
106
|
+
count = 0
|
|
107
|
+
with open(path, 'r') as fh:
|
|
108
|
+
for i, line in enumerate(fh, 1):
|
|
109
|
+
line = line.strip()
|
|
110
|
+
if not line:
|
|
111
|
+
continue
|
|
112
|
+
parts = line.split()
|
|
113
|
+
if len(parts) < 3:
|
|
114
|
+
continue
|
|
115
|
+
try:
|
|
116
|
+
start = int(parts[0])
|
|
117
|
+
stop = int(parts[1])
|
|
118
|
+
except ValueError:
|
|
119
|
+
continue
|
|
120
|
+
strand_tok = parts[2]
|
|
121
|
+
if 'complement' in strand_tok:
|
|
122
|
+
strand = '-'
|
|
123
|
+
else:
|
|
124
|
+
strand = '+'
|
|
125
|
+
attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
|
|
126
|
+
results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
|
|
127
|
+
count += 1
|
|
128
|
+
return results
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
|
|
132
|
+
# Should be cleaned up to use consistent format names
|
|
133
|
+
fmt = fmt.lower()
|
|
134
|
+
if fmt in ('blast', 'blast_tab6', 'tab6'):
|
|
135
|
+
return parse_blast_tab6(input_file, genome_seq, gene_ident)
|
|
136
|
+
if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
|
|
137
|
+
return parse_abricate(input_file, genome_seq, gene_ident)
|
|
138
|
+
if fmt in ('genemark', 'gene_mark'):
|
|
139
|
+
return parse_genemark(input_file, genome_seq, gene_ident)
|
|
140
|
+
raise ValueError(f"Unknown format: {fmt}")
|
|
File without changes
|