ORForise 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,16 +7,15 @@ from datetime import datetime
7
7
 
8
8
 
9
9
  try:
10
+ from utils import *
10
11
  from Comparator import tool_comparison
11
12
  except ImportError:
12
13
  from .Comparator import tool_comparison
13
-
14
- try:
15
- from utils import *
16
- except ImportError:
17
14
  from ORForise.utils import *
18
15
 
19
16
 
17
+
18
+
20
19
  ##########################
21
20
 
22
21
  # Consolidate printing and logging into a single block
@@ -131,16 +130,6 @@ def comparator(options):
131
130
  print(full_msg)
132
131
  options.output_logger.info(full_msg)
133
132
 
134
- # print("These are the results for: " + dna_region + '\n')
135
- # print('Current Contig: ' + str(dna_region))
136
- # print('Number of Genes: ' + str(num_current_genes))
137
- # print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
138
- # print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
139
- # print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
140
- # print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
141
- # print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
142
- # print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
143
-
144
133
  # Prepare output directory and file names for each contig
145
134
  contig_save = dna_region.replace('/', '_').replace('\\', '_')
146
135
  contig_dir = os.path.join(options.outdir, contig_save)
@@ -190,24 +179,6 @@ def comparator(options):
190
179
  tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
191
180
  tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
192
181
  tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
193
- # tool_out.writerow(['Start_Position_Difference:'])
194
- # tool_out.writerow(result.get('start_Difference', []))
195
- # tool_out.writerow(['Stop_Position_Difference:'])
196
- # tool_out.writerow(result.get('stop_Difference', []))
197
- # tool_out.writerow(['Alternative_Starts_Predicted:'])
198
- # tool_out.writerow(result.get('other_Starts', []))
199
- # tool_out.writerow(['Alternative_Stops_Predicted:'])
200
- # tool_out.writerow(result.get('other_Stops', []))
201
- # tool_out.writerow(['Undetected_Gene_Metrics:'])
202
- # tool_out.writerow([
203
- # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
204
- # ])
205
- # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
206
- # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
207
- # tool_out.writerow([
208
- # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
209
- # ])
210
- # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
211
182
 
212
183
  # Write perfect matches to FASTA
213
184
  with open(perfect_fasta, 'w', encoding='utf-8') as f:
@@ -266,26 +237,21 @@ def comparator(options):
266
237
  out_file.write('\nOverall Summary:\n')
267
238
  out_file.write(f'Number of Genes: {total_genes}\n')
268
239
  out_file.write(f'Number of ORFs: {total_orfs}\n')
269
- out_file.write(
270
- f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
271
- out_file.write(
272
- f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
273
- out_file.write(
274
- f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
275
- out_file.write(
276
- f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
277
- out_file.write(
278
- f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
240
+ out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
241
+ out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
242
+ out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
243
+ out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
244
+ out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
279
245
 
280
246
  lines = [
281
247
  f"Combined metrics for all contigs:",
282
248
  f"Number of Genes: {total_genes}",
283
249
  f"Number of ORFs: {total_orfs}",
284
- f"Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%",
285
- f"Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%",
286
- f"Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%",
287
- f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%",
288
- f"Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%"
250
+ f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
251
+ f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
252
+ f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
253
+ f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
254
+ f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
289
255
  ]
290
256
 
291
257
  full_msg = '\n'.join(lines) + '\n'
@@ -350,4 +316,4 @@ def main():
350
316
 
351
317
  if __name__ == "__main__":
352
318
  main()
353
- print("Complete")
319
+ print("Complete")
@@ -0,0 +1,138 @@
1
+ import argparse
2
+ import logging
3
+ from datetime import datetime
4
+ import os
5
+ import sys
6
+
7
+ try:
8
+ from utils import *
9
+ from Tools.TabToGFF.TabToGFF import TabToGFF
10
+ except ImportError:
11
+ from ORForise.utils import *
12
+ from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
13
+
14
+
15
+ def setup_logging(outdir, verbose=False):
16
+ ts = datetime.now().strftime('%Y%m%d_%H%M%S')
17
+ logfile = None
18
+ logger = logging.getLogger()
19
+ logger.setLevel(logging.DEBUG if verbose else logging.INFO)
20
+ # clear existing handlers to avoid duplicates when running repeatedly
21
+ logger.handlers = []
22
+ fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
23
+ # Only create a file handler (and thus the logfile) when verbose is enabled
24
+ if verbose:
25
+ logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
26
+ fh = logging.FileHandler(logfile)
27
+ fh.setLevel(logging.DEBUG)
28
+ fh.setFormatter(fmt)
29
+ logger.addHandler(fh)
30
+ # Always add a stdout handler
31
+ sh = logging.StreamHandler(sys.stdout)
32
+ sh.setLevel(logging.DEBUG if verbose else logging.INFO)
33
+ sh.setFormatter(fmt)
34
+ logger.addHandler(sh)
35
+ return logfile
36
+
37
+
38
+ def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
39
+ with open(outpath, 'w') as out:
40
+ out.write('##gff-version\t3\n')
41
+ out.write('#\tConvert_To_GFF\n')
42
+ out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
43
+ # Only include genome DNA line if a path was provided
44
+ if genome_DNA:
45
+ out.write('##Genome DNA File:' + genome_DNA + '\n')
46
+ out.write('##Original File: ' + input_annotation + '\n')
47
+ for pos, data in features.items():
48
+ pos_ = pos.split(',')
49
+ start = pos_[0]
50
+ stop = pos_[-1]
51
+ strand = data['strand']
52
+ if fmt == 'abricate': # Currently only supports abricate format
53
+ info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
54
+ entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
55
+ out.write(entry)
56
+
57
+
58
+ def load_genome(genome_fasta):
59
+ genome_seq = ''
60
+ genome_ID = 'unknown'
61
+ with open(genome_fasta, 'r') as fh:
62
+ for line in fh:
63
+ line = line.rstrip('\n')
64
+ if not line:
65
+ continue
66
+ if line.startswith('>'):
67
+ genome_ID = line.split()[0].lstrip('>')
68
+ else:
69
+ genome_seq += line
70
+ return genome_ID, genome_seq
71
+
72
+
73
+ def main():
74
+ print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
75
+
76
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
77
+ parser._action_groups.pop()
78
+
79
+ required = parser.add_argument_group('Required Arguments')
80
+ # Make genome DNA optional: if not provided we operate without genome sequence
81
+ required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
82
+ required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
83
+ required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
84
+ required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
85
+
86
+ optional = parser.add_argument_group('Optional Arguments')
87
+ optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
88
+ optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
89
+
90
+ options = parser.parse_args()
91
+
92
+ if not os.path.exists(options.output_dir):
93
+ os.makedirs(options.output_dir)
94
+ logfile = setup_logging(options.output_dir, verbose=options.verbose)
95
+ logging.info('Starting Convert_To_GFF')
96
+ # Log genome DNA only if provided
97
+ if options.genome_DNA:
98
+ logging.info('Genome DNA: %s', options.genome_DNA)
99
+ else:
100
+ logging.info('Genome DNA: (not provided)')
101
+ logging.info('Input annotation: %s', options.input_annotation)
102
+ logging.info('Format: %s', options.format)
103
+
104
+ # If a genome fasta was provided, load it; otherwise proceed without genome sequence
105
+ if options.genome_DNA:
106
+ if not os.path.exists(options.genome_DNA):
107
+ logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
108
+ sys.exit(1)
109
+ genome_ID, genome_seq = load_genome(options.genome_DNA)
110
+ else:
111
+ # Derive a sensible genome_ID from the annotation filename and leave sequence empty
112
+ genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
113
+ genome_seq = ''
114
+
115
+ try:
116
+ # Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
117
+ genome_map = {genome_ID: (genome_seq,)}
118
+ features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
119
+ except Exception as e:
120
+ logging.exception('Error parsing input annotation')
121
+ sys.exit(1)
122
+
123
+ #features = sortORFs(features) - Not sorting for now to preserve original order
124
+ basename = os.path.basename(options.input_annotation)
125
+ dot = basename.rfind('.')
126
+ if dot != -1:
127
+ outname = basename[:dot] + '.gff'
128
+ else:
129
+ outname = basename + '.gff'
130
+ outgff = os.path.join(options.output_dir, outname)
131
+ # Pass the original genome path if provided, else pass None so headers adapt
132
+ genome_DNA_path = options.genome_DNA if options.genome_DNA else None
133
+ write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
134
+ logging.info('Wrote GFF to %s', outgff)
135
+ logging.info('Logfile: %s', logfile)
136
+
137
+ if __name__ == '__main__':
138
+ main()
@@ -0,0 +1,140 @@
1
+ import collections
2
+ import logging
3
+
4
+
5
+
6
+ def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
7
+ attrs = []
8
+ for k, v in attributes.items():
9
+ attrs.append(f"{k}={v}")
10
+ return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
11
+
12
+
13
+ def parse_blast_tab6(path, genome_seq, gene_ident=None):
14
+ results = collections.OrderedDict()
15
+ count = 0
16
+ with open(path, 'r') as fh:
17
+ for i, line in enumerate(fh, 1):
18
+ line = line.strip()
19
+ if not line or line.startswith('#'):
20
+ continue
21
+ parts = line.split('\t')
22
+ if len(parts) < 12:
23
+ logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
24
+ continue
25
+ qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
26
+ try:
27
+ sstart = int(sstart)
28
+ send = int(send)
29
+ except ValueError:
30
+ logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
31
+ continue
32
+ start = min(sstart, send)
33
+ end = max(sstart, send)
34
+ strand = '+' if sstart <= send else '-'
35
+ attrs = {
36
+ 'ID': f'blast_hit{count}',
37
+ 'Target': f'{qseqid} {qstart} {qend}',
38
+ 'pident': pident,
39
+ 'length': length,
40
+ 'evalue': evalue,
41
+ 'bitscore': bitscore
42
+ }
43
+ results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
44
+ count += 1
45
+ return results
46
+
47
+
48
+ def parse_abricate(path, genome_seq, gene_ident=None):
49
+ results = collections.OrderedDict()
50
+ count = 0
51
+ with (open(path, 'r') as fh):
52
+ header = None
53
+ for i, line in enumerate(fh, 1):
54
+ line = line.rstrip('\n')
55
+ if not line:
56
+ continue
57
+ if line.startswith('#'):
58
+ header = line.split('\t')
59
+ continue
60
+ if header is None:
61
+ # skip any pre-header content until a header line is encountered
62
+ continue
63
+ parts = line.split('\t')
64
+ if header and len(parts) == len(header):
65
+ row = dict(zip(header, parts))
66
+
67
+ try:
68
+ start = int(row.get('START', '0'))
69
+ end = int(row.get('END', '0'))
70
+ strand = row.get('STRAND')
71
+ except ValueError:
72
+ logging.warning(f"Line {i}: invalid START/END in Abricate line")
73
+ continue
74
+ seqid = row.get('SEQUENCE')
75
+ gene = row.get('GENE')
76
+ accession = row.get('ACCESSION') or 'unknown'
77
+ db = row.get('DATABASE') or 'unknown'
78
+ identity = row.get('%IDENTITY')
79
+ coverage = row.get('%COVERAGE')
80
+ product = row.get('PRODUCT') or 'unkown'
81
+ resistance = row.get('RESISTANCE') or 'unknown'
82
+
83
+ attrs = {
84
+ 'seqid': seqid,
85
+ 'start': start,
86
+ 'end': end,
87
+ 'strand': strand,
88
+ 'gene': gene,
89
+ 'accession': accession,
90
+ 'database': db,
91
+ 'identity': identity,
92
+ 'coverage': coverage,
93
+ 'product': product,
94
+ 'resistance': resistance
95
+ }
96
+ results[f"{start},{end}"] = attrs
97
+ count += 1
98
+ else:
99
+ logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
100
+ continue
101
+ return results
102
+
103
+
104
+ def parse_genemark(path, genome_seq, gene_ident=None):
105
+ results = collections.OrderedDict()
106
+ count = 0
107
+ with open(path, 'r') as fh:
108
+ for i, line in enumerate(fh, 1):
109
+ line = line.strip()
110
+ if not line:
111
+ continue
112
+ parts = line.split()
113
+ if len(parts) < 3:
114
+ continue
115
+ try:
116
+ start = int(parts[0])
117
+ stop = int(parts[1])
118
+ except ValueError:
119
+ continue
120
+ strand_tok = parts[2]
121
+ if 'complement' in strand_tok:
122
+ strand = '-'
123
+ else:
124
+ strand = '+'
125
+ attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
126
+ results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
127
+ count += 1
128
+ return results
129
+
130
+
131
+ def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
132
+ # Should be cleaned up to use consistent format names
133
+ fmt = fmt.lower()
134
+ if fmt in ('blast', 'blast_tab6', 'tab6'):
135
+ return parse_blast_tab6(input_file, genome_seq, gene_ident)
136
+ if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
137
+ return parse_abricate(input_file, genome_seq, gene_ident)
138
+ if fmt in ('genemark', 'gene_mark'):
139
+ return parse_genemark(input_file, genome_seq, gene_ident)
140
+ raise ValueError(f"Unknown format: {fmt}")
File without changes
ORForise/utils.py CHANGED
@@ -4,7 +4,7 @@ import collections
4
4
  # Constants
5
5
  SHORT_ORF_LENGTH = 300
6
6
  MIN_COVERAGE = 75
7
- ORForise_Version = 'v1.5.1'
7
+ ORForise_Version = 'v1.6.0'
8
8
 
9
9
 
10
10
  def revCompIterative(watson): # Gets Reverse Complement