PyPI - ORForise - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

ORForise 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

ORForise/Annotation_Compare.py +14 -48
ORForise/Convert_To_GFF.py +138 -0
ORForise/Tools/TabToGFF/TabToGFF.py +140 -0
ORForise/Tools/TabToGFF/__init__.py +0 -0
ORForise/utils.py +1 -1
orforise-1.6.0.dist-info/METADATA +1051 -0
{orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/RECORD +11 -8
{orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/entry_points.txt +2 -0
orforise-1.5.1.dist-info/METADATA +0 -427
{orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/WHEEL +0 -0
{orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/licenses/LICENSE +0 -0
{orforise-1.5.1.dist-info → orforise-1.6.0.dist-info}/top_level.txt +0 -0

ORForise/Annotation_Compare.py CHANGED Viewed

@@ -7,16 +7,15 @@ from datetime import datetime
 try:
+    from utils import *
     from Comparator import tool_comparison
 except ImportError:
     from .Comparator import tool_comparison
-try:
-    from utils import *
-except ImportError:
     from ORForise.utils import *
 ##########################
 # Consolidate printing and logging into a single block
@@ -131,16 +130,6 @@ def comparator(options):
                 print(full_msg)
             options.output_logger.info(full_msg)
-            # print("These are the results for: " + dna_region + '\n')
-            # print('Current Contig: ' + str(dna_region))
-            # print('Number of Genes: ' + str(num_current_genes))
-            # print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
-            # print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
-            # print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
-            # print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
-            # print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
-            # print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
             # Prepare output directory and file names for each contig
             contig_save = dna_region.replace('/', '_').replace('\\', '_')
             contig_dir = os.path.join(options.outdir, contig_save)
@@ -190,24 +179,6 @@ def comparator(options):
                 tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
                 tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
                 tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
-                # tool_out.writerow(['Start_Position_Difference:'])
-                # tool_out.writerow(result.get('start_Difference', []))
-                # tool_out.writerow(['Stop_Position_Difference:'])
-                # tool_out.writerow(result.get('stop_Difference', []))
-                # tool_out.writerow(['Alternative_Starts_Predicted:'])
-                # tool_out.writerow(result.get('other_Starts', []))
-                # tool_out.writerow(['Alternative_Stops_Predicted:'])
-                # tool_out.writerow(result.get('other_Stops', []))
-                # tool_out.writerow(['Undetected_Gene_Metrics:'])
-                # tool_out.writerow([
-                #     'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
-                # ])
-                # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
-                # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
-                # tool_out.writerow([
-                #     'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
-                # ])
-                # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
             # Write perfect matches to FASTA
             with open(perfect_fasta, 'w', encoding='utf-8') as f:
@@ -266,26 +237,21 @@ def comparator(options):
             out_file.write('\nOverall Summary:\n')
             out_file.write(f'Number of Genes: {total_genes}\n')
             out_file.write(f'Number of ORFs: {total_orfs}\n')
-            out_file.write(
-                f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
+            out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
+            out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
+            out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
+            out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
+            out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
             lines = [
                 f"Combined metrics for all contigs:",
                 f"Number of Genes: {total_genes}",
                 f"Number of ORFs: {total_orfs}",
-                f"Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%",
-                f"Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%",
-                f"Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%",
-                f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%",
-                f"Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%"
+                f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
+                f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
+                f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
+                f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
+                f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
             ]
             full_msg = '\n'.join(lines) + '\n'
@@ -350,4 +316,4 @@ def main():
 if __name__ == "__main__":
     main()
-    print("Complete")
+    print("Complete")

ORForise/Convert_To_GFF.py ADDED Viewed

@@ -0,0 +1,138 @@
+import argparse
+import logging
+from datetime import datetime
+import os
+import sys
+try:
+    from utils import *
+    from Tools.TabToGFF.TabToGFF import TabToGFF
+except ImportError:
+    from ORForise.utils import *
+    from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
+def setup_logging(outdir, verbose=False):
+    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
+    logfile = None
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    # clear existing handlers to avoid duplicates when running repeatedly
+    logger.handlers = []
+    fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    # Only create a file handler (and thus the logfile) when verbose is enabled
+    if verbose:
+        logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
+        fh = logging.FileHandler(logfile)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(fmt)
+        logger.addHandler(fh)
+    # Always add a stdout handler
+    sh = logging.StreamHandler(sys.stdout)
+    sh.setLevel(logging.DEBUG if verbose else logging.INFO)
+    sh.setFormatter(fmt)
+    logger.addHandler(sh)
+    return logfile
+def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
+    with open(outpath, 'w') as out:
+        out.write('##gff-version\t3\n')
+        out.write('#\tConvert_To_GFF\n')
+        out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
+        # Only include genome DNA line if a path was provided
+        if genome_DNA:
+            out.write('##Genome DNA File:' + genome_DNA + '\n')
+        out.write('##Original File: ' + input_annotation + '\n')
+        for pos, data in features.items():
+            pos_ = pos.split(',')
+            start = pos_[0]
+            stop = pos_[-1]
+            strand = data['strand']
+            if fmt == 'abricate': # Currently only supports abricate format
+                info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
+            entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
+            out.write(entry)
+def load_genome(genome_fasta):
+    genome_seq = ''
+    genome_ID = 'unknown'
+    with open(genome_fasta, 'r') as fh:
+        for line in fh:
+            line = line.rstrip('\n')
+            if not line:
+                continue
+            if line.startswith('>'):
+                genome_ID = line.split()[0].lstrip('>')
+            else:
+                genome_seq += line
+    return genome_ID, genome_seq
+def main():
+    print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
+    parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
+    parser._action_groups.pop()
+    required = parser.add_argument_group('Required Arguments')
+    # Make genome DNA optional: if not provided we operate without genome sequence
+    required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
+    required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
+    required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
+    required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
+    optional = parser.add_argument_group('Optional Arguments')
+    optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
+    optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
+    options = parser.parse_args()
+    if not os.path.exists(options.output_dir):
+        os.makedirs(options.output_dir)
+    logfile = setup_logging(options.output_dir, verbose=options.verbose)
+    logging.info('Starting Convert_To_GFF')
+    # Log genome DNA only if provided
+    if options.genome_DNA:
+        logging.info('Genome DNA: %s', options.genome_DNA)
+    else:
+        logging.info('Genome DNA: (not provided)')
+    logging.info('Input annotation: %s', options.input_annotation)
+    logging.info('Format: %s', options.format)
+    # If a genome fasta was provided, load it; otherwise proceed without genome sequence
+    if options.genome_DNA:
+        if not os.path.exists(options.genome_DNA):
+            logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
+            sys.exit(1)
+        genome_ID, genome_seq = load_genome(options.genome_DNA)
+    else:
+        # Derive a sensible genome_ID from the annotation filename and leave sequence empty
+        genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
+        genome_seq = ''
+    try:
+        # Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
+        genome_map = {genome_ID: (genome_seq,)}
+        features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
+    except Exception as e:
+        logging.exception('Error parsing input annotation')
+        sys.exit(1)
+    #features = sortORFs(features) - Not sorting for now to preserve original order
+    basename = os.path.basename(options.input_annotation)
+    dot = basename.rfind('.')
+    if dot != -1:
+        outname = basename[:dot] + '.gff'
+    else:
+        outname = basename + '.gff'
+    outgff = os.path.join(options.output_dir, outname)
+    # Pass the original genome path if provided, else pass None so headers adapt
+    genome_DNA_path = options.genome_DNA if options.genome_DNA else None
+    write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
+    logging.info('Wrote GFF to %s', outgff)
+    logging.info('Logfile: %s', logfile)
+if __name__ == '__main__':
+    main()

ORForise/Tools/TabToGFF/TabToGFF.py ADDED Viewed

@@ -0,0 +1,140 @@
+import collections
+import logging
+def _make_feature(seqid, source, type_, start, end, score, strand, phase, attributes):
+    attrs = []
+    for k, v in attributes.items():
+        attrs.append(f"{k}={v}")
+    return f"{seqid}\t{source}\t{type_}\t{start}\t{end}\t{score}\t{strand}\t{phase}\t{';'.join(attrs)}\n"
+def parse_blast_tab6(path, genome_seq, gene_ident=None):
+    results = collections.OrderedDict()
+    count = 0
+    with open(path, 'r') as fh:
+        for i, line in enumerate(fh, 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = line.split('\t')
+            if len(parts) < 12:
+                logging.warning(f"Line {i}: unexpected BLAST line with {len(parts)} columns")
+                continue
+            qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = parts[:12]
+            try:
+                sstart = int(sstart)
+                send = int(send)
+            except ValueError:
+                logging.warning(f"Line {i}: non-integer coordinates in BLAST sstart/send")
+                continue
+            start = min(sstart, send)
+            end = max(sstart, send)
+            strand = '+' if sstart <= send else '-'
+            attrs = {
+                'ID': f'blast_hit{count}',
+                'Target': f'{qseqid} {qstart} {qend}',
+                'pident': pident,
+                'length': length,
+                'evalue': evalue,
+                'bitscore': bitscore
+            }
+            results[f"{start},{end}"] = [strand, '.', 'similarity', attrs]
+            count += 1
+    return results
+def parse_abricate(path, genome_seq, gene_ident=None):
+    results = collections.OrderedDict()
+    count = 0
+    with (open(path, 'r') as fh):
+        header = None
+        for i, line in enumerate(fh, 1):
+            line = line.rstrip('\n')
+            if not line:
+                continue
+            if line.startswith('#'):
+                header = line.split('\t')
+                continue
+            if header is None:
+                # skip any pre-header content until a header line is encountered
+                continue
+            parts = line.split('\t')
+            if header and len(parts) == len(header):
+                row = dict(zip(header, parts))
+                try:
+                    start = int(row.get('START', '0'))
+                    end = int(row.get('END', '0'))
+                    strand = row.get('STRAND')
+                except ValueError:
+                    logging.warning(f"Line {i}: invalid START/END in Abricate line")
+                    continue
+                seqid = row.get('SEQUENCE')
+                gene = row.get('GENE')
+                accession =  row.get('ACCESSION') or 'unknown'
+                db = row.get('DATABASE')  or 'unknown'
+                identity = row.get('%IDENTITY')
+                coverage = row.get('%COVERAGE')
+                product = row.get('PRODUCT') or 'unkown'
+                resistance = row.get('RESISTANCE') or 'unknown'
+                attrs = {
+                    'seqid': seqid,
+                    'start': start,
+                    'end': end,
+                    'strand': strand,
+                    'gene': gene,
+                    'accession': accession,
+                    'database': db,
+                    'identity': identity,
+                    'coverage': coverage,
+                    'product': product,
+                    'resistance': resistance
+                }
+                results[f"{start},{end}"] = attrs
+                count += 1
+            else:
+                logging.warning(f"Line {i}: unexpected number of columns in Abricate line")
+                continue
+    return results
+def parse_genemark(path, genome_seq, gene_ident=None):
+    results = collections.OrderedDict()
+    count = 0
+    with open(path, 'r') as fh:
+        for i, line in enumerate(fh, 1):
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split()
+            if len(parts) < 3:
+                continue
+            try:
+                start = int(parts[0])
+                stop = int(parts[1])
+            except ValueError:
+                continue
+            strand_tok = parts[2]
+            if 'complement' in strand_tok:
+                strand = '-'
+            else:
+                strand = '+'
+            attrs = {'ID': f'genemark_hit{count}', 'tool': 'GeneMark'}
+            results[f"{start},{stop}"] = [strand, '.', 'CDS', attrs]
+            count += 1
+    return results
+def TabToGFF(input_file, genome_seq, gene_ident='CDS', fmt='blast'):
+    # Should be cleaned up to use consistent format names
+    fmt = fmt.lower()
+    if fmt in ('blast', 'blast_tab6', 'tab6'):
+        return parse_blast_tab6(input_file, genome_seq, gene_ident)
+    if fmt in ('abricate', 'abricate_tsv', 'abricate_format'):
+        return parse_abricate(input_file, genome_seq, gene_ident)
+    if fmt in ('genemark', 'gene_mark'):
+        return parse_genemark(input_file, genome_seq, gene_ident)
+    raise ValueError(f"Unknown format: {fmt}")

ORForise/Tools/TabToGFF/__init__.py ADDED Viewed

File without changes

ORForise/utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ import collections
 # Constants
 SHORT_ORF_LENGTH = 300
 MIN_COVERAGE = 75
-ORForise_Version = 'v1.5.1'
+ORForise_Version = 'v1.6.0'
 def revCompIterative(watson):  # Gets Reverse Complement

ORForise 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

ORForise 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl