PyPI - ORForise - Versions diffs - 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

ORForise 1.5.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

ORForise/Annotation_Compare.py +105 -88
ORForise/Comparator.py +60 -28
ORForise/Convert_To_GFF.py +138 -0
ORForise/Tools/TabToGFF/TabToGFF.py +140 -0
ORForise/Tools/TabToGFF/__init__.py +0 -0
ORForise/utils.py +1 -1
orforise-1.6.0.dist-info/METADATA +1051 -0
{orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/RECORD +12 -9
{orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/entry_points.txt +2 -0
orforise-1.5.0.dist-info/METADATA +0 -451
{orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/WHEEL +0 -0
{orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/licenses/LICENSE +0 -0
{orforise-1.5.0.dist-info → orforise-1.6.0.dist-info}/top_level.txt +0 -0

ORForise/Annotation_Compare.py CHANGED Viewed

@@ -1,22 +1,35 @@
 from importlib import import_module
 import argparse
-import sys,os
-import gzip,csv
+import sys, os
+import gzip, csv
+import logging
+from datetime import datetime
-try:
-    from Comparator import tool_comparison
-except ImportError:
-    from .Comparator import tool_comparison
 try:
     from utils import *
+    from Comparator import tool_comparison
 except ImportError:
+    from .Comparator import tool_comparison
     from ORForise.utils import *
+##########################
+# Consolidate printing and logging into a single block
+def _pct(n, total):
+    try:
+        return format(100 * n / total, '.2f') + '%'
+    except Exception:
+        return 'N/A'
 ##########################
 def comparator(options):
     try:
         try:  # Detect whether fasta/gff files are .gz or text and read accordingly
             fasta_in = gzip.open(options.genome_dna, 'rt')
@@ -77,36 +90,46 @@ def comparator(options):
                 'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
     for dna_region, result in results.items():
-        num_current_genes = len(dna_regions[dna_region][2])
-        num_orfs = result['pred_metrics']['Number_of_ORFs']
-        num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
-        num_partial = len(result['pred_metrics']['partial_Hits'])
-        num_missed = len(result['rep_metrics']['genes_Undetected'])
-        num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
-        num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
-        # Collect summary for this contig
-        if options.outdir:
-            contig_summaries.append([
-                dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
-            ])
-        ###
-        num_current_genes = len(dna_regions[dna_region][2])
-        print("These are the results for: " + dna_region + '\n')
-        ############################################# To get default output filename from input file details
-        genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
-        rep_metric_description, rep_metrics = get_rep_metrics(result)
-        all_metric_description, all_metrics = get_all_metrics(result)
-        print('Current Contig: ' + str(dna_region))
-        print('Number of Genes: ' + str(num_current_genes))
-        print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
-        print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
-        print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
-        print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
-        print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
-        print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
-        if options.outdir:
+        if result:
+            num_current_genes = len(dna_regions[dna_region][2])
+            num_orfs = result['pred_metrics']['Number_of_ORFs']
+            num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
+            num_partial = len(result['pred_metrics']['partial_Hits'])
+            num_missed = len(result['rep_metrics']['genes_Undetected'])
+            num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
+            num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
+            # Collect summary for this contig
+            contig_summaries.append([dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi])
+            num_current_genes = len(dna_regions[dna_region][2])
+            genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
+            rep_metric_description, rep_metrics = get_rep_metrics(result)
+            all_metric_description, all_metrics = get_all_metrics(result)
+             # Safely extract metric values
+            num_orfs = result.get('pred_metrics', {}).get('Number_of_ORFs') if isinstance(result, dict) else 'N/A'
+            perfect = result.get('pred_metrics', {}).get('Number_of_Perfect_Matches') if isinstance(result, dict) else 0
+            partial = len(result.get('pred_metrics', {}).get('partial_Hits', [])) if isinstance(result, dict) else 'N/A'
+            missed = len(result.get('rep_metrics', {}).get('genes_Undetected', [])) if isinstance(result, dict) else 'N/A'
+            unmatched = len(result.get('pred_metrics', {}).get('unmatched_ORFs', [])) if isinstance(result, dict) else 'N/A'
+            multi = len(result.get('pred_metrics', {}).get('multi_Matched_ORFs', [])) if isinstance(result, dict) else 'N/A'
+            lines = [
+                f"These are the results for: {dna_region}",
+                f"Current Contig: {dna_region}",
+                f"Number of Genes: {num_current_genes}",
+                f"Number of ORFs: {num_orfs}",
+                f"Perfect Matches: {perfect} [{num_current_genes}] - {_pct(perfect, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
+                f"Partial Matches: {partial} [{num_current_genes}] - {_pct(partial, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
+                f"Missed Genes: {missed} [{num_current_genes}] - {_pct(missed, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
+                f"Unmatched ORFs: {unmatched} [{num_current_genes}] - {_pct(unmatched, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}",
+                f"Multi-matched ORFs: {multi} [{num_current_genes}] - {_pct(multi, num_current_genes) if isinstance(num_current_genes, (int, float)) else 'N/A'}"
+            ]
+            full_msg = '\n'.join(lines) + '\n'
+            if options.verbose:
+                print(full_msg)
+            options.output_logger.info(full_msg)
             # Prepare output directory and file names for each contig
             contig_save = dna_region.replace('/', '_').replace('\\', '_')
             contig_dir = os.path.join(options.outdir, contig_save)
@@ -156,24 +179,6 @@ def comparator(options):
                 tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
                 tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
                 tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
-                # tool_out.writerow(['Start_Position_Difference:'])
-                # tool_out.writerow(result.get('start_Difference', []))
-                # tool_out.writerow(['Stop_Position_Difference:'])
-                # tool_out.writerow(result.get('stop_Difference', []))
-                # tool_out.writerow(['Alternative_Starts_Predicted:'])
-                # tool_out.writerow(result.get('other_Starts', []))
-                # tool_out.writerow(['Alternative_Stops_Predicted:'])
-                # tool_out.writerow(result.get('other_Stops', []))
-                # tool_out.writerow(['Undetected_Gene_Metrics:'])
-                # tool_out.writerow([
-                #     'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
-                # ])
-                # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
-                # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
-                # tool_out.writerow([
-                #     'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
-                # ])
-                # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
             # Write perfect matches to FASTA
             with open(perfect_fasta, 'w', encoding='utf-8') as f:
@@ -210,6 +215,11 @@ def comparator(options):
                     key_parts = key.split(',')
                     multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
                     f.write(f"{multi}\n")
+        else:
+            if options.verbose:
+                print(f"No results to process for dna region - " + str(dna_region))
+            options.output_logger.info(f"No results to process for dna region - " + str(dna_region))
     # After all contigs, append the summary table to the main summary file
     if options.outdir and contig_summaries:
@@ -227,34 +237,27 @@ def comparator(options):
             out_file.write('\nOverall Summary:\n')
             out_file.write(f'Number of Genes: {total_genes}\n')
             out_file.write(f'Number of ORFs: {total_orfs}\n')
-            out_file.write(
-                f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
-            out_file.write(
-                f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
-            # Print combined metrics to stdout
-            print("\nCombined metrics for all contigs:")
-            print(f'Number of Genes: {total_genes}')
-            print(f'Number of ORFs: {total_orfs}')
-            print(
-                f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
-            print(
-                f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
-            print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
-            print(
-                f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
-            print(
-                f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
+            out_file.write(f'Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%\n')
+            out_file.write(f'Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%\n')
+            out_file.write(f'Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%\n')
+            out_file.write(f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%\n')
+            out_file.write(f'Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%\n')
+            lines = [
+                f"Combined metrics for all contigs:",
+                f"Number of Genes: {total_genes}",
+                f"Number of ORFs: {total_orfs}",
+                f"Perfect Matches: {total_perfect} [{total_genes}] - {100 * total_perfect / total_genes:.2f}%",
+                f"Partial Matches: {total_partial} [{total_genes}] - {100 * total_partial / total_genes:.2f}%",
+                f"Missed Genes: {total_missed} [{total_genes}] - {100 * total_missed / total_genes:.2f}%",
+                f"Unmatched ORFs: {total_unmatched} [{total_genes}] - {100 * total_unmatched / total_genes:.2f}%",
+                f"Multi-matched ORFs: {total_multi} [{total_genes}] - {100 * total_multi / total_genes:.2f}%"
+            ]
+            full_msg = '\n'.join(lines) + '\n'
+            if options.verbose:
+                print(full_msg)
+            options.output_logger.info(full_msg)
 def main():
@@ -282,21 +285,35 @@ def main():
                                '- Provide tool name to compare output from two tools')
     output = parser.add_argument_group('Output')
-    output.add_argument('-o', dest='outdir', required=False,
-                        help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
+    output.add_argument('-o', dest='outdir', required=True,
+                        help='Define directory where detailed output should be places')
     output.add_argument('-n', dest='outname', required=False,
-                        help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
+                        help='Define output filename(s) prefix - If not provided, filename of reference '
+                             'annotation file will be used- <outname>_<contig_id>_ORF_Comparison.csv')
     misc = parser.add_argument_group('Misc')
     misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
                       help='Default - False: Print out runtime status')
     options = parser.parse_args()
-    if options.outdir and not options.outname:
-        sys.exit("Error: If -o (outdir) is provided, you must also provide -n (outname).")
+    options.outname = options.outname if options.outname else options.reference_annotation.split('/')[-1].split('.')[0]
+    # Initialise loggers once and store on options
+    if not getattr(options, 'logger_initialized', False):
+        os.makedirs(options.outdir, exist_ok=True)
+        output_log = os.path.join(options.outdir, f"ORForise_{options.outname}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+        logger = logging.getLogger('ORForise.output')
+        logger.setLevel(logging.INFO)
+        fh_out = logging.FileHandler(output_log, encoding='utf-8')
+        fh_out.setFormatter(logging.Formatter('%(message)s'))
+        logger.addHandler(fh_out)
+        options.output_logger = logger
+        options.logger_initialized = True
     comparator(options)
 if __name__ == "__main__":
     main()
-    print("Complete")
+    print("Complete")

ORForise/Comparator.py CHANGED Viewed

@@ -206,33 +206,53 @@ def start_Codon_Count(orfs):
         else:
             other += 1
             other_Starts.append(codon)
-    atg_P = format(100 * atg / len(orfs), '.2f')
-    gtg_P = format(100 * gtg / len(orfs), '.2f')
-    ttg_P = format(100 * ttg / len(orfs), '.2f')
-    att_P = format(100 * att / len(orfs), '.2f')
-    ctg_P = format(100 * ctg / len(orfs), '.2f')
-    other_Start_P = format(100 * other / len(orfs), '.2f')
-    return atg_P, gtg_P, ttg_P, att_P, ctg_P, other_Start_P, other_Starts
+    total = len(orfs) if orfs is not None else 0
+    if total:
+        atg_P = format(100 * atg / len(orfs), '.2f')
+        gtg_P = format(100 * gtg / len(orfs), '.2f')
+        ttg_P = format(100 * ttg / len(orfs), '.2f')
+        att_P = format(100 * att / len(orfs), '.2f')
+        ctg_P = format(100 * ctg / len(orfs), '.2f')
+        other_Start_P = format(100 * other / len(orfs), '.2f')
+    else:
+        atg_P = ttg_P = gtg_P = ctg_P = att_P = other_Start_P = format(0, '.2f')
+    return {
+        'ATG': (atg, atg_P),
+        'TTG': (ttg, ttg_P),
+        'GTG': (gtg, gtg_P),
+        'CTG': (ctg, ctg_P),
+        'ATT': (att, att_P),
+        'Other': (other, other_Start_P),
+        'total': total
+    }
 def stop_Codon_Count(orfs):
     tag, taa, tga, other = 0, 0, 0, 0
     other_Stops = []
-    for orf in orfs.values():
-        codon = orf[2]
-        if codon == 'TAG':
-            tag += 1
-        elif codon == 'TAA':
-            taa += 1
-        elif codon == 'TGA':
-            tga += 1
-        else:
-            other += 1
-            other_Stops.append(codon)
-    tag_p = format(100 * tag / len(orfs), '.2f')
-    taa_p = format(100 * taa / len(orfs), '.2f')
-    tga_p = format(100 * tga / len(orfs), '.2f')
-    other_Stop_P = format(100 * other / len(orfs), '.2f')
+    total = len(orfs) if orfs else 0
+    if total:
+        for orf in orfs.values():
+            codon = orf[2]
+            if codon == 'TAG':
+                tag += 1
+            elif codon == 'TAA':
+                taa += 1
+            elif codon == 'TGA':
+                tga += 1
+            else:
+                other += 1
+                other_Stops.append(codon)
+        tag_p = format(100 * tag / len(orfs), '.2f')
+        taa_p = format(100 * taa / len(orfs), '.2f')
+        tga_p = format(100 * tga / len(orfs), '.2f')
+        other_Stop_P = format(100 * other / len(orfs), '.2f')
+    else:
+        tag_p = taa_p = tga_p = other_Stop_P = format(0, '.2f')
     return tag_p, taa_p, tga_p, other_Stop_P, other_Stops
@@ -260,8 +280,8 @@ def candidate_ORF_Selection(gene_Set,
             if len(current_ORF_Difference) > len(candidate_ORF_Difference):
                 pos = c_Pos
                 orf_Details = c_ORF_Details
-        else:
-            print("Match filtered out")
+        #else:
+            #("Match filtered out")
     return pos, orf_Details
@@ -300,6 +320,11 @@ def tool_comparison(all_orfs, dna_regions, verbose):
         ref_genes_list = dna_regions[dna_region][2]
         ref_genes = collections.OrderedDict()
+        if not ref_genes_list:
+            results[dna_region] = {}
+            continue
         for d in ref_genes_list:
             ref_genes.update(d)
         comp.genome_Seq = dna_regions[dna_region][0]
@@ -311,6 +336,10 @@ def tool_comparison(all_orfs, dna_regions, verbose):
         better_pos_orfs_items = [[(int(pos.split(',')[0]), int(pos.split(',')[1])), orf_Details] for pos, orf_Details in current_orfs.items()] #TODO: turn pos into tuple instead of string everywhere
+        if not current_orfs or not better_pos_orfs_items:
+            results[dna_region] = {}
+            continue
         for gene_num, gene_details in ref_genes.items():  # Loop through each gene to compare against predicted ORFs
             g_Start = int(gene_details[0])
             g_Stop = int(gene_details[1])
@@ -477,10 +506,13 @@ def tool_comparison(all_orfs, dna_regions, verbose):
                 comp.gene_Pos_Olap.append(0)
             elif '-' in g_Strand:
                 comp.gene_Neg_Olap.append(0)
-        ####
-        min_Gene_Length = min(comp.gene_Lengths)
-        max_Gene_Length = max(comp.gene_Lengths)
-        median_Gene_Length = np.median(comp.gene_Lengths)
+        #### avoid ValueError
+        if comp.gene_Lengths:
+            min_Gene_Length = min(comp.gene_Lengths)
+            max_Gene_Length = max(comp.gene_Lengths)
+            median_Gene_Length = np.median(comp.gene_Lengths)
+        else:
+            min_Gene_Length = max_Gene_Length = min_Length_Difference = 0
         prev_ORF_Stop = 0
         prev_ORF_Overlapped = False
         for o_Positions, orf_Details in current_orfs.items():

ORForise/Convert_To_GFF.py ADDED Viewed

@@ -0,0 +1,138 @@
+import argparse
+import logging
+from datetime import datetime
+import os
+import sys
+try:
+    from utils import *
+    from Tools.TabToGFF.TabToGFF import TabToGFF
+except ImportError:
+    from ORForise.utils import *
+    from ORForise.Tools.TabToGFF.TabToGFF import TabToGFF
+def setup_logging(outdir, verbose=False):
+    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
+    logfile = None
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    # clear existing handlers to avoid duplicates when running repeatedly
+    logger.handlers = []
+    fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    # Only create a file handler (and thus the logfile) when verbose is enabled
+    if verbose:
+        logfile = os.path.join(outdir, f'convert_to_gff_{ts}.log')
+        fh = logging.FileHandler(logfile)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(fmt)
+        logger.addHandler(fh)
+    # Always add a stdout handler
+    sh = logging.StreamHandler(sys.stdout)
+    sh.setLevel(logging.DEBUG if verbose else logging.INFO)
+    sh.setFormatter(fmt)
+    logger.addHandler(sh)
+    return logfile
+def write_gff(outpath, genome_ID, genome_DNA, input_annotation, fmt, features):
+    with open(outpath, 'w') as out:
+        out.write('##gff-version\t3\n')
+        out.write('#\tConvert_To_GFF\n')
+        out.write('#\tRun Date: ' + str(datetime.now()) + '\n')
+        # Only include genome DNA line if a path was provided
+        if genome_DNA:
+            out.write('##Genome DNA File:' + genome_DNA + '\n')
+        out.write('##Original File: ' + input_annotation + '\n')
+        for pos, data in features.items():
+            pos_ = pos.split(',')
+            start = pos_[0]
+            stop = pos_[-1]
+            strand = data['strand']
+            if fmt == 'abricate': # Currently only supports abricate format
+                info = 'abricate_anotation;accession='+data['accession']+';database='+data['database']+';identity='+str(data['identity'])+';coverage='+str(data['coverage'])+';product='+data['product']+';resistance='+data['resistance']
+            entry = f"{data['seqid']}\t{fmt}\t{'CDS'}\t{start}\t{stop}\t.\t{strand}\t.\t{'ID='}{info}\n"
+            out.write(entry)
+def load_genome(genome_fasta):
+    genome_seq = ''
+    genome_ID = 'unknown'
+    with open(genome_fasta, 'r') as fh:
+        for line in fh:
+            line = line.rstrip('\n')
+            if not line:
+                continue
+            if line.startswith('>'):
+                genome_ID = line.split()[0].lstrip('>')
+            else:
+                genome_seq += line
+    return genome_ID, genome_seq
+def main():
+    print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n#####")
+    parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Convert-To-GFF Run Parameters')
+    parser._action_groups.pop()
+    required = parser.add_argument_group('Required Arguments')
+    # Make genome DNA optional: if not provided we operate without genome sequence
+    required.add_argument('-dna', dest='genome_DNA', required=False, help='Genome DNA file (.fa)')
+    required.add_argument('-i', dest='input_annotation', required=True, help='Input annotation file (tabular)')
+    required.add_argument('-fmt', dest='format', required=True, help='Input format: blast, abricate, genemark')
+    required.add_argument('-o', dest='output_dir', required=True, help='Output directory')
+    optional = parser.add_argument_group('Optional Arguments')
+    optional.add_argument('-gi', dest='gene_ident', default='CDS', required=False, help='Gene identifier types to extract (unused)')
+    optional.add_argument('--verbose', dest='verbose', action='store_true', help='Verbose logging with logfile')
+    options = parser.parse_args()
+    if not os.path.exists(options.output_dir):
+        os.makedirs(options.output_dir)
+    logfile = setup_logging(options.output_dir, verbose=options.verbose)
+    logging.info('Starting Convert_To_GFF')
+    # Log genome DNA only if provided
+    if options.genome_DNA:
+        logging.info('Genome DNA: %s', options.genome_DNA)
+    else:
+        logging.info('Genome DNA: (not provided)')
+    logging.info('Input annotation: %s', options.input_annotation)
+    logging.info('Format: %s', options.format)
+    # If a genome fasta was provided, load it; otherwise proceed without genome sequence
+    if options.genome_DNA:
+        if not os.path.exists(options.genome_DNA):
+            logging.error('Genome DNA file does not exist: %s', options.genome_DNA)
+            sys.exit(1)
+        genome_ID, genome_seq = load_genome(options.genome_DNA)
+    else:
+        # Derive a sensible genome_ID from the annotation filename and leave sequence empty
+        genome_ID = os.path.splitext(os.path.basename(options.input_annotation))[0]
+        genome_seq = ''
+    try:
+        # Build genome map expected by TabToGFF: mapping genome_ID -> tuple(sequence, ...)
+        genome_map = {genome_ID: (genome_seq,)}
+        features = TabToGFF(options.input_annotation, genome_map, options.gene_ident, fmt=options.format)
+    except Exception as e:
+        logging.exception('Error parsing input annotation')
+        sys.exit(1)
+    #features = sortORFs(features) - Not sorting for now to preserve original order
+    basename = os.path.basename(options.input_annotation)
+    dot = basename.rfind('.')
+    if dot != -1:
+        outname = basename[:dot] + '.gff'
+    else:
+        outname = basename + '.gff'
+    outgff = os.path.join(options.output_dir, outname)
+    # Pass the original genome path if provided, else pass None so headers adapt
+    genome_DNA_path = options.genome_DNA if options.genome_DNA else None
+    write_gff(outgff, genome_ID, genome_DNA_path, options.input_annotation, options.format, features)
+    logging.info('Wrote GFF to %s', outgff)
+    logging.info('Logfile: %s', logfile)
+if __name__ == '__main__':
+    main()

ORForise 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

ORForise 1.5.0py3-none-any.whl → 1.6.0py3-none-any.whl