PyPI - PyamilySeq - Versions diffs - 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl - Mend

PyamilySeq 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

PyamilySeq/{Cluster_Compare.py → Group_Compare.py} +27 -13
PyamilySeq/Group_Extractor.py +29 -12
PyamilySeq/Group_Sizes.py +22 -8
PyamilySeq/Group_Splitter.py +89 -29
PyamilySeq/{Cluster_Summary.py → Group_Summary.py} +18 -20
PyamilySeq/PyamilySeq.py +66 -43
PyamilySeq/PyamilySeq_Genus.py +1 -1
PyamilySeq/PyamilySeq_Species.py +30 -63
PyamilySeq/Seq_Combiner.py +125 -15
PyamilySeq/Seq_Extractor.py +24 -2
PyamilySeq/Seq_Finder.py +20 -2
PyamilySeq/clusterings.py +1 -1
PyamilySeq/constants.py +142 -1
PyamilySeq/utils.py +171 -84
{pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/METADATA +14 -14
pyamilyseq-1.3.3.dist-info/RECORD +21 -0
{pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/WHEEL +1 -1
{pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/entry_points.txt +4 -4
PyamilySeq/config.py +0 -0
pyamilyseq-1.3.1.dist-info/RECORD +0 -22
{pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/licenses/LICENSE +0 -0
{pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/top_level.txt +0 -0

PyamilySeq/PyamilySeq.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import argparse
-#from config import config_params
 try:
     from .PyamilySeq_Species import cluster as species_cluster
     from .PyamilySeq_Genus import cluster as genus_cluster
@@ -12,10 +9,11 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
     from constants import *
     from utils import *
+import traceback
+import sys
 def run_cd_hit(options, input_file, clustering_output, clustering_mode):
+    logger = logging.getLogger("PyamilySeq.PyamilySeq")
     cdhit_command = [
         clustering_mode,
         '-i', input_file,
@@ -29,14 +27,25 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
         '-sc', "1",
         '-sf', "1"
     ]
-    if options.verbose == True:
-        subprocess.run(cdhit_command)
-    else:
-        subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    logger.debug("CD-HIT command: %s", " ".join(cdhit_command))
+    try:
+        if options.verbose:
+            ret = subprocess.run(cdhit_command)
+        else:
+            ret = subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        if ret.returncode != 0:
+            logger.error("cd-hit returned non-zero exit code %s", ret.returncode)
+        else:
+            logger.info("cd-hit completed successfully: %s", clustering_output)
+    except Exception as e:
+        logger.exception("Failed to run cd-hit: %s", e)
 def main():
-    parser = argparse.ArgumentParser(description=f"PyamilySeq {PyamilySeq_Version}: A tool for gene clustering and analysis.")
+    # Initial console-only logger so welcome and parser.description are logged before argparse outputs.
+    early_logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=False, log_dir=None, verbose=False)
+    # Use LoggingArgumentParser so usage/errors are emitted via logger
+    parser = LoggingArgumentParser(logger_name="PyamilySeq.PyamilySeq")#, description="PyamilySeq entrypoint")
     # Add subparsers for Full and Partial modes
     subparsers = parser.add_subparsers(dest="run_mode", required=True, help="Choose a mode: 'Full' or 'Partial'.")
@@ -109,28 +118,28 @@ def main():
         subparser.add_argument("-T", type=int, default=8, dest="threads", required=False,
                                  help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
-        # Miscellaneous Arguments
-        subparser.add_argument("-verbose", action="store_true",
-                            help="Print verbose output.")
-        subparser.add_argument("-v", "--version", action="version",
-                            version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
+    # Miscellaneous Arguments
+    # Global logging options (user controls logfile creation)
+    parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
+    parser.add_argument("--log-dir", dest="log_dir", default=None,
+                        help="Directory for logfile (default: output dir or cwd).")
+    parser.add_argument("-verbose", action="store_true",
+                        help="Print verbose output.")
+    parser.add_argument("-v", "--version", action="version",
+                        version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
     # Parse Arguments
     options = parser.parse_args()
-    ## Configuration
+    # Setup logger once we know output paths/options
+    # after we resolve output_path / options.output_dir:
+    resolved_log_dir = options.log_dir if getattr(options, "log_dir", None) else (os.path.abspath(options.output_dir) if getattr(options, "output_dir", None) else os.getcwd())
+    logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=getattr(options, "log", False), log_dir=resolved_log_dir, verbose=options.verbose)
+    logger.info("Running PyamilySeq %s in %s mode", PyamilySeq_Version, getattr(options, "run_mode", "N/A"))
+    if options.verbose:
+        logger.debug("Options: %s", vars(options))
-    if options.write_groups != None and options.write_individual_groups == False:
-        options.write_individual_groups = True
-    # Example of conditional logic based on selected mode
-    print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
-    if options.run_mode == "Full" and options.verbose == True:
-        print("Processing Full mode with options:", vars(options))
-    elif options.run_mode == "Partial" and options.verbose == True:
-        print("Processing Partial mode with options:", vars(options))
-    ### Checking all required parameters are provided by user #!!# Doesn't seem to work
     if options.run_mode == 'Full':
         options.clustering_format = 'CD-HIT'
         if getattr(options, 'reclustered', None) is not None:
@@ -145,6 +154,7 @@ def main():
             missing_options = [opt for opt in
                                ['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
                                not options.__dict__.get(opt)]
+            logger.error("Missing required options for Full mode: %s", ', '.join(missing_options))
             sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
         if options.align_core:
             options.write_individual_groups = True
@@ -176,34 +186,28 @@ def main():
     ##MAFFT
     if options.align_core == True:
         if is_tool_installed('mafft'):
-            if options.verbose == True:
-                print("mafft is installed. Proceeding with alignment.")
+            logger.info("mafft is installed. Proceeding with alignment.")
         else:
+            logger.error("mafft is not installed. Please install mafft to proceed.")
             exit("mafft is not installed. Please install mafft to proceed.")
     ##CD-HIT
     if options.run_mode == 'Full':
         if is_tool_installed('cd-hit'):
-            if options.verbose == True:
-                print("cd-hit is installed. Proceeding with clustering.")
+            logger.info("cd-hit is installed. Proceeding with clustering.")
             if options.sequence_type == 'DNA':
                 clustering_mode = 'cd-hit-est'
             elif options.sequence_type == 'AA':
                 clustering_mode = 'cd-hit'
             if options.fast_mode == True:
                 options.fast_mode = 1
-                if options.verbose == True:
-                    print("Running CD-HIT in fast mode.")
+                logger.info("Running CD-HIT in fast mode.")
             else:
                 options.fast_mode = 0
-                if options.verbose == True:
-                    print("Running CD-HIT in accurate mode.")
+                logger.info("Running CD-HIT in accurate mode.")
         else:
+            logger.error("cd-hit is not installed. Please install cd-hit to proceed.")
             exit("cd-hit is not installed. Please install cd-hit to proceed.")
-    # if options.write_groups != None and options.original_fasta == False:
-    #     exit("-fasta must br provided if -w is used")
     if hasattr(options, 'cluster_file') and options.cluster_file:
         options.cluster_file = fix_path(options.cluster_file)
     if hasattr(options, 'reclustered') and options.reclustered:
@@ -308,10 +312,29 @@ def main():
     if options.group_mode == 'Species':
-        species_cluster(clustering_options)
+        try:
+            species_cluster(clustering_options)
+            logger.info("Invoked species clustering.")
+        except FileNotFoundError as e:
+            logger.error("File not found during species clustering: %s", e)
+            logger.debug("Traceback:\n%s", traceback.format_exc())
+            sys.exit(1)
+        except Exception as e:
+            logger.error("Unexpected error during species clustering: %s", e)
+            logger.debug("Traceback:\n%s", traceback.format_exc())
+            sys.exit(1)
     elif options.group_mode == 'Genus':
-        genus_cluster((clustering_options))
+        try:
+            genus_cluster(clustering_options)
+            logger.info("Invoked genus clustering.")
+        except FileNotFoundError as e:
+            logger.error("File not found during genus clustering: %s", e)
+            logger.debug("Traceback:\n%s", traceback.format_exc())
+            sys.exit(1)
+        except Exception as e:
+            logger.error("Unexpected error during genus clustering: %s", e)
+            logger.debug("Traceback:\n%s", traceback.format_exc())
+            sys.exit(1)
     # Save arguments to a text file
     from datetime import datetime
@@ -319,9 +342,9 @@ def main():
         outfile.write(f"Timestamp: {datetime.now().isoformat()}\n")
         for arg, value in vars(options).items():
             outfile.write(f"{arg}: {value}\n")
+    logger.info("Saved parameters to %s", os.path.join(output_path, "PyamilySeq_params.txt"))
-    print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
-          "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
 if __name__ == "__main__":
     main()

PyamilySeq/PyamilySeq_Genus.py CHANGED Viewed

@@ -88,7 +88,7 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Secon
     except KeyError:
         cores['extended_genera_>'].append(cluster)
 #@profile
-def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
+def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
     group = First_num + Second_num
     try:
         cores['combined_genera_' + str(group)].append(cluster)

PyamilySeq/PyamilySeq_Species.py CHANGED Viewed

@@ -9,7 +9,7 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
     from utils import *
-def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
+def gene_presence_absence_output(options, genome_dict,
                                  pangenome_clusters_First_sequences_sorted,
                                  combined_pangenome_clusters_First_Second_clustered=None,
                                  combined_pangenome_clusters_Second_sequences_sorted=None):
@@ -137,48 +137,6 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
     if options.reclustered is not None:
         print(f"Merged Second cluster IDs: {len(merged_second_cluster_ids)}")
-# def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
-#     print("Outputting gene_presence_absence file")
-#     output_dir = os.path.abspath(options.output_dir)
-#     #in_name = options.clusters.split('.')[0].split('/')[-1]
-#     gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
-#     gpa_outfile = open(gpa_outfile, 'w')
-#     genome_dict = OrderedDict(sorted(genome_dict.items()))
-#     gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
-#                      '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
-#     gpa_outfile.write('","'.join(genome_dict.keys()))
-#     gpa_outfile.write('"\n')
-#     for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
-#         average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
-#         gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
-#                          '","","","","","","","",""')
-#
-#
-#         for genome in genome_dict.keys():
-#             full_out = ''
-#             tmp_list = []
-#             for value in sequences:
-#                 if value.split('|')[0] == genome:
-#                     tmp_list.append(value.split('|')[1])
-#             if tmp_list:
-#                 full_out += ',"'+'  '.join(tmp_list)+'"'
-#             else:
-#                 full_out = ',""'
-#             gpa_outfile.write(full_out)
-#         gpa_outfile.write('\n')
-### Below is some unfinished code
-    # edge_list_outfile = open(in_name+'_edge_list.csv','w')
-    # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
-    #     output = []
-    #     for entry in sequences:
-    #         # Split each entry at '|'
-    #         genome, gene = entry.split('|')
-    #         # Format the result as "gene  genome"
-    #         output.append(f"{gene}\t{genome}")
-    #     for line in output:
-    #         edge_list_outfile.write(line + '\n')
@@ -209,7 +167,7 @@ def get_cores(options,genome_dict):
             cores[only_second_core_group] = []
     return cores, groups
-#@profile
 def calc_First_only_core(cluster, First_num, groups, cores):
     groups_as_list = list(groups.values())
     for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
@@ -217,7 +175,7 @@ def calc_First_only_core(cluster, First_num, groups, cores):
     family_group = list(groups)[res]
     cores['First_core_'+family_group].append(cluster)
-#@profile
 def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
     groups_as_list = list(groups.values())
     for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
@@ -227,8 +185,8 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
     cores['extended_core_' + family_group].append(cluster)
-#@profile
-def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
+def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
     groups_as_list = list(groups.values())
     # Looping through the list to find the matching condition
     for idx, (sec, fir) in enumerate(groups_as_list):
@@ -239,7 +197,7 @@ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores
     cores['combined_core_' + family_group].append(cluster)
-#@profile
 def calc_Second_only_core(cluster, Second_num, groups, cores):
     groups_as_list = list(groups.values())
     for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
@@ -247,7 +205,7 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
     family_group = list(groups)[res]
     cores['Second_core_' + family_group].append(cluster)
-#@profile
 def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
     try:
         groups_as_list = list(groups.values())
@@ -259,7 +217,7 @@ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count
         sys.exit("Error in calc_only_Second_only_core")
-#@profile
 def cluster(options):
     if options.cluster_format == 'CD-HIT':
@@ -273,18 +231,17 @@ def cluster(options):
     cores, groups = get_cores(options, genome_dict)
     ###
-    if options.reclustered != None: #FIX
+    if options.reclustered != None: # Combined clustering
         if options.cluster_format == 'CD-HIT':
             combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
         elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
-            #Fix
             combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences  = combined_clustering_Edge_List(options, '|')
         pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences,  '|')
         # Sort First clusters
         sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
-        pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
+        #pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
         pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_First_keys)
         pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
@@ -296,7 +253,7 @@ def cluster(options):
     else:
         pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
         sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
-        pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
+        #pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
         pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences,
                                                                          sorted_First_keys)
         pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
@@ -375,17 +332,16 @@ def cluster(options):
     if options.gene_presence_absence_out != False:
         if options.reclustered != None:
             # Pass both First and Second clustering data
-            gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
+            gene_presence_absence_output(options, genome_dict,
                                          pangenome_clusters_First_sequences_sorted,
                                          combined_pangenome_clusters_First_Second_clustered,
                                          combined_pangenome_clusters_Second_sequences_sorted)
         else:
             # Only First clustering data available
-            gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
-                                         pangenome_clusters_First_sequences_sorted)
+            gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sequences_sorted)
-    ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
+    ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -write-groups not presented then it needs
     # to be done for alignment full anyway...
     genome_list = list(genome_dict.keys())
@@ -400,17 +356,24 @@ def cluster(options):
                 outfile.write('>group_'+str(cluster)+'\n')
                 wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
                 outfile.write(wrapped_aa_seq+'\n')
-        if options.write_groups != None:
+        if options.write_groups != False:
             print("Outputting gene group FASTA files")
             #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
             output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
             write_groups_func(options,output_dir, key_order, cores, sequences,
                          pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
-            if options.align_core != None:
+            if options.align_core != False:
                 print("Processing gene group alignment")
                 process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
+        if options.write_individual_groups == True:
+            output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
+            write_individual_groups(options, output_dir, key_order, cores, sequences,
+                                        pangenome_clusters_First_sequences_sorted,
+                                        combined_pangenome_clusters_Second_sequences)
     elif options.run_mode == 'Partial':
         sequences = read_fasta(options.fasta)
         if options.reclustered == None:
@@ -432,16 +395,21 @@ def cluster(options):
                 outfile.write('>group_'+str(cluster)+'\n')
                 wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
                 outfile.write(wrapped_aa_seq+'\n')
-        if options.write_groups != None:
+        if options.write_groups != False:
             print("Outputting gene group FASTA files")
             output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
             write_groups_func(options,output_dir, key_order, cores, sequences,
                          pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
-            if options.align_core != None:
+            if options.align_core != False:
                 print("Processing gene group alignment")
                 process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
+        if options.write_individual_groups == True:
+            output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
+            write_individual_groups(options, output_dir, key_order, cores, sequences,
+                                        pangenome_clusters_First_sequences_sorted,
+                                        combined_pangenome_clusters_Second_sequences)
         #
@@ -461,4 +429,3 @@ def cluster(options):
     #
     #
     #

PyamilySeq/Seq_Combiner.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import argparse
 try:
     from .constants import *
     from .utils import *
@@ -8,10 +5,94 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
     from constants import *
     from utils import *
+import threading
+import time
+import os
+from typing import Optional
+import re
+def count_matching_files(input_dir: str, name_split: Optional[str], extensions):
+    """
+    Count input files in input_dir that match the provided extensions and, if name_split supplied,
+    contain the name_split substring in the filename. This is used to compute total work units (files).
+    """
+    if not input_dir or not os.path.isdir(input_dir):
+        return 0
+    total = 0
+    for fname in os.listdir(input_dir):
+        low = fname.lower()
+        if any(low.endswith(ext) for ext in extensions):
+            if name_split:
+                if name_split in fname:
+                    total += 1
+            else:
+                total += 1
+    return total
+def count_files_present_in_combined(combined_file: str, name_split: Optional[str]) -> int:
+    """
+    Heuristic: count number of distinct input files (genomes) already present in the combined output.
+    Primary approach: parse headers and take the second '|' field (header.split('|')[1]) as genome/file id.
+    If that parsing fails, look for tokens containing name_split inside the header.
+    """
+    if not combined_file or not os.path.exists(combined_file):
+        return 0
+    seen = set()
+    try:
+        with open(combined_file, 'r') as fh:
+            for line in fh:
+                if not line.startswith('>'):
+                    continue
+                header = line[1:].strip()
+                # 1) Prefer headers like ">id|genome|rest" -> take genome (second field)
+                if '|' in header:
+                    parts = header.split('|')
+                    if len(parts) > 1 and parts[1]:
+                        seen.add(parts[1])
+                        continue
+                # 2) If name_split provided, look for a filename-like token that includes it
+                if name_split:
+                    match = re.search(r'([^\s/\\]*' + re.escape(name_split) + r'[^\s/\\]*)', header)
+                    if match:
+                        token = os.path.basename(match.group(1))
+                        seen.add(token)
+                        continue
+                # 3) If nothing matched, skip this header (avoids per-sequence overcounting)
+    except Exception:
+        return 0
+    return len(seen)
+# Helpers for progress reporting
+def progress_reporter(stop_event, logger, total_files, combined_file, name_split=None, interval=10):
+    """
+    Periodically log progress. Preference: count headers in combined_file.
+    Falls back to simple heartbeat if combined_file isn't yet created.
+    """
+    start = time.time()
+    while not stop_event.is_set():
+        # Use number of distinct input files represented in the combined output for "processed"
+        processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
+        # Cap processed to total_files (prevents >100%)
+        if total_files > 0 and processed > total_files:
+            processed = total_files
+        pct = (processed / total_files * 100) if total_files > 0 else 0.0
+        elapsed = time.time() - start
+        logger.info("Progress: %d/%d processed (%.1f%%). Elapsed: %.0fs", processed, total_files, pct, elapsed)
+        # Wait with early exit support
+        stop_event.wait(interval)
+    # Final log when exiting
+    processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
+    if total_files > 0 and processed > total_files:
+        processed = total_files
+    pct = (processed / total_files * 100) if total_files > 0 else 0.0
+    elapsed = time.time() - start
+    logger.info("Final progress: %d/%d processed (%.1f%%). Total elapsed: %.0fs", processed, total_files, pct, elapsed)
 def main():
-    parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
+    # Early console-only logger so parser.description is logged before help/usage.
+    early_logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=False, log_dir=None, verbose=False)
+    parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Combiner", description='Running Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
     ### Required Arguments
     required = parser.add_argument_group('Required Arguments')
     required.add_argument('-input_dir', action='store', dest='input_dir',
@@ -47,37 +128,66 @@ def main():
     misc.add_argument("-v", "--version", action="version",
                       version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
                       help="Print out version number and exit")
+    parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
+    parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
     options = parser.parse_args()
+    # Setup logger for Seq-Combiner
+    output_path = os.path.abspath(options.output_dir)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    log_dir = options.log_dir if getattr(options, "log_dir", None) else output_path
+    logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
+    # --- Progress reporting setup ------------------------------------------------
+    combined_out_file = os.path.join(output_path, options.output_file)
+    # Determine name_split and extensions per mode and count matching input files as total work units
+    if options.input_type == 'fasta':
+        name_split = options.name_split_fasta
+        exts = ('.fasta', '.fa', '.fna')
+    else:  # 'separate' or 'combined'
+        name_split = options.name_split_gff
+        exts = ('.gff', '.gff3', '.gff.gz', '.gff3.gz')
+    total_work = count_matching_files(options.input_dir, name_split, exts)
+    logger.info("Found %d input files (matching pattern) to process in %s", total_work, options.input_dir)
+    stop_event = threading.Event()
+    reporter_thread = threading.Thread(target=progress_reporter, args=(stop_event, logger, total_work, combined_out_file, name_split, 10), daemon=True)
+    reporter_thread.start()
+    # ---------------------------------------------------------------------------
     if options.input_type == 'separate' and options.name_split_gff is None:
+        logger.error("Please provide a substring to split the filename and extract the genome name.")
         print("Please provide a substring to split the filename and extract the genome name.")
         exit(1)
     if options.input_type == 'combined' and options.name_split_gff is None:
+        logger.error("Please provide a substring to split the filename and extract the genome name.")
         print("Please provide a substring to split the filename and extract the genome name.")
         exit(1)
     if options.input_type == 'fasta' and options.name_split_fasta is None:
+        logger.error("Please provide a substring to split the filename and extract the genome name.")
         print("Please provide a substring to split the filename and extract the genome name.")
         exit(1)
-    output_path = os.path.abspath(options.output_dir)
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    #output_file = options.output_file + '.fasta'
-    if os.path.exists(os.path.join(output_path, options.output_file)):
-        print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
-        exit(1)
-    combined_out_file = os.path.join(output_path, options.output_file )
     if options.input_type == 'separate':
+        logger.info("Processing 'separate' input_type from %s", options.input_dir)
         read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
     elif options.input_type == 'combined':
+        logger.info("Processing 'combined' input_type from %s", options.input_dir)
         read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
     elif options.input_type == 'fasta':
+        logger.info("Processing 'fasta' input_type from %s", options.input_dir)
         read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate, True)
+    logger.info("Seq-Combiner completed.")
+    # Stop reporter and wait for final log
+    stop_event.set()
+    reporter_thread.join(timeout=5)
+    # Final summary: count number of input files represented (heuristic)
+    final_files = count_files_present_in_combined(combined_out_file, name_split)
+    logger.info("Completed combining. Final combined file: %s (input files represented: %d)", combined_out_file, final_files)
 if __name__ == "__main__":
     main()

PyamilySeq/Seq_Extractor.py CHANGED Viewed

@@ -1,5 +1,12 @@
-import argparse
 import copy
+import os
+# Use centralised logger factory
+try:
+    from .constants import configure_logger, LoggingArgumentParser
+except Exception:
+    from constants import configure_logger, LoggingArgumentParser
 def find_gene_ids_in_csv(csv_file, group_name):
     """Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
@@ -39,7 +46,10 @@ def extract_sequences(fasta_file, gene_ids):
     return sequences
 def main():
-    parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
+    # Early console-only logger so parser.description appears in logger output before argparse prints the menu.
+    early_logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=False, log_dir=None, verbose=False)
+    parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Extractor", description="Running Seq-Extractor - A tool to extract sequences for specified group name from CSV file and corresponding FASTA file.")
     parser.add_argument("-csv", action='store', dest='csv_file',
                         help="CSV file containing group data", required=True)
     parser.add_argument("-group", action='store', dest='group_name',
@@ -48,22 +58,34 @@ def main():
                         help="Input FASTA file containing sequences", required=True)
     parser.add_argument("-out", action='store', dest='output_file',
                         help="Output FASTA file with extracted sequences", required=True)
+    parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
+    parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
     options = parser.parse_args()
+    # Setup logger
+    out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
+    log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
+    logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
+    logger.info("Searching for gene IDs in CSV %s for group %s", options.csv_file, options.group_name)
     # Find gene IDs in CSV
     gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
     if not gene_ids:
+        logger.warning("No gene IDs found for group name '%s' in the CSV.", options.group_name)
         print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
         return
     # Extract sequences from the FASTA file
+    logger.info("Extracting sequences from FASTA: %s", options.fasta_file)
     sequences = extract_sequences(options.fasta_file, gene_ids)
     # Write matched sequences to the output FASTA file
     with open(options.output_file, 'w') as output:
         for gene_id, sequence_lines in sequences.items():
             output.write("\n".join(sequence_lines) + "\n")
+    logger.info("Wrote %d sequences to %s", len(sequences), options.output_file)
 if __name__ == "__main__":
     main()

PyamilySeq 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

PyamilySeq 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl