PyPI - PyamilySeq - Versions diffs - 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl - Mend

PyamilySeq 1.3.2py3-none-any.whl → 1.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

PyamilySeq/Group_Compare.py +27 -13
PyamilySeq/Group_Extractor.py +29 -12
PyamilySeq/Group_Sizes.py +22 -8
PyamilySeq/Group_Splitter.py +89 -29
PyamilySeq/Group_Summary.py +18 -20
PyamilySeq/PyamilySeq.py +66 -43
PyamilySeq/PyamilySeq_Genus.py +1 -1
PyamilySeq/PyamilySeq_Species.py +30 -63
PyamilySeq/Seq_Combiner.py +125 -15
PyamilySeq/Seq_Extractor.py +24 -2
PyamilySeq/Seq_Finder.py +20 -2
PyamilySeq/clusterings.py +1 -1
PyamilySeq/constants.py +142 -1
PyamilySeq/utils.py +171 -84
{pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/METADATA +11 -11
pyamilyseq-1.3.3.dist-info/RECORD +21 -0
{pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/WHEEL +1 -1
PyamilySeq/config.py +0 -0
pyamilyseq-1.3.2.dist-info/RECORD +0 -22
{pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/entry_points.txt +0 -0
{pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/licenses/LICENSE +0 -0
{pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/top_level.txt +0 -0

PyamilySeq/Seq_Finder.py CHANGED Viewed

@@ -1,6 +1,12 @@
-import argparse
 import collections
 import csv
+import os
+# Use centralised logger factory
+try:
+    from .constants import configure_logger, LoggingArgumentParser
+except Exception:
+    from constants import configure_logger, LoggingArgumentParser
 def parse_fasta_ids(fasta_file):
@@ -29,16 +35,27 @@ def find_ids_in_csv(ids, csv_file):
 def main():
-    parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
+    # Early console-only logger so the parser description and argparse messages are logged via logger.
+    early_logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=False, log_dir=None, verbose=False)
+    parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Finder", description="Running Seq-Finder: A tool to extract IDs from a FASTA file and search for them in a CSV file.")
     parser.add_argument("-in", action='store', dest='fasta_file',
                         help="Input FASTA file", required=True)
     parser.add_argument("-ids", action='store', dest='csv_file',
                         help="CSV file containing IDs to search for", required=True)
     parser.add_argument("-out", action='store', dest='output_file',
                         help="Output file to save found IDs", required=True)
+    parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
+    parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
     options = parser.parse_args()
+    # Setup logger
+    out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
+    log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
+    logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
+    logger.info("Parsing FASTA IDs from %s", options.fasta_file)
     # Parse IDs from the FASTA file
     ids = parse_fasta_ids(options.fasta_file)
@@ -50,6 +67,7 @@ def main():
         output.write("ID,Found_In_First_Column\n")
         for seq_id, found_in in found_records.items():
             output.write(f"{seq_id},{found_in}\n")
+    logger.info("Wrote found records for %d IDs to %s", len(found_records), options.output_file)
 if __name__ == "__main__":

PyamilySeq/clusterings.py CHANGED Viewed

@@ -441,7 +441,7 @@ def combined_clustering_Edge_List(options, splitter):
             combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
         else:
             if str(cluster_id) not in not_Second_only_cluster_ids:
-                not_Second_only_cluster_ids.append(str(cluster_id))  # Tell us which StORF_Reporter clustered are unmatched to a PEP
+                not_Second_only_cluster_ids.append(str(cluster_id))  # Tell us which StORF-Reporter clustered are unmatched to a PEP
             if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
                 combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
             combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)

PyamilySeq/constants.py CHANGED Viewed

@@ -1,2 +1,143 @@
-PyamilySeq_Version = 'v1.3.2'
+import logging
+import os
+import sys
+import argparse
+from datetime import datetime
+from io import StringIO
+import re
+PyamilySeq_Version = 'v1.3.3'
+WELCOME = f"Thank you for using PyamilySeq {PyamilySeq_Version} - A tool for gene clustering and pangenome analysis."
+CITATION = "Please Cite PyamilySeq: https://doi.org/10.1093/nargab/lqaf198"
+ISSUE = "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues"
+def configure_logger(logger_name, enable_file=False, log_dir=None, level=logging.INFO, verbose=False):
+    """
+    Create and return a configured logger.
+    - logger_name: full logger name (e.g. "PyamilySeq.Group_Splitter")
+    - enable_file: if True, create a timestamped logfile in log_dir
+    - log_dir: directory for logfile (defaults to cwd)
+    - level: console log level (default INFO)
+    - verbose: if True, sets console level to DEBUG and file to DEBUG
+    """
+    logger = logging.getLogger(logger_name)
+    # Clear previous handlers to avoid duplicate logs on repeated imports/runs
+    if logger.hasHandlers():
+        logger.handlers.clear()
+    # Determine levels
+    console_level = logging.DEBUG if verbose else level
+    logger.setLevel(logging.DEBUG if verbose else level)
+    # Formatter without logger name (keeps output clean)
+    formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
+    # Console handler -> write to stdout by default
+    ch = logging.StreamHandler(sys.stdout)
+    ch.setLevel(console_level)
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    # Optional file handler
+    file_handler = None
+    if enable_file:
+        if not log_dir:
+            log_dir = os.getcwd()
+        os.makedirs(log_dir, exist_ok=True)
+        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Use short tool name derived from logger_name for the filename
+        safe_name = logger_name.split('.')[-1]
+        file_name = f"{safe_name}-{ts}.log"
+        fh = logging.FileHandler(os.path.join(log_dir, file_name))
+        fh.setLevel(logging.DEBUG)  # file always capture debug for diagnostics
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+        file_handler = fh
+        logger.debug("File logging enabled: %s", os.path.join(log_dir, file_name))
+    # Standard startup banner for all tools (printed once per logger instance)
+    # If banner hasn't been printed at all, log it normally (will go to console and file if present).
+    if not getattr(logger, "_welcome_printed", False):
+        logger.info("%s", WELCOME)
+        logger.info("%s", CITATION)
+        logger.info("%s", ISSUE)
+        setattr(logger, "_welcome_printed", True)
+        # Mark that banner also written to file if file handler exists
+        if file_handler:
+            # Also write formatted lines directly into the file to guarantee presence
+            try:
+                for msg in (WELCOME, CITATION, ISSUE):
+                    rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
+                    formatted = formatter.format(rec)
+                    # write to file handler's stream and flush
+                    try:
+                        file_handler.stream.write(formatted + "\n")
+                        file_handler.stream.flush()
+                    except Exception:
+                        # Best-effort; ignore write errors
+                        pass
+                setattr(logger, "_welcome_file_written", True)
+            except Exception:
+                pass
+    else:
+        # Banner already printed (likely to console by an early logger). Ensure it is written to file
+        # if file logging was just enabled and it hasn't yet been written to file.
+        if file_handler and not getattr(logger, "_welcome_file_written", False):
+            # Write banner lines directly into the file handler's stream (avoid duplicating console output).
+            try:
+                for msg in (WELCOME, CITATION, ISSUE):
+                    rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
+                    formatted = formatter.format(rec)
+                    try:
+                        file_handler.stream.write(formatted + "\n")
+                        file_handler.stream.flush()
+                    except Exception:
+                        pass
+                setattr(logger, "_welcome_file_written", True)
+            except Exception:
+                pass
+    return logger
+#  ArgumentParser subclass that logs usage/help/errors via the logger
+class LoggingArgumentParser(argparse.ArgumentParser):
+    def __init__(self, *args, logger_name=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        # If logger_name provided, use that logger; otherwise use root logger
+        self._logger = logging.getLogger(logger_name) if logger_name else logging.getLogger()
+        # Emit the parser description immediately on creation so it appears for normal runs
+        # (tools create an early console-only logger before constructing the parser).
+        if getattr(self, 'description', None):
+            try:
+                self._logger.info("%s", str(self.description))
+            except Exception:
+                # If logging fails for any reason, swallow to avoid breaking parser creation.
+                pass
+    def print_usage(self, file=None):
+        # Preserve default usage printing to console; description already logged at init.
+        super().print_usage(file)
+    def print_help(self, file=None):
+        # Capture help output, strip description (already logged), and print the rest to console.
+        sio = StringIO()
+        super().print_help(sio)
+        help_text = sio.getvalue()
+        if self.description:
+            pattern = re.escape(str(self.description)) + r'(\r?\n){1,2}'
+            help_text = re.sub(pattern, '', help_text, count=1)
+        out_file = file if file is not None else sys.stdout
+        out_file.write(help_text)
+    def exit(self, status=0, message=None):
+        # Preserve argparse behaviour by writing any exit message to stderr and exiting.
+        if message:
+            sys.stderr.write(message)
+        raise SystemExit(status)
+    def error(self, message):
+        # Print usage to stderr (as argparse does) and log a concise error message via logger.
+        super().print_usage(sys.stderr)
+        prog = self.prog if hasattr(self, 'prog') else ''
+        self._logger.error("%s: error: %s", prog, message)
+        self.exit(2)

PyamilySeq/utils.py CHANGED Viewed

@@ -7,6 +7,25 @@ from tempfile import NamedTemporaryFile
 import sys
 import re
 import math
+import logging
+logger = logging.getLogger("PyamilySeq")  # Use the shared top-level PyamilySeq logger so all utils logs propagate to the same handlers
+_startup_messages_pending = []
+def emit_pending_startup_messages():
+    global _startup_messages_pending
+    if _startup_messages_pending:
+        try:
+            for msg in _startup_messages_pending:
+                try:
+                    logger.info("%s", msg)
+                except Exception:
+                    # swallow any logging errors to avoid breaking flow
+                    pass
+        finally:
+            _startup_messages_pending.clear()
 ####
 # Placeholder for the distance function
@@ -18,7 +37,8 @@ try:
     def levenshtein_distance_calc(seq1, seq2):
         return LV.distance(seq1, seq2)
 except (ModuleNotFoundError, ImportError):
-    print("Levenshtein package not installed - Will fallback to slower Python implementation.")
+    # Save the notice for later emission (after logger handlers are configured)
+    _startup_messages_pending.append("Levenshtein package not installed - Will fallback to slower Python implementation.")
     # Fallback implementation
     def levenshtein_distance_calc(seq1, seq2):
         # Slower Python implementation of Levenshtein distance
@@ -62,6 +82,11 @@ codon_table = {
       'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
       'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
+# Temp fix
+codon_table['TAA'] = ''
+codon_table['TGA'] = ''
+codon_table['TAG'] = ''
 def translate_frame(sequence):
     translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
     return translate
@@ -94,10 +119,15 @@ def translate_dna_to_aa(dna_fasta, aa_fasta):
 def detect_sequence_type(fasta_file):
-    with open(fasta_file, 'r') as f:
+    import gzip
+    opener = gzip.open if str(fasta_file).lower().endswith('.gz') else open
+    with opener(fasta_file, 'rt') as f:
         for line in f:
             if line.startswith('>'):
                 continue
+            line = line.strip().upper()
+            if not line:
+                continue
             if any(base in line for base in 'EFILPQZ'):
                 return False  # Contains amino acids
     return True  # Contains DNA
@@ -105,7 +135,6 @@ def detect_sequence_type(fasta_file):
 def is_tool_installed(tool_name):
     """Check if a tool is installed and available in PATH."""
-    # Check if the tool is in the system PATH
     if shutil.which(tool_name) is None:
         return False
@@ -119,7 +148,9 @@ def is_tool_installed(tool_name):
         return False  # This shouldn't happen due to the earlier check
 def reverse_complement(seq):
-    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
+    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N','R': 'Y',
+                  'Y': 'R', 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'V': 'B',
+                  'B': 'V', 'H': 'D', 'D': 'H'}
     return ''.join(complement[base] for base in reversed(seq))
@@ -196,8 +227,8 @@ def select_longest_gene(sequences, subgrouped):
 def run_mafft_on_sequences(options, sequences, output_file):
-    #print("Conducting MAFFT alignment.")
     """Run mafft on the given sequences and write to output file."""
+    emit_pending_startup_messages()
     # Create a temporary input file for mafft
     with NamedTemporaryFile('w', delete=False) as temp_input_file:
         for header, sequence in sequences.items():
@@ -207,14 +238,13 @@ def run_mafft_on_sequences(options, sequences, output_file):
     # Run mafft
     try:
         with open(output_file, 'w') as output_f:
-            if options.verbose == True:
+            if getattr(options, "verbose", False):
                 subprocess.run(
                     ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
                     stdout=output_f,
                     stderr=sys.stderr,
                     check=True
                 )
             else:
                 subprocess.run(
                     ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
@@ -223,22 +253,28 @@ def run_mafft_on_sequences(options, sequences, output_file):
                     check=True
                 )
     finally:
-        os.remove(temp_input_file_path)  # Clean up the temporary file
+        try:
+            os.remove(temp_input_file_path)  # Clean up the temporary file
+        except Exception:
+            pass
 def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
+    emit_pending_startup_messages()
     if run_as_combiner == True:
-        combined_out_file_aa = None
+     combined_out_file_aa_path = None
     else:
-        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
+     combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
-    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+    # Open actual AA file or os.devnull based on whether we need an AA file path
+    aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
+    with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
         paired_files_found = None
     #with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
         gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
         if not gff_files:
+            logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split_gff)
             sys.exit("Error: No GFF files found.")
         for gff_file in gff_files:
             genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
@@ -251,12 +287,12 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
                         corresponding_fasta_file = temp_file
                         break
                 if corresponding_fasta_file is None:
-                    print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
+                    logger.warning("Corresponding FASTA file for GFF file '%s' not found. Skipping. Try using the -name_split_fasta option.", gff_file)
                     continue
             else:
                 corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
                 if not os.path.exists(corresponding_fasta_file):
-                    print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
+                    logger.warning("Corresponding FASTA file for GFF file '%s' not found: expected '%s'. Skipping. Try using the -name_split_fasta option.", gff_file, corresponding_fasta_file)
                     continue
             gff_features = []
@@ -322,25 +358,30 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
                             combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
     if not paired_files_found:
+        logger.error("Could not find matching GFF/FASTA files. Please check input directory and -name_split_gff / -name_split_fasta parameters.")
         sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
     if translate == False or translate == None:
-        #Clean up unused file
-        try: # Catches is combined_out_file_aa is None
-            if combined_out_file.name != combined_out_file_aa.name:
-                os.remove(combined_out_file_aa.name)
-        except AttributeError:
-            pass
+        # Clean up unused file only if it was a real file we created (never remove os.devnull)
+        if combined_out_file_aa_path:
+            try:
+                if os.path.exists(combined_out_file_aa_path):
+                    os.remove(combined_out_file_aa_path)
+            except Exception:
+                pass
 def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
+    emit_pending_startup_messages()
     if run_as_combiner == True:
-        combined_out_file_aa = None
+        combined_out_file_aa_path = None
     else:
-        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
-    #with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
-    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+        combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
+    aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
+    with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
         gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
         if not gff_files:
+            logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split)
             sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
         for gff_file in gff_files:
             genome_name = os.path.basename(gff_file).split(name_split)[0]
@@ -409,24 +450,29 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
                                 combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
     if translate == False or translate == None:
-        #Clean up unused file
-        try: # Catches is combined_out_file_aa is None
-            if combined_out_file.name != combined_out_file_aa.name:
-                os.remove(combined_out_file_aa.name)
-        except AttributeError:
-            pass
+        # Clean up unused file only if it was a real file we created (never remove os.devnull)
+        if combined_out_file_aa_path:
+            try:
+                if os.path.exists(combined_out_file_aa_path):
+                    os.remove(combined_out_file_aa_path)
+            except Exception:
+                pass
 def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
+    emit_pending_startup_messages()
     if run_as_combiner == True:
-        combined_out_file_aa = None
+     combined_out_file_aa_path = None
     else:
-        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
-    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+     combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
+    aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
+    with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
         fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
         if not fasta_files:
-            sys.exit("Error: No GFF files found.")
+            logger.error("Error: No FASTA files found in %s (pattern: *%s).", input_dir, name_split_fasta)
+            sys.exit("Error: No FASTA files found.")
         for fasta_file in fasta_files:
             genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
             fasta_dict = collections.defaultdict(str)
@@ -456,31 +502,63 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_a
                         combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
     if translate == False or translate == None:
-        #Clean up unused file
-        try: # Catches is combined_out_file_aa is None
-            if combined_out_file.name != combined_out_file_aa.name:
-                os.remove(combined_out_file_aa.name)
-        except AttributeError:
-            pass
+        # Clean up unused file only if it was a real file we created (never remove os.devnull)
+        if combined_out_file_aa_path:
+            try:
+                if os.path.exists(combined_out_file_aa_path):
+                    os.remove(combined_out_file_aa_path)
+            except Exception:
+                pass
+def write_individual_groups(options, output_dir, key_order, cores, sequences,
+                            pangenome_clusters_First_sequences_sorted,
+                            combined_pangenome_clusters_Second_sequences):
+    if not getattr(options, "write_individual_groups", False):
+        return
+    os.makedirs(output_dir, exist_ok=True)
+    for key_prefix in key_order:
+        for key, values in cores.items():
+            if not key.startswith(key_prefix):
+                continue
+            for value in values:
+                sequences_to_write = (pangenome_clusters_First_sequences_sorted[value]
+                                      if 'First' in key_prefix
+                                      else combined_pangenome_clusters_Second_sequences[value])
+                dna_path = os.path.join(output_dir, f"{key}_{value}_dna.fasta")
+                aa_path = dna_path.replace('_dna.fasta', '_aa.fasta')
+                if getattr(options, "sequence_type", None) == 'AA':
+                    with open(dna_path, 'w') as dna_f, open(aa_path, 'w') as aa_f:
+                        for header in sequences_to_write:
+                            if header not in sequences:
+                                if getattr(options, "verbose", False):
+                                    print(f"Sequence {header} not found in original_fasta file.")
+                                continue
+                            seq = sequences[header]
+                            dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
+                            aa_f.write(f">{header}\n{wrap_sequence(translate_frame(seq))}\n")
+                else:
+                    with open(dna_path, 'w') as dna_f:
+                        for header in sequences_to_write:
+                            if header not in sequences:
+                                if getattr(options, "verbose", False):
+                                    print(f"Sequence {header} not found in original_fasta file.")
+                                continue
+                            seq = sequences[header]
+                            dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
 def write_groups_func(options, output_dir, key_order, cores, sequences,
                  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
-    """
-    Writes individual FASTA files and a combined FASTA file for all sequences.
-    Parameters:
-    - options: Command-line options.
-    - output_dir: Directory where output FASTA files will be saved.
-    - key_order: The order in which to process keys.
-    - cores: Dictionary of core genes.
-    - sequences: Dictionary mapping headers to sequences.
-    - pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
-    - combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
-    """
+    emit_pending_startup_messages()
     # Create output directory if it doesn't exist
     if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
+     os.makedirs(output_dir)
     for group in options.write_groups.split(','):
@@ -514,7 +592,10 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
                                                     outfile_aa.write(f">{header}\n")
                                                     outfile_aa.write(f"{wrapped_sequence_aa}\n")
                                                 else:
-                                                    os.remove(outfile_aa.name)  # Delete individual file if option is disabled
+                                                    try:
+                                                        os.remove(outfile_aa.name)  # Delete individual file if option is disabled
+                                                    except FileNotFoundError:
+                                                        pass
                                                 # Always write to the combined AA file
                                                 combined_fasta_aa.write(f">Group_{value}|{header}\n")
                                                 combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
@@ -530,18 +611,24 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
                                                 outfile.write(f">{header}\n")
                                                 outfile.write(f"{wrapped_sequence}\n")
                                             else:
-                                                os.remove(outfile.name)  # Delete individual file if option is disabled
+                                                try:
+                                                    os.remove(outfile.name)  # Delete individual file if option is disabled
+                                                except FileNotFoundError:
+                                                    pass
                                             # Always write to the combined nucleotide file
                                             combined_fasta.write(f">Group_{value}|{header}\n")
                                             combined_fasta.write(f"{wrapped_sequence}\n")
                                         else:
                                             if options.verbose == True:
-                                                print(f"Sequence {header} not found in original_fasta file.")
+                                                logger.info("Sequence " + header + " not found in original_fasta file.")
         if options.sequence_type != 'AA':
             #Clean up unused file
-            os.remove(combined_fasta_aa.name)
-    print(f"Combined FASTA file saved to: {combined_fasta_filename}")
+            try:
+                os.remove(combined_fasta_aa.name)
+            except FileNotFoundError:
+                pass
+    logger.info("Combined FASTA file saved to: " + combined_fasta_filename)
 # def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
@@ -612,38 +699,38 @@ def perform_alignment(gene_path,group_directory, gene_file, options, concatenate
     return concatenated_sequences
 def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
+    emit_pending_startup_messages()
     """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
     concatenated_sequences = {genome: "" for genome in genome_list}
     output_file = group_directory.replace('Gene_Groups_Output', output_file)
     if paralog_groups != None:
-        threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
+     threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
     if options.align_aa == True:
-        affix = '_aa.fasta'
+     affix = '_aa.fasta'
     else:
-        affix = '_dna.fasta'
+     affix = '_dna.fasta'
     if options.align_core == True:
-        # Iterate over each gene family file
-        for gene_file in os.listdir(group_directory):
-            if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
-                current_group = int(gene_file.split('_')[3].split('.')[0])
-                gene_path = os.path.join(group_directory, gene_file)
-                # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
-                if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
-                    # Check for matching group in paralog_groups
-                    if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
-                        for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
-                            if size >= threshold_size:
-                                gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
-                                concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
-                    else:
-                        concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
-    # Write the concatenated sequences to the output file
-    with open(output_file, 'w') as out:
-        for genome, sequence in concatenated_sequences.items():
-            out.write(f">{genome}\n")
-            wrapped_sequence = wrap_sequence(sequence, 60)
-            out.write(f"{wrapped_sequence}\n")
+     # Iterate over each gene family file
+     for gene_file in os.listdir(group_directory):
+         if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
+             current_group = int(gene_file.split('_')[3].split('.')[0])
+             gene_path = os.path.join(group_directory, gene_file)
+             # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
+             if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
+                 # Check for matching group in paralog_groups
+                 if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
+                     for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
+                         if size >= threshold_size:
+                             gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
+                             concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
+                 else:
+                     concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
+     # Write the concatenated sequences to the output file
+     with open(output_file, 'w') as out:
+         for genome, sequence in concatenated_sequences.items():
+             out.write(f">{genome}\n")
+             wrapped_sequence = wrap_sequence(sequence, 60)
+             out.write(f"{wrapped_sequence}\n")

PyamilySeq 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

PyamilySeq 1.3.2py3-none-any.whl → 1.3.3py3-none-any.whl