PyamilySeq 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Seq_Finder.py CHANGED
@@ -1,6 +1,12 @@
1
- import argparse
2
1
  import collections
3
2
  import csv
3
+ import os
4
+
5
+ # Use centralised logger factory
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
4
10
 
5
11
 
6
12
  def parse_fasta_ids(fasta_file):
@@ -29,16 +35,27 @@ def find_ids_in_csv(ids, csv_file):
29
35
 
30
36
 
31
37
  def main():
32
- parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
38
+ # Early console-only logger so the parser description and argparse messages are logged via logger.
39
+ early_logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=False, log_dir=None, verbose=False)
40
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Finder", description="Running Seq-Finder: A tool to extract IDs from a FASTA file and search for them in a CSV file.")
41
+
33
42
  parser.add_argument("-in", action='store', dest='fasta_file',
34
43
  help="Input FASTA file", required=True)
35
44
  parser.add_argument("-ids", action='store', dest='csv_file',
36
45
  help="CSV file containing IDs to search for", required=True)
37
46
  parser.add_argument("-out", action='store', dest='output_file',
38
47
  help="Output file to save found IDs", required=True)
48
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
49
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
39
50
 
40
51
  options = parser.parse_args()
41
52
 
53
+ # Setup logger
54
+ out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
55
+ log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
56
+ logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
57
+
58
+ logger.info("Parsing FASTA IDs from %s", options.fasta_file)
42
59
  # Parse IDs from the FASTA file
43
60
  ids = parse_fasta_ids(options.fasta_file)
44
61
 
@@ -50,6 +67,7 @@ def main():
50
67
  output.write("ID,Found_In_First_Column\n")
51
68
  for seq_id, found_in in found_records.items():
52
69
  output.write(f"{seq_id},{found_in}\n")
70
+ logger.info("Wrote found records for %d IDs to %s", len(found_records), options.output_file)
53
71
 
54
72
 
55
73
  if __name__ == "__main__":
PyamilySeq/clusterings.py CHANGED
@@ -441,7 +441,7 @@ def combined_clustering_Edge_List(options, splitter):
441
441
  combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
442
442
  else:
443
443
  if str(cluster_id) not in not_Second_only_cluster_ids:
444
- not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF_Reporter clustered are unmatched to a PEP
444
+ not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF-Reporter clustered are unmatched to a PEP
445
445
  if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
446
446
  combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
447
447
  combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
PyamilySeq/constants.py CHANGED
@@ -1,2 +1,143 @@
1
- PyamilySeq_Version = 'v1.3.2'
1
+ import logging
2
+ import os
3
+ import sys
4
+ import argparse
5
+ from datetime import datetime
6
+ from io import StringIO
7
+ import re
2
8
 
9
+ PyamilySeq_Version = 'v1.3.3'
10
+ WELCOME = f"Thank you for using PyamilySeq {PyamilySeq_Version} - A tool for gene clustering and pangenome analysis."
11
+ CITATION = "Please Cite PyamilySeq: https://doi.org/10.1093/nargab/lqaf198"
12
+ ISSUE = "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues"
13
+
14
+ def configure_logger(logger_name, enable_file=False, log_dir=None, level=logging.INFO, verbose=False):
15
+ """
16
+ Create and return a configured logger.
17
+ - logger_name: full logger name (e.g. "PyamilySeq.Group_Splitter")
18
+ - enable_file: if True, create a timestamped logfile in log_dir
19
+ - log_dir: directory for logfile (defaults to cwd)
20
+ - level: console log level (default INFO)
21
+ - verbose: if True, sets console level to DEBUG and file to DEBUG
22
+ """
23
+ logger = logging.getLogger(logger_name)
24
+ # Clear previous handlers to avoid duplicate logs on repeated imports/runs
25
+ if logger.hasHandlers():
26
+ logger.handlers.clear()
27
+
28
+ # Determine levels
29
+ console_level = logging.DEBUG if verbose else level
30
+ logger.setLevel(logging.DEBUG if verbose else level)
31
+
32
+ # Formatter without logger name (keeps output clean)
33
+ formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
34
+
35
+ # Console handler -> write to stdout by default
36
+ ch = logging.StreamHandler(sys.stdout)
37
+ ch.setLevel(console_level)
38
+ ch.setFormatter(formatter)
39
+ logger.addHandler(ch)
40
+
41
+ # Optional file handler
42
+ file_handler = None
43
+ if enable_file:
44
+ if not log_dir:
45
+ log_dir = os.getcwd()
46
+ os.makedirs(log_dir, exist_ok=True)
47
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
48
+ # Use short tool name derived from logger_name for the filename
49
+ safe_name = logger_name.split('.')[-1]
50
+ file_name = f"{safe_name}-{ts}.log"
51
+ fh = logging.FileHandler(os.path.join(log_dir, file_name))
52
+ fh.setLevel(logging.DEBUG) # file always capture debug for diagnostics
53
+ fh.setFormatter(formatter)
54
+ logger.addHandler(fh)
55
+ file_handler = fh
56
+ logger.debug("File logging enabled: %s", os.path.join(log_dir, file_name))
57
+
58
+ # Standard startup banner for all tools (printed once per logger instance)
59
+ # If banner hasn't been printed at all, log it normally (will go to console and file if present).
60
+ if not getattr(logger, "_welcome_printed", False):
61
+ logger.info("%s", WELCOME)
62
+ logger.info("%s", CITATION)
63
+ logger.info("%s", ISSUE)
64
+ setattr(logger, "_welcome_printed", True)
65
+ # Mark that banner also written to file if file handler exists
66
+ if file_handler:
67
+ # Also write formatted lines directly into the file to guarantee presence
68
+ try:
69
+ for msg in (WELCOME, CITATION, ISSUE):
70
+ rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
71
+ formatted = formatter.format(rec)
72
+ # write to file handler's stream and flush
73
+ try:
74
+ file_handler.stream.write(formatted + "\n")
75
+ file_handler.stream.flush()
76
+ except Exception:
77
+ # Best-effort; ignore write errors
78
+ pass
79
+ setattr(logger, "_welcome_file_written", True)
80
+ except Exception:
81
+ pass
82
+ else:
83
+ # Banner already printed (likely to console by an early logger). Ensure it is written to file
84
+ # if file logging was just enabled and it hasn't yet been written to file.
85
+ if file_handler and not getattr(logger, "_welcome_file_written", False):
86
+ # Write banner lines directly into the file handler's stream (avoid duplicating console output).
87
+ try:
88
+ for msg in (WELCOME, CITATION, ISSUE):
89
+ rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
90
+ formatted = formatter.format(rec)
91
+ try:
92
+ file_handler.stream.write(formatted + "\n")
93
+ file_handler.stream.flush()
94
+ except Exception:
95
+ pass
96
+ setattr(logger, "_welcome_file_written", True)
97
+ except Exception:
98
+ pass
99
+
100
+ return logger
101
+
102
+ # ArgumentParser subclass that logs usage/help/errors via the logger
103
+ class LoggingArgumentParser(argparse.ArgumentParser):
104
+ def __init__(self, *args, logger_name=None, **kwargs):
105
+ super().__init__(*args, **kwargs)
106
+ # If logger_name provided, use that logger; otherwise use root logger
107
+ self._logger = logging.getLogger(logger_name) if logger_name else logging.getLogger()
108
+ # Emit the parser description immediately on creation so it appears for normal runs
109
+ # (tools create an early console-only logger before constructing the parser).
110
+ if getattr(self, 'description', None):
111
+ try:
112
+ self._logger.info("%s", str(self.description))
113
+ except Exception:
114
+ # If logging fails for any reason, swallow to avoid breaking parser creation.
115
+ pass
116
+
117
+ def print_usage(self, file=None):
118
+ # Preserve default usage printing to console; description already logged at init.
119
+ super().print_usage(file)
120
+
121
+ def print_help(self, file=None):
122
+ # Capture help output, strip description (already logged), and print the rest to console.
123
+ sio = StringIO()
124
+ super().print_help(sio)
125
+ help_text = sio.getvalue()
126
+ if self.description:
127
+ pattern = re.escape(str(self.description)) + r'(\r?\n){1,2}'
128
+ help_text = re.sub(pattern, '', help_text, count=1)
129
+ out_file = file if file is not None else sys.stdout
130
+ out_file.write(help_text)
131
+
132
+ def exit(self, status=0, message=None):
133
+ # Preserve argparse behaviour by writing any exit message to stderr and exiting.
134
+ if message:
135
+ sys.stderr.write(message)
136
+ raise SystemExit(status)
137
+
138
+ def error(self, message):
139
+ # Print usage to stderr (as argparse does) and log a concise error message via logger.
140
+ super().print_usage(sys.stderr)
141
+ prog = self.prog if hasattr(self, 'prog') else ''
142
+ self._logger.error("%s: error: %s", prog, message)
143
+ self.exit(2)
PyamilySeq/utils.py CHANGED
@@ -7,6 +7,25 @@ from tempfile import NamedTemporaryFile
7
7
  import sys
8
8
  import re
9
9
  import math
10
+ import logging
11
+
12
+ logger = logging.getLogger("PyamilySeq") # Use the shared top-level PyamilySeq logger so all utils logs propagate to the same handlers
13
+
14
+ _startup_messages_pending = []
15
+
16
+
17
+ def emit_pending_startup_messages():
18
+ global _startup_messages_pending
19
+ if _startup_messages_pending:
20
+ try:
21
+ for msg in _startup_messages_pending:
22
+ try:
23
+ logger.info("%s", msg)
24
+ except Exception:
25
+ # swallow any logging errors to avoid breaking flow
26
+ pass
27
+ finally:
28
+ _startup_messages_pending.clear()
10
29
 
11
30
  ####
12
31
  # Placeholder for the distance function
@@ -18,7 +37,8 @@ try:
18
37
  def levenshtein_distance_calc(seq1, seq2):
19
38
  return LV.distance(seq1, seq2)
20
39
  except (ModuleNotFoundError, ImportError):
21
- print("Levenshtein package not installed - Will fallback to slower Python implementation.")
40
+ # Save the notice for later emission (after logger handlers are configured)
41
+ _startup_messages_pending.append("Levenshtein package not installed - Will fallback to slower Python implementation.")
22
42
  # Fallback implementation
23
43
  def levenshtein_distance_calc(seq1, seq2):
24
44
  # Slower Python implementation of Levenshtein distance
@@ -62,6 +82,11 @@ codon_table = {
62
82
  'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
63
83
  'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
64
84
 
85
+ # Temp fix
86
+ codon_table['TAA'] = ''
87
+ codon_table['TGA'] = ''
88
+ codon_table['TAG'] = ''
89
+
65
90
  def translate_frame(sequence):
66
91
  translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
67
92
  return translate
@@ -94,10 +119,15 @@ def translate_dna_to_aa(dna_fasta, aa_fasta):
94
119
 
95
120
 
96
121
  def detect_sequence_type(fasta_file):
97
- with open(fasta_file, 'r') as f:
122
+ import gzip
123
+ opener = gzip.open if str(fasta_file).lower().endswith('.gz') else open
124
+ with opener(fasta_file, 'rt') as f:
98
125
  for line in f:
99
126
  if line.startswith('>'):
100
127
  continue
128
+ line = line.strip().upper()
129
+ if not line:
130
+ continue
101
131
  if any(base in line for base in 'EFILPQZ'):
102
132
  return False # Contains amino acids
103
133
  return True # Contains DNA
@@ -105,7 +135,6 @@ def detect_sequence_type(fasta_file):
105
135
 
106
136
  def is_tool_installed(tool_name):
107
137
  """Check if a tool is installed and available in PATH."""
108
- # Check if the tool is in the system PATH
109
138
  if shutil.which(tool_name) is None:
110
139
  return False
111
140
 
@@ -119,7 +148,9 @@ def is_tool_installed(tool_name):
119
148
  return False # This shouldn't happen due to the earlier check
120
149
 
121
150
  def reverse_complement(seq):
122
- complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
151
+ complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N','R': 'Y',
152
+ 'Y': 'R', 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'V': 'B',
153
+ 'B': 'V', 'H': 'D', 'D': 'H'}
123
154
  return ''.join(complement[base] for base in reversed(seq))
124
155
 
125
156
 
@@ -196,8 +227,8 @@ def select_longest_gene(sequences, subgrouped):
196
227
 
197
228
 
198
229
  def run_mafft_on_sequences(options, sequences, output_file):
199
- #print("Conducting MAFFT alignment.")
200
230
  """Run mafft on the given sequences and write to output file."""
231
+ emit_pending_startup_messages()
201
232
  # Create a temporary input file for mafft
202
233
  with NamedTemporaryFile('w', delete=False) as temp_input_file:
203
234
  for header, sequence in sequences.items():
@@ -207,14 +238,13 @@ def run_mafft_on_sequences(options, sequences, output_file):
207
238
  # Run mafft
208
239
  try:
209
240
  with open(output_file, 'w') as output_f:
210
- if options.verbose == True:
241
+ if getattr(options, "verbose", False):
211
242
  subprocess.run(
212
243
  ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
213
244
  stdout=output_f,
214
245
  stderr=sys.stderr,
215
246
  check=True
216
247
  )
217
-
218
248
  else:
219
249
  subprocess.run(
220
250
  ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
@@ -223,22 +253,28 @@ def run_mafft_on_sequences(options, sequences, output_file):
223
253
  check=True
224
254
  )
225
255
  finally:
226
- os.remove(temp_input_file_path) # Clean up the temporary file
227
-
256
+ try:
257
+ os.remove(temp_input_file_path) # Clean up the temporary file
258
+ except Exception:
259
+ pass
228
260
 
229
261
 
230
262
 
231
263
  def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
264
+ emit_pending_startup_messages()
232
265
  if run_as_combiner == True:
233
- combined_out_file_aa = None
266
+ combined_out_file_aa_path = None
234
267
  else:
235
- combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
268
+ combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
236
269
 
237
- with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
270
+ # Open actual AA file or os.devnull based on whether we need an AA file path
271
+ aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
272
+ with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
238
273
  paired_files_found = None
239
274
  #with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
240
275
  gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
241
276
  if not gff_files:
277
+ logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split_gff)
242
278
  sys.exit("Error: No GFF files found.")
243
279
  for gff_file in gff_files:
244
280
  genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
@@ -251,12 +287,12 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
251
287
  corresponding_fasta_file = temp_file
252
288
  break
253
289
  if corresponding_fasta_file is None:
254
- print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
290
+ logger.warning("Corresponding FASTA file for GFF file '%s' not found. Skipping. Try using the -name_split_fasta option.", gff_file)
255
291
  continue
256
292
  else:
257
293
  corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
258
294
  if not os.path.exists(corresponding_fasta_file):
259
- print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
295
+ logger.warning("Corresponding FASTA file for GFF file '%s' not found: expected '%s'. Skipping. Try using the -name_split_fasta option.", gff_file, corresponding_fasta_file)
260
296
  continue
261
297
 
262
298
  gff_features = []
@@ -322,25 +358,30 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
322
358
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
323
359
 
324
360
  if not paired_files_found:
361
+ logger.error("Could not find matching GFF/FASTA files. Please check input directory and -name_split_gff / -name_split_fasta parameters.")
325
362
  sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
326
363
  if translate == False or translate == None:
327
- #Clean up unused file
328
- try: # Catches is combined_out_file_aa is None
329
- if combined_out_file.name != combined_out_file_aa.name:
330
- os.remove(combined_out_file_aa.name)
331
- except AttributeError:
332
- pass
364
+ # Clean up unused file only if it was a real file we created (never remove os.devnull)
365
+ if combined_out_file_aa_path:
366
+ try:
367
+ if os.path.exists(combined_out_file_aa_path):
368
+ os.remove(combined_out_file_aa_path)
369
+ except Exception:
370
+ pass
333
371
 
334
372
 
335
373
  def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
374
+ emit_pending_startup_messages()
336
375
  if run_as_combiner == True:
337
- combined_out_file_aa = None
376
+ combined_out_file_aa_path = None
338
377
  else:
339
- combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
340
- #with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
341
- with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
378
+ combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
379
+
380
+ aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
381
+ with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
342
382
  gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
343
383
  if not gff_files:
384
+ logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split)
344
385
  sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
345
386
  for gff_file in gff_files:
346
387
  genome_name = os.path.basename(gff_file).split(name_split)[0]
@@ -409,24 +450,29 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
409
450
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
410
451
 
411
452
  if translate == False or translate == None:
412
- #Clean up unused file
413
- try: # Catches is combined_out_file_aa is None
414
- if combined_out_file.name != combined_out_file_aa.name:
415
- os.remove(combined_out_file_aa.name)
416
- except AttributeError:
417
- pass
453
+ # Clean up unused file only if it was a real file we created (never remove os.devnull)
454
+ if combined_out_file_aa_path:
455
+ try:
456
+ if os.path.exists(combined_out_file_aa_path):
457
+ os.remove(combined_out_file_aa_path)
458
+ except Exception:
459
+ pass
418
460
 
419
461
 
420
462
 
421
463
  def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
464
+ emit_pending_startup_messages()
422
465
  if run_as_combiner == True:
423
- combined_out_file_aa = None
466
+ combined_out_file_aa_path = None
424
467
  else:
425
- combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
426
- with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
468
+ combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
469
+
470
+ aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
471
+ with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
427
472
  fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
428
473
  if not fasta_files:
429
- sys.exit("Error: No GFF files found.")
474
+ logger.error("Error: No FASTA files found in %s (pattern: *%s).", input_dir, name_split_fasta)
475
+ sys.exit("Error: No FASTA files found.")
430
476
  for fasta_file in fasta_files:
431
477
  genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
432
478
  fasta_dict = collections.defaultdict(str)
@@ -456,31 +502,63 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_a
456
502
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
457
503
 
458
504
  if translate == False or translate == None:
459
- #Clean up unused file
460
- try: # Catches is combined_out_file_aa is None
461
- if combined_out_file.name != combined_out_file_aa.name:
462
- os.remove(combined_out_file_aa.name)
463
- except AttributeError:
464
- pass
505
+ # Clean up unused file only if it was a real file we created (never remove os.devnull)
506
+ if combined_out_file_aa_path:
507
+ try:
508
+ if os.path.exists(combined_out_file_aa_path):
509
+ os.remove(combined_out_file_aa_path)
510
+ except Exception:
511
+ pass
512
+
513
+ def write_individual_groups(options, output_dir, key_order, cores, sequences,
514
+ pangenome_clusters_First_sequences_sorted,
515
+ combined_pangenome_clusters_Second_sequences):
516
+ if not getattr(options, "write_individual_groups", False):
517
+ return
518
+
519
+ os.makedirs(output_dir, exist_ok=True)
520
+
521
+
522
+ for key_prefix in key_order:
523
+ for key, values in cores.items():
524
+ if not key.startswith(key_prefix):
525
+ continue
465
526
 
527
+ for value in values:
528
+ sequences_to_write = (pangenome_clusters_First_sequences_sorted[value]
529
+ if 'First' in key_prefix
530
+ else combined_pangenome_clusters_Second_sequences[value])
531
+
532
+ dna_path = os.path.join(output_dir, f"{key}_{value}_dna.fasta")
533
+ aa_path = dna_path.replace('_dna.fasta', '_aa.fasta')
534
+
535
+ if getattr(options, "sequence_type", None) == 'AA':
536
+ with open(dna_path, 'w') as dna_f, open(aa_path, 'w') as aa_f:
537
+ for header in sequences_to_write:
538
+ if header not in sequences:
539
+ if getattr(options, "verbose", False):
540
+ print(f"Sequence {header} not found in original_fasta file.")
541
+ continue
542
+ seq = sequences[header]
543
+ dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
544
+ aa_f.write(f">{header}\n{wrap_sequence(translate_frame(seq))}\n")
545
+ else:
546
+ with open(dna_path, 'w') as dna_f:
547
+ for header in sequences_to_write:
548
+ if header not in sequences:
549
+ if getattr(options, "verbose", False):
550
+ print(f"Sequence {header} not found in original_fasta file.")
551
+ continue
552
+ seq = sequences[header]
553
+ dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
466
554
 
467
555
  def write_groups_func(options, output_dir, key_order, cores, sequences,
468
556
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
469
- """
470
- Writes individual FASTA files and a combined FASTA file for all sequences.
471
-
472
- Parameters:
473
- - options: Command-line options.
474
- - output_dir: Directory where output FASTA files will be saved.
475
- - key_order: The order in which to process keys.
476
- - cores: Dictionary of core genes.
477
- - sequences: Dictionary mapping headers to sequences.
478
- - pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
479
- - combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
480
- """
557
+
558
+ emit_pending_startup_messages()
481
559
  # Create output directory if it doesn't exist
482
560
  if not os.path.exists(output_dir):
483
- os.makedirs(output_dir)
561
+ os.makedirs(output_dir)
484
562
 
485
563
  for group in options.write_groups.split(','):
486
564
 
@@ -514,7 +592,10 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
514
592
  outfile_aa.write(f">{header}\n")
515
593
  outfile_aa.write(f"{wrapped_sequence_aa}\n")
516
594
  else:
517
- os.remove(outfile_aa.name) # Delete individual file if option is disabled
595
+ try:
596
+ os.remove(outfile_aa.name) # Delete individual file if option is disabled
597
+ except FileNotFoundError:
598
+ pass
518
599
  # Always write to the combined AA file
519
600
  combined_fasta_aa.write(f">Group_{value}|{header}\n")
520
601
  combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
@@ -530,18 +611,24 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
530
611
  outfile.write(f">{header}\n")
531
612
  outfile.write(f"{wrapped_sequence}\n")
532
613
  else:
533
- os.remove(outfile.name) # Delete individual file if option is disabled
614
+ try:
615
+ os.remove(outfile.name) # Delete individual file if option is disabled
616
+ except FileNotFoundError:
617
+ pass
534
618
  # Always write to the combined nucleotide file
535
619
  combined_fasta.write(f">Group_{value}|{header}\n")
536
620
  combined_fasta.write(f"{wrapped_sequence}\n")
537
621
 
538
622
  else:
539
623
  if options.verbose == True:
540
- print(f"Sequence {header} not found in original_fasta file.")
624
+ logger.info("Sequence " + header + " not found in original_fasta file.")
541
625
  if options.sequence_type != 'AA':
542
626
  #Clean up unused file
543
- os.remove(combined_fasta_aa.name)
544
- print(f"Combined FASTA file saved to: {combined_fasta_filename}")
627
+ try:
628
+ os.remove(combined_fasta_aa.name)
629
+ except FileNotFoundError:
630
+ pass
631
+ logger.info("Combined FASTA file saved to: " + combined_fasta_filename)
545
632
 
546
633
 
547
634
  # def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
@@ -612,38 +699,38 @@ def perform_alignment(gene_path,group_directory, gene_file, options, concatenate
612
699
  return concatenated_sequences
613
700
 
614
701
  def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
702
+ emit_pending_startup_messages()
615
703
  """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
616
704
  concatenated_sequences = {genome: "" for genome in genome_list}
617
705
  output_file = group_directory.replace('Gene_Groups_Output', output_file)
618
706
  if paralog_groups != None:
619
- threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
707
+ threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
620
708
 
621
709
  if options.align_aa == True:
622
- affix = '_aa.fasta'
710
+ affix = '_aa.fasta'
623
711
  else:
624
- affix = '_dna.fasta'
712
+ affix = '_dna.fasta'
625
713
 
626
714
  if options.align_core == True:
627
- # Iterate over each gene family file
628
- for gene_file in os.listdir(group_directory):
629
- if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
630
- current_group = int(gene_file.split('_')[3].split('.')[0])
631
- gene_path = os.path.join(group_directory, gene_file)
632
- # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
633
- if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
634
- # Check for matching group in paralog_groups
635
- if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
636
- for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
637
- if size >= threshold_size:
638
- gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
639
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
640
- else:
641
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
642
-
643
- # Write the concatenated sequences to the output file
644
- with open(output_file, 'w') as out:
645
- for genome, sequence in concatenated_sequences.items():
646
- out.write(f">{genome}\n")
647
- wrapped_sequence = wrap_sequence(sequence, 60)
648
- out.write(f"{wrapped_sequence}\n")
649
-
715
+ # Iterate over each gene family file
716
+ for gene_file in os.listdir(group_directory):
717
+ if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
718
+ current_group = int(gene_file.split('_')[3].split('.')[0])
719
+ gene_path = os.path.join(group_directory, gene_file)
720
+ # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
721
+ if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
722
+ # Check for matching group in paralog_groups
723
+ if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
724
+ for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
725
+ if size >= threshold_size:
726
+ gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
727
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
728
+ else:
729
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
730
+
731
+ # Write the concatenated sequences to the output file
732
+ with open(output_file, 'w') as out:
733
+ for genome, sequence in concatenated_sequences.items():
734
+ out.write(f">{genome}\n")
735
+ wrapped_sequence = wrap_sequence(sequence, 60)
736
+ out.write(f"{wrapped_sequence}\n")