PyamilySeq 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,12 @@
1
- import argparse
2
1
  from collections import defaultdict
2
+ import logging
3
+ import os
4
+
5
+ # Use centralised logger factory from constants
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
3
10
 
4
11
  def read_cd_hit_output(clstr_file):
5
12
  """
@@ -23,10 +30,8 @@ def read_cd_hit_output(clstr_file):
23
30
  return seq_to_cluster
24
31
 
25
32
  def compare_cd_hit_clusters(file1, file2, output_file):
26
- """
27
- Compares two CD-HIT .clstr files to check if clusters are the same.
28
- Writes the results to a TSV file.
29
- """
33
+ logger = logging.getLogger("PyamilySeq.Group_Compare")
34
+ logger.info("Comparing clusters: %s vs %s", file1, file2)
30
35
  # Read both clustering files
31
36
  clusters1 = read_cd_hit_output(file1)
32
37
  clusters2 = read_cd_hit_output(file2)
@@ -80,12 +85,11 @@ def compare_cd_hit_clusters(file1, file2, output_file):
80
85
  tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
81
86
 
82
87
  # Print metrics
83
- print("🔢 Clustering Comparison Metrics:")
84
- print(f"Cluster name changes: {cluster_name_changes}")
85
- print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
86
- print(f"Sequences only in the first file: {len(only_in_file1)}")
87
- print(f"Sequences only in the second file: {len(only_in_file2)}")
88
- print()
88
+ logger.info("Clustering Comparison Metrics:")
89
+ logger.info("Cluster name changes: %s", cluster_name_changes)
90
+ logger.info("Sequence shifts (sequences assigned to different clusters): %s", sequence_shifts)
91
+ logger.info("Sequences only in the first file: %s", len(only_in_file1))
92
+ logger.info("Sequences only in the second file: %s", len(only_in_file2))
89
93
 
90
94
  # Write the results to a TSV file
91
95
  with open(output_file, 'w') as out_file:
@@ -93,15 +97,25 @@ def compare_cd_hit_clusters(file1, file2, output_file):
93
97
  for row in tsv_data:
94
98
  out_file.write("\t".join(map(str, row)) + "\n")
95
99
 
96
- print(f"Results have been written to {output_file}")
100
+ logger.info("Results have been written to %s", output_file)
97
101
 
98
102
  def main():
99
- parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
103
+ # Early console-only logger so parser.description and argparse messages are emitted via logger
104
+ early_logger = configure_logger("PyamilySeq.Group_Compare", enable_file=False, log_dir=None, verbose=False)
105
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Compare", description="Running Group-Compare - A tool to compare two CD-HIT .clstr files to check for clustering consistency.")
106
+
100
107
  parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
101
108
  parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
102
109
  parser.add_argument("-output", required=True, help="Output file (TSV format)")
110
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
111
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output).")
103
112
  args = parser.parse_args()
104
113
 
114
+ # Setup logger
115
+ out_dir = os.path.abspath(os.path.dirname(args.output)) if args.output else os.getcwd()
116
+ log_dir = args.log_dir if args.log_dir else out_dir
117
+ logger = configure_logger("PyamilySeq.Group_Compare", enable_file=args.log, log_dir=log_dir, verbose=False)
118
+
105
119
  compare_cd_hit_clusters(args.file1, args.file2, args.output)
106
120
 
107
121
  if __name__ == "__main__":
@@ -1,6 +1,12 @@
1
- import argparse
2
1
  import os
3
2
  import csv
3
+ import logging
4
+
5
+ # Use centralissed logger factory from constants
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
4
10
 
5
11
 
6
12
  def parse_fasta(fasta_file):
@@ -43,9 +49,8 @@ def parse_csv(csv_file):
43
49
 
44
50
 
45
51
  def write_group_fastas(groups, sequences, output_dir):
46
- """
47
- Writes individual FASTA files for each group with the relevant sequences.
48
- """
52
+
53
+ logger = logging.getLogger("PyamilySeq.Group_Extractor")
49
54
  if not os.path.exists(output_dir):
50
55
  os.makedirs(output_dir)
51
56
 
@@ -56,27 +61,39 @@ def write_group_fastas(groups, sequences, output_dir):
56
61
  if gene_id in sequences:
57
62
  f.write(f">{gene_id}\n{sequences[gene_id]}\n")
58
63
  else:
59
- print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
64
+ logger.warning("Warning: Gene ID %s not found in FASTA file.", gene_id)
60
65
 
61
66
 
62
67
  def main():
63
- parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
68
+ # Early console-only logger so the parser description is logged before argparse outputs.
69
+ early_logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=False, log_dir=None, verbose=False)
70
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Extractor", description="Running Group-Extractor - A tool to process FASTA and CSV files to create grouped FASTA outputs.")
71
+
64
72
  parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
65
73
  parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
66
74
  parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
75
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
76
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
67
77
 
68
78
  args = parser.parse_args()
69
79
 
70
- # Parse the input files
71
- print("Parsing FASTA file...")
80
+ # Setup logger writing to output_dir (optional file)
81
+ log_dir = os.path.abspath(args.output_dir) if args.output_dir else os.getcwd()
82
+ if hasattr(args, "log_dir") and args.log_dir:
83
+ log_dir = args.log_dir
84
+ # Only create a logfile when --log is provided; default is console (stdout) only.
85
+ logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=getattr(args, "log", False), log_dir=log_dir, verbose=False)
86
+
87
+ logger.info("Parsing FASTA file: %s", args.fasta)
72
88
  sequences = parse_fasta(args.fasta)
73
- print("Parsing CSV file...")
89
+ logger.info("Parsed %d sequences.", len(sequences))
90
+ logger.info("Parsing CSV file: %s", args.csv)
74
91
  groups = parse_csv(args.csv)
92
+ logger.info("Parsed %d groups.", len(groups))
75
93
 
76
- # Write the grouped FASTA files
77
- print("Writing grouped FASTA files...")
94
+ logger.info("Writing grouped FASTA files to %s", args.output_dir)
78
95
  write_group_fastas(groups, sequences, args.output_dir)
79
- print("Process completed successfully.")
96
+ logger.info("Process completed successfully.")
80
97
 
81
98
 
82
99
  if __name__ == "__main__":
PyamilySeq/Group_Sizes.py CHANGED
@@ -1,6 +1,14 @@
1
- import argparse
1
+
2
2
  import os
3
3
  import csv
4
+ import logging
5
+
6
+
7
+ # Use centralised logger factory from constants
8
+ try:
9
+ from .constants import configure_logger, LoggingArgumentParser
10
+ except Exception:
11
+ from constants import configure_logger, LoggingArgumentParser
4
12
 
5
13
 
6
14
  def parse_fasta_stats(fasta_file):
@@ -43,9 +51,7 @@ def parse_fasta_stats(fasta_file):
43
51
 
44
52
 
45
53
  def process_fasta_directory(input_dir, output_csv):
46
- """
47
- Processes a directory of FASTA files and writes statistics to a CSV file.
48
- """
54
+ logger = logging.getLogger("PyamilySeq.Group_Sizes")
49
55
  results = []
50
56
  for filename in os.listdir(input_dir):
51
57
  if filename.endswith(".fasta"):
@@ -68,19 +74,27 @@ def process_fasta_directory(input_dir, output_csv):
68
74
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
69
75
  writer.writeheader()
70
76
  writer.writerows(results)
77
+ logger.info("Wrote statistics for %d FASTA files to %s", len(results), output_csv)
71
78
 
72
79
 
73
80
  def main():
74
- parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
81
+ # Early console-only logger so the parser.description is emitted via logger before argparse prints usage/help.
82
+ early_logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=False, log_dir=None, verbose=False)
83
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Sizes", description="Group-Sizes - A tool to summarise sequence statistics for a directory of FASTA files.")
75
84
  parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
76
85
  parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
86
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
87
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output_csv).")
77
88
 
78
89
  args = parser.parse_args()
79
90
 
80
- # Process the directory of FASTA files
81
- print("Processing FASTA files...")
91
+ out_dir = os.path.abspath(os.path.dirname(args.output_csv)) if args.output_csv else os.getcwd()
92
+ log_dir = args.log_dir if args.log_dir else out_dir
93
+ logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=args.log, log_dir=log_dir, verbose=False)
94
+
95
+ logger.info("Processing FASTA files in %s", args.input_dir)
82
96
  process_fasta_directory(args.input_dir, args.output_csv)
83
- print(f"Statistics saved to {args.output_csv}")
97
+ logger.info("Statistics saved to %s", args.output_csv)
84
98
 
85
99
 
86
100
  if __name__ == "__main__":
@@ -1,6 +1,5 @@
1
-
2
- import argparse
3
1
  from collections import defaultdict, OrderedDict
2
+ import sys
4
3
 
5
4
 
6
5
  try:
@@ -11,6 +10,7 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
10
  from utils import *
12
11
 
13
12
  def run_cd_hit(options, input_file, clustering_output, clustering_mode):
13
+ logger = logging.getLogger("PyamilySeq.Group_Splitter")
14
14
  cdhit_command = [
15
15
  clustering_mode,
16
16
  '-i', input_file,
@@ -24,12 +24,17 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
24
24
  '-sc', "1",
25
25
  '-sf', "1"
26
26
  ]
27
- if options.verbose == True:
28
- subprocess.run(cdhit_command)
29
- else:
30
- subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
27
+ logger.debug("Group-Splitter CD-HIT command: %s", " ".join(cdhit_command))
28
+ try:
29
+ if options.verbose:
30
+ subprocess.run(cdhit_command)
31
+ else:
32
+ subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
33
+ logger.info("CD-HIT completed for %s", input_file)
34
+ except Exception:
35
+ logger.exception("Error running CD-HIT for %s", input_file)
36
+
31
37
 
32
- #'@profile
33
38
  def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
34
39
  total_length = sum(entry['length'] for entry in cluster_data)
35
40
  avg_length = total_length / len(cluster_data)
@@ -75,7 +80,27 @@ def read_fasta_groups(options, groups_to_use):
75
80
  else:
76
81
  affix = '_dna.fasta'
77
82
 
78
- combined_groups_fasta = options.input_directory + '/Gene_Groups_Output/combined_group_sequences' + affix
83
+ # Ensure we look for the combined file that includes the requested group level (e.g. "99")
84
+ # groups_to_use[1] contains the numeric group level when using ('groups', <num>)
85
+ group_level = str(groups_to_use[1]) if groups_to_use and len(groups_to_use) > 1 else ''
86
+ combined_groups_fasta = os.path.join(options.input_directory, 'Gene_Groups_Output',
87
+ f"combined_group_sequences_{group_level}{affix}")
88
+
89
+ # Defensive check: combined_group_sequences_* file must exist (was created by PyamilySeq with -write_groups)
90
+ if not os.path.exists(combined_groups_fasta):
91
+ logger = logging.getLogger("PyamilySeq.Group_Splitter")
92
+ logger.error("Required combined group sequences file not found: %s", combined_groups_fasta)
93
+ logger.error("This usually means the upstream PyamilySeq run did not include the -write_groups and -write_individual_groups options.")
94
+ # Helpful debug info: list contents of Gene_Groups_Output if available
95
+ parent_dir = os.path.dirname(combined_groups_fasta)
96
+ if os.path.isdir(parent_dir):
97
+ try:
98
+ files = os.listdir(parent_dir)
99
+ logger.debug("Files in %s: %s", parent_dir, ", ".join(sorted(files)) if files else "(none)")
100
+ except Exception as e:
101
+ logger.debug("Could not list %s: %s", parent_dir, e)
102
+ # Stop further processing
103
+ sys.exit(1)
79
104
 
80
105
  if groups_to_use[0] == 'ids':
81
106
  selected_group_ids = [int(g.strip()) for g in groups_to_use[1].split(',')]
@@ -334,13 +359,16 @@ def separate_groups(options, clustering_mode, groups_to_use):
334
359
 
335
360
 
336
361
  def main():
337
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
362
+ # Early console-only logger so parser.description is emitted via logger before argparse prints usage/help.
363
+ early_logger = configure_logger("PyamilySeq.Group_Splitter", enable_file=False, log_dir=None, verbose=False)
364
+ # Use LoggingArgumentParser so usage/errors are emitted via the configured logger
365
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Splitter", description='Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
338
366
  ### Required Arguments
339
367
  required = parser.add_argument_group('Required Parameters')
340
- required.add_argument('-input_directory', action='store', dest='input_directory',
368
+ required.add_argument('-input_dir', action='store', dest='input_directory',
341
369
  help='Provide the directory of a PyamilySeq run.',
342
370
  required=True)
343
- required.add_argument('-sequence_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
371
+ required.add_argument('-seq_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
344
372
  help='Default - AA: Are groups "DNA" or "AA" sequences?',
345
373
  required=True)
346
374
  required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
@@ -350,7 +378,7 @@ def main():
350
378
 
351
379
  ### Regrouping Arguments
352
380
  regrouping_params = parser.add_argument_group('Regrouping Parameters')
353
- regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=None,
381
+ regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=99,
354
382
  help='Default - 99: groups to be split by pangenome grouping (see -group_threshold). '
355
383
  'Provide "-groups 99" to split specific groups.',
356
384
  required=False)
@@ -403,10 +431,14 @@ def main():
403
431
  help="Print out version number and exit")
404
432
 
405
433
 
434
+ # Optional file logging flags (must be added before parsing)
435
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
436
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: input_directory).")
406
437
  options = parser.parse_args()
407
- print("Running PyamilySeq: Group-Splitter " + PyamilySeq_Version)
408
-
409
-
438
+ # Compute logfile directory (default to input_directory) and only enable file logging when --log is provided.
439
+ log_dir = options.log_dir if getattr(options, "log_dir", None) else os.path.abspath(options.input_directory)
440
+ logger = configure_logger("PyamilySeq.Group_Splitter", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=options.verbose)
441
+ logger.info("Running Group-Splitter %s", PyamilySeq_Version)
410
442
 
411
443
  ###External tool checks:
412
444
  ##MAFFT
@@ -416,11 +448,10 @@ def main():
416
448
  print("mafft is installed. Proceeding with alignment.")
417
449
  else:
418
450
  exit("mafft is not installed. Please install mafft to proceed.")
419
- ##CD-HIT
420
451
 
452
+ ##CD-HIT
421
453
  if is_tool_installed('cd-hit'):
422
- if options.verbose == True:
423
- print("cd-hit is installed. Proceeding with clustering.")
454
+ logger.info("cd-hit is installed. Proceeding with clustering.")
424
455
  if options.sequence_type == 'DNA':
425
456
  clustering_mode = 'cd-hit-est'
426
457
  else:
@@ -434,6 +465,7 @@ def main():
434
465
  if options.verbose == True:
435
466
  print("Running CD-HIT in slow mode.")
436
467
  else:
468
+ logger.error("cd-hit is not installed. Please install cd-hit to proceed.")
437
469
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
438
470
 
439
471
  ##Alignment
@@ -451,6 +483,9 @@ def main():
451
483
  if not os.path.exists(sub_groups_output):
452
484
  os.makedirs(sub_groups_output)
453
485
 
486
+ logger.info("Gene groups output: %s", gene_groups_output)
487
+ logger.info("Sub groups output: %s", sub_groups_output)
488
+
454
489
  ## Get Summary Stats
455
490
  summary_file = os.path.join(options.input_directory, 'summary_statistics.txt')
456
491
 
@@ -459,10 +494,9 @@ def main():
459
494
  with open(params_out, "w") as outfile:
460
495
  for arg, value in vars(options).items():
461
496
  outfile.write(f"{arg}: {value}\n")
497
+ logger.info("Saved parameters to %s", params_out)
462
498
 
463
-
464
-
465
- ## Group Selction - FIX THIS - currently fails if either are not provided
499
+ ## Group Selection - FIX THIS - currently fails if either are not provided
466
500
  if options.groups != None and options.group_ids != None:
467
501
  sys.exit('Must provide "-group_ids" or "-groups", not both.')
468
502
  elif options.group_ids != None:
@@ -475,12 +509,9 @@ def main():
475
509
 
476
510
 
477
511
  paralog_groups = separate_groups(options, clustering_mode, groups_to_use)
478
- ###
479
- # Print metrics about paralog groups
480
- print(f"Identified {len(paralog_groups)} paralog groups:")
512
+ logger.info("Identified %d paralog groups", len(paralog_groups))
481
513
  for group_id, data in paralog_groups.items():
482
- print(f"Group ID: {group_id}, Number of new groups: {data['count']}, Sizes: {data['sizes']}")
483
- ###
514
+ logger.debug("Group %s -> new groups: %s sizes: %s", group_id, data['count'], data['sizes'])
484
515
 
485
516
 
486
517
  # Read summary statistics
@@ -509,8 +540,37 @@ def main():
509
540
  # Recalculate each *_core_* value
510
541
  for group_id, data in paralog_groups.items():
511
542
  group_id = group_id.replace('>Group_', '')
512
- original_group = next((f for f in os.listdir(gene_groups_output) if f.endswith(f'_{group_id}.fasta')), None)
513
- original_group = int(original_group.split('_')[2])
543
+ # Find the original group filename in gene_groups_output that:
544
+ # - contains the requested group level (options.groups, e.g. '99')
545
+ # - corresponds to this subgroup id (group_id)
546
+ original_group = None
547
+ for fname in os.listdir(gene_groups_output):
548
+ if not fname.endswith('.fasta'):
549
+ continue
550
+ # Require the filename to include the group level token (e.g., '_99_') to avoid false matches
551
+ if f"_{options.groups}_" not in fname:
552
+ continue
553
+ # Accept filenames that end with _<group_id>.fasta or _<group_id>_dna.fasta/_aa.fasta
554
+ if fname.endswith(f"_{group_id}.fasta") or fname.endswith(f"_{group_id}_dna.fasta") or fname.endswith(f"_{group_id}_aa.fasta"):
555
+ original_group = fname
556
+ break
557
+ if original_group is None:
558
+ # fallback: attempt a looser match (preserve previous behavior)
559
+ for fname in os.listdir(gene_groups_output):
560
+ if fname.endswith(f"_{group_id}.fasta") or fname.endswith(f"_{group_id}_dna.fasta") or fname.endswith(f"_{group_id}_aa.fasta"):
561
+ original_group = fname
562
+ break
563
+ if original_group is None:
564
+ # If still not found, skip recalculation for this paralog group
565
+ logger.warning("Could not find original group file for subgroup id %s in %s", group_id, gene_groups_output)
566
+ continue
567
+ # Extract the core-group number from the filename (expected at index 2: First_core_99_3_dna.fasta)
568
+ try:
569
+ original_group_num = int(original_group.split('_')[2])
570
+ except Exception:
571
+ logger.warning("Unexpected filename format for %s; skipping", original_group)
572
+ continue
573
+ original_group = original_group_num
514
574
  if original_group == 99:
515
575
  new_core_99 -= 1
516
576
  elif original_group == 95:
@@ -554,7 +614,7 @@ def main():
554
614
 
555
615
  # Alignment
556
616
  if options.align_core != None:
557
- print("\n\nProcessing gene group alignment")
617
+ logger.info("Processing gene group alignment")
558
618
  group_directory = options.gene_groups_output
559
619
  sub_group_directory = options.sub_groups_output
560
620
  genome_list = read_genomes_from_fasta(options.gene_groups_output + '/combined_group_sequences_dna.fasta')
@@ -1,4 +1,3 @@
1
- import argparse
2
1
  from collections import OrderedDict, defaultdict
3
2
 
4
3
  try:
@@ -10,7 +9,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError):
10
9
 
11
10
 
12
11
  def categorise_percentage(percent):
13
- """Categorise the percentage of genomes with multicopy genes."""
14
12
  categories = {
15
13
  (20, 40): "20-40%",
16
14
  (40, 60): "40-60%",
@@ -26,12 +24,9 @@ def categorise_percentage(percent):
26
24
 
27
25
 
28
26
  def read_cd_hit_output(clustering_output):
29
- """Parse CD-HIT .cluster file and extract clustering information."""
30
27
  clusters = OrderedDict()
31
-
32
28
  with open(clustering_output, 'r') as f:
33
29
  current_cluster_id = None
34
-
35
30
  for line in f:
36
31
  line = line.strip()
37
32
  if line.startswith(">Cluster"):
@@ -43,14 +38,12 @@ def read_cd_hit_output(clustering_output):
43
38
  clustered_info = parts[1]
44
39
  length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
45
40
  clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
46
-
47
41
  if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
48
42
  percent_identity = extract_identity(clustered_info)
49
43
  elif line.endswith('*'):
50
44
  percent_identity = 100.0
51
45
  else:
52
46
  raise ValueError("Percent identity not found in the string.")
53
-
54
47
  clusters[current_cluster_id].append({
55
48
  'header': clustered_header,
56
49
  'length': length,
@@ -61,22 +54,17 @@ def read_cd_hit_output(clustering_output):
61
54
 
62
55
 
63
56
  def summarise_clusters(options, clusters, output):
64
- """Generate a detailed cluster summary report."""
57
+ logger = logging.getLogger("PyamilySeq.Group_Summary")
65
58
  multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
66
-
67
59
  with open(output, 'w') as out_f:
68
60
  out_f.write(
69
- "Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
70
- )
71
-
61
+ "Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n")
72
62
  for cluster_id, seqs in clusters.items():
73
63
  num_seqs = len(seqs)
74
64
  lengths = [seq['length'] for seq in seqs]
75
65
  identities = [seq['percent_identity'] for seq in seqs]
76
-
77
66
  avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
78
67
  length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
79
-
80
68
  avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
81
69
  identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
82
70
 
@@ -90,7 +78,6 @@ def summarise_clusters(options, clusters, output):
90
78
  num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
91
79
  multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
92
80
 
93
- # Categorize multicopy percentage
94
81
  category = categorise_percentage(multicopy_percentage)
95
82
  if category:
96
83
  multicopy_groups[category] += 1
@@ -104,13 +91,14 @@ def summarise_clusters(options, clusters, output):
104
91
  # Define order for multicopy statistics output
105
92
  category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
106
93
  for category in category_order:
107
- print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
94
+ logger.info("Clusters with multicopy genes in %s range: %s", category, multicopy_groups[category])
108
95
 
109
96
 
110
97
  def main():
111
- """Main function to parse arguments and process clustering files."""
112
- parser = argparse.ArgumentParser(
113
- description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
98
+ # Initial logger setup before parsing arguments (use same logger name as summarise_clusters)
99
+ early_logger = configure_logger("PyamilySeq.Group_Summary", enable_file=False, log_dir=None, verbose=False)
100
+ # Use the LoggingArgumentParser so usage/help/error messages are emitted via the same logger
101
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Summary", description="Running Group-Summary - A tool to summarise CD-HIT clustering files.")
114
102
 
115
103
  # Required Arguments
116
104
  required = parser.add_argument_group('Required Parameters')
@@ -132,9 +120,18 @@ def main():
132
120
  misc.add_argument("-v", "--version", action="version",
133
121
  version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
134
122
  help="Print out version number and exit")
123
+ # Add optional logging flags
124
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
125
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir or input file dir).")
135
126
 
136
127
  options = parser.parse_args()
137
- print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
128
+
129
+ # Setup logger once we know output paths/options
130
+ # after we resolve output_path / options.output_dir:
131
+ resolved_log_dir = options.log_dir if getattr(options, "log_dir", None) else (os.path.abspath(options.output_dir) if getattr(options, "output_dir", None) else os.getcwd())
132
+ logger = configure_logger("PyamilySeq.Group_Summary", enable_file=getattr(options, "log", False), log_dir=resolved_log_dir, verbose=options.verbose)
133
+ if options.verbose:
134
+ logger.debug("Options: %s", vars(options))
138
135
 
139
136
  # File handling
140
137
  options.input_cluster = fix_path(options.input_cluster)
@@ -152,6 +149,7 @@ def main():
152
149
  # Process clusters and generate summary
153
150
  clusters = read_cd_hit_output(options.input_cluster)
154
151
  summarise_clusters(options, clusters, output_file_path)
152
+ logger.info("Summary written to %s", output_file_path)
155
153
 
156
154
 
157
155
  if __name__ == "__main__":