PyamilySeq 1.3.1__tar.gz → 1.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/PKG-INFO +14 -14
  2. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/README.md +13 -13
  3. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/pyproject.toml +5 -6
  4. pyamilyseq-1.3.1/src/PyamilySeq/Cluster_Compare.py → pyamilyseq-1.3.3/src/PyamilySeq/Group_Compare.py +27 -13
  5. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Extractor.py +29 -12
  6. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Sizes.py +22 -8
  7. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Splitter.py +89 -29
  8. pyamilyseq-1.3.1/src/PyamilySeq/Cluster_Summary.py → pyamilyseq-1.3.3/src/PyamilySeq/Group_Summary.py +18 -20
  9. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq.py +66 -43
  10. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq_Genus.py +1 -1
  11. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq_Species.py +30 -63
  12. pyamilyseq-1.3.3/src/PyamilySeq/Seq_Combiner.py +193 -0
  13. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/Seq_Extractor.py +24 -2
  14. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/Seq_Finder.py +20 -2
  15. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/clusterings.py +1 -1
  16. pyamilyseq-1.3.3/src/PyamilySeq/constants.py +143 -0
  17. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/utils.py +171 -84
  18. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/PKG-INFO +14 -14
  19. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/SOURCES.txt +2 -4
  20. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/entry_points.txt +4 -4
  21. pyamilyseq-1.3.1/src/PyamilySeq/Seq_Combiner.py +0 -83
  22. pyamilyseq-1.3.1/src/PyamilySeq/config.py +0 -0
  23. pyamilyseq-1.3.1/src/PyamilySeq/constants.py +0 -2
  24. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/LICENSE +0 -0
  25. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/setup.cfg +0 -0
  26. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq/__init__.py +0 -0
  27. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
  28. {pyamilyseq-1.3.1 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PyamilySeq
3
- Version: 1.3.1
3
+ Version: 1.3.3
4
4
  Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Author-email: Nicholas Dimonaco <nicholas@dimonaco.co.uk>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -720,7 +720,7 @@ To update to the newest version add '-U' to end of the pip install command.
720
720
  ```commandline
721
721
  usage: PyamilySeq.py [-h] {Full,Partial} ...
722
722
 
723
- PyamilySeq v1.3.1: A tool for gene clustering and analysis.
723
+ PyamilySeq v1.3.3: A tool for gene clustering and analysis.
724
724
 
725
725
  positional arguments:
726
726
  {Full,Partial} Choose a mode: 'Full' or 'Partial'.
@@ -750,7 +750,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
750
750
  ```
751
751
  ### Example output:
752
752
  ```
753
- Running PyamilySeq v1.3.1
753
+ Running PyamilySeq v1.3.3
754
754
  Calculating Groups
755
755
  Number of Genomes: 10
756
756
  Gene Groups
@@ -805,7 +805,7 @@ Total Number of First Gene Groups That Had Additional Second Sequences But Not N
805
805
  ## PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
806
806
  ### PyamilySeq - Full Menu:
807
807
  ```
808
- usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
808
+ usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-seq_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
809
809
  [-group_mode {Species,Genus}] [-species_groups SPECIES_GROUPS] [-genus_groups GENUS_GROUPS] [-write_groups WRITE_GROUPS] [-write_individual_groups] [-align] [-align_aa] [-no_gpa] [-M MEM] [-T THREADS] [-verbose] [-v]
810
810
 
811
811
  options:
@@ -821,7 +821,7 @@ options:
821
821
  Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.
822
822
  -name_split_fasta NAME_SPLIT_FASTA
823
823
  Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.
824
- -sequence_type {AA,DNA}
824
+ -seq_type {AA,DNA}
825
825
  Clustering mode: 'DNA' or 'AA'.
826
826
  -gene_ident GENE_IDENT
827
827
  Gene identifiers to extract sequences (e.g., 'CDS, tRNA').
@@ -895,7 +895,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
895
895
  ```
896
896
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
897
897
 
898
- PyamilySeq v1.3.1: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
898
+ PyamilySeq v1.3.3: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
899
899
 
900
900
  options:
901
901
  -h, --help show this help message and exit
@@ -927,18 +927,18 @@ Misc Arguments:
927
927
  ## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
928
928
  ### Example:
929
929
  ```bash
930
- Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
930
+ Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -seq_type AA
931
931
  ```
932
932
  ### Group-Splitter Menu:
933
933
  ```
934
- usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
934
+ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -seq_type {AA,DNA}
935
935
  -genome_num GENOME_NUM -output_dir OUTPUT_DIR
936
936
  [-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
937
937
  [-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
938
938
  [-M CLUSTERING_MEMORY] [-no_delete_temp_files]
939
939
  [-verbose] [-v]
940
940
 
941
- PyamilySeq v1.3.1: Group-Splitter - A tool to split multi-copy gene groups
941
+ PyamilySeq v1.3.3: Group-Splitter - A tool to split multi-copy gene groups
942
942
  identified by PyamilySeq.
943
943
 
944
944
  options:
@@ -947,7 +947,7 @@ options:
947
947
  Required Parameters:
948
948
  -input_fasta INPUT_FASTA
949
949
  Input FASTA file containing gene groups.
950
- -sequence_type {AA,DNA}
950
+ -seq_type {AA,DNA}
951
951
  Default - DNA: Are groups "DNA" or "AA" sequences?
952
952
  -genome_num GENOME_NUM
953
953
  The total number of genomes must be provide
@@ -981,17 +981,17 @@ Misc Parameters:
981
981
 
982
982
  ```
983
983
 
984
- ## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
984
+ ## Group-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
985
985
  ### Example:
986
986
  ```bash
987
- Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
987
+ Group-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
988
988
  ```
989
- ### Cluster-Summary Menu:
989
+ ### Group-Summary Menu:
990
990
  ```
991
991
  usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
992
992
  [-output_dir OUTPUT_DIR] [-verbose] [-v]
993
993
 
994
- PyamilySeq v1.3.1: Cluster-Summary - A tool to summarise CD-HIT clustering files.
994
+ PyamilySeq v1.3.3: Group-Summary - A tool to summarise CD-HIT clustering files.
995
995
 
996
996
  options:
997
997
  -h, --help show this help message and exit
@@ -29,7 +29,7 @@ To update to the newest version add '-U' to end of the pip install command.
29
29
  ```commandline
30
30
  usage: PyamilySeq.py [-h] {Full,Partial} ...
31
31
 
32
- PyamilySeq v1.3.1: A tool for gene clustering and analysis.
32
+ PyamilySeq v1.3.3: A tool for gene clustering and analysis.
33
33
 
34
34
  positional arguments:
35
35
  {Full,Partial} Choose a mode: 'Full' or 'Partial'.
@@ -59,7 +59,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
59
59
  ```
60
60
  ### Example output:
61
61
  ```
62
- Running PyamilySeq v1.3.1
62
+ Running PyamilySeq v1.3.3
63
63
  Calculating Groups
64
64
  Number of Genomes: 10
65
65
  Gene Groups
@@ -114,7 +114,7 @@ Total Number of First Gene Groups That Had Additional Second Sequences But Not N
114
114
  ## PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
115
115
  ### PyamilySeq - Full Menu:
116
116
  ```
117
- usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
117
+ usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-seq_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
118
118
  [-group_mode {Species,Genus}] [-species_groups SPECIES_GROUPS] [-genus_groups GENUS_GROUPS] [-write_groups WRITE_GROUPS] [-write_individual_groups] [-align] [-align_aa] [-no_gpa] [-M MEM] [-T THREADS] [-verbose] [-v]
119
119
 
120
120
  options:
@@ -130,7 +130,7 @@ options:
130
130
  Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.
131
131
  -name_split_fasta NAME_SPLIT_FASTA
132
132
  Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.
133
- -sequence_type {AA,DNA}
133
+ -seq_type {AA,DNA}
134
134
  Clustering mode: 'DNA' or 'AA'.
135
135
  -gene_ident GENE_IDENT
136
136
  Gene identifiers to extract sequences (e.g., 'CDS, tRNA').
@@ -204,7 +204,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
204
204
  ```
205
205
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
206
206
 
207
- PyamilySeq v1.3.1: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
207
+ PyamilySeq v1.3.3: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
208
208
 
209
209
  options:
210
210
  -h, --help show this help message and exit
@@ -236,18 +236,18 @@ Misc Arguments:
236
236
  ## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
237
237
  ### Example:
238
238
  ```bash
239
- Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
239
+ Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -seq_type AA
240
240
  ```
241
241
  ### Group-Splitter Menu:
242
242
  ```
243
- usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
243
+ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -seq_type {AA,DNA}
244
244
  -genome_num GENOME_NUM -output_dir OUTPUT_DIR
245
245
  [-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
246
246
  [-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
247
247
  [-M CLUSTERING_MEMORY] [-no_delete_temp_files]
248
248
  [-verbose] [-v]
249
249
 
250
- PyamilySeq v1.3.1: Group-Splitter - A tool to split multi-copy gene groups
250
+ PyamilySeq v1.3.3: Group-Splitter - A tool to split multi-copy gene groups
251
251
  identified by PyamilySeq.
252
252
 
253
253
  options:
@@ -256,7 +256,7 @@ options:
256
256
  Required Parameters:
257
257
  -input_fasta INPUT_FASTA
258
258
  Input FASTA file containing gene groups.
259
- -sequence_type {AA,DNA}
259
+ -seq_type {AA,DNA}
260
260
  Default - DNA: Are groups "DNA" or "AA" sequences?
261
261
  -genome_num GENOME_NUM
262
262
  The total number of genomes must be provide
@@ -290,17 +290,17 @@ Misc Parameters:
290
290
 
291
291
  ```
292
292
 
293
- ## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
293
+ ## Group-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
294
294
  ### Example:
295
295
  ```bash
296
- Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
296
+ Group-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
297
297
  ```
298
- ### Cluster-Summary Menu:
298
+ ### Group-Summary Menu:
299
299
  ```
300
300
  usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
301
301
  [-output_dir OUTPUT_DIR] [-verbose] [-v]
302
302
 
303
- PyamilySeq v1.3.1: Cluster-Summary - A tool to summarise CD-HIT clustering files.
303
+ PyamilySeq v1.3.3: Group-Summary - A tool to summarise CD-HIT clustering files.
304
304
 
305
305
  options:
306
306
  -h, --help show this help message and exit
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "PyamilySeq"
7
- version = "1.3.1"
7
+ version = "1.3.3"
8
8
  authors = [
9
9
  {name = "Nicholas Dimonaco", email = "nicholas@dimonaco.co.uk"}
10
10
  ]
@@ -33,10 +33,10 @@ Homepage = "https://github.com/NickJD/PyamilySeq"
33
33
  seq-combiner = "PyamilySeq.Seq_Combiner:main"
34
34
  Group-Splitter = "PyamilySeq.Group_Splitter:main"
35
35
  group-splitter = "PyamilySeq.Group_Splitter:main"
36
- Cluster-Summary = "PyamilySeq.Cluster_Summary:main"
37
- cluster-summary = "PyamilySeq.Cluster_Summary:main"
38
- Cluster-Extractor = "PyamilySeq.Cluster_Extractor:main"
39
- cluster-extractor = "PyamilySeq.Cluster_Extractor:main"
36
+ Group-Extractor = "PyamilySeq.Group_Extractor:main"
37
+ group-extractor = "PyamilySeq.Group_Extractor:main"
38
+ Group-Summary = "PyamilySeq.Group_Summary:main"
39
+ group-summary = "PyamilySeq.Group_Summary:main"
40
40
  Seq-Finder = "PyamilySeq.Seq_Finder:main"
41
41
  seq-finder = "PyamilySeq.Seq_Finder:main"
42
42
  Seq-Extractor = "PyamilySeq.Seq_Extractor:main"
@@ -56,5 +56,4 @@ include = ["PyamilySeq*"]
56
56
 
57
57
  [tool.setuptools.package-data]
58
58
  PyamilySeq = [
59
-
60
59
  ]
@@ -1,5 +1,12 @@
1
- import argparse
2
1
  from collections import defaultdict
2
+ import logging
3
+ import os
4
+
5
+ # Use centralised logger factory from constants
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
3
10
 
4
11
  def read_cd_hit_output(clstr_file):
5
12
  """
@@ -23,10 +30,8 @@ def read_cd_hit_output(clstr_file):
23
30
  return seq_to_cluster
24
31
 
25
32
  def compare_cd_hit_clusters(file1, file2, output_file):
26
- """
27
- Compares two CD-HIT .clstr files to check if clusters are the same.
28
- Writes the results to a TSV file.
29
- """
33
+ logger = logging.getLogger("PyamilySeq.Group_Compare")
34
+ logger.info("Comparing clusters: %s vs %s", file1, file2)
30
35
  # Read both clustering files
31
36
  clusters1 = read_cd_hit_output(file1)
32
37
  clusters2 = read_cd_hit_output(file2)
@@ -80,12 +85,11 @@ def compare_cd_hit_clusters(file1, file2, output_file):
80
85
  tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
81
86
 
82
87
  # Print metrics
83
- print("🔢 Clustering Comparison Metrics:")
84
- print(f"Cluster name changes: {cluster_name_changes}")
85
- print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
86
- print(f"Sequences only in the first file: {len(only_in_file1)}")
87
- print(f"Sequences only in the second file: {len(only_in_file2)}")
88
- print()
88
+ logger.info("Clustering Comparison Metrics:")
89
+ logger.info("Cluster name changes: %s", cluster_name_changes)
90
+ logger.info("Sequence shifts (sequences assigned to different clusters): %s", sequence_shifts)
91
+ logger.info("Sequences only in the first file: %s", len(only_in_file1))
92
+ logger.info("Sequences only in the second file: %s", len(only_in_file2))
89
93
 
90
94
  # Write the results to a TSV file
91
95
  with open(output_file, 'w') as out_file:
@@ -93,15 +97,25 @@ def compare_cd_hit_clusters(file1, file2, output_file):
93
97
  for row in tsv_data:
94
98
  out_file.write("\t".join(map(str, row)) + "\n")
95
99
 
96
- print(f"Results have been written to {output_file}")
100
+ logger.info("Results have been written to %s", output_file)
97
101
 
98
102
  def main():
99
- parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
103
+ # Early console-only logger so parser.description and argparse messages are emitted via logger
104
+ early_logger = configure_logger("PyamilySeq.Group_Compare", enable_file=False, log_dir=None, verbose=False)
105
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Compare", description="Running Group-Compare - A tool to compare two CD-HIT .clstr files to check for clustering consistency.")
106
+
100
107
  parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
101
108
  parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
102
109
  parser.add_argument("-output", required=True, help="Output file (TSV format)")
110
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
111
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output).")
103
112
  args = parser.parse_args()
104
113
 
114
+ # Setup logger
115
+ out_dir = os.path.abspath(os.path.dirname(args.output)) if args.output else os.getcwd()
116
+ log_dir = args.log_dir if args.log_dir else out_dir
117
+ logger = configure_logger("PyamilySeq.Group_Compare", enable_file=args.log, log_dir=log_dir, verbose=False)
118
+
105
119
  compare_cd_hit_clusters(args.file1, args.file2, args.output)
106
120
 
107
121
  if __name__ == "__main__":
@@ -1,6 +1,12 @@
1
- import argparse
2
1
  import os
3
2
  import csv
3
+ import logging
4
+
5
+ # Use centralissed logger factory from constants
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
4
10
 
5
11
 
6
12
  def parse_fasta(fasta_file):
@@ -43,9 +49,8 @@ def parse_csv(csv_file):
43
49
 
44
50
 
45
51
  def write_group_fastas(groups, sequences, output_dir):
46
- """
47
- Writes individual FASTA files for each group with the relevant sequences.
48
- """
52
+
53
+ logger = logging.getLogger("PyamilySeq.Group_Extractor")
49
54
  if not os.path.exists(output_dir):
50
55
  os.makedirs(output_dir)
51
56
 
@@ -56,27 +61,39 @@ def write_group_fastas(groups, sequences, output_dir):
56
61
  if gene_id in sequences:
57
62
  f.write(f">{gene_id}\n{sequences[gene_id]}\n")
58
63
  else:
59
- print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
64
+ logger.warning("Warning: Gene ID %s not found in FASTA file.", gene_id)
60
65
 
61
66
 
62
67
  def main():
63
- parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
68
+ # Early console-only logger so the parser description is logged before argparse outputs.
69
+ early_logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=False, log_dir=None, verbose=False)
70
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Extractor", description="Running Group-Extractor - A tool to process FASTA and CSV files to create grouped FASTA outputs.")
71
+
64
72
  parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
65
73
  parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
66
74
  parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
75
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
76
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
67
77
 
68
78
  args = parser.parse_args()
69
79
 
70
- # Parse the input files
71
- print("Parsing FASTA file...")
80
+ # Setup logger writing to output_dir (optional file)
81
+ log_dir = os.path.abspath(args.output_dir) if args.output_dir else os.getcwd()
82
+ if hasattr(args, "log_dir") and args.log_dir:
83
+ log_dir = args.log_dir
84
+ # Only create a logfile when --log is provided; default is console (stdout) only.
85
+ logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=getattr(args, "log", False), log_dir=log_dir, verbose=False)
86
+
87
+ logger.info("Parsing FASTA file: %s", args.fasta)
72
88
  sequences = parse_fasta(args.fasta)
73
- print("Parsing CSV file...")
89
+ logger.info("Parsed %d sequences.", len(sequences))
90
+ logger.info("Parsing CSV file: %s", args.csv)
74
91
  groups = parse_csv(args.csv)
92
+ logger.info("Parsed %d groups.", len(groups))
75
93
 
76
- # Write the grouped FASTA files
77
- print("Writing grouped FASTA files...")
94
+ logger.info("Writing grouped FASTA files to %s", args.output_dir)
78
95
  write_group_fastas(groups, sequences, args.output_dir)
79
- print("Process completed successfully.")
96
+ logger.info("Process completed successfully.")
80
97
 
81
98
 
82
99
  if __name__ == "__main__":
@@ -1,6 +1,14 @@
1
- import argparse
1
+
2
2
  import os
3
3
  import csv
4
+ import logging
5
+
6
+
7
+ # Use centralised logger factory from constants
8
+ try:
9
+ from .constants import configure_logger, LoggingArgumentParser
10
+ except Exception:
11
+ from constants import configure_logger, LoggingArgumentParser
4
12
 
5
13
 
6
14
  def parse_fasta_stats(fasta_file):
@@ -43,9 +51,7 @@ def parse_fasta_stats(fasta_file):
43
51
 
44
52
 
45
53
  def process_fasta_directory(input_dir, output_csv):
46
- """
47
- Processes a directory of FASTA files and writes statistics to a CSV file.
48
- """
54
+ logger = logging.getLogger("PyamilySeq.Group_Sizes")
49
55
  results = []
50
56
  for filename in os.listdir(input_dir):
51
57
  if filename.endswith(".fasta"):
@@ -68,19 +74,27 @@ def process_fasta_directory(input_dir, output_csv):
68
74
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
69
75
  writer.writeheader()
70
76
  writer.writerows(results)
77
+ logger.info("Wrote statistics for %d FASTA files to %s", len(results), output_csv)
71
78
 
72
79
 
73
80
  def main():
74
- parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
81
+ # Early console-only logger so the parser.description is emitted via logger before argparse prints usage/help.
82
+ early_logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=False, log_dir=None, verbose=False)
83
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Sizes", description="Group-Sizes - A tool to summarise sequence statistics for a directory of FASTA files.")
75
84
  parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
76
85
  parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
86
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
87
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output_csv).")
77
88
 
78
89
  args = parser.parse_args()
79
90
 
80
- # Process the directory of FASTA files
81
- print("Processing FASTA files...")
91
+ out_dir = os.path.abspath(os.path.dirname(args.output_csv)) if args.output_csv else os.getcwd()
92
+ log_dir = args.log_dir if args.log_dir else out_dir
93
+ logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=args.log, log_dir=log_dir, verbose=False)
94
+
95
+ logger.info("Processing FASTA files in %s", args.input_dir)
82
96
  process_fasta_directory(args.input_dir, args.output_csv)
83
- print(f"Statistics saved to {args.output_csv}")
97
+ logger.info("Statistics saved to %s", args.output_csv)
84
98
 
85
99
 
86
100
  if __name__ == "__main__":