PyamilySeq 1.3.2__tar.gz → 1.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/PKG-INFO +11 -11
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/README.md +10 -10
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/pyproject.toml +1 -2
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Compare.py +27 -13
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Extractor.py +29 -12
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Sizes.py +22 -8
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Splitter.py +89 -29
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Group_Summary.py +18 -20
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq.py +66 -43
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq_Genus.py +1 -1
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/PyamilySeq_Species.py +30 -63
- pyamilyseq-1.3.3/src/PyamilySeq/Seq_Combiner.py +193 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Seq_Extractor.py +24 -2
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/Seq_Finder.py +20 -2
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/clusterings.py +1 -1
- pyamilyseq-1.3.3/src/PyamilySeq/constants.py +143 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/utils.py +171 -84
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/PKG-INFO +11 -11
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/SOURCES.txt +0 -1
- pyamilyseq-1.3.2/src/PyamilySeq/Seq_Combiner.py +0 -83
- pyamilyseq-1.3.2/src/PyamilySeq/config.py +0 -0
- pyamilyseq-1.3.2/src/PyamilySeq/constants.py +0 -2
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/LICENSE +0 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/setup.cfg +0 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq/__init__.py +0 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/entry_points.txt +0 -0
- {pyamilyseq-1.3.2 → pyamilyseq-1.3.3}/src/PyamilySeq.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.3
|
|
4
4
|
Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Author-email: Nicholas Dimonaco <nicholas@dimonaco.co.uk>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -720,7 +720,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
720
720
|
```commandline
|
|
721
721
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
722
722
|
|
|
723
|
-
PyamilySeq v1.3.
|
|
723
|
+
PyamilySeq v1.3.3: A tool for gene clustering and analysis.
|
|
724
724
|
|
|
725
725
|
positional arguments:
|
|
726
726
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -750,7 +750,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
750
750
|
```
|
|
751
751
|
### Example output:
|
|
752
752
|
```
|
|
753
|
-
Running PyamilySeq v1.3.
|
|
753
|
+
Running PyamilySeq v1.3.3
|
|
754
754
|
Calculating Groups
|
|
755
755
|
Number of Genomes: 10
|
|
756
756
|
Gene Groups
|
|
@@ -805,7 +805,7 @@ Total Number of First Gene Groups That Had Additional Second Sequences But Not N
|
|
|
805
805
|
## PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
806
806
|
### PyamilySeq - Full Menu:
|
|
807
807
|
```
|
|
808
|
-
usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-
|
|
808
|
+
usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-seq_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
|
|
809
809
|
[-group_mode {Species,Genus}] [-species_groups SPECIES_GROUPS] [-genus_groups GENUS_GROUPS] [-write_groups WRITE_GROUPS] [-write_individual_groups] [-align] [-align_aa] [-no_gpa] [-M MEM] [-T THREADS] [-verbose] [-v]
|
|
810
810
|
|
|
811
811
|
options:
|
|
@@ -821,7 +821,7 @@ options:
|
|
|
821
821
|
Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.
|
|
822
822
|
-name_split_fasta NAME_SPLIT_FASTA
|
|
823
823
|
Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.
|
|
824
|
-
-
|
|
824
|
+
-seq_type {AA,DNA}
|
|
825
825
|
Clustering mode: 'DNA' or 'AA'.
|
|
826
826
|
-gene_ident GENE_IDENT
|
|
827
827
|
Gene identifiers to extract sequences (e.g., 'CDS, tRNA').
|
|
@@ -895,7 +895,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
895
895
|
```
|
|
896
896
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
897
897
|
|
|
898
|
-
PyamilySeq v1.3.
|
|
898
|
+
PyamilySeq v1.3.3: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
899
899
|
|
|
900
900
|
options:
|
|
901
901
|
-h, --help show this help message and exit
|
|
@@ -927,18 +927,18 @@ Misc Arguments:
|
|
|
927
927
|
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
928
928
|
### Example:
|
|
929
929
|
```bash
|
|
930
|
-
Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -
|
|
930
|
+
Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -seq_type AA
|
|
931
931
|
```
|
|
932
932
|
### Group-Splitter Menu:
|
|
933
933
|
```
|
|
934
|
-
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -
|
|
934
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -seq_type {AA,DNA}
|
|
935
935
|
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
936
936
|
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
937
937
|
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
938
938
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
939
939
|
[-verbose] [-v]
|
|
940
940
|
|
|
941
|
-
PyamilySeq v1.3.
|
|
941
|
+
PyamilySeq v1.3.3: Group-Splitter - A tool to split multi-copy gene groups
|
|
942
942
|
identified by PyamilySeq.
|
|
943
943
|
|
|
944
944
|
options:
|
|
@@ -947,7 +947,7 @@ options:
|
|
|
947
947
|
Required Parameters:
|
|
948
948
|
-input_fasta INPUT_FASTA
|
|
949
949
|
Input FASTA file containing gene groups.
|
|
950
|
-
-
|
|
950
|
+
-seq_type {AA,DNA}
|
|
951
951
|
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
952
952
|
-genome_num GENOME_NUM
|
|
953
953
|
The total number of genomes must be provide
|
|
@@ -991,7 +991,7 @@ Group-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_ex
|
|
|
991
991
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
992
992
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
993
993
|
|
|
994
|
-
PyamilySeq v1.3.
|
|
994
|
+
PyamilySeq v1.3.3: Group-Summary - A tool to summarise CD-HIT clustering files.
|
|
995
995
|
|
|
996
996
|
options:
|
|
997
997
|
-h, --help show this help message and exit
|
|
@@ -29,7 +29,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
29
29
|
```commandline
|
|
30
30
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
31
31
|
|
|
32
|
-
PyamilySeq v1.3.
|
|
32
|
+
PyamilySeq v1.3.3: A tool for gene clustering and analysis.
|
|
33
33
|
|
|
34
34
|
positional arguments:
|
|
35
35
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -59,7 +59,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
59
59
|
```
|
|
60
60
|
### Example output:
|
|
61
61
|
```
|
|
62
|
-
Running PyamilySeq v1.3.
|
|
62
|
+
Running PyamilySeq v1.3.3
|
|
63
63
|
Calculating Groups
|
|
64
64
|
Number of Genomes: 10
|
|
65
65
|
Gene Groups
|
|
@@ -114,7 +114,7 @@ Total Number of First Gene Groups That Had Additional Second Sequences But Not N
|
|
|
114
114
|
## PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
115
115
|
### PyamilySeq - Full Menu:
|
|
116
116
|
```
|
|
117
|
-
usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-
|
|
117
|
+
usage: PyamilySeq.py Full [-h] -output_dir OUTPUT_DIR -input_type {separate,combined,fasta} [-input_dir INPUT_DIR] [-input_fasta INPUT_FASTA] [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] [-seq_type {AA,DNA}] [-gene_ident GENE_IDENT] [-c PIDENT] [-s LEN_DIFF] [-fast_mode]
|
|
118
118
|
[-group_mode {Species,Genus}] [-species_groups SPECIES_GROUPS] [-genus_groups GENUS_GROUPS] [-write_groups WRITE_GROUPS] [-write_individual_groups] [-align] [-align_aa] [-no_gpa] [-M MEM] [-T THREADS] [-verbose] [-v]
|
|
119
119
|
|
|
120
120
|
options:
|
|
@@ -130,7 +130,7 @@ options:
|
|
|
130
130
|
Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.
|
|
131
131
|
-name_split_fasta NAME_SPLIT_FASTA
|
|
132
132
|
Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.
|
|
133
|
-
-
|
|
133
|
+
-seq_type {AA,DNA}
|
|
134
134
|
Clustering mode: 'DNA' or 'AA'.
|
|
135
135
|
-gene_ident GENE_IDENT
|
|
136
136
|
Gene identifiers to extract sequences (e.g., 'CDS, tRNA').
|
|
@@ -204,7 +204,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
204
204
|
```
|
|
205
205
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
206
206
|
|
|
207
|
-
PyamilySeq v1.3.
|
|
207
|
+
PyamilySeq v1.3.3: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
208
208
|
|
|
209
209
|
options:
|
|
210
210
|
-h, --help show this help message and exit
|
|
@@ -236,18 +236,18 @@ Misc Arguments:
|
|
|
236
236
|
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
237
237
|
### Example:
|
|
238
238
|
```bash
|
|
239
|
-
Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -
|
|
239
|
+
Group-Splitter -genome_num 10 -input_fasta .../test/species/ -output_dir .../test/species/ -seq_type AA
|
|
240
240
|
```
|
|
241
241
|
### Group-Splitter Menu:
|
|
242
242
|
```
|
|
243
|
-
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -
|
|
243
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -seq_type {AA,DNA}
|
|
244
244
|
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
245
245
|
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
246
246
|
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
247
247
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
248
248
|
[-verbose] [-v]
|
|
249
249
|
|
|
250
|
-
PyamilySeq v1.3.
|
|
250
|
+
PyamilySeq v1.3.3: Group-Splitter - A tool to split multi-copy gene groups
|
|
251
251
|
identified by PyamilySeq.
|
|
252
252
|
|
|
253
253
|
options:
|
|
@@ -256,7 +256,7 @@ options:
|
|
|
256
256
|
Required Parameters:
|
|
257
257
|
-input_fasta INPUT_FASTA
|
|
258
258
|
Input FASTA file containing gene groups.
|
|
259
|
-
-
|
|
259
|
+
-seq_type {AA,DNA}
|
|
260
260
|
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
261
261
|
-genome_num GENOME_NUM
|
|
262
262
|
The total number of genomes must be provide
|
|
@@ -300,7 +300,7 @@ Group-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_ex
|
|
|
300
300
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
301
301
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
302
302
|
|
|
303
|
-
PyamilySeq v1.3.
|
|
303
|
+
PyamilySeq v1.3.3: Group-Summary - A tool to summarise CD-HIT clustering files.
|
|
304
304
|
|
|
305
305
|
options:
|
|
306
306
|
-h, --help show this help message and exit
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "PyamilySeq"
|
|
7
|
-
version = "1.3.
|
|
7
|
+
version = "1.3.3"
|
|
8
8
|
authors = [
|
|
9
9
|
{name = "Nicholas Dimonaco", email = "nicholas@dimonaco.co.uk"}
|
|
10
10
|
]
|
|
@@ -56,5 +56,4 @@ include = ["PyamilySeq*"]
|
|
|
56
56
|
|
|
57
57
|
[tool.setuptools.package-data]
|
|
58
58
|
PyamilySeq = [
|
|
59
|
-
|
|
60
59
|
]
|
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
from collections import defaultdict
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Use centralised logger factory from constants
|
|
6
|
+
try:
|
|
7
|
+
from .constants import configure_logger, LoggingArgumentParser
|
|
8
|
+
except Exception:
|
|
9
|
+
from constants import configure_logger, LoggingArgumentParser
|
|
3
10
|
|
|
4
11
|
def read_cd_hit_output(clstr_file):
|
|
5
12
|
"""
|
|
@@ -23,10 +30,8 @@ def read_cd_hit_output(clstr_file):
|
|
|
23
30
|
return seq_to_cluster
|
|
24
31
|
|
|
25
32
|
def compare_cd_hit_clusters(file1, file2, output_file):
|
|
26
|
-
""
|
|
27
|
-
|
|
28
|
-
Writes the results to a TSV file.
|
|
29
|
-
"""
|
|
33
|
+
logger = logging.getLogger("PyamilySeq.Group_Compare")
|
|
34
|
+
logger.info("Comparing clusters: %s vs %s", file1, file2)
|
|
30
35
|
# Read both clustering files
|
|
31
36
|
clusters1 = read_cd_hit_output(file1)
|
|
32
37
|
clusters2 = read_cd_hit_output(file2)
|
|
@@ -80,12 +85,11 @@ def compare_cd_hit_clusters(file1, file2, output_file):
|
|
|
80
85
|
tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
|
|
81
86
|
|
|
82
87
|
# Print metrics
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
print()
|
|
88
|
+
logger.info("Clustering Comparison Metrics:")
|
|
89
|
+
logger.info("Cluster name changes: %s", cluster_name_changes)
|
|
90
|
+
logger.info("Sequence shifts (sequences assigned to different clusters): %s", sequence_shifts)
|
|
91
|
+
logger.info("Sequences only in the first file: %s", len(only_in_file1))
|
|
92
|
+
logger.info("Sequences only in the second file: %s", len(only_in_file2))
|
|
89
93
|
|
|
90
94
|
# Write the results to a TSV file
|
|
91
95
|
with open(output_file, 'w') as out_file:
|
|
@@ -93,15 +97,25 @@ def compare_cd_hit_clusters(file1, file2, output_file):
|
|
|
93
97
|
for row in tsv_data:
|
|
94
98
|
out_file.write("\t".join(map(str, row)) + "\n")
|
|
95
99
|
|
|
96
|
-
|
|
100
|
+
logger.info("Results have been written to %s", output_file)
|
|
97
101
|
|
|
98
102
|
def main():
|
|
99
|
-
|
|
103
|
+
# Early console-only logger so parser.description and argparse messages are emitted via logger
|
|
104
|
+
early_logger = configure_logger("PyamilySeq.Group_Compare", enable_file=False, log_dir=None, verbose=False)
|
|
105
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Compare", description="Running Group-Compare - A tool to compare two CD-HIT .clstr files to check for clustering consistency.")
|
|
106
|
+
|
|
100
107
|
parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
|
|
101
108
|
parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
|
|
102
109
|
parser.add_argument("-output", required=True, help="Output file (TSV format)")
|
|
110
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
111
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output).")
|
|
103
112
|
args = parser.parse_args()
|
|
104
113
|
|
|
114
|
+
# Setup logger
|
|
115
|
+
out_dir = os.path.abspath(os.path.dirname(args.output)) if args.output else os.getcwd()
|
|
116
|
+
log_dir = args.log_dir if args.log_dir else out_dir
|
|
117
|
+
logger = configure_logger("PyamilySeq.Group_Compare", enable_file=args.log, log_dir=log_dir, verbose=False)
|
|
118
|
+
|
|
105
119
|
compare_cd_hit_clusters(args.file1, args.file2, args.output)
|
|
106
120
|
|
|
107
121
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
import os
|
|
3
2
|
import csv
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
# Use centralissed logger factory from constants
|
|
6
|
+
try:
|
|
7
|
+
from .constants import configure_logger, LoggingArgumentParser
|
|
8
|
+
except Exception:
|
|
9
|
+
from constants import configure_logger, LoggingArgumentParser
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def parse_fasta(fasta_file):
|
|
@@ -43,9 +49,8 @@ def parse_csv(csv_file):
|
|
|
43
49
|
|
|
44
50
|
|
|
45
51
|
def write_group_fastas(groups, sequences, output_dir):
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"""
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger("PyamilySeq.Group_Extractor")
|
|
49
54
|
if not os.path.exists(output_dir):
|
|
50
55
|
os.makedirs(output_dir)
|
|
51
56
|
|
|
@@ -56,27 +61,39 @@ def write_group_fastas(groups, sequences, output_dir):
|
|
|
56
61
|
if gene_id in sequences:
|
|
57
62
|
f.write(f">{gene_id}\n{sequences[gene_id]}\n")
|
|
58
63
|
else:
|
|
59
|
-
|
|
64
|
+
logger.warning("Warning: Gene ID %s not found in FASTA file.", gene_id)
|
|
60
65
|
|
|
61
66
|
|
|
62
67
|
def main():
|
|
63
|
-
|
|
68
|
+
# Early console-only logger so the parser description is logged before argparse outputs.
|
|
69
|
+
early_logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=False, log_dir=None, verbose=False)
|
|
70
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Extractor", description="Running Group-Extractor - A tool to process FASTA and CSV files to create grouped FASTA outputs.")
|
|
71
|
+
|
|
64
72
|
parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
|
|
65
73
|
parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
|
|
66
74
|
parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
|
|
75
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
76
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
|
|
67
77
|
|
|
68
78
|
args = parser.parse_args()
|
|
69
79
|
|
|
70
|
-
#
|
|
71
|
-
|
|
80
|
+
# Setup logger writing to output_dir (optional file)
|
|
81
|
+
log_dir = os.path.abspath(args.output_dir) if args.output_dir else os.getcwd()
|
|
82
|
+
if hasattr(args, "log_dir") and args.log_dir:
|
|
83
|
+
log_dir = args.log_dir
|
|
84
|
+
# Only create a logfile when --log is provided; default is console (stdout) only.
|
|
85
|
+
logger = configure_logger("PyamilySeq.Group_Extractor", enable_file=getattr(args, "log", False), log_dir=log_dir, verbose=False)
|
|
86
|
+
|
|
87
|
+
logger.info("Parsing FASTA file: %s", args.fasta)
|
|
72
88
|
sequences = parse_fasta(args.fasta)
|
|
73
|
-
|
|
89
|
+
logger.info("Parsed %d sequences.", len(sequences))
|
|
90
|
+
logger.info("Parsing CSV file: %s", args.csv)
|
|
74
91
|
groups = parse_csv(args.csv)
|
|
92
|
+
logger.info("Parsed %d groups.", len(groups))
|
|
75
93
|
|
|
76
|
-
|
|
77
|
-
print("Writing grouped FASTA files...")
|
|
94
|
+
logger.info("Writing grouped FASTA files to %s", args.output_dir)
|
|
78
95
|
write_group_fastas(groups, sequences, args.output_dir)
|
|
79
|
-
|
|
96
|
+
logger.info("Process completed successfully.")
|
|
80
97
|
|
|
81
98
|
|
|
82
99
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
|
|
2
2
|
import os
|
|
3
3
|
import csv
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Use centralised logger factory from constants
|
|
8
|
+
try:
|
|
9
|
+
from .constants import configure_logger, LoggingArgumentParser
|
|
10
|
+
except Exception:
|
|
11
|
+
from constants import configure_logger, LoggingArgumentParser
|
|
4
12
|
|
|
5
13
|
|
|
6
14
|
def parse_fasta_stats(fasta_file):
|
|
@@ -43,9 +51,7 @@ def parse_fasta_stats(fasta_file):
|
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
def process_fasta_directory(input_dir, output_csv):
|
|
46
|
-
""
|
|
47
|
-
Processes a directory of FASTA files and writes statistics to a CSV file.
|
|
48
|
-
"""
|
|
54
|
+
logger = logging.getLogger("PyamilySeq.Group_Sizes")
|
|
49
55
|
results = []
|
|
50
56
|
for filename in os.listdir(input_dir):
|
|
51
57
|
if filename.endswith(".fasta"):
|
|
@@ -68,19 +74,27 @@ def process_fasta_directory(input_dir, output_csv):
|
|
|
68
74
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
69
75
|
writer.writeheader()
|
|
70
76
|
writer.writerows(results)
|
|
77
|
+
logger.info("Wrote statistics for %d FASTA files to %s", len(results), output_csv)
|
|
71
78
|
|
|
72
79
|
|
|
73
80
|
def main():
|
|
74
|
-
|
|
81
|
+
# Early console-only logger so the parser.description is emitted via logger before argparse prints usage/help.
|
|
82
|
+
early_logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=False, log_dir=None, verbose=False)
|
|
83
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Sizes", description="Group-Sizes - A tool to summarise sequence statistics for a directory of FASTA files.")
|
|
75
84
|
parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
|
|
76
85
|
parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
|
|
86
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
87
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: same dir as -output_csv).")
|
|
77
88
|
|
|
78
89
|
args = parser.parse_args()
|
|
79
90
|
|
|
80
|
-
|
|
81
|
-
|
|
91
|
+
out_dir = os.path.abspath(os.path.dirname(args.output_csv)) if args.output_csv else os.getcwd()
|
|
92
|
+
log_dir = args.log_dir if args.log_dir else out_dir
|
|
93
|
+
logger = configure_logger("PyamilySeq.Group_Sizes", enable_file=args.log, log_dir=log_dir, verbose=False)
|
|
94
|
+
|
|
95
|
+
logger.info("Processing FASTA files in %s", args.input_dir)
|
|
82
96
|
process_fasta_directory(args.input_dir, args.output_csv)
|
|
83
|
-
|
|
97
|
+
logger.info("Statistics saved to %s", args.output_csv)
|
|
84
98
|
|
|
85
99
|
|
|
86
100
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
import argparse
|
|
3
1
|
from collections import defaultdict, OrderedDict
|
|
2
|
+
import sys
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
try:
|
|
@@ -11,6 +10,7 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
11
10
|
from utils import *
|
|
12
11
|
|
|
13
12
|
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
13
|
+
logger = logging.getLogger("PyamilySeq.Group_Splitter")
|
|
14
14
|
cdhit_command = [
|
|
15
15
|
clustering_mode,
|
|
16
16
|
'-i', input_file,
|
|
@@ -24,12 +24,17 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
24
24
|
'-sc', "1",
|
|
25
25
|
'-sf', "1"
|
|
26
26
|
]
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
logger.debug("Group-Splitter CD-HIT command: %s", " ".join(cdhit_command))
|
|
28
|
+
try:
|
|
29
|
+
if options.verbose:
|
|
30
|
+
subprocess.run(cdhit_command)
|
|
31
|
+
else:
|
|
32
|
+
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
33
|
+
logger.info("CD-HIT completed for %s", input_file)
|
|
34
|
+
except Exception:
|
|
35
|
+
logger.exception("Error running CD-HIT for %s", input_file)
|
|
36
|
+
|
|
31
37
|
|
|
32
|
-
#'@profile
|
|
33
38
|
def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
|
|
34
39
|
total_length = sum(entry['length'] for entry in cluster_data)
|
|
35
40
|
avg_length = total_length / len(cluster_data)
|
|
@@ -75,7 +80,27 @@ def read_fasta_groups(options, groups_to_use):
|
|
|
75
80
|
else:
|
|
76
81
|
affix = '_dna.fasta'
|
|
77
82
|
|
|
78
|
-
|
|
83
|
+
# Ensure we look for the combined file that includes the requested group level (e.g. "99")
|
|
84
|
+
# groups_to_use[1] contains the numeric group level when using ('groups', <num>)
|
|
85
|
+
group_level = str(groups_to_use[1]) if groups_to_use and len(groups_to_use) > 1 else ''
|
|
86
|
+
combined_groups_fasta = os.path.join(options.input_directory, 'Gene_Groups_Output',
|
|
87
|
+
f"combined_group_sequences_{group_level}{affix}")
|
|
88
|
+
|
|
89
|
+
# Defensive check: combined_group_sequences_* file must exist (was created by PyamilySeq with -write_groups)
|
|
90
|
+
if not os.path.exists(combined_groups_fasta):
|
|
91
|
+
logger = logging.getLogger("PyamilySeq.Group_Splitter")
|
|
92
|
+
logger.error("Required combined group sequences file not found: %s", combined_groups_fasta)
|
|
93
|
+
logger.error("This usually means the upstream PyamilySeq run did not include the -write_groups and -write_individual_groups options.")
|
|
94
|
+
# Helpful debug info: list contents of Gene_Groups_Output if available
|
|
95
|
+
parent_dir = os.path.dirname(combined_groups_fasta)
|
|
96
|
+
if os.path.isdir(parent_dir):
|
|
97
|
+
try:
|
|
98
|
+
files = os.listdir(parent_dir)
|
|
99
|
+
logger.debug("Files in %s: %s", parent_dir, ", ".join(sorted(files)) if files else "(none)")
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.debug("Could not list %s: %s", parent_dir, e)
|
|
102
|
+
# Stop further processing
|
|
103
|
+
sys.exit(1)
|
|
79
104
|
|
|
80
105
|
if groups_to_use[0] == 'ids':
|
|
81
106
|
selected_group_ids = [int(g.strip()) for g in groups_to_use[1].split(',')]
|
|
@@ -334,13 +359,16 @@ def separate_groups(options, clustering_mode, groups_to_use):
|
|
|
334
359
|
|
|
335
360
|
|
|
336
361
|
def main():
|
|
337
|
-
|
|
362
|
+
# Early console-only logger so parser.description is emitted via logger before argparse prints usage/help.
|
|
363
|
+
early_logger = configure_logger("PyamilySeq.Group_Splitter", enable_file=False, log_dir=None, verbose=False)
|
|
364
|
+
# Use LoggingArgumentParser so usage/errors are emitted via the configured logger
|
|
365
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Group_Splitter", description='Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
|
|
338
366
|
### Required Arguments
|
|
339
367
|
required = parser.add_argument_group('Required Parameters')
|
|
340
|
-
required.add_argument('-
|
|
368
|
+
required.add_argument('-input_dir', action='store', dest='input_directory',
|
|
341
369
|
help='Provide the directory of a PyamilySeq run.',
|
|
342
370
|
required=True)
|
|
343
|
-
required.add_argument('-
|
|
371
|
+
required.add_argument('-seq_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
|
|
344
372
|
help='Default - AA: Are groups "DNA" or "AA" sequences?',
|
|
345
373
|
required=True)
|
|
346
374
|
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
@@ -350,7 +378,7 @@ def main():
|
|
|
350
378
|
|
|
351
379
|
### Regrouping Arguments
|
|
352
380
|
regrouping_params = parser.add_argument_group('Regrouping Parameters')
|
|
353
|
-
regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=
|
|
381
|
+
regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=99,
|
|
354
382
|
help='Default - 99: groups to be split by pangenome grouping (see -group_threshold). '
|
|
355
383
|
'Provide "-groups 99" to split specific groups.',
|
|
356
384
|
required=False)
|
|
@@ -403,10 +431,14 @@ def main():
|
|
|
403
431
|
help="Print out version number and exit")
|
|
404
432
|
|
|
405
433
|
|
|
434
|
+
# Optional file logging flags (must be added before parsing)
|
|
435
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
436
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: input_directory).")
|
|
406
437
|
options = parser.parse_args()
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
438
|
+
# Compute logfile directory (default to input_directory) and only enable file logging when --log is provided.
|
|
439
|
+
log_dir = options.log_dir if getattr(options, "log_dir", None) else os.path.abspath(options.input_directory)
|
|
440
|
+
logger = configure_logger("PyamilySeq.Group_Splitter", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=options.verbose)
|
|
441
|
+
logger.info("Running Group-Splitter %s", PyamilySeq_Version)
|
|
410
442
|
|
|
411
443
|
###External tool checks:
|
|
412
444
|
##MAFFT
|
|
@@ -416,11 +448,10 @@ def main():
|
|
|
416
448
|
print("mafft is installed. Proceeding with alignment.")
|
|
417
449
|
else:
|
|
418
450
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
419
|
-
##CD-HIT
|
|
420
451
|
|
|
452
|
+
##CD-HIT
|
|
421
453
|
if is_tool_installed('cd-hit'):
|
|
422
|
-
|
|
423
|
-
print("cd-hit is installed. Proceeding with clustering.")
|
|
454
|
+
logger.info("cd-hit is installed. Proceeding with clustering.")
|
|
424
455
|
if options.sequence_type == 'DNA':
|
|
425
456
|
clustering_mode = 'cd-hit-est'
|
|
426
457
|
else:
|
|
@@ -434,6 +465,7 @@ def main():
|
|
|
434
465
|
if options.verbose == True:
|
|
435
466
|
print("Running CD-HIT in slow mode.")
|
|
436
467
|
else:
|
|
468
|
+
logger.error("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
437
469
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
438
470
|
|
|
439
471
|
##Alignment
|
|
@@ -451,6 +483,9 @@ def main():
|
|
|
451
483
|
if not os.path.exists(sub_groups_output):
|
|
452
484
|
os.makedirs(sub_groups_output)
|
|
453
485
|
|
|
486
|
+
logger.info("Gene groups output: %s", gene_groups_output)
|
|
487
|
+
logger.info("Sub groups output: %s", sub_groups_output)
|
|
488
|
+
|
|
454
489
|
## Get Summary Stats
|
|
455
490
|
summary_file = os.path.join(options.input_directory, 'summary_statistics.txt')
|
|
456
491
|
|
|
@@ -459,10 +494,9 @@ def main():
|
|
|
459
494
|
with open(params_out, "w") as outfile:
|
|
460
495
|
for arg, value in vars(options).items():
|
|
461
496
|
outfile.write(f"{arg}: {value}\n")
|
|
497
|
+
logger.info("Saved parameters to %s", params_out)
|
|
462
498
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
## Group Selction - FIX THIS - currently fails if either are not provided
|
|
499
|
+
## Group Selection - FIX THIS - currently fails if either are not provided
|
|
466
500
|
if options.groups != None and options.group_ids != None:
|
|
467
501
|
sys.exit('Must provide "-group_ids" or "-groups", not both.')
|
|
468
502
|
elif options.group_ids != None:
|
|
@@ -475,12 +509,9 @@ def main():
|
|
|
475
509
|
|
|
476
510
|
|
|
477
511
|
paralog_groups = separate_groups(options, clustering_mode, groups_to_use)
|
|
478
|
-
|
|
479
|
-
# Print metrics about paralog groups
|
|
480
|
-
print(f"Identified {len(paralog_groups)} paralog groups:")
|
|
512
|
+
logger.info("Identified %d paralog groups", len(paralog_groups))
|
|
481
513
|
for group_id, data in paralog_groups.items():
|
|
482
|
-
|
|
483
|
-
###
|
|
514
|
+
logger.debug("Group %s -> new groups: %s sizes: %s", group_id, data['count'], data['sizes'])
|
|
484
515
|
|
|
485
516
|
|
|
486
517
|
# Read summary statistics
|
|
@@ -509,8 +540,37 @@ def main():
|
|
|
509
540
|
# Recalculate each *_core_* value
|
|
510
541
|
for group_id, data in paralog_groups.items():
|
|
511
542
|
group_id = group_id.replace('>Group_', '')
|
|
512
|
-
|
|
513
|
-
|
|
543
|
+
# Find the original group filename in gene_groups_output that:
|
|
544
|
+
# - contains the requested group level (options.groups, e.g. '99')
|
|
545
|
+
# - corresponds to this subgroup id (group_id)
|
|
546
|
+
original_group = None
|
|
547
|
+
for fname in os.listdir(gene_groups_output):
|
|
548
|
+
if not fname.endswith('.fasta'):
|
|
549
|
+
continue
|
|
550
|
+
# Require the filename to include the group level token (e.g., '_99_') to avoid false matches
|
|
551
|
+
if f"_{options.groups}_" not in fname:
|
|
552
|
+
continue
|
|
553
|
+
# Accept filenames that end with _<group_id>.fasta or _<group_id>_dna.fasta/_aa.fasta
|
|
554
|
+
if fname.endswith(f"_{group_id}.fasta") or fname.endswith(f"_{group_id}_dna.fasta") or fname.endswith(f"_{group_id}_aa.fasta"):
|
|
555
|
+
original_group = fname
|
|
556
|
+
break
|
|
557
|
+
if original_group is None:
|
|
558
|
+
# fallback: attempt a looser match (preserve previous behavior)
|
|
559
|
+
for fname in os.listdir(gene_groups_output):
|
|
560
|
+
if fname.endswith(f"_{group_id}.fasta") or fname.endswith(f"_{group_id}_dna.fasta") or fname.endswith(f"_{group_id}_aa.fasta"):
|
|
561
|
+
original_group = fname
|
|
562
|
+
break
|
|
563
|
+
if original_group is None:
|
|
564
|
+
# If still not found, skip recalculation for this paralog group
|
|
565
|
+
logger.warning("Could not find original group file for subgroup id %s in %s", group_id, gene_groups_output)
|
|
566
|
+
continue
|
|
567
|
+
# Extract the core-group number from the filename (expected at index 2: First_core_99_3_dna.fasta)
|
|
568
|
+
try:
|
|
569
|
+
original_group_num = int(original_group.split('_')[2])
|
|
570
|
+
except Exception:
|
|
571
|
+
logger.warning("Unexpected filename format for %s; skipping", original_group)
|
|
572
|
+
continue
|
|
573
|
+
original_group = original_group_num
|
|
514
574
|
if original_group == 99:
|
|
515
575
|
new_core_99 -= 1
|
|
516
576
|
elif original_group == 95:
|
|
@@ -554,7 +614,7 @@ def main():
|
|
|
554
614
|
|
|
555
615
|
# Alignment
|
|
556
616
|
if options.align_core != None:
|
|
557
|
-
|
|
617
|
+
logger.info("Processing gene group alignment")
|
|
558
618
|
group_directory = options.gene_groups_output
|
|
559
619
|
sub_group_directory = options.sub_groups_output
|
|
560
620
|
genome_list = read_genomes_from_fasta(options.gene_groups_output + '/combined_group_sequences_dna.fasta')
|