PyamilySeq 1.1.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/PKG-INFO +6 -6
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/README.md +5 -5
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/setup.cfg +1 -1
- pyamilyseq-1.1.1/src/PyamilySeq/Cluster_Compare.py +108 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Cluster_Summary.py +59 -64
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/PyamilySeq.py +13 -10
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/PyamilySeq_Species.py +8 -6
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Seq_Combiner.py +6 -6
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/clusterings.py +0 -2
- pyamilyseq-1.1.1/src/PyamilySeq/constants.py +2 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/utils.py +96 -45
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/PKG-INFO +6 -6
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/SOURCES.txt +1 -0
- pyamilyseq-1.1.0/src/PyamilySeq/constants.py +0 -2
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/LICENSE +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/pyproject.toml +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Group_Extractor.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Group_Sizes.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Group_Splitter.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/PyamilySeq_Genus.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Seq_Extractor.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/Seq_Finder.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq/__init__.py +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/entry_points.txt +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/requires.txt +0 -0
- {pyamilyseq-1.1.0 → pyamilyseq-1.1.1}/src/PyamilySeq.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -45,7 +45,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
45
45
|
```commandline
|
|
46
46
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
47
47
|
|
|
48
|
-
PyamilySeq v1.1.
|
|
48
|
+
PyamilySeq v1.1.1: A tool for gene clustering and analysis.
|
|
49
49
|
|
|
50
50
|
positional arguments:
|
|
51
51
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -75,7 +75,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
75
75
|
```
|
|
76
76
|
### Example output:
|
|
77
77
|
```
|
|
78
|
-
Running PyamilySeq v1.1.
|
|
78
|
+
Running PyamilySeq v1.1.1
|
|
79
79
|
Calculating Groups
|
|
80
80
|
Number of Genomes: 10
|
|
81
81
|
Gene Groups
|
|
@@ -220,7 +220,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
220
220
|
```
|
|
221
221
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
222
222
|
|
|
223
|
-
PyamilySeq v1.1.
|
|
223
|
+
PyamilySeq v1.1.1: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
224
224
|
|
|
225
225
|
options:
|
|
226
226
|
-h, --help show this help message and exit
|
|
@@ -263,7 +263,7 @@ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
|
263
263
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
264
264
|
[-verbose] [-v]
|
|
265
265
|
|
|
266
|
-
PyamilySeq v1.1.
|
|
266
|
+
PyamilySeq v1.1.1: Group-Splitter - A tool to split multi-copy gene groups
|
|
267
267
|
identified by PyamilySeq.
|
|
268
268
|
|
|
269
269
|
options:
|
|
@@ -316,7 +316,7 @@ Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_
|
|
|
316
316
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
317
317
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
318
318
|
|
|
319
|
-
PyamilySeq v1.1.
|
|
319
|
+
PyamilySeq v1.1.1: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
320
320
|
|
|
321
321
|
options:
|
|
322
322
|
-h, --help show this help message and exit
|
|
@@ -29,7 +29,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
29
29
|
```commandline
|
|
30
30
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
31
31
|
|
|
32
|
-
PyamilySeq v1.1.
|
|
32
|
+
PyamilySeq v1.1.1: A tool for gene clustering and analysis.
|
|
33
33
|
|
|
34
34
|
positional arguments:
|
|
35
35
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -59,7 +59,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
59
59
|
```
|
|
60
60
|
### Example output:
|
|
61
61
|
```
|
|
62
|
-
Running PyamilySeq v1.1.
|
|
62
|
+
Running PyamilySeq v1.1.1
|
|
63
63
|
Calculating Groups
|
|
64
64
|
Number of Genomes: 10
|
|
65
65
|
Gene Groups
|
|
@@ -204,7 +204,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
204
204
|
```
|
|
205
205
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
206
206
|
|
|
207
|
-
PyamilySeq v1.1.
|
|
207
|
+
PyamilySeq v1.1.1: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
208
208
|
|
|
209
209
|
options:
|
|
210
210
|
-h, --help show this help message and exit
|
|
@@ -247,7 +247,7 @@ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
|
247
247
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
248
248
|
[-verbose] [-v]
|
|
249
249
|
|
|
250
|
-
PyamilySeq v1.1.
|
|
250
|
+
PyamilySeq v1.1.1: Group-Splitter - A tool to split multi-copy gene groups
|
|
251
251
|
identified by PyamilySeq.
|
|
252
252
|
|
|
253
253
|
options:
|
|
@@ -300,7 +300,7 @@ Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_
|
|
|
300
300
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
301
301
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
302
302
|
|
|
303
|
-
PyamilySeq v1.1.
|
|
303
|
+
PyamilySeq v1.1.1: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
304
304
|
|
|
305
305
|
options:
|
|
306
306
|
-h, --help show this help message and exit
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
def read_cd_hit_output(clstr_file):
|
|
5
|
+
"""
|
|
6
|
+
Reads a CD-HIT .clstr file and extracts sequence clusters.
|
|
7
|
+
Returns a dictionary where keys are sequence headers and values are cluster IDs.
|
|
8
|
+
"""
|
|
9
|
+
seq_to_cluster = {} # Maps sequence header -> cluster ID
|
|
10
|
+
cluster_id = 0 # Generic ID for clusters (since CD-HIT names don't matter)
|
|
11
|
+
|
|
12
|
+
with open(clstr_file, 'r') as f:
|
|
13
|
+
for line in f:
|
|
14
|
+
line = line.strip()
|
|
15
|
+
if line.startswith(">Cluster"):
|
|
16
|
+
cluster_id += 1 # Increment cluster ID
|
|
17
|
+
elif line:
|
|
18
|
+
parts = line.split('\t')
|
|
19
|
+
if len(parts) > 1:
|
|
20
|
+
seq_header = parts[1].split('>')[1].split('...')[0] # Extract sequence header
|
|
21
|
+
seq_to_cluster[seq_header] = cluster_id
|
|
22
|
+
|
|
23
|
+
return seq_to_cluster
|
|
24
|
+
|
|
25
|
+
def compare_cd_hit_clusters(file1, file2, output_file):
|
|
26
|
+
"""
|
|
27
|
+
Compares two CD-HIT .clstr files to check if clusters are the same.
|
|
28
|
+
Writes the results to a TSV file.
|
|
29
|
+
"""
|
|
30
|
+
# Read both clustering files
|
|
31
|
+
clusters1 = read_cd_hit_output(file1)
|
|
32
|
+
clusters2 = read_cd_hit_output(file2)
|
|
33
|
+
|
|
34
|
+
# Reverse mappings: cluster ID -> list of sequences
|
|
35
|
+
grouped_clusters1 = defaultdict(set)
|
|
36
|
+
grouped_clusters2 = defaultdict(set)
|
|
37
|
+
|
|
38
|
+
for seq, cluster_id in clusters1.items():
|
|
39
|
+
grouped_clusters1[cluster_id].add(seq)
|
|
40
|
+
for seq, cluster_id in clusters2.items():
|
|
41
|
+
grouped_clusters2[cluster_id].add(seq)
|
|
42
|
+
|
|
43
|
+
# Initialize metrics counters
|
|
44
|
+
cluster_name_changes = 0
|
|
45
|
+
sequence_shifts = 0
|
|
46
|
+
only_in_file1 = defaultdict(list)
|
|
47
|
+
only_in_file2 = defaultdict(list)
|
|
48
|
+
cluster_mismatches = defaultdict(list)
|
|
49
|
+
|
|
50
|
+
# Prepare data for the TSV output
|
|
51
|
+
tsv_data = []
|
|
52
|
+
|
|
53
|
+
# Track changes
|
|
54
|
+
for seq, cluster_id in clusters1.items():
|
|
55
|
+
if seq not in clusters2:
|
|
56
|
+
only_in_file1[cluster_id].append(seq)
|
|
57
|
+
tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
|
|
58
|
+
elif clusters2[seq] != cluster_id:
|
|
59
|
+
# Sequence shifts: sequence in different clusters between files
|
|
60
|
+
sequence_shifts += 1
|
|
61
|
+
cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
|
|
62
|
+
tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
|
|
63
|
+
|
|
64
|
+
for seq, cluster_id in clusters2.items():
|
|
65
|
+
if seq not in clusters1:
|
|
66
|
+
only_in_file2[cluster_id].append(seq)
|
|
67
|
+
tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
|
|
68
|
+
elif clusters1[seq] != cluster_id:
|
|
69
|
+
# Sequence shifts: sequence in different clusters between files
|
|
70
|
+
sequence_shifts += 1
|
|
71
|
+
cluster_mismatches[seq].append((clusters1[seq], cluster_id))
|
|
72
|
+
tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
|
|
73
|
+
|
|
74
|
+
# Track cluster name changes (same sequences in different clusters)
|
|
75
|
+
for cluster_id1, seqs1 in grouped_clusters1.items():
|
|
76
|
+
for cluster_id2, seqs2 in grouped_clusters2.items():
|
|
77
|
+
if seqs1 == seqs2 and cluster_id1 != cluster_id2:
|
|
78
|
+
cluster_name_changes += 1
|
|
79
|
+
for seq in seqs1:
|
|
80
|
+
tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
|
|
81
|
+
|
|
82
|
+
# Print metrics
|
|
83
|
+
print("🔢 Clustering Comparison Metrics:")
|
|
84
|
+
print(f"Cluster name changes: {cluster_name_changes}")
|
|
85
|
+
print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
|
|
86
|
+
print(f"Sequences only in the first file: {len(only_in_file1)}")
|
|
87
|
+
print(f"Sequences only in the second file: {len(only_in_file2)}")
|
|
88
|
+
print()
|
|
89
|
+
|
|
90
|
+
# Write the results to a TSV file
|
|
91
|
+
with open(output_file, 'w') as out_file:
|
|
92
|
+
out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
|
|
93
|
+
for row in tsv_data:
|
|
94
|
+
out_file.write("\t".join(map(str, row)) + "\n")
|
|
95
|
+
|
|
96
|
+
print(f"✅ Results have been written to {output_file}")
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
|
|
100
|
+
parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
|
|
101
|
+
parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
|
|
102
|
+
parser.add_argument("-output", required=True, help="Output file (TSV format)")
|
|
103
|
+
args = parser.parse_args()
|
|
104
|
+
|
|
105
|
+
compare_cd_hit_clusters(args.file1, args.file2, args.output)
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
main()
|
|
@@ -1,33 +1,32 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from collections import OrderedDict
|
|
3
|
-
from collections import defaultdict
|
|
2
|
+
from collections import OrderedDict, defaultdict
|
|
4
3
|
|
|
5
4
|
try:
|
|
6
5
|
from .constants import *
|
|
7
6
|
from .utils import *
|
|
8
|
-
except (ModuleNotFoundError, ImportError, NameError, TypeError)
|
|
7
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError):
|
|
9
8
|
from constants import *
|
|
10
9
|
from utils import *
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
def categorise_percentage(percent):
|
|
14
13
|
"""Categorise the percentage of genomes with multicopy genes."""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return "99-100%"
|
|
14
|
+
categories = {
|
|
15
|
+
(20, 40): "20-40%",
|
|
16
|
+
(40, 60): "40-60%",
|
|
17
|
+
(60, 80): "60-80%",
|
|
18
|
+
(80, 95): "80-95%",
|
|
19
|
+
(95, 99): "95-99%",
|
|
20
|
+
(99, 100): "99-100%"
|
|
21
|
+
}
|
|
22
|
+
for (low, high), label in categories.items():
|
|
23
|
+
if low <= percent < high:
|
|
24
|
+
return label
|
|
27
25
|
return None
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
|
|
30
28
|
def read_cd_hit_output(clustering_output):
|
|
29
|
+
"""Parse CD-HIT .cluster file and extract clustering information."""
|
|
31
30
|
clusters = OrderedDict()
|
|
32
31
|
|
|
33
32
|
with open(clustering_output, 'r') as f:
|
|
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
|
|
|
42
41
|
parts = line.split('\t')
|
|
43
42
|
if len(parts) > 1:
|
|
44
43
|
clustered_info = parts[1]
|
|
45
|
-
length = clustered_info.split(',')[0]
|
|
46
|
-
|
|
47
|
-
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
48
|
-
clustered_header = '>' + clustered_header
|
|
44
|
+
length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
|
|
45
|
+
clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
|
|
49
46
|
|
|
50
47
|
if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
|
|
51
48
|
percent_identity = extract_identity(clustered_info)
|
|
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
|
|
|
63
60
|
return clusters
|
|
64
61
|
|
|
65
62
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
multicopy_groups = defaultdict(int) # Counter for
|
|
63
|
+
def summarise_clusters(options, clusters, output):
|
|
64
|
+
"""Generate a detailed cluster summary report."""
|
|
65
|
+
multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
|
|
69
66
|
|
|
70
67
|
with open(output, 'w') as out_f:
|
|
71
|
-
out_f.write(
|
|
68
|
+
out_f.write(
|
|
69
|
+
"Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
|
|
70
|
+
)
|
|
72
71
|
|
|
73
72
|
for cluster_id, seqs in clusters.items():
|
|
74
73
|
num_seqs = len(seqs)
|
|
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
|
|
|
81
80
|
avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
|
|
82
81
|
identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
|
|
86
|
-
|
|
87
|
-
# Count genomes with more than one gene
|
|
83
|
+
# Count genomes in cluster
|
|
88
84
|
genome_to_gene_count = defaultdict(int)
|
|
89
85
|
for seq in seqs:
|
|
90
|
-
genome = seq['header'].split('|')[0].replace('>','')
|
|
86
|
+
genome = seq['header'].split('|')[0].replace('>', '')
|
|
91
87
|
genome_to_gene_count[genome] += 1
|
|
92
88
|
|
|
89
|
+
num_genomes = len(genome_to_gene_count)
|
|
93
90
|
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
91
|
+
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
|
|
94
92
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
|
|
93
|
+
# Categorize multicopy percentage
|
|
98
94
|
category = categorise_percentage(multicopy_percentage)
|
|
99
95
|
if category:
|
|
100
96
|
multicopy_groups[category] += 1
|
|
101
97
|
|
|
102
|
-
|
|
103
|
-
|
|
98
|
+
# Write detailed output for each cluster
|
|
99
|
+
out_f.write(
|
|
100
|
+
f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
|
|
101
|
+
f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
|
|
102
|
+
)
|
|
104
103
|
|
|
105
|
-
#
|
|
104
|
+
# Define order for multicopy statistics output
|
|
105
|
+
category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
|
|
106
106
|
for category in category_order:
|
|
107
|
-
print(f"
|
|
107
|
+
print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
# Main function to parse arguments and run the analysis
|
|
111
110
|
def main():
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
required.add_argument('-input_clstr', action="store", dest="input_clstr",
|
|
116
|
-
help='Input CD-HIT .clstr file',
|
|
117
|
-
required=True)
|
|
118
|
-
required.add_argument('-output', action="store", dest="output",
|
|
119
|
-
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
|
|
120
|
-
required=True)
|
|
121
|
-
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
122
|
-
help='The total number of genomes must be provide',
|
|
123
|
-
required=True)
|
|
124
|
-
#required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
125
|
-
# help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
126
|
-
# required=True)
|
|
111
|
+
"""Main function to parse arguments and process clustering files."""
|
|
112
|
+
parser = argparse.ArgumentParser(
|
|
113
|
+
description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
|
|
127
114
|
|
|
115
|
+
# Required Arguments
|
|
116
|
+
required = parser.add_argument_group('Required Parameters')
|
|
117
|
+
required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
|
|
118
|
+
help='Input CD-HIT .cluster file')
|
|
119
|
+
required.add_argument('-output', action="store", dest="output", required=True,
|
|
120
|
+
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
|
|
121
|
+
required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
|
|
122
|
+
help='Total number of genomes in dataset')
|
|
123
|
+
|
|
124
|
+
# Optional Arguments
|
|
128
125
|
optional = parser.add_argument_group('Optional Arguments')
|
|
129
126
|
optional.add_argument('-output_dir', action="store", dest="output_dir",
|
|
130
|
-
help='Default: Same as input file',
|
|
131
|
-
required=False)
|
|
127
|
+
help='Default: Same as input file', required=False)
|
|
132
128
|
|
|
133
129
|
misc = parser.add_argument_group("Misc Parameters")
|
|
134
130
|
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
135
|
-
help="Print verbose output.",
|
|
136
|
-
required=False)
|
|
131
|
+
help="Print verbose output.", required=False)
|
|
137
132
|
misc.add_argument("-v", "--version", action="version",
|
|
138
133
|
version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
|
|
139
134
|
help="Print out version number and exit")
|
|
140
135
|
|
|
141
|
-
|
|
142
136
|
options = parser.parse_args()
|
|
143
|
-
print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
|
|
137
|
+
print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
|
|
144
138
|
|
|
145
|
-
|
|
146
|
-
options.
|
|
139
|
+
# File handling
|
|
140
|
+
options.input_cluster = fix_path(options.input_cluster)
|
|
147
141
|
if options.output_dir is None:
|
|
148
|
-
options.output_dir = os.path.dirname(os.path.abspath(options.
|
|
142
|
+
options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
|
|
149
143
|
output_path = os.path.abspath(options.output_dir)
|
|
150
144
|
if not os.path.exists(output_path):
|
|
151
145
|
os.makedirs(output_path)
|
|
146
|
+
|
|
152
147
|
output_name = options.output
|
|
153
148
|
if not output_name.endswith('.tsv'):
|
|
154
149
|
output_name += '.tsv'
|
|
155
150
|
output_file_path = os.path.join(output_path, output_name)
|
|
156
|
-
###
|
|
157
151
|
|
|
158
|
-
clusters
|
|
159
|
-
|
|
152
|
+
# Process clusters and generate summary
|
|
153
|
+
clusters = read_cd_hit_output(options.input_cluster)
|
|
154
|
+
summarise_clusters(options, clusters, output_file_path)
|
|
160
155
|
|
|
161
156
|
|
|
162
157
|
if __name__ == "__main__":
|
|
@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
20
20
|
clustering_mode,
|
|
21
21
|
'-i', input_file,
|
|
22
22
|
'-o', clustering_output,
|
|
23
|
-
'-c',
|
|
24
|
-
'-s',
|
|
23
|
+
'-c', f"{float(options.pident):.2f}",
|
|
24
|
+
'-s', f"{float(options.len_diff):.2f}",
|
|
25
25
|
'-T', str(options.threads),
|
|
26
26
|
'-M', str(options.mem),
|
|
27
27
|
'-d', "0",
|
|
@@ -62,10 +62,11 @@ def main():
|
|
|
62
62
|
help="Clustering mode: 'DNA' or 'AA'.")
|
|
63
63
|
full_parser.add_argument("-gene_ident", default="CDS", required=False,
|
|
64
64
|
help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
|
|
65
|
-
full_parser.add_argument("-c", type=
|
|
65
|
+
full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
|
|
66
66
|
help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
|
|
67
|
-
full_parser.add_argument("-s", type=
|
|
67
|
+
full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
|
|
68
68
|
help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
|
|
69
|
+
|
|
69
70
|
full_parser.add_argument("-fast_mode", action="store_true", required=False,
|
|
70
71
|
help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
|
|
71
72
|
|
|
@@ -98,7 +99,7 @@ def main():
|
|
|
98
99
|
subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups", required=False,
|
|
99
100
|
help="Output individual FASTA files for each group.")
|
|
100
101
|
subparser.add_argument("-align", action="store_true", dest="align_core", required=False,
|
|
101
|
-
help="Align and concatenate sequences for 'core' groups
|
|
102
|
+
help="Align and concatenate sequences for 'core' groups (those in 99-100% of genomes).")
|
|
102
103
|
subparser.add_argument("-align_aa", action="store_true", required=False,
|
|
103
104
|
help="Align sequences as amino acids.")
|
|
104
105
|
subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
|
|
@@ -187,13 +188,13 @@ def main():
|
|
|
187
188
|
elif options.sequence_type == 'AA':
|
|
188
189
|
clustering_mode = 'cd-hit'
|
|
189
190
|
if options.fast_mode == True:
|
|
190
|
-
options.fast_mode =
|
|
191
|
+
options.fast_mode = 1
|
|
191
192
|
if options.verbose == True:
|
|
192
193
|
print("Running CD-HIT in fast mode.")
|
|
193
194
|
else:
|
|
194
|
-
options.fast_mode =
|
|
195
|
+
options.fast_mode = 0
|
|
195
196
|
if options.verbose == True:
|
|
196
|
-
print("Running CD-HIT in
|
|
197
|
+
print("Running CD-HIT in accurate mode.")
|
|
197
198
|
else:
|
|
198
199
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
199
200
|
|
|
@@ -239,10 +240,10 @@ def main():
|
|
|
239
240
|
translate = False
|
|
240
241
|
file_to_cluster = combined_out_file
|
|
241
242
|
if options.input_type == 'separate':
|
|
242
|
-
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate)
|
|
243
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
|
|
243
244
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
244
245
|
elif options.input_type == 'combined':
|
|
245
|
-
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate)
|
|
246
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
|
|
246
247
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
247
248
|
elif options.input_type == 'fasta':
|
|
248
249
|
combined_out_file = options.input_fasta
|
|
@@ -281,6 +282,8 @@ def main():
|
|
|
281
282
|
clustering_options = clustering_options()
|
|
282
283
|
|
|
283
284
|
elif options.run_mode == 'Partial':
|
|
285
|
+
if not os.path.exists(output_path):
|
|
286
|
+
os.makedirs(output_path)
|
|
284
287
|
class clustering_options:
|
|
285
288
|
def __init__(self):
|
|
286
289
|
self.run_mode = options.run_mode
|
|
@@ -120,12 +120,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
|
120
120
|
|
|
121
121
|
#@profile
|
|
122
122
|
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
123
|
+
try:
|
|
124
|
+
groups_as_list = list(groups.values())
|
|
125
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
126
|
+
res = idx
|
|
127
|
+
family_group = list(groups)[res]
|
|
128
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
129
|
+
except UnboundLocalError:
|
|
130
|
+
sys.exit("Error in calc_only_Second_only_core")
|
|
129
131
|
|
|
130
132
|
|
|
131
133
|
#@profile
|
|
@@ -65,17 +65,17 @@ def main():
|
|
|
65
65
|
if not os.path.exists(output_path):
|
|
66
66
|
os.makedirs(output_path)
|
|
67
67
|
|
|
68
|
-
output_file = options.output_file + '.fasta'
|
|
69
|
-
if os.path.exists(os.path.join(output_path, output_file)):
|
|
70
|
-
print(f"Output file {output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
68
|
+
#output_file = options.output_file + '.fasta'
|
|
69
|
+
if os.path.exists(os.path.join(output_path, options.output_file)):
|
|
70
|
+
print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
71
71
|
exit(1)
|
|
72
72
|
|
|
73
|
-
combined_out_file = os.path.join(output_path, output_file )
|
|
73
|
+
combined_out_file = os.path.join(output_path, options.output_file )
|
|
74
74
|
|
|
75
75
|
if options.input_type == 'separate':
|
|
76
|
-
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate)
|
|
76
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
|
|
77
77
|
elif options.input_type == 'combined':
|
|
78
|
-
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate)
|
|
78
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
|
|
79
79
|
elif options.input_type == 'fasta':
|
|
80
80
|
read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
|
|
81
81
|
|
|
@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
279
279
|
first = True
|
|
280
280
|
for line in Second_in:
|
|
281
281
|
if line.startswith('>'):
|
|
282
|
-
if '>Cluster 1997' in line:
|
|
283
|
-
print()
|
|
284
282
|
if first == False:
|
|
285
283
|
cluster_size = len(Combined_clusters[cluster_id])
|
|
286
284
|
Combined_reps.update({rep: cluster_size})
|
|
@@ -228,9 +228,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
|
|
231
|
-
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate):
|
|
232
|
-
|
|
233
|
-
|
|
231
|
+
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
|
|
232
|
+
if run_as_combiner == True:
|
|
233
|
+
combined_out_file_aa = None
|
|
234
|
+
else:
|
|
235
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
236
|
+
|
|
237
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
238
|
+
paired_files_found = None
|
|
239
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
234
240
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
|
|
235
241
|
if not gff_files:
|
|
236
242
|
sys.exit("Error: No GFF files found.")
|
|
@@ -299,23 +305,40 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
|
|
|
299
305
|
full_sequence = fasta_dict[contig][1]
|
|
300
306
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
301
307
|
|
|
302
|
-
if
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
+
if run_as_combiner == True:
|
|
309
|
+
if translate == True:
|
|
310
|
+
seq_aa = translate_frame(seq)
|
|
311
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
312
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
313
|
+
else:
|
|
314
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
315
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
316
|
+
else:
|
|
317
|
+
if translate == True:
|
|
318
|
+
seq_aa = translate_frame(seq)
|
|
319
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
320
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
321
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
322
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
308
323
|
|
|
309
324
|
if not paired_files_found:
|
|
310
325
|
sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
|
|
311
326
|
if translate == False or translate == None:
|
|
312
327
|
#Clean up unused file
|
|
313
|
-
|
|
314
|
-
|
|
328
|
+
try: # Catches is combined_out_file_aa is None
|
|
329
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
330
|
+
os.remove(combined_out_file_aa.name)
|
|
331
|
+
except AttributeError:
|
|
332
|
+
pass
|
|
315
333
|
|
|
316
334
|
|
|
317
|
-
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
318
|
-
|
|
335
|
+
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
|
|
336
|
+
if run_as_combiner == True:
|
|
337
|
+
combined_out_file_aa = None
|
|
338
|
+
else:
|
|
339
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
340
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
|
|
341
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
319
342
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
|
|
320
343
|
if not gff_files:
|
|
321
344
|
sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
|
|
@@ -355,7 +378,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
355
378
|
|
|
356
379
|
for contig, fasta in fasta_dict.items():
|
|
357
380
|
reverse_sequence = reverse_complement(fasta[0])
|
|
358
|
-
fasta_dict[contig][1]=reverse_sequence
|
|
381
|
+
fasta_dict[contig][1] = reverse_sequence
|
|
359
382
|
|
|
360
383
|
if fasta_dict and gff_features:
|
|
361
384
|
for contig, start, end, strand, feature, seq_id in gff_features:
|
|
@@ -369,22 +392,38 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
369
392
|
full_sequence = fasta_dict[contig][1]
|
|
370
393
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
371
394
|
|
|
372
|
-
if
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
395
|
+
if run_as_combiner == True:
|
|
396
|
+
if translate == True:
|
|
397
|
+
seq_aa = translate_frame(seq)
|
|
398
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
399
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
400
|
+
else:
|
|
401
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
402
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
403
|
+
else:
|
|
404
|
+
if translate == True:
|
|
405
|
+
seq_aa = translate_frame(seq)
|
|
406
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
407
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
408
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
409
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
378
410
|
|
|
379
411
|
if translate == False or translate == None:
|
|
380
412
|
#Clean up unused file
|
|
381
|
-
|
|
382
|
-
|
|
413
|
+
try: # Catches is combined_out_file_aa is None
|
|
414
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
415
|
+
os.remove(combined_out_file_aa.name)
|
|
416
|
+
except AttributeError:
|
|
417
|
+
pass
|
|
383
418
|
|
|
384
419
|
|
|
385
420
|
|
|
386
|
-
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
|
|
387
|
-
|
|
421
|
+
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
|
|
422
|
+
if run_as_combiner == True:
|
|
423
|
+
combined_out_file_aa = None
|
|
424
|
+
else:
|
|
425
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
426
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
388
427
|
fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
|
|
389
428
|
if not fasta_files:
|
|
390
429
|
sys.exit("Error: No GFF files found.")
|
|
@@ -400,17 +439,30 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
|
|
|
400
439
|
else:
|
|
401
440
|
fasta_dict[current_seq] +=line.strip()
|
|
402
441
|
for seq_id, seq in fasta_dict.items():
|
|
403
|
-
if
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
442
|
+
if run_as_combiner == True:
|
|
443
|
+
if translate == True:
|
|
444
|
+
seq_aa = translate_frame(seq)
|
|
445
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
446
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
447
|
+
else:
|
|
448
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
449
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
450
|
+
else:
|
|
451
|
+
if translate == True:
|
|
452
|
+
seq_aa = translate_frame(seq)
|
|
453
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
454
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
455
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
456
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
409
457
|
|
|
410
458
|
if translate == False or translate == None:
|
|
411
459
|
#Clean up unused file
|
|
412
|
-
|
|
413
|
-
|
|
460
|
+
try: # Catches is combined_out_file_aa is None
|
|
461
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
462
|
+
os.remove(combined_out_file_aa.name)
|
|
463
|
+
except AttributeError:
|
|
464
|
+
pass
|
|
465
|
+
|
|
414
466
|
|
|
415
467
|
def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
416
468
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
@@ -533,7 +585,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
533
585
|
def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
|
|
534
586
|
# Read sequences from the gene family file
|
|
535
587
|
sequences = read_fasta(gene_path)
|
|
536
|
-
|
|
588
|
+
if len(sequences) == 1: # We can't align a single sequence
|
|
589
|
+
return concatenated_sequences
|
|
537
590
|
# Select the longest sequence for each genome
|
|
538
591
|
longest_sequences = select_longest_gene(sequences, subgrouped)
|
|
539
592
|
|
|
@@ -574,20 +627,18 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
|
|
|
574
627
|
# Iterate over each gene family file
|
|
575
628
|
for gene_file in os.listdir(group_directory):
|
|
576
629
|
if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
|
|
577
|
-
#print(gene_file)
|
|
578
630
|
current_group = int(gene_file.split('_')[3].split('.')[0])
|
|
579
631
|
gene_path = os.path.join(group_directory, gene_file)
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
632
|
+
# Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
|
|
633
|
+
if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
|
|
634
|
+
# Check for matching group in paralog_groups
|
|
635
|
+
if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
|
|
636
|
+
for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
|
|
637
|
+
if size >= threshold_size:
|
|
638
|
+
gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
|
|
639
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
|
|
640
|
+
else:
|
|
641
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
|
|
591
642
|
|
|
592
643
|
# Write the concatenated sequences to the output file
|
|
593
644
|
with open(output_file, 'w') as out:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -45,7 +45,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
45
45
|
```commandline
|
|
46
46
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
47
47
|
|
|
48
|
-
PyamilySeq v1.1.
|
|
48
|
+
PyamilySeq v1.1.1: A tool for gene clustering and analysis.
|
|
49
49
|
|
|
50
50
|
positional arguments:
|
|
51
51
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -75,7 +75,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
75
75
|
```
|
|
76
76
|
### Example output:
|
|
77
77
|
```
|
|
78
|
-
Running PyamilySeq v1.1.
|
|
78
|
+
Running PyamilySeq v1.1.1
|
|
79
79
|
Calculating Groups
|
|
80
80
|
Number of Genomes: 10
|
|
81
81
|
Gene Groups
|
|
@@ -220,7 +220,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
220
220
|
```
|
|
221
221
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
222
222
|
|
|
223
|
-
PyamilySeq v1.1.
|
|
223
|
+
PyamilySeq v1.1.1: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
224
224
|
|
|
225
225
|
options:
|
|
226
226
|
-h, --help show this help message and exit
|
|
@@ -263,7 +263,7 @@ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
|
263
263
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
264
264
|
[-verbose] [-v]
|
|
265
265
|
|
|
266
|
-
PyamilySeq v1.1.
|
|
266
|
+
PyamilySeq v1.1.1: Group-Splitter - A tool to split multi-copy gene groups
|
|
267
267
|
identified by PyamilySeq.
|
|
268
268
|
|
|
269
269
|
options:
|
|
@@ -316,7 +316,7 @@ Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_
|
|
|
316
316
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
317
317
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
318
318
|
|
|
319
|
-
PyamilySeq v1.1.
|
|
319
|
+
PyamilySeq v1.1.1: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
320
320
|
|
|
321
321
|
options:
|
|
322
322
|
-h, --help show this help message and exit
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|