PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Compare.py +108 -0
- PyamilySeq/Cluster_Summary.py +59 -64
- PyamilySeq/Group_Extractor.py +83 -0
- PyamilySeq/Group_Sizes.py +87 -0
- PyamilySeq/PyamilySeq.py +26 -18
- PyamilySeq/PyamilySeq_Genus.py +3 -3
- PyamilySeq/PyamilySeq_Species.py +10 -8
- PyamilySeq/Seq_Combiner.py +25 -8
- PyamilySeq/clusterings.py +0 -2
- PyamilySeq/constants.py +1 -1
- PyamilySeq/utils.py +197 -114
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/METADATA +46 -85
- pyamilyseq-1.1.1.dist-info/RECORD +21 -0
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/entry_points.txt +6 -0
- PyamilySeq-1.0.1.dist-info/RECORD +0 -18
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
def read_cd_hit_output(clstr_file):
|
|
5
|
+
"""
|
|
6
|
+
Reads a CD-HIT .clstr file and extracts sequence clusters.
|
|
7
|
+
Returns a dictionary where keys are sequence headers and values are cluster IDs.
|
|
8
|
+
"""
|
|
9
|
+
seq_to_cluster = {} # Maps sequence header -> cluster ID
|
|
10
|
+
cluster_id = 0 # Generic ID for clusters (since CD-HIT names don't matter)
|
|
11
|
+
|
|
12
|
+
with open(clstr_file, 'r') as f:
|
|
13
|
+
for line in f:
|
|
14
|
+
line = line.strip()
|
|
15
|
+
if line.startswith(">Cluster"):
|
|
16
|
+
cluster_id += 1 # Increment cluster ID
|
|
17
|
+
elif line:
|
|
18
|
+
parts = line.split('\t')
|
|
19
|
+
if len(parts) > 1:
|
|
20
|
+
seq_header = parts[1].split('>')[1].split('...')[0] # Extract sequence header
|
|
21
|
+
seq_to_cluster[seq_header] = cluster_id
|
|
22
|
+
|
|
23
|
+
return seq_to_cluster
|
|
24
|
+
|
|
25
|
+
def compare_cd_hit_clusters(file1, file2, output_file):
|
|
26
|
+
"""
|
|
27
|
+
Compares two CD-HIT .clstr files to check if clusters are the same.
|
|
28
|
+
Writes the results to a TSV file.
|
|
29
|
+
"""
|
|
30
|
+
# Read both clustering files
|
|
31
|
+
clusters1 = read_cd_hit_output(file1)
|
|
32
|
+
clusters2 = read_cd_hit_output(file2)
|
|
33
|
+
|
|
34
|
+
# Reverse mappings: cluster ID -> list of sequences
|
|
35
|
+
grouped_clusters1 = defaultdict(set)
|
|
36
|
+
grouped_clusters2 = defaultdict(set)
|
|
37
|
+
|
|
38
|
+
for seq, cluster_id in clusters1.items():
|
|
39
|
+
grouped_clusters1[cluster_id].add(seq)
|
|
40
|
+
for seq, cluster_id in clusters2.items():
|
|
41
|
+
grouped_clusters2[cluster_id].add(seq)
|
|
42
|
+
|
|
43
|
+
# Initialize metrics counters
|
|
44
|
+
cluster_name_changes = 0
|
|
45
|
+
sequence_shifts = 0
|
|
46
|
+
only_in_file1 = defaultdict(list)
|
|
47
|
+
only_in_file2 = defaultdict(list)
|
|
48
|
+
cluster_mismatches = defaultdict(list)
|
|
49
|
+
|
|
50
|
+
# Prepare data for the TSV output
|
|
51
|
+
tsv_data = []
|
|
52
|
+
|
|
53
|
+
# Track changes
|
|
54
|
+
for seq, cluster_id in clusters1.items():
|
|
55
|
+
if seq not in clusters2:
|
|
56
|
+
only_in_file1[cluster_id].append(seq)
|
|
57
|
+
tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
|
|
58
|
+
elif clusters2[seq] != cluster_id:
|
|
59
|
+
# Sequence shifts: sequence in different clusters between files
|
|
60
|
+
sequence_shifts += 1
|
|
61
|
+
cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
|
|
62
|
+
tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
|
|
63
|
+
|
|
64
|
+
for seq, cluster_id in clusters2.items():
|
|
65
|
+
if seq not in clusters1:
|
|
66
|
+
only_in_file2[cluster_id].append(seq)
|
|
67
|
+
tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
|
|
68
|
+
elif clusters1[seq] != cluster_id:
|
|
69
|
+
# Sequence shifts: sequence in different clusters between files
|
|
70
|
+
sequence_shifts += 1
|
|
71
|
+
cluster_mismatches[seq].append((clusters1[seq], cluster_id))
|
|
72
|
+
tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
|
|
73
|
+
|
|
74
|
+
# Track cluster name changes (same sequences in different clusters)
|
|
75
|
+
for cluster_id1, seqs1 in grouped_clusters1.items():
|
|
76
|
+
for cluster_id2, seqs2 in grouped_clusters2.items():
|
|
77
|
+
if seqs1 == seqs2 and cluster_id1 != cluster_id2:
|
|
78
|
+
cluster_name_changes += 1
|
|
79
|
+
for seq in seqs1:
|
|
80
|
+
tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
|
|
81
|
+
|
|
82
|
+
# Print metrics
|
|
83
|
+
print("🔢 Clustering Comparison Metrics:")
|
|
84
|
+
print(f"Cluster name changes: {cluster_name_changes}")
|
|
85
|
+
print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
|
|
86
|
+
print(f"Sequences only in the first file: {len(only_in_file1)}")
|
|
87
|
+
print(f"Sequences only in the second file: {len(only_in_file2)}")
|
|
88
|
+
print()
|
|
89
|
+
|
|
90
|
+
# Write the results to a TSV file
|
|
91
|
+
with open(output_file, 'w') as out_file:
|
|
92
|
+
out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
|
|
93
|
+
for row in tsv_data:
|
|
94
|
+
out_file.write("\t".join(map(str, row)) + "\n")
|
|
95
|
+
|
|
96
|
+
print(f"✅ Results have been written to {output_file}")
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
|
|
100
|
+
parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
|
|
101
|
+
parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
|
|
102
|
+
parser.add_argument("-output", required=True, help="Output file (TSV format)")
|
|
103
|
+
args = parser.parse_args()
|
|
104
|
+
|
|
105
|
+
compare_cd_hit_clusters(args.file1, args.file2, args.output)
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
main()
|
PyamilySeq/Cluster_Summary.py
CHANGED
|
@@ -1,33 +1,32 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from collections import OrderedDict
|
|
3
|
-
from collections import defaultdict
|
|
2
|
+
from collections import OrderedDict, defaultdict
|
|
4
3
|
|
|
5
4
|
try:
|
|
6
5
|
from .constants import *
|
|
7
6
|
from .utils import *
|
|
8
|
-
except (ModuleNotFoundError, ImportError, NameError, TypeError)
|
|
7
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError):
|
|
9
8
|
from constants import *
|
|
10
9
|
from utils import *
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
def categorise_percentage(percent):
|
|
14
13
|
"""Categorise the percentage of genomes with multicopy genes."""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return "99-100%"
|
|
14
|
+
categories = {
|
|
15
|
+
(20, 40): "20-40%",
|
|
16
|
+
(40, 60): "40-60%",
|
|
17
|
+
(60, 80): "60-80%",
|
|
18
|
+
(80, 95): "80-95%",
|
|
19
|
+
(95, 99): "95-99%",
|
|
20
|
+
(99, 100): "99-100%"
|
|
21
|
+
}
|
|
22
|
+
for (low, high), label in categories.items():
|
|
23
|
+
if low <= percent < high:
|
|
24
|
+
return label
|
|
27
25
|
return None
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
|
|
30
28
|
def read_cd_hit_output(clustering_output):
|
|
29
|
+
"""Parse CD-HIT .cluster file and extract clustering information."""
|
|
31
30
|
clusters = OrderedDict()
|
|
32
31
|
|
|
33
32
|
with open(clustering_output, 'r') as f:
|
|
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
|
|
|
42
41
|
parts = line.split('\t')
|
|
43
42
|
if len(parts) > 1:
|
|
44
43
|
clustered_info = parts[1]
|
|
45
|
-
length = clustered_info.split(',')[0]
|
|
46
|
-
|
|
47
|
-
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
48
|
-
clustered_header = '>' + clustered_header
|
|
44
|
+
length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
|
|
45
|
+
clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
|
|
49
46
|
|
|
50
47
|
if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
|
|
51
48
|
percent_identity = extract_identity(clustered_info)
|
|
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
|
|
|
63
60
|
return clusters
|
|
64
61
|
|
|
65
62
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
multicopy_groups = defaultdict(int) # Counter for
|
|
63
|
+
def summarise_clusters(options, clusters, output):
|
|
64
|
+
"""Generate a detailed cluster summary report."""
|
|
65
|
+
multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
|
|
69
66
|
|
|
70
67
|
with open(output, 'w') as out_f:
|
|
71
|
-
out_f.write(
|
|
68
|
+
out_f.write(
|
|
69
|
+
"Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
|
|
70
|
+
)
|
|
72
71
|
|
|
73
72
|
for cluster_id, seqs in clusters.items():
|
|
74
73
|
num_seqs = len(seqs)
|
|
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
|
|
|
81
80
|
avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
|
|
82
81
|
identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
|
|
86
|
-
|
|
87
|
-
# Count genomes with more than one gene
|
|
83
|
+
# Count genomes in cluster
|
|
88
84
|
genome_to_gene_count = defaultdict(int)
|
|
89
85
|
for seq in seqs:
|
|
90
|
-
genome = seq['header'].split('|')[0].replace('>','')
|
|
86
|
+
genome = seq['header'].split('|')[0].replace('>', '')
|
|
91
87
|
genome_to_gene_count[genome] += 1
|
|
92
88
|
|
|
89
|
+
num_genomes = len(genome_to_gene_count)
|
|
93
90
|
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
91
|
+
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
|
|
94
92
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
|
|
93
|
+
# Categorize multicopy percentage
|
|
98
94
|
category = categorise_percentage(multicopy_percentage)
|
|
99
95
|
if category:
|
|
100
96
|
multicopy_groups[category] += 1
|
|
101
97
|
|
|
102
|
-
|
|
103
|
-
|
|
98
|
+
# Write detailed output for each cluster
|
|
99
|
+
out_f.write(
|
|
100
|
+
f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
|
|
101
|
+
f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
|
|
102
|
+
)
|
|
104
103
|
|
|
105
|
-
#
|
|
104
|
+
# Define order for multicopy statistics output
|
|
105
|
+
category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
|
|
106
106
|
for category in category_order:
|
|
107
|
-
print(f"
|
|
107
|
+
print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
# Main function to parse arguments and run the analysis
|
|
111
110
|
def main():
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
required.add_argument('-input_clstr', action="store", dest="input_clstr",
|
|
116
|
-
help='Input CD-HIT .clstr file',
|
|
117
|
-
required=True)
|
|
118
|
-
required.add_argument('-output', action="store", dest="output",
|
|
119
|
-
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
|
|
120
|
-
required=True)
|
|
121
|
-
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
122
|
-
help='The total number of genomes must be provide',
|
|
123
|
-
required=True)
|
|
124
|
-
#required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
125
|
-
# help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
126
|
-
# required=True)
|
|
111
|
+
"""Main function to parse arguments and process clustering files."""
|
|
112
|
+
parser = argparse.ArgumentParser(
|
|
113
|
+
description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
|
|
127
114
|
|
|
115
|
+
# Required Arguments
|
|
116
|
+
required = parser.add_argument_group('Required Parameters')
|
|
117
|
+
required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
|
|
118
|
+
help='Input CD-HIT .cluster file')
|
|
119
|
+
required.add_argument('-output', action="store", dest="output", required=True,
|
|
120
|
+
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
|
|
121
|
+
required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
|
|
122
|
+
help='Total number of genomes in dataset')
|
|
123
|
+
|
|
124
|
+
# Optional Arguments
|
|
128
125
|
optional = parser.add_argument_group('Optional Arguments')
|
|
129
126
|
optional.add_argument('-output_dir', action="store", dest="output_dir",
|
|
130
|
-
help='Default: Same as input file',
|
|
131
|
-
required=False)
|
|
127
|
+
help='Default: Same as input file', required=False)
|
|
132
128
|
|
|
133
129
|
misc = parser.add_argument_group("Misc Parameters")
|
|
134
130
|
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
135
|
-
help="Print verbose output.",
|
|
136
|
-
required=False)
|
|
131
|
+
help="Print verbose output.", required=False)
|
|
137
132
|
misc.add_argument("-v", "--version", action="version",
|
|
138
133
|
version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
|
|
139
134
|
help="Print out version number and exit")
|
|
140
135
|
|
|
141
|
-
|
|
142
136
|
options = parser.parse_args()
|
|
143
|
-
print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
|
|
137
|
+
print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
|
|
144
138
|
|
|
145
|
-
|
|
146
|
-
options.
|
|
139
|
+
# File handling
|
|
140
|
+
options.input_cluster = fix_path(options.input_cluster)
|
|
147
141
|
if options.output_dir is None:
|
|
148
|
-
options.output_dir = os.path.dirname(os.path.abspath(options.
|
|
142
|
+
options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
|
|
149
143
|
output_path = os.path.abspath(options.output_dir)
|
|
150
144
|
if not os.path.exists(output_path):
|
|
151
145
|
os.makedirs(output_path)
|
|
146
|
+
|
|
152
147
|
output_name = options.output
|
|
153
148
|
if not output_name.endswith('.tsv'):
|
|
154
149
|
output_name += '.tsv'
|
|
155
150
|
output_file_path = os.path.join(output_path, output_name)
|
|
156
|
-
###
|
|
157
151
|
|
|
158
|
-
clusters
|
|
159
|
-
|
|
152
|
+
# Process clusters and generate summary
|
|
153
|
+
clusters = read_cd_hit_output(options.input_cluster)
|
|
154
|
+
summarise_clusters(options, clusters, output_file_path)
|
|
160
155
|
|
|
161
156
|
|
|
162
157
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta(fasta_file):
|
|
7
|
+
"""
|
|
8
|
+
Parses a FASTA file and returns a dictionary of gene IDs and sequences.
|
|
9
|
+
"""
|
|
10
|
+
sequences = {}
|
|
11
|
+
with open(fasta_file, 'r') as f:
|
|
12
|
+
gene_id = None
|
|
13
|
+
sequence = []
|
|
14
|
+
for line in f:
|
|
15
|
+
line = line.strip()
|
|
16
|
+
if line.startswith(">"):
|
|
17
|
+
if gene_id: # Save the previous gene
|
|
18
|
+
sequences[gene_id] = ''.join(sequence)
|
|
19
|
+
gene_id = line[1:].split()[0].split('|')[1].replace('ENSB_','') # Extract the gene ID after ">"
|
|
20
|
+
sequence = []
|
|
21
|
+
else:
|
|
22
|
+
sequence.append(line)
|
|
23
|
+
if gene_id: # Save the last gene
|
|
24
|
+
sequences[gene_id] = ''.join(sequence)
|
|
25
|
+
return sequences
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_csv(csv_file):
|
|
29
|
+
"""
|
|
30
|
+
Parses a CSV file to extract group IDs and gene IDs (skipping the first line).
|
|
31
|
+
"""
|
|
32
|
+
groups = {}
|
|
33
|
+
with open(csv_file, 'r') as f:
|
|
34
|
+
reader = csv.reader(f, delimiter=',') # Assuming tab-delimited CSV
|
|
35
|
+
next(reader) # Skip the first line
|
|
36
|
+
for row in reader:
|
|
37
|
+
group_id = row[0]
|
|
38
|
+
gene_ids = row[14:] # Read from column 14 onward
|
|
39
|
+
gene_ids = [gene.strip() for genes in gene_ids for gene in genes.split(';') if
|
|
40
|
+
gene.strip()] # Flatten and clean
|
|
41
|
+
groups[group_id] = gene_ids
|
|
42
|
+
return groups
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_group_fastas(groups, sequences, output_dir):
|
|
46
|
+
"""
|
|
47
|
+
Writes individual FASTA files for each group with the relevant sequences.
|
|
48
|
+
"""
|
|
49
|
+
if not os.path.exists(output_dir):
|
|
50
|
+
os.makedirs(output_dir)
|
|
51
|
+
|
|
52
|
+
for group_id, gene_ids in groups.items():
|
|
53
|
+
group_file = os.path.join(output_dir, f"{group_id}.fasta")
|
|
54
|
+
with open(group_file, 'w') as f:
|
|
55
|
+
for gene_id in gene_ids:
|
|
56
|
+
if gene_id in sequences:
|
|
57
|
+
f.write(f">{gene_id}\n{sequences[gene_id]}\n")
|
|
58
|
+
else:
|
|
59
|
+
print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
|
|
64
|
+
parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
|
|
65
|
+
parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
|
|
66
|
+
parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
|
|
67
|
+
|
|
68
|
+
args = parser.parse_args()
|
|
69
|
+
|
|
70
|
+
# Parse the input files
|
|
71
|
+
print("Parsing FASTA file...")
|
|
72
|
+
sequences = parse_fasta(args.fasta)
|
|
73
|
+
print("Parsing CSV file...")
|
|
74
|
+
groups = parse_csv(args.csv)
|
|
75
|
+
|
|
76
|
+
# Write the grouped FASTA files
|
|
77
|
+
print("Writing grouped FASTA files...")
|
|
78
|
+
write_group_fastas(groups, sequences, args.output_dir)
|
|
79
|
+
print("Process completed successfully.")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta_stats(fasta_file):
|
|
7
|
+
"""
|
|
8
|
+
Parses a FASTA file and calculates sequence statistics.
|
|
9
|
+
"""
|
|
10
|
+
lengths = []
|
|
11
|
+
with open(fasta_file, 'r') as f:
|
|
12
|
+
sequence = []
|
|
13
|
+
for line in f:
|
|
14
|
+
line = line.strip()
|
|
15
|
+
if line.startswith(">"):
|
|
16
|
+
if sequence: # Save the previous sequence length
|
|
17
|
+
lengths.append(len(''.join(sequence)))
|
|
18
|
+
sequence = [] # Reset for the next sequence
|
|
19
|
+
else:
|
|
20
|
+
sequence.append(line)
|
|
21
|
+
if sequence: # Save the last sequence length
|
|
22
|
+
lengths.append(len(''.join(sequence)))
|
|
23
|
+
|
|
24
|
+
# Calculate statistics
|
|
25
|
+
num_sequences = len(lengths)
|
|
26
|
+
if num_sequences > 0:
|
|
27
|
+
avg_length = sum(lengths) / num_sequences
|
|
28
|
+
min_length = min(lengths)
|
|
29
|
+
max_length = max(lengths)
|
|
30
|
+
length_diff = max_length - min_length
|
|
31
|
+
percent_diff = (length_diff / min_length * 100) if min_length > 0 else 0
|
|
32
|
+
else:
|
|
33
|
+
avg_length = min_length = max_length = length_diff = percent_diff = 0
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"num_sequences": num_sequences,
|
|
37
|
+
"min_length": min_length,
|
|
38
|
+
"max_length": max_length,
|
|
39
|
+
"avg_length": avg_length,
|
|
40
|
+
"length_diff": length_diff,
|
|
41
|
+
"percent_diff": percent_diff
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def process_fasta_directory(input_dir, output_csv):
|
|
46
|
+
"""
|
|
47
|
+
Processes a directory of FASTA files and writes statistics to a CSV file.
|
|
48
|
+
"""
|
|
49
|
+
results = []
|
|
50
|
+
for filename in os.listdir(input_dir):
|
|
51
|
+
if filename.endswith(".fasta"):
|
|
52
|
+
file_path = os.path.join(input_dir, filename)
|
|
53
|
+
stats = parse_fasta_stats(file_path)
|
|
54
|
+
results.append({
|
|
55
|
+
"file_name": filename,
|
|
56
|
+
"num_sequences": stats["num_sequences"],
|
|
57
|
+
"min_length": stats["min_length"],
|
|
58
|
+
"max_length": stats["max_length"],
|
|
59
|
+
"avg_length": stats["avg_length"],
|
|
60
|
+
"length_diff": stats["length_diff"],
|
|
61
|
+
"percent_diff": stats["percent_diff"]
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
# Write results to a CSV file
|
|
65
|
+
with open(output_csv, 'w', newline='') as csvfile:
|
|
66
|
+
fieldnames = ["file_name", "num_sequences", "min_length", "max_length", "avg_length", "length_diff",
|
|
67
|
+
"percent_diff"]
|
|
68
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
69
|
+
writer.writeheader()
|
|
70
|
+
writer.writerows(results)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
|
|
75
|
+
parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
|
|
76
|
+
parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
|
|
77
|
+
|
|
78
|
+
args = parser.parse_args()
|
|
79
|
+
|
|
80
|
+
# Process the directory of FASTA files
|
|
81
|
+
print("Processing FASTA files...")
|
|
82
|
+
process_fasta_directory(args.input_dir, args.output_csv)
|
|
83
|
+
print(f"Statistics saved to {args.output_csv}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
20
20
|
clustering_mode,
|
|
21
21
|
'-i', input_file,
|
|
22
22
|
'-o', clustering_output,
|
|
23
|
-
'-c',
|
|
24
|
-
'-s',
|
|
23
|
+
'-c', f"{float(options.pident):.2f}",
|
|
24
|
+
'-s', f"{float(options.len_diff):.2f}",
|
|
25
25
|
'-T', str(options.threads),
|
|
26
26
|
'-M', str(options.mem),
|
|
27
27
|
'-d', "0",
|
|
@@ -54,16 +54,19 @@ def main():
|
|
|
54
54
|
help="Directory containing GFF/FASTA files - Use with -input_type separate/combined.")
|
|
55
55
|
full_parser.add_argument("-input_fasta", required=False,
|
|
56
56
|
help="Input FASTA file - Use with - input_type fasta.")
|
|
57
|
-
full_parser.add_argument("-
|
|
58
|
-
help="Substring to split filenames and extract genome names (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
|
|
57
|
+
full_parser.add_argument("-name_split_gff", required=False,
|
|
58
|
+
help="Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
|
|
59
|
+
full_parser.add_argument("-name_split_fasta", required=False,
|
|
60
|
+
help="Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.")
|
|
59
61
|
full_parser.add_argument("-sequence_type", choices=['AA', 'DNA'], default="AA", required=False,
|
|
60
62
|
help="Clustering mode: 'DNA' or 'AA'.")
|
|
61
63
|
full_parser.add_argument("-gene_ident", default="CDS", required=False,
|
|
62
64
|
help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
|
|
63
|
-
full_parser.add_argument("-c", type=
|
|
65
|
+
full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
|
|
64
66
|
help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
|
|
65
|
-
full_parser.add_argument("-s", type=
|
|
67
|
+
full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
|
|
66
68
|
help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
|
|
69
|
+
|
|
67
70
|
full_parser.add_argument("-fast_mode", action="store_true", required=False,
|
|
68
71
|
help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
|
|
69
72
|
|
|
@@ -91,12 +94,12 @@ def main():
|
|
|
91
94
|
help="Gene groupings for 'Species' mode (default: '99,95,15').")
|
|
92
95
|
subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
|
|
93
96
|
help="Gene groupings for 'Genus' mode (default: '1-10').")
|
|
94
|
-
subparser.add_argument("-
|
|
95
|
-
help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95').")
|
|
96
|
-
subparser.add_argument("-
|
|
97
|
+
subparser.add_argument("-write_groups", default=None, dest="write_groups", required=False,
|
|
98
|
+
help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95'). - triggers '-wig'.")
|
|
99
|
+
subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups", required=False,
|
|
97
100
|
help="Output individual FASTA files for each group.")
|
|
98
|
-
subparser.add_argument("-
|
|
99
|
-
help="Align and concatenate sequences for 'core' groups.")
|
|
101
|
+
subparser.add_argument("-align", action="store_true", dest="align_core", required=False,
|
|
102
|
+
help="Align and concatenate sequences for 'core' groups (those in 99-100% of genomes).")
|
|
100
103
|
subparser.add_argument("-align_aa", action="store_true", required=False,
|
|
101
104
|
help="Align sequences as amino acids.")
|
|
102
105
|
subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
|
|
@@ -115,6 +118,9 @@ def main():
|
|
|
115
118
|
# Parse Arguments
|
|
116
119
|
options = parser.parse_args()
|
|
117
120
|
|
|
121
|
+
if options.write_groups != None and options.write_individual_groups == False:
|
|
122
|
+
options.write_individual_groups = True
|
|
123
|
+
|
|
118
124
|
# Example of conditional logic based on selected mode
|
|
119
125
|
print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
|
|
120
126
|
if options.run_mode == "Full" and options.verbose == True:
|
|
@@ -129,13 +135,13 @@ def main():
|
|
|
129
135
|
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
130
136
|
required_full_mode = [options.input_type, options.pident, options.len_diff]
|
|
131
137
|
if options.input_type != 'fasta':
|
|
132
|
-
required_full_mode.extend([options.input_dir, options.
|
|
138
|
+
required_full_mode.extend([options.input_dir, options.name_split_gff])
|
|
133
139
|
if all(required_full_mode):
|
|
134
140
|
# Proceed with the Full mode
|
|
135
141
|
pass
|
|
136
142
|
else:
|
|
137
143
|
missing_options = [opt for opt in
|
|
138
|
-
['input_type', 'input_dir', '
|
|
144
|
+
['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
|
|
139
145
|
not options.__dict__.get(opt)]
|
|
140
146
|
sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
|
|
141
147
|
if options.align_core:
|
|
@@ -182,13 +188,13 @@ def main():
|
|
|
182
188
|
elif options.sequence_type == 'AA':
|
|
183
189
|
clustering_mode = 'cd-hit'
|
|
184
190
|
if options.fast_mode == True:
|
|
185
|
-
options.fast_mode =
|
|
191
|
+
options.fast_mode = 1
|
|
186
192
|
if options.verbose == True:
|
|
187
193
|
print("Running CD-HIT in fast mode.")
|
|
188
194
|
else:
|
|
189
|
-
options.fast_mode =
|
|
195
|
+
options.fast_mode = 0
|
|
190
196
|
if options.verbose == True:
|
|
191
|
-
print("Running CD-HIT in
|
|
197
|
+
print("Running CD-HIT in accurate mode.")
|
|
192
198
|
else:
|
|
193
199
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
194
200
|
|
|
@@ -234,10 +240,10 @@ def main():
|
|
|
234
240
|
translate = False
|
|
235
241
|
file_to_cluster = combined_out_file
|
|
236
242
|
if options.input_type == 'separate':
|
|
237
|
-
read_separate_files(options.input_dir, options.
|
|
243
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
|
|
238
244
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
239
245
|
elif options.input_type == 'combined':
|
|
240
|
-
read_combined_files(options.input_dir, options.
|
|
246
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
|
|
241
247
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
242
248
|
elif options.input_type == 'fasta':
|
|
243
249
|
combined_out_file = options.input_fasta
|
|
@@ -276,6 +282,8 @@ def main():
|
|
|
276
282
|
clustering_options = clustering_options()
|
|
277
283
|
|
|
278
284
|
elif options.run_mode == 'Partial':
|
|
285
|
+
if not os.path.exists(output_path):
|
|
286
|
+
os.makedirs(output_path)
|
|
279
287
|
class clustering_options:
|
|
280
288
|
def __init__(self):
|
|
281
289
|
self.run_mode = options.run_mode
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -23,8 +23,8 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
|
|
|
23
23
|
gpa_outfile.write('"\n')
|
|
24
24
|
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
25
25
|
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
26
|
-
gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
27
|
-
'","","","","","","","",""
|
|
26
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
27
|
+
'","","","","","","","",""')
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
for genus in genus_dict.keys():
|
|
@@ -34,7 +34,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
|
|
|
34
34
|
if value.split('_')[0] == genus:
|
|
35
35
|
tmp_list.append(value)
|
|
36
36
|
if tmp_list:
|
|
37
|
-
full_out += ',"'+''.join(tmp_list)+'"'
|
|
37
|
+
full_out += ',"'+' '.join(tmp_list)+'"'
|
|
38
38
|
else:
|
|
39
39
|
full_out = ',""'
|
|
40
40
|
gpa_outfile.write(full_out)
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -21,7 +21,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
21
21
|
gpa_outfile.write('"\n')
|
|
22
22
|
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
23
23
|
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
24
|
-
gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
24
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
25
25
|
'","","","","","","","","",""')
|
|
26
26
|
|
|
27
27
|
|
|
@@ -32,7 +32,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
32
32
|
if value.split('|')[0] == genome:
|
|
33
33
|
tmp_list.append(value.split('|')[1])
|
|
34
34
|
if tmp_list:
|
|
35
|
-
full_out += ',"'+'
|
|
35
|
+
full_out += ',"'+' '.join(tmp_list)+'"'
|
|
36
36
|
else:
|
|
37
37
|
full_out = ',""'
|
|
38
38
|
gpa_outfile.write(full_out)
|
|
@@ -120,12 +120,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
|
120
120
|
|
|
121
121
|
#@profile
|
|
122
122
|
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
123
|
+
try:
|
|
124
|
+
groups_as_list = list(groups.values())
|
|
125
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
126
|
+
res = idx
|
|
127
|
+
family_group = list(groups)[res]
|
|
128
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
129
|
+
except UnboundLocalError:
|
|
130
|
+
sys.exit("Error in calc_only_Second_only_core")
|
|
129
131
|
|
|
130
132
|
|
|
131
133
|
#@profile
|