PyamilySeq 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +163 -0
- PyamilySeq/Group_Splitter.py +571 -0
- PyamilySeq/PyamilySeq.py +316 -0
- PyamilySeq/PyamilySeq_Genus.py +242 -0
- PyamilySeq/PyamilySeq_Species.py +309 -0
- PyamilySeq/Seq_Combiner.py +66 -0
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/clusterings.py +452 -0
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +566 -0
- PyamilySeq-1.0.1.dist-info/METADATA +381 -0
- PyamilySeq-1.0.1.dist-info/RECORD +18 -0
- PyamilySeq-1.0.1.dist-info/entry_points.txt +7 -0
- PyamilySeq-1.0.1.dist-info/top_level.txt +1 -0
- PyamilySeq-1.0.0.dist-info/METADATA +0 -17
- PyamilySeq-1.0.0.dist-info/RECORD +0 -6
- PyamilySeq-1.0.0.dist-info/entry_points.txt +0 -2
- PyamilySeq-1.0.0.dist-info/top_level.txt +0 -1
- {PyamilySeq-1.0.0.dist-info → PyamilySeq-1.0.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-1.0.0.dist-info → PyamilySeq-1.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from .constants import *
|
|
7
|
+
from .utils import *
|
|
8
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
9
|
+
from constants import *
|
|
10
|
+
from utils import *
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def categorise_percentage(percent):
|
|
14
|
+
"""Categorise the percentage of genomes with multicopy genes."""
|
|
15
|
+
if 20 <= percent < 40:
|
|
16
|
+
return "20-40%"
|
|
17
|
+
elif 40 <= percent < 60:
|
|
18
|
+
return "40-60%"
|
|
19
|
+
elif 60 <= percent < 80:
|
|
20
|
+
return "60-80%"
|
|
21
|
+
elif 80 <= percent < 95:
|
|
22
|
+
return "80-95%"
|
|
23
|
+
elif 95 <= percent < 99:
|
|
24
|
+
return "95-99%"
|
|
25
|
+
elif 99 <= percent <= 100:
|
|
26
|
+
return "99-100%"
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
# Read cd-hit .clstr file and extract information
|
|
30
|
+
def read_cd_hit_output(clustering_output):
|
|
31
|
+
clusters = OrderedDict()
|
|
32
|
+
|
|
33
|
+
with open(clustering_output, 'r') as f:
|
|
34
|
+
current_cluster_id = None
|
|
35
|
+
|
|
36
|
+
for line in f:
|
|
37
|
+
line = line.strip()
|
|
38
|
+
if line.startswith(">Cluster"):
|
|
39
|
+
current_cluster_id = line.split(' ')[1]
|
|
40
|
+
clusters[current_cluster_id] = []
|
|
41
|
+
elif line and current_cluster_id is not None:
|
|
42
|
+
parts = line.split('\t')
|
|
43
|
+
if len(parts) > 1:
|
|
44
|
+
clustered_info = parts[1]
|
|
45
|
+
length = clustered_info.split(',')[0]
|
|
46
|
+
length = int(''.join(c for c in length if c.isdigit()))
|
|
47
|
+
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
48
|
+
clustered_header = '>' + clustered_header
|
|
49
|
+
|
|
50
|
+
if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
|
|
51
|
+
percent_identity = extract_identity(clustered_info)
|
|
52
|
+
elif line.endswith('*'):
|
|
53
|
+
percent_identity = 100.0
|
|
54
|
+
else:
|
|
55
|
+
raise ValueError("Percent identity not found in the string.")
|
|
56
|
+
|
|
57
|
+
clusters[current_cluster_id].append({
|
|
58
|
+
'header': clustered_header,
|
|
59
|
+
'length': length,
|
|
60
|
+
'percent_identity': percent_identity
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
return clusters
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Summarise the information for each cluster
|
|
67
|
+
def summarise_clusters(options,clusters, output):
|
|
68
|
+
multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
|
|
69
|
+
|
|
70
|
+
with open(output, 'w') as out_f:
|
|
71
|
+
out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
|
|
72
|
+
|
|
73
|
+
for cluster_id, seqs in clusters.items():
|
|
74
|
+
num_seqs = len(seqs)
|
|
75
|
+
lengths = [seq['length'] for seq in seqs]
|
|
76
|
+
identities = [seq['percent_identity'] for seq in seqs]
|
|
77
|
+
|
|
78
|
+
avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
|
|
79
|
+
length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
|
|
80
|
+
|
|
81
|
+
avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
|
|
82
|
+
identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
|
|
83
|
+
|
|
84
|
+
out_f.write(
|
|
85
|
+
f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
|
|
86
|
+
|
|
87
|
+
# Count genomes with more than one gene
|
|
88
|
+
genome_to_gene_count = defaultdict(int)
|
|
89
|
+
for seq in seqs:
|
|
90
|
+
genome = seq['header'].split('|')[0].replace('>','')
|
|
91
|
+
genome_to_gene_count[genome] += 1
|
|
92
|
+
|
|
93
|
+
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
94
|
+
|
|
95
|
+
# Calculate the percentage of genomes with multicopy genes
|
|
96
|
+
|
|
97
|
+
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
|
|
98
|
+
category = categorise_percentage(multicopy_percentage)
|
|
99
|
+
if category:
|
|
100
|
+
multicopy_groups[category] += 1
|
|
101
|
+
|
|
102
|
+
# Define the order of categories for printout
|
|
103
|
+
category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
|
|
104
|
+
|
|
105
|
+
# Print the number of clusters with multicopy genes in each percentage range, in the correct order
|
|
106
|
+
for category in category_order:
|
|
107
|
+
print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Main function to parse arguments and run the analysis
|
|
111
|
+
def main():
|
|
112
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
|
|
113
|
+
### Required Arguments
|
|
114
|
+
required = parser.add_argument_group('Required Parameters')
|
|
115
|
+
required.add_argument('-input_clstr', action="store", dest="input_clstr",
|
|
116
|
+
help='Input CD-HIT .clstr file',
|
|
117
|
+
required=True)
|
|
118
|
+
required.add_argument('-output', action="store", dest="output",
|
|
119
|
+
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
|
|
120
|
+
required=True)
|
|
121
|
+
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
122
|
+
help='The total number of genomes must be provide',
|
|
123
|
+
required=True)
|
|
124
|
+
#required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
125
|
+
# help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
126
|
+
# required=True)
|
|
127
|
+
|
|
128
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
129
|
+
optional.add_argument('-output_dir', action="store", dest="output_dir",
|
|
130
|
+
help='Default: Same as input file',
|
|
131
|
+
required=False)
|
|
132
|
+
|
|
133
|
+
misc = parser.add_argument_group("Misc Parameters")
|
|
134
|
+
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
135
|
+
help="Print verbose output.",
|
|
136
|
+
required=False)
|
|
137
|
+
misc.add_argument("-v", "--version", action="version",
|
|
138
|
+
version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
|
|
139
|
+
help="Print out version number and exit")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
options = parser.parse_args()
|
|
143
|
+
print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
|
|
144
|
+
|
|
145
|
+
### File handling
|
|
146
|
+
options.input_clstr = fix_path(options.input_clstr)
|
|
147
|
+
if options.output_dir is None:
|
|
148
|
+
options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
|
|
149
|
+
output_path = os.path.abspath(options.output_dir)
|
|
150
|
+
if not os.path.exists(output_path):
|
|
151
|
+
os.makedirs(output_path)
|
|
152
|
+
output_name = options.output
|
|
153
|
+
if not output_name.endswith('.tsv'):
|
|
154
|
+
output_name += '.tsv'
|
|
155
|
+
output_file_path = os.path.join(output_path, output_name)
|
|
156
|
+
###
|
|
157
|
+
|
|
158
|
+
clusters = read_cd_hit_output(options.input_clstr)
|
|
159
|
+
summarise_clusters(options,clusters, output_file_path)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|