PyamilySeq 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ import argparse
2
+ from collections import OrderedDict
3
+ from collections import defaultdict
4
+
5
+ try:
6
+ from .constants import *
7
+ from .utils import *
8
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
+ from constants import *
10
+ from utils import *
11
+
12
+
13
+ def categorise_percentage(percent):
14
+ """Categorise the percentage of genomes with multicopy genes."""
15
+ if 20 <= percent < 40:
16
+ return "20-40%"
17
+ elif 40 <= percent < 60:
18
+ return "40-60%"
19
+ elif 60 <= percent < 80:
20
+ return "60-80%"
21
+ elif 80 <= percent < 95:
22
+ return "80-95%"
23
+ elif 95 <= percent < 99:
24
+ return "95-99%"
25
+ elif 99 <= percent <= 100:
26
+ return "99-100%"
27
+ return None
28
+
29
+ # Read cd-hit .clstr file and extract information
30
+ def read_cd_hit_output(clustering_output):
31
+ clusters = OrderedDict()
32
+
33
+ with open(clustering_output, 'r') as f:
34
+ current_cluster_id = None
35
+
36
+ for line in f:
37
+ line = line.strip()
38
+ if line.startswith(">Cluster"):
39
+ current_cluster_id = line.split(' ')[1]
40
+ clusters[current_cluster_id] = []
41
+ elif line and current_cluster_id is not None:
42
+ parts = line.split('\t')
43
+ if len(parts) > 1:
44
+ clustered_info = parts[1]
45
+ length = clustered_info.split(',')[0]
46
+ length = int(''.join(c for c in length if c.isdigit()))
47
+ clustered_header = clustered_info.split('>')[1].split('...')[0]
48
+ clustered_header = '>' + clustered_header
49
+
50
+ if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
51
+ percent_identity = extract_identity(clustered_info)
52
+ elif line.endswith('*'):
53
+ percent_identity = 100.0
54
+ else:
55
+ raise ValueError("Percent identity not found in the string.")
56
+
57
+ clusters[current_cluster_id].append({
58
+ 'header': clustered_header,
59
+ 'length': length,
60
+ 'percent_identity': percent_identity
61
+ })
62
+
63
+ return clusters
64
+
65
+
66
+ # Summarise the information for each cluster
67
+ def summarise_clusters(options,clusters, output):
68
+ multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
69
+
70
+ with open(output, 'w') as out_f:
71
+ out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
72
+
73
+ for cluster_id, seqs in clusters.items():
74
+ num_seqs = len(seqs)
75
+ lengths = [seq['length'] for seq in seqs]
76
+ identities = [seq['percent_identity'] for seq in seqs]
77
+
78
+ avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
79
+ length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
80
+
81
+ avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
82
+ identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
83
+
84
+ out_f.write(
85
+ f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
86
+
87
+ # Count genomes with more than one gene
88
+ genome_to_gene_count = defaultdict(int)
89
+ for seq in seqs:
90
+ genome = seq['header'].split('|')[0].replace('>','')
91
+ genome_to_gene_count[genome] += 1
92
+
93
+ num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
94
+
95
+ # Calculate the percentage of genomes with multicopy genes
96
+
97
+ multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
98
+ category = categorise_percentage(multicopy_percentage)
99
+ if category:
100
+ multicopy_groups[category] += 1
101
+
102
+ # Define the order of categories for printout
103
+ category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
104
+
105
+ # Print the number of clusters with multicopy genes in each percentage range, in the correct order
106
+ for category in category_order:
107
+ print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
108
+
109
+
110
+ # Main function to parse arguments and run the analysis
111
+ def main():
112
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
113
+ ### Required Arguments
114
+ required = parser.add_argument_group('Required Parameters')
115
+ required.add_argument('-input_clstr', action="store", dest="input_clstr",
116
+ help='Input CD-HIT .clstr file',
117
+ required=True)
118
+ required.add_argument('-output', action="store", dest="output",
119
+ help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
120
+ required=True)
121
+ required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
122
+ help='The total number of genomes must be provide',
123
+ required=True)
124
+ #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
125
+ # help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
126
+ # required=True)
127
+
128
+ optional = parser.add_argument_group('Optional Arguments')
129
+ optional.add_argument('-output_dir', action="store", dest="output_dir",
130
+ help='Default: Same as input file',
131
+ required=False)
132
+
133
+ misc = parser.add_argument_group("Misc Parameters")
134
+ misc.add_argument("-verbose", action="store_true", dest="verbose",
135
+ help="Print verbose output.",
136
+ required=False)
137
+ misc.add_argument("-v", "--version", action="version",
138
+ version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
139
+ help="Print out version number and exit")
140
+
141
+
142
+ options = parser.parse_args()
143
+ print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
144
+
145
+ ### File handling
146
+ options.input_clstr = fix_path(options.input_clstr)
147
+ if options.output_dir is None:
148
+ options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
149
+ output_path = os.path.abspath(options.output_dir)
150
+ if not os.path.exists(output_path):
151
+ os.makedirs(output_path)
152
+ output_name = options.output
153
+ if not output_name.endswith('.tsv'):
154
+ output_name += '.tsv'
155
+ output_file_path = os.path.join(output_path, output_name)
156
+ ###
157
+
158
+ clusters = read_cd_hit_output(options.input_clstr)
159
+ summarise_clusters(options,clusters, output_file_path)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()