PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ import argparse
2
+ from collections import OrderedDict
3
+ from collections import defaultdict
4
+
5
+ try:
6
+ from .constants import *
7
+ from .utils import *
8
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
+ from constants import *
10
+ from utils import *
11
+
12
+
13
+ def categorise_percentage(percent):
14
+ """Categorise the percentage of genomes with multicopy genes."""
15
+ if 20 <= percent < 40:
16
+ return "20-40%"
17
+ elif 40 <= percent < 60:
18
+ return "40-60%"
19
+ elif 60 <= percent < 80:
20
+ return "60-80%"
21
+ elif 80 <= percent < 95:
22
+ return "80-95%"
23
+ elif 95 <= percent < 99:
24
+ return "95-99%"
25
+ elif 99 <= percent <= 100:
26
+ return "99-100%"
27
+ return None
28
+
29
+ # Read cd-hit .clstr file and extract information
30
+ def read_cd_hit_output(clustering_output):
31
+ clusters = OrderedDict()
32
+
33
+ with open(clustering_output, 'r') as f:
34
+ current_cluster_id = None
35
+
36
+ for line in f:
37
+ line = line.strip()
38
+ if line.startswith(">Cluster"):
39
+ current_cluster_id = line.split(' ')[1]
40
+ clusters[current_cluster_id] = []
41
+ elif line and current_cluster_id is not None:
42
+ parts = line.split('\t')
43
+ if len(parts) > 1:
44
+ clustered_info = parts[1]
45
+ length = clustered_info.split(',')[0]
46
+ length = int(''.join(c for c in length if c.isdigit()))
47
+ clustered_header = clustered_info.split('>')[1].split('...')[0]
48
+ clustered_header = '>' + clustered_header
49
+
50
+ if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
51
+ percent_identity = extract_identity(clustered_info)
52
+ elif line.endswith('*'):
53
+ percent_identity = 100.0
54
+ else:
55
+ raise ValueError("Percent identity not found in the string.")
56
+
57
+ clusters[current_cluster_id].append({
58
+ 'header': clustered_header,
59
+ 'length': length,
60
+ 'percent_identity': percent_identity
61
+ })
62
+
63
+ return clusters
64
+
65
+
66
+ # Summarise the information for each cluster
67
+ def summarise_clusters(options,clusters, output):
68
+ multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
69
+
70
+ with open(output, 'w') as out_f:
71
+ out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
72
+
73
+ for cluster_id, seqs in clusters.items():
74
+ num_seqs = len(seqs)
75
+ lengths = [seq['length'] for seq in seqs]
76
+ identities = [seq['percent_identity'] for seq in seqs]
77
+
78
+ avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
79
+ length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
80
+
81
+ avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
82
+ identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
83
+
84
+ out_f.write(
85
+ f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
86
+
87
+ # Count genomes with more than one gene
88
+ genome_to_gene_count = defaultdict(int)
89
+ for seq in seqs:
90
+ genome = seq['header'].split('|')[0].replace('>','')
91
+ genome_to_gene_count[genome] += 1
92
+
93
+ num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
94
+
95
+ # Calculate the percentage of genomes with multicopy genes
96
+
97
+ multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
98
+ category = categorise_percentage(multicopy_percentage)
99
+ if category:
100
+ multicopy_groups[category] += 1
101
+
102
+ # Define the order of categories for printout
103
+ category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
104
+
105
+ # Print the number of clusters with multicopy genes in each percentage range, in the correct order
106
+ for category in category_order:
107
+ print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
108
+
109
+
110
+ # Main function to parse arguments and run the analysis
111
+ def main():
112
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
113
+ ### Required Arguments
114
+ required = parser.add_argument_group('Required Parameters')
115
+ required.add_argument('-input_clstr', action="store", dest="input_clstr",
116
+ help='Input CD-HIT .clstr file',
117
+ required=True)
118
+ required.add_argument('-output', action="store", dest="output",
119
+ help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
120
+ required=True)
121
+ required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
122
+ help='The total number of genomes must be provide',
123
+ required=True)
124
+ #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
125
+ # help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
126
+ # required=True)
127
+
128
+ optional = parser.add_argument_group('Optional Arguments')
129
+ optional.add_argument('-output_dir', action="store", dest="output_dir",
130
+ help='Default: Same as input file',
131
+ required=False)
132
+
133
+ misc = parser.add_argument_group("Misc Parameters")
134
+ misc.add_argument("-verbose", action="store_true", dest="verbose",
135
+ help="Print verbose output.",
136
+ required=False)
137
+ misc.add_argument("-v", "--version", action="version",
138
+ version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
139
+ help="Print out version number and exit")
140
+
141
+
142
+ options = parser.parse_args()
143
+ print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
144
+
145
+ ### File handling
146
+ options.input_clstr = fix_path(options.input_clstr)
147
+ if options.output_dir is None:
148
+ options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
149
+ output_path = os.path.abspath(options.output_dir)
150
+ if not os.path.exists(output_path):
151
+ os.makedirs(output_path)
152
+ output_name = options.output
153
+ if not output_name.endswith('.tsv'):
154
+ output_name += '.tsv'
155
+ output_file_path = os.path.join(output_path, output_name)
156
+ ###
157
+
158
+ clusters = read_cd_hit_output(options.input_clstr)
159
+ summarise_clusters(options,clusters, output_file_path)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -0,0 +1,83 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+
5
+
6
+ def parse_fasta(fasta_file):
7
+ """
8
+ Parses a FASTA file and returns a dictionary of gene IDs and sequences.
9
+ """
10
+ sequences = {}
11
+ with open(fasta_file, 'r') as f:
12
+ gene_id = None
13
+ sequence = []
14
+ for line in f:
15
+ line = line.strip()
16
+ if line.startswith(">"):
17
+ if gene_id: # Save the previous gene
18
+ sequences[gene_id] = ''.join(sequence)
19
+ gene_id = line[1:].split()[0].split('|')[1].replace('ENSB_','') # Extract the gene ID after ">"
20
+ sequence = []
21
+ else:
22
+ sequence.append(line)
23
+ if gene_id: # Save the last gene
24
+ sequences[gene_id] = ''.join(sequence)
25
+ return sequences
26
+
27
+
28
+ def parse_csv(csv_file):
29
+ """
30
+ Parses a CSV file to extract group IDs and gene IDs (skipping the first line).
31
+ """
32
+ groups = {}
33
+ with open(csv_file, 'r') as f:
34
+ reader = csv.reader(f, delimiter=',') # Assuming tab-delimited CSV
35
+ next(reader) # Skip the first line
36
+ for row in reader:
37
+ group_id = row[0]
38
+ gene_ids = row[14:] # Read from column 14 onward
39
+ gene_ids = [gene.strip() for genes in gene_ids for gene in genes.split(';') if
40
+ gene.strip()] # Flatten and clean
41
+ groups[group_id] = gene_ids
42
+ return groups
43
+
44
+
45
+ def write_group_fastas(groups, sequences, output_dir):
46
+ """
47
+ Writes individual FASTA files for each group with the relevant sequences.
48
+ """
49
+ if not os.path.exists(output_dir):
50
+ os.makedirs(output_dir)
51
+
52
+ for group_id, gene_ids in groups.items():
53
+ group_file = os.path.join(output_dir, f"{group_id}.fasta")
54
+ with open(group_file, 'w') as f:
55
+ for gene_id in gene_ids:
56
+ if gene_id in sequences:
57
+ f.write(f">{gene_id}\n{sequences[gene_id]}\n")
58
+ else:
59
+ print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
60
+
61
+
62
+ def main():
63
+ parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
64
+ parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
65
+ parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
66
+ parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
67
+
68
+ args = parser.parse_args()
69
+
70
+ # Parse the input files
71
+ print("Parsing FASTA file...")
72
+ sequences = parse_fasta(args.fasta)
73
+ print("Parsing CSV file...")
74
+ groups = parse_csv(args.csv)
75
+
76
+ # Write the grouped FASTA files
77
+ print("Writing grouped FASTA files...")
78
+ write_group_fastas(groups, sequences, args.output_dir)
79
+ print("Process completed successfully.")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,87 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+
5
+
6
+ def parse_fasta_stats(fasta_file):
7
+ """
8
+ Parses a FASTA file and calculates sequence statistics.
9
+ """
10
+ lengths = []
11
+ with open(fasta_file, 'r') as f:
12
+ sequence = []
13
+ for line in f:
14
+ line = line.strip()
15
+ if line.startswith(">"):
16
+ if sequence: # Save the previous sequence length
17
+ lengths.append(len(''.join(sequence)))
18
+ sequence = [] # Reset for the next sequence
19
+ else:
20
+ sequence.append(line)
21
+ if sequence: # Save the last sequence length
22
+ lengths.append(len(''.join(sequence)))
23
+
24
+ # Calculate statistics
25
+ num_sequences = len(lengths)
26
+ if num_sequences > 0:
27
+ avg_length = sum(lengths) / num_sequences
28
+ min_length = min(lengths)
29
+ max_length = max(lengths)
30
+ length_diff = max_length - min_length
31
+ percent_diff = (length_diff / min_length * 100) if min_length > 0 else 0
32
+ else:
33
+ avg_length = min_length = max_length = length_diff = percent_diff = 0
34
+
35
+ return {
36
+ "num_sequences": num_sequences,
37
+ "min_length": min_length,
38
+ "max_length": max_length,
39
+ "avg_length": avg_length,
40
+ "length_diff": length_diff,
41
+ "percent_diff": percent_diff
42
+ }
43
+
44
+
45
+ def process_fasta_directory(input_dir, output_csv):
46
+ """
47
+ Processes a directory of FASTA files and writes statistics to a CSV file.
48
+ """
49
+ results = []
50
+ for filename in os.listdir(input_dir):
51
+ if filename.endswith(".fasta"):
52
+ file_path = os.path.join(input_dir, filename)
53
+ stats = parse_fasta_stats(file_path)
54
+ results.append({
55
+ "file_name": filename,
56
+ "num_sequences": stats["num_sequences"],
57
+ "min_length": stats["min_length"],
58
+ "max_length": stats["max_length"],
59
+ "avg_length": stats["avg_length"],
60
+ "length_diff": stats["length_diff"],
61
+ "percent_diff": stats["percent_diff"]
62
+ })
63
+
64
+ # Write results to a CSV file
65
+ with open(output_csv, 'w', newline='') as csvfile:
66
+ fieldnames = ["file_name", "num_sequences", "min_length", "max_length", "avg_length", "length_diff",
67
+ "percent_diff"]
68
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
69
+ writer.writeheader()
70
+ writer.writerows(results)
71
+
72
+
73
+ def main():
74
+ parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
75
+ parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
76
+ parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
77
+
78
+ args = parser.parse_args()
79
+
80
+ # Process the directory of FASTA files
81
+ print("Processing FASTA files...")
82
+ process_fasta_directory(args.input_dir, args.output_csv)
83
+ print(f"Statistics saved to {args.output_csv}")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()