PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +163 -0
- PyamilySeq/Group_Extractor.py +83 -0
- PyamilySeq/Group_Sizes.py +87 -0
- PyamilySeq/Group_Splitter.py +571 -0
- PyamilySeq/PyamilySeq.py +321 -0
- PyamilySeq/PyamilySeq_Genus.py +242 -0
- PyamilySeq/PyamilySeq_Species.py +309 -0
- PyamilySeq/Seq_Combiner.py +83 -0
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/clusterings.py +452 -0
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +598 -0
- pyamilyseq-1.1.0.dist-info/METADATA +342 -0
- pyamilyseq-1.1.0.dist-info/RECORD +20 -0
- {PyamilySeq-1.0.0.dist-info → pyamilyseq-1.1.0.dist-info}/WHEEL +1 -1
- pyamilyseq-1.1.0.dist-info/entry_points.txt +13 -0
- pyamilyseq-1.1.0.dist-info/top_level.txt +1 -0
- PyamilySeq-1.0.0.dist-info/METADATA +0 -17
- PyamilySeq-1.0.0.dist-info/RECORD +0 -6
- PyamilySeq-1.0.0.dist-info/entry_points.txt +0 -2
- PyamilySeq-1.0.0.dist-info/top_level.txt +0 -1
- {PyamilySeq-1.0.0.dist-info → pyamilyseq-1.1.0.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from .constants import *
|
|
7
|
+
from .utils import *
|
|
8
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
9
|
+
from constants import *
|
|
10
|
+
from utils import *
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def categorise_percentage(percent):
|
|
14
|
+
"""Categorise the percentage of genomes with multicopy genes."""
|
|
15
|
+
if 20 <= percent < 40:
|
|
16
|
+
return "20-40%"
|
|
17
|
+
elif 40 <= percent < 60:
|
|
18
|
+
return "40-60%"
|
|
19
|
+
elif 60 <= percent < 80:
|
|
20
|
+
return "60-80%"
|
|
21
|
+
elif 80 <= percent < 95:
|
|
22
|
+
return "80-95%"
|
|
23
|
+
elif 95 <= percent < 99:
|
|
24
|
+
return "95-99%"
|
|
25
|
+
elif 99 <= percent <= 100:
|
|
26
|
+
return "99-100%"
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
# Read cd-hit .clstr file and extract information
|
|
30
|
+
def read_cd_hit_output(clustering_output):
|
|
31
|
+
clusters = OrderedDict()
|
|
32
|
+
|
|
33
|
+
with open(clustering_output, 'r') as f:
|
|
34
|
+
current_cluster_id = None
|
|
35
|
+
|
|
36
|
+
for line in f:
|
|
37
|
+
line = line.strip()
|
|
38
|
+
if line.startswith(">Cluster"):
|
|
39
|
+
current_cluster_id = line.split(' ')[1]
|
|
40
|
+
clusters[current_cluster_id] = []
|
|
41
|
+
elif line and current_cluster_id is not None:
|
|
42
|
+
parts = line.split('\t')
|
|
43
|
+
if len(parts) > 1:
|
|
44
|
+
clustered_info = parts[1]
|
|
45
|
+
length = clustered_info.split(',')[0]
|
|
46
|
+
length = int(''.join(c for c in length if c.isdigit()))
|
|
47
|
+
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
48
|
+
clustered_header = '>' + clustered_header
|
|
49
|
+
|
|
50
|
+
if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
|
|
51
|
+
percent_identity = extract_identity(clustered_info)
|
|
52
|
+
elif line.endswith('*'):
|
|
53
|
+
percent_identity = 100.0
|
|
54
|
+
else:
|
|
55
|
+
raise ValueError("Percent identity not found in the string.")
|
|
56
|
+
|
|
57
|
+
clusters[current_cluster_id].append({
|
|
58
|
+
'header': clustered_header,
|
|
59
|
+
'length': length,
|
|
60
|
+
'percent_identity': percent_identity
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
return clusters
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Summarise the information for each cluster
|
|
67
|
+
def summarise_clusters(options,clusters, output):
|
|
68
|
+
multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
|
|
69
|
+
|
|
70
|
+
with open(output, 'w') as out_f:
|
|
71
|
+
out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
|
|
72
|
+
|
|
73
|
+
for cluster_id, seqs in clusters.items():
|
|
74
|
+
num_seqs = len(seqs)
|
|
75
|
+
lengths = [seq['length'] for seq in seqs]
|
|
76
|
+
identities = [seq['percent_identity'] for seq in seqs]
|
|
77
|
+
|
|
78
|
+
avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
|
|
79
|
+
length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
|
|
80
|
+
|
|
81
|
+
avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
|
|
82
|
+
identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
|
|
83
|
+
|
|
84
|
+
out_f.write(
|
|
85
|
+
f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
|
|
86
|
+
|
|
87
|
+
# Count genomes with more than one gene
|
|
88
|
+
genome_to_gene_count = defaultdict(int)
|
|
89
|
+
for seq in seqs:
|
|
90
|
+
genome = seq['header'].split('|')[0].replace('>','')
|
|
91
|
+
genome_to_gene_count[genome] += 1
|
|
92
|
+
|
|
93
|
+
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
94
|
+
|
|
95
|
+
# Calculate the percentage of genomes with multicopy genes
|
|
96
|
+
|
|
97
|
+
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
|
|
98
|
+
category = categorise_percentage(multicopy_percentage)
|
|
99
|
+
if category:
|
|
100
|
+
multicopy_groups[category] += 1
|
|
101
|
+
|
|
102
|
+
# Define the order of categories for printout
|
|
103
|
+
category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
|
|
104
|
+
|
|
105
|
+
# Print the number of clusters with multicopy genes in each percentage range, in the correct order
|
|
106
|
+
for category in category_order:
|
|
107
|
+
print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Main function to parse arguments and run the analysis
|
|
111
|
+
def main():
|
|
112
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
|
|
113
|
+
### Required Arguments
|
|
114
|
+
required = parser.add_argument_group('Required Parameters')
|
|
115
|
+
required.add_argument('-input_clstr', action="store", dest="input_clstr",
|
|
116
|
+
help='Input CD-HIT .clstr file',
|
|
117
|
+
required=True)
|
|
118
|
+
required.add_argument('-output', action="store", dest="output",
|
|
119
|
+
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
|
|
120
|
+
required=True)
|
|
121
|
+
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
122
|
+
help='The total number of genomes must be provide',
|
|
123
|
+
required=True)
|
|
124
|
+
#required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
125
|
+
# help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
126
|
+
# required=True)
|
|
127
|
+
|
|
128
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
129
|
+
optional.add_argument('-output_dir', action="store", dest="output_dir",
|
|
130
|
+
help='Default: Same as input file',
|
|
131
|
+
required=False)
|
|
132
|
+
|
|
133
|
+
misc = parser.add_argument_group("Misc Parameters")
|
|
134
|
+
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
135
|
+
help="Print verbose output.",
|
|
136
|
+
required=False)
|
|
137
|
+
misc.add_argument("-v", "--version", action="version",
|
|
138
|
+
version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
|
|
139
|
+
help="Print out version number and exit")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
options = parser.parse_args()
|
|
143
|
+
print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
|
|
144
|
+
|
|
145
|
+
### File handling
|
|
146
|
+
options.input_clstr = fix_path(options.input_clstr)
|
|
147
|
+
if options.output_dir is None:
|
|
148
|
+
options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
|
|
149
|
+
output_path = os.path.abspath(options.output_dir)
|
|
150
|
+
if not os.path.exists(output_path):
|
|
151
|
+
os.makedirs(output_path)
|
|
152
|
+
output_name = options.output
|
|
153
|
+
if not output_name.endswith('.tsv'):
|
|
154
|
+
output_name += '.tsv'
|
|
155
|
+
output_file_path = os.path.join(output_path, output_name)
|
|
156
|
+
###
|
|
157
|
+
|
|
158
|
+
clusters = read_cd_hit_output(options.input_clstr)
|
|
159
|
+
summarise_clusters(options,clusters, output_file_path)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta(fasta_file):
|
|
7
|
+
"""
|
|
8
|
+
Parses a FASTA file and returns a dictionary of gene IDs and sequences.
|
|
9
|
+
"""
|
|
10
|
+
sequences = {}
|
|
11
|
+
with open(fasta_file, 'r') as f:
|
|
12
|
+
gene_id = None
|
|
13
|
+
sequence = []
|
|
14
|
+
for line in f:
|
|
15
|
+
line = line.strip()
|
|
16
|
+
if line.startswith(">"):
|
|
17
|
+
if gene_id: # Save the previous gene
|
|
18
|
+
sequences[gene_id] = ''.join(sequence)
|
|
19
|
+
gene_id = line[1:].split()[0].split('|')[1].replace('ENSB_','') # Extract the gene ID after ">"
|
|
20
|
+
sequence = []
|
|
21
|
+
else:
|
|
22
|
+
sequence.append(line)
|
|
23
|
+
if gene_id: # Save the last gene
|
|
24
|
+
sequences[gene_id] = ''.join(sequence)
|
|
25
|
+
return sequences
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_csv(csv_file):
|
|
29
|
+
"""
|
|
30
|
+
Parses a CSV file to extract group IDs and gene IDs (skipping the first line).
|
|
31
|
+
"""
|
|
32
|
+
groups = {}
|
|
33
|
+
with open(csv_file, 'r') as f:
|
|
34
|
+
reader = csv.reader(f, delimiter=',') # Assuming tab-delimited CSV
|
|
35
|
+
next(reader) # Skip the first line
|
|
36
|
+
for row in reader:
|
|
37
|
+
group_id = row[0]
|
|
38
|
+
gene_ids = row[14:] # Read from column 14 onward
|
|
39
|
+
gene_ids = [gene.strip() for genes in gene_ids for gene in genes.split(';') if
|
|
40
|
+
gene.strip()] # Flatten and clean
|
|
41
|
+
groups[group_id] = gene_ids
|
|
42
|
+
return groups
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_group_fastas(groups, sequences, output_dir):
|
|
46
|
+
"""
|
|
47
|
+
Writes individual FASTA files for each group with the relevant sequences.
|
|
48
|
+
"""
|
|
49
|
+
if not os.path.exists(output_dir):
|
|
50
|
+
os.makedirs(output_dir)
|
|
51
|
+
|
|
52
|
+
for group_id, gene_ids in groups.items():
|
|
53
|
+
group_file = os.path.join(output_dir, f"{group_id}.fasta")
|
|
54
|
+
with open(group_file, 'w') as f:
|
|
55
|
+
for gene_id in gene_ids:
|
|
56
|
+
if gene_id in sequences:
|
|
57
|
+
f.write(f">{gene_id}\n{sequences[gene_id]}\n")
|
|
58
|
+
else:
|
|
59
|
+
print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
|
|
64
|
+
parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
|
|
65
|
+
parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
|
|
66
|
+
parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
|
|
67
|
+
|
|
68
|
+
args = parser.parse_args()
|
|
69
|
+
|
|
70
|
+
# Parse the input files
|
|
71
|
+
print("Parsing FASTA file...")
|
|
72
|
+
sequences = parse_fasta(args.fasta)
|
|
73
|
+
print("Parsing CSV file...")
|
|
74
|
+
groups = parse_csv(args.csv)
|
|
75
|
+
|
|
76
|
+
# Write the grouped FASTA files
|
|
77
|
+
print("Writing grouped FASTA files...")
|
|
78
|
+
write_group_fastas(groups, sequences, args.output_dir)
|
|
79
|
+
print("Process completed successfully.")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta_stats(fasta_file):
|
|
7
|
+
"""
|
|
8
|
+
Parses a FASTA file and calculates sequence statistics.
|
|
9
|
+
"""
|
|
10
|
+
lengths = []
|
|
11
|
+
with open(fasta_file, 'r') as f:
|
|
12
|
+
sequence = []
|
|
13
|
+
for line in f:
|
|
14
|
+
line = line.strip()
|
|
15
|
+
if line.startswith(">"):
|
|
16
|
+
if sequence: # Save the previous sequence length
|
|
17
|
+
lengths.append(len(''.join(sequence)))
|
|
18
|
+
sequence = [] # Reset for the next sequence
|
|
19
|
+
else:
|
|
20
|
+
sequence.append(line)
|
|
21
|
+
if sequence: # Save the last sequence length
|
|
22
|
+
lengths.append(len(''.join(sequence)))
|
|
23
|
+
|
|
24
|
+
# Calculate statistics
|
|
25
|
+
num_sequences = len(lengths)
|
|
26
|
+
if num_sequences > 0:
|
|
27
|
+
avg_length = sum(lengths) / num_sequences
|
|
28
|
+
min_length = min(lengths)
|
|
29
|
+
max_length = max(lengths)
|
|
30
|
+
length_diff = max_length - min_length
|
|
31
|
+
percent_diff = (length_diff / min_length * 100) if min_length > 0 else 0
|
|
32
|
+
else:
|
|
33
|
+
avg_length = min_length = max_length = length_diff = percent_diff = 0
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"num_sequences": num_sequences,
|
|
37
|
+
"min_length": min_length,
|
|
38
|
+
"max_length": max_length,
|
|
39
|
+
"avg_length": avg_length,
|
|
40
|
+
"length_diff": length_diff,
|
|
41
|
+
"percent_diff": percent_diff
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def process_fasta_directory(input_dir, output_csv):
|
|
46
|
+
"""
|
|
47
|
+
Processes a directory of FASTA files and writes statistics to a CSV file.
|
|
48
|
+
"""
|
|
49
|
+
results = []
|
|
50
|
+
for filename in os.listdir(input_dir):
|
|
51
|
+
if filename.endswith(".fasta"):
|
|
52
|
+
file_path = os.path.join(input_dir, filename)
|
|
53
|
+
stats = parse_fasta_stats(file_path)
|
|
54
|
+
results.append({
|
|
55
|
+
"file_name": filename,
|
|
56
|
+
"num_sequences": stats["num_sequences"],
|
|
57
|
+
"min_length": stats["min_length"],
|
|
58
|
+
"max_length": stats["max_length"],
|
|
59
|
+
"avg_length": stats["avg_length"],
|
|
60
|
+
"length_diff": stats["length_diff"],
|
|
61
|
+
"percent_diff": stats["percent_diff"]
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
# Write results to a CSV file
|
|
65
|
+
with open(output_csv, 'w', newline='') as csvfile:
|
|
66
|
+
fieldnames = ["file_name", "num_sequences", "min_length", "max_length", "avg_length", "length_diff",
|
|
67
|
+
"percent_diff"]
|
|
68
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
69
|
+
writer.writeheader()
|
|
70
|
+
writer.writerows(results)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
|
|
75
|
+
parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
|
|
76
|
+
parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
|
|
77
|
+
|
|
78
|
+
args = parser.parse_args()
|
|
79
|
+
|
|
80
|
+
# Process the directory of FASTA files
|
|
81
|
+
print("Processing FASTA files...")
|
|
82
|
+
process_fasta_directory(args.input_dir, args.output_csv)
|
|
83
|
+
print(f"Statistics saved to {args.output_csv}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|