PyamilySeq 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Compare.py +108 -0
- PyamilySeq/Cluster_Summary.py +59 -64
- PyamilySeq/PyamilySeq.py +25 -20
- PyamilySeq/PyamilySeq_Genus.py +2 -1
- PyamilySeq/PyamilySeq_Species.py +11 -8
- PyamilySeq/Seq_Combiner.py +6 -6
- PyamilySeq/clusterings.py +0 -2
- PyamilySeq/config.py +0 -0
- PyamilySeq/constants.py +1 -1
- PyamilySeq/utils.py +98 -45
- {pyamilyseq-1.1.0.dist-info → pyamilyseq-1.1.2.dist-info}/METADATA +8 -7
- pyamilyseq-1.1.2.dist-info/RECORD +22 -0
- {pyamilyseq-1.1.0.dist-info → pyamilyseq-1.1.2.dist-info}/WHEEL +1 -1
- {pyamilyseq-1.1.0.dist-info → pyamilyseq-1.1.2.dist-info}/entry_points.txt +2 -0
- pyamilyseq-1.1.0.dist-info/RECORD +0 -20
- {pyamilyseq-1.1.0.dist-info → pyamilyseq-1.1.2.dist-info/licenses}/LICENSE +0 -0
- {pyamilyseq-1.1.0.dist-info → pyamilyseq-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
def read_cd_hit_output(clstr_file):
|
|
5
|
+
"""
|
|
6
|
+
Reads a CD-HIT .clstr file and extracts sequence clusters.
|
|
7
|
+
Returns a dictionary where keys are sequence headers and values are cluster IDs.
|
|
8
|
+
"""
|
|
9
|
+
seq_to_cluster = {} # Maps sequence header -> cluster ID
|
|
10
|
+
cluster_id = 0 # Generic ID for clusters (since CD-HIT names don't matter)
|
|
11
|
+
|
|
12
|
+
with open(clstr_file, 'r') as f:
|
|
13
|
+
for line in f:
|
|
14
|
+
line = line.strip()
|
|
15
|
+
if line.startswith(">Cluster"):
|
|
16
|
+
cluster_id += 1 # Increment cluster ID
|
|
17
|
+
elif line:
|
|
18
|
+
parts = line.split('\t')
|
|
19
|
+
if len(parts) > 1:
|
|
20
|
+
seq_header = parts[1].split('>')[1].split('...')[0] # Extract sequence header
|
|
21
|
+
seq_to_cluster[seq_header] = cluster_id
|
|
22
|
+
|
|
23
|
+
return seq_to_cluster
|
|
24
|
+
|
|
25
|
+
def compare_cd_hit_clusters(file1, file2, output_file):
|
|
26
|
+
"""
|
|
27
|
+
Compares two CD-HIT .clstr files to check if clusters are the same.
|
|
28
|
+
Writes the results to a TSV file.
|
|
29
|
+
"""
|
|
30
|
+
# Read both clustering files
|
|
31
|
+
clusters1 = read_cd_hit_output(file1)
|
|
32
|
+
clusters2 = read_cd_hit_output(file2)
|
|
33
|
+
|
|
34
|
+
# Reverse mappings: cluster ID -> list of sequences
|
|
35
|
+
grouped_clusters1 = defaultdict(set)
|
|
36
|
+
grouped_clusters2 = defaultdict(set)
|
|
37
|
+
|
|
38
|
+
for seq, cluster_id in clusters1.items():
|
|
39
|
+
grouped_clusters1[cluster_id].add(seq)
|
|
40
|
+
for seq, cluster_id in clusters2.items():
|
|
41
|
+
grouped_clusters2[cluster_id].add(seq)
|
|
42
|
+
|
|
43
|
+
# Initialize metrics counters
|
|
44
|
+
cluster_name_changes = 0
|
|
45
|
+
sequence_shifts = 0
|
|
46
|
+
only_in_file1 = defaultdict(list)
|
|
47
|
+
only_in_file2 = defaultdict(list)
|
|
48
|
+
cluster_mismatches = defaultdict(list)
|
|
49
|
+
|
|
50
|
+
# Prepare data for the TSV output
|
|
51
|
+
tsv_data = []
|
|
52
|
+
|
|
53
|
+
# Track changes
|
|
54
|
+
for seq, cluster_id in clusters1.items():
|
|
55
|
+
if seq not in clusters2:
|
|
56
|
+
only_in_file1[cluster_id].append(seq)
|
|
57
|
+
tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
|
|
58
|
+
elif clusters2[seq] != cluster_id:
|
|
59
|
+
# Sequence shifts: sequence in different clusters between files
|
|
60
|
+
sequence_shifts += 1
|
|
61
|
+
cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
|
|
62
|
+
tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
|
|
63
|
+
|
|
64
|
+
for seq, cluster_id in clusters2.items():
|
|
65
|
+
if seq not in clusters1:
|
|
66
|
+
only_in_file2[cluster_id].append(seq)
|
|
67
|
+
tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
|
|
68
|
+
elif clusters1[seq] != cluster_id:
|
|
69
|
+
# Sequence shifts: sequence in different clusters between files
|
|
70
|
+
sequence_shifts += 1
|
|
71
|
+
cluster_mismatches[seq].append((clusters1[seq], cluster_id))
|
|
72
|
+
tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
|
|
73
|
+
|
|
74
|
+
# Track cluster name changes (same sequences in different clusters)
|
|
75
|
+
for cluster_id1, seqs1 in grouped_clusters1.items():
|
|
76
|
+
for cluster_id2, seqs2 in grouped_clusters2.items():
|
|
77
|
+
if seqs1 == seqs2 and cluster_id1 != cluster_id2:
|
|
78
|
+
cluster_name_changes += 1
|
|
79
|
+
for seq in seqs1:
|
|
80
|
+
tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
|
|
81
|
+
|
|
82
|
+
# Print metrics
|
|
83
|
+
print("🔢 Clustering Comparison Metrics:")
|
|
84
|
+
print(f"Cluster name changes: {cluster_name_changes}")
|
|
85
|
+
print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
|
|
86
|
+
print(f"Sequences only in the first file: {len(only_in_file1)}")
|
|
87
|
+
print(f"Sequences only in the second file: {len(only_in_file2)}")
|
|
88
|
+
print()
|
|
89
|
+
|
|
90
|
+
# Write the results to a TSV file
|
|
91
|
+
with open(output_file, 'w') as out_file:
|
|
92
|
+
out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
|
|
93
|
+
for row in tsv_data:
|
|
94
|
+
out_file.write("\t".join(map(str, row)) + "\n")
|
|
95
|
+
|
|
96
|
+
print(f"✅ Results have been written to {output_file}")
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
|
|
100
|
+
parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
|
|
101
|
+
parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
|
|
102
|
+
parser.add_argument("-output", required=True, help="Output file (TSV format)")
|
|
103
|
+
args = parser.parse_args()
|
|
104
|
+
|
|
105
|
+
compare_cd_hit_clusters(args.file1, args.file2, args.output)
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
main()
|
PyamilySeq/Cluster_Summary.py
CHANGED
|
@@ -1,33 +1,32 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from collections import OrderedDict
|
|
3
|
-
from collections import defaultdict
|
|
2
|
+
from collections import OrderedDict, defaultdict
|
|
4
3
|
|
|
5
4
|
try:
|
|
6
5
|
from .constants import *
|
|
7
6
|
from .utils import *
|
|
8
|
-
except (ModuleNotFoundError, ImportError, NameError, TypeError)
|
|
7
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError):
|
|
9
8
|
from constants import *
|
|
10
9
|
from utils import *
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
def categorise_percentage(percent):
|
|
14
13
|
"""Categorise the percentage of genomes with multicopy genes."""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return "99-100%"
|
|
14
|
+
categories = {
|
|
15
|
+
(20, 40): "20-40%",
|
|
16
|
+
(40, 60): "40-60%",
|
|
17
|
+
(60, 80): "60-80%",
|
|
18
|
+
(80, 95): "80-95%",
|
|
19
|
+
(95, 99): "95-99%",
|
|
20
|
+
(99, 100): "99-100%"
|
|
21
|
+
}
|
|
22
|
+
for (low, high), label in categories.items():
|
|
23
|
+
if low <= percent < high:
|
|
24
|
+
return label
|
|
27
25
|
return None
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
|
|
30
28
|
def read_cd_hit_output(clustering_output):
|
|
29
|
+
"""Parse CD-HIT .cluster file and extract clustering information."""
|
|
31
30
|
clusters = OrderedDict()
|
|
32
31
|
|
|
33
32
|
with open(clustering_output, 'r') as f:
|
|
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
|
|
|
42
41
|
parts = line.split('\t')
|
|
43
42
|
if len(parts) > 1:
|
|
44
43
|
clustered_info = parts[1]
|
|
45
|
-
length = clustered_info.split(',')[0]
|
|
46
|
-
|
|
47
|
-
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
48
|
-
clustered_header = '>' + clustered_header
|
|
44
|
+
length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
|
|
45
|
+
clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
|
|
49
46
|
|
|
50
47
|
if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
|
|
51
48
|
percent_identity = extract_identity(clustered_info)
|
|
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
|
|
|
63
60
|
return clusters
|
|
64
61
|
|
|
65
62
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
multicopy_groups = defaultdict(int) # Counter for
|
|
63
|
+
def summarise_clusters(options, clusters, output):
|
|
64
|
+
"""Generate a detailed cluster summary report."""
|
|
65
|
+
multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
|
|
69
66
|
|
|
70
67
|
with open(output, 'w') as out_f:
|
|
71
|
-
out_f.write(
|
|
68
|
+
out_f.write(
|
|
69
|
+
"Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
|
|
70
|
+
)
|
|
72
71
|
|
|
73
72
|
for cluster_id, seqs in clusters.items():
|
|
74
73
|
num_seqs = len(seqs)
|
|
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
|
|
|
81
80
|
avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
|
|
82
81
|
identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
|
|
86
|
-
|
|
87
|
-
# Count genomes with more than one gene
|
|
83
|
+
# Count genomes in cluster
|
|
88
84
|
genome_to_gene_count = defaultdict(int)
|
|
89
85
|
for seq in seqs:
|
|
90
|
-
genome = seq['header'].split('|')[0].replace('>','')
|
|
86
|
+
genome = seq['header'].split('|')[0].replace('>', '')
|
|
91
87
|
genome_to_gene_count[genome] += 1
|
|
92
88
|
|
|
89
|
+
num_genomes = len(genome_to_gene_count)
|
|
93
90
|
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
91
|
+
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
|
|
94
92
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
|
|
93
|
+
# Categorize multicopy percentage
|
|
98
94
|
category = categorise_percentage(multicopy_percentage)
|
|
99
95
|
if category:
|
|
100
96
|
multicopy_groups[category] += 1
|
|
101
97
|
|
|
102
|
-
|
|
103
|
-
|
|
98
|
+
# Write detailed output for each cluster
|
|
99
|
+
out_f.write(
|
|
100
|
+
f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
|
|
101
|
+
f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
|
|
102
|
+
)
|
|
104
103
|
|
|
105
|
-
#
|
|
104
|
+
# Define order for multicopy statistics output
|
|
105
|
+
category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
|
|
106
106
|
for category in category_order:
|
|
107
|
-
print(f"
|
|
107
|
+
print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
# Main function to parse arguments and run the analysis
|
|
111
110
|
def main():
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
required.add_argument('-input_clstr', action="store", dest="input_clstr",
|
|
116
|
-
help='Input CD-HIT .clstr file',
|
|
117
|
-
required=True)
|
|
118
|
-
required.add_argument('-output', action="store", dest="output",
|
|
119
|
-
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
|
|
120
|
-
required=True)
|
|
121
|
-
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
122
|
-
help='The total number of genomes must be provide',
|
|
123
|
-
required=True)
|
|
124
|
-
#required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
125
|
-
# help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
126
|
-
# required=True)
|
|
111
|
+
"""Main function to parse arguments and process clustering files."""
|
|
112
|
+
parser = argparse.ArgumentParser(
|
|
113
|
+
description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
|
|
127
114
|
|
|
115
|
+
# Required Arguments
|
|
116
|
+
required = parser.add_argument_group('Required Parameters')
|
|
117
|
+
required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
|
|
118
|
+
help='Input CD-HIT .cluster file')
|
|
119
|
+
required.add_argument('-output', action="store", dest="output", required=True,
|
|
120
|
+
help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
|
|
121
|
+
required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
|
|
122
|
+
help='Total number of genomes in dataset')
|
|
123
|
+
|
|
124
|
+
# Optional Arguments
|
|
128
125
|
optional = parser.add_argument_group('Optional Arguments')
|
|
129
126
|
optional.add_argument('-output_dir', action="store", dest="output_dir",
|
|
130
|
-
help='Default: Same as input file',
|
|
131
|
-
required=False)
|
|
127
|
+
help='Default: Same as input file', required=False)
|
|
132
128
|
|
|
133
129
|
misc = parser.add_argument_group("Misc Parameters")
|
|
134
130
|
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
135
|
-
help="Print verbose output.",
|
|
136
|
-
required=False)
|
|
131
|
+
help="Print verbose output.", required=False)
|
|
137
132
|
misc.add_argument("-v", "--version", action="version",
|
|
138
133
|
version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
|
|
139
134
|
help="Print out version number and exit")
|
|
140
135
|
|
|
141
|
-
|
|
142
136
|
options = parser.parse_args()
|
|
143
|
-
print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
|
|
137
|
+
print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
|
|
144
138
|
|
|
145
|
-
|
|
146
|
-
options.
|
|
139
|
+
# File handling
|
|
140
|
+
options.input_cluster = fix_path(options.input_cluster)
|
|
147
141
|
if options.output_dir is None:
|
|
148
|
-
options.output_dir = os.path.dirname(os.path.abspath(options.
|
|
142
|
+
options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
|
|
149
143
|
output_path = os.path.abspath(options.output_dir)
|
|
150
144
|
if not os.path.exists(output_path):
|
|
151
145
|
os.makedirs(output_path)
|
|
146
|
+
|
|
152
147
|
output_name = options.output
|
|
153
148
|
if not output_name.endswith('.tsv'):
|
|
154
149
|
output_name += '.tsv'
|
|
155
150
|
output_file_path = os.path.join(output_path, output_name)
|
|
156
|
-
###
|
|
157
151
|
|
|
158
|
-
clusters
|
|
159
|
-
|
|
152
|
+
# Process clusters and generate summary
|
|
153
|
+
clusters = read_cd_hit_output(options.input_cluster)
|
|
154
|
+
summarise_clusters(options, clusters, output_file_path)
|
|
160
155
|
|
|
161
156
|
|
|
162
157
|
if __name__ == "__main__":
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
|
|
2
|
+
#from config import config_params
|
|
3
3
|
|
|
4
4
|
try:
|
|
5
5
|
from .PyamilySeq_Species import cluster as species_cluster
|
|
@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
20
20
|
clustering_mode,
|
|
21
21
|
'-i', input_file,
|
|
22
22
|
'-o', clustering_output,
|
|
23
|
-
'-c',
|
|
24
|
-
'-s',
|
|
23
|
+
'-c', f"{float(options.pident):.2f}",
|
|
24
|
+
'-s', f"{float(options.len_diff):.2f}",
|
|
25
25
|
'-T', str(options.threads),
|
|
26
26
|
'-M', str(options.mem),
|
|
27
27
|
'-d', "0",
|
|
@@ -62,11 +62,12 @@ def main():
|
|
|
62
62
|
help="Clustering mode: 'DNA' or 'AA'.")
|
|
63
63
|
full_parser.add_argument("-gene_ident", default="CDS", required=False,
|
|
64
64
|
help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
|
|
65
|
-
full_parser.add_argument("-c", type=
|
|
65
|
+
full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
|
|
66
66
|
help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
|
|
67
|
-
full_parser.add_argument("-s", type=
|
|
67
|
+
full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
|
|
68
68
|
help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
|
|
69
|
-
|
|
69
|
+
|
|
70
|
+
full_parser.add_argument("-fast_mode", action="store_true",
|
|
70
71
|
help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
|
|
71
72
|
|
|
72
73
|
|
|
@@ -94,14 +95,14 @@ def main():
|
|
|
94
95
|
subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
|
|
95
96
|
help="Gene groupings for 'Genus' mode (default: '1-10').")
|
|
96
97
|
subparser.add_argument("-write_groups", default=None, dest="write_groups", required=False,
|
|
97
|
-
help="Output gene groups as a single FASTA file (
|
|
98
|
-
subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups",
|
|
98
|
+
help="Output gene groups as a single FASTA file (e.g., '99,95'). Triggers writing individual groups.")
|
|
99
|
+
subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups",
|
|
99
100
|
help="Output individual FASTA files for each group.")
|
|
100
|
-
subparser.add_argument("-align", action="store_true", dest="align_core",
|
|
101
|
-
help="Align and concatenate sequences for 'core' groups
|
|
102
|
-
subparser.add_argument("-align_aa", action="store_true",
|
|
101
|
+
subparser.add_argument("-align", action="store_true", dest="align_core",
|
|
102
|
+
help="Align and concatenate sequences for 'core' groups (those in 99-100%% of genomes).")
|
|
103
|
+
subparser.add_argument("-align_aa", action="store_true",
|
|
103
104
|
help="Align sequences as amino acids.")
|
|
104
|
-
subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out",
|
|
105
|
+
subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out",
|
|
105
106
|
help="Skip creation of gene_presence_absence.csv.")
|
|
106
107
|
subparser.add_argument("-M", type=int, default=4000, dest="mem", required=False,
|
|
107
108
|
help="Memory allocation for clustering (MB) - CD-HIT parameter '-M'.")
|
|
@@ -109,13 +110,15 @@ def main():
|
|
|
109
110
|
help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
|
|
110
111
|
|
|
111
112
|
# Miscellaneous Arguments
|
|
112
|
-
subparser.add_argument("-verbose", action="store_true",
|
|
113
|
+
subparser.add_argument("-verbose", action="store_true",
|
|
113
114
|
help="Print verbose output.")
|
|
114
115
|
subparser.add_argument("-v", "--version", action="version",
|
|
115
|
-
version=f"PyamilySeq {PyamilySeq_Version}: Exiting."
|
|
116
|
+
version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
|
|
116
117
|
|
|
117
118
|
# Parse Arguments
|
|
118
119
|
options = parser.parse_args()
|
|
120
|
+
## Configuration
|
|
121
|
+
|
|
119
122
|
|
|
120
123
|
if options.write_groups != None and options.write_individual_groups == False:
|
|
121
124
|
options.write_individual_groups = True
|
|
@@ -146,7 +149,7 @@ def main():
|
|
|
146
149
|
if options.align_core:
|
|
147
150
|
options.write_individual_groups = True
|
|
148
151
|
if options.write_groups == None:
|
|
149
|
-
sys.exit('Must provide "-
|
|
152
|
+
sys.exit('Must provide "-write_groups" to output gene groups before alignment "-align" can be done.')
|
|
150
153
|
elif options.run_mode == 'Partial':
|
|
151
154
|
required_partial_mode = [options.cluster_file, options.original_fasta]
|
|
152
155
|
if all(required_partial_mode):
|
|
@@ -187,13 +190,13 @@ def main():
|
|
|
187
190
|
elif options.sequence_type == 'AA':
|
|
188
191
|
clustering_mode = 'cd-hit'
|
|
189
192
|
if options.fast_mode == True:
|
|
190
|
-
options.fast_mode =
|
|
193
|
+
options.fast_mode = 1
|
|
191
194
|
if options.verbose == True:
|
|
192
195
|
print("Running CD-HIT in fast mode.")
|
|
193
196
|
else:
|
|
194
|
-
options.fast_mode =
|
|
197
|
+
options.fast_mode = 0
|
|
195
198
|
if options.verbose == True:
|
|
196
|
-
print("Running CD-HIT in
|
|
199
|
+
print("Running CD-HIT in accurate mode.")
|
|
197
200
|
else:
|
|
198
201
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
199
202
|
|
|
@@ -239,10 +242,10 @@ def main():
|
|
|
239
242
|
translate = False
|
|
240
243
|
file_to_cluster = combined_out_file
|
|
241
244
|
if options.input_type == 'separate':
|
|
242
|
-
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate)
|
|
245
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
|
|
243
246
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
244
247
|
elif options.input_type == 'combined':
|
|
245
|
-
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate)
|
|
248
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
|
|
246
249
|
run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
|
|
247
250
|
elif options.input_type == 'fasta':
|
|
248
251
|
combined_out_file = options.input_fasta
|
|
@@ -281,6 +284,8 @@ def main():
|
|
|
281
284
|
clustering_options = clustering_options()
|
|
282
285
|
|
|
283
286
|
elif options.run_mode == 'Partial':
|
|
287
|
+
if not os.path.exists(output_path):
|
|
288
|
+
os.makedirs(output_path)
|
|
284
289
|
class clustering_options:
|
|
285
290
|
def __init__(self):
|
|
286
291
|
self.run_mode = options.run_mode
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -17,7 +17,8 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
|
|
|
17
17
|
#in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
18
18
|
gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
|
|
19
19
|
gpa_outfile = open(gpa_outfile, 'w')
|
|
20
|
-
|
|
20
|
+
genus_dict = OrderedDict(sorted(genus_dict.items()))
|
|
21
|
+
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
|
|
21
22
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
22
23
|
gpa_outfile.write('","'.join(genus_dict.keys()))
|
|
23
24
|
gpa_outfile.write('"\n')
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -15,14 +15,15 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
15
15
|
#in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
16
16
|
gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
|
|
17
17
|
gpa_outfile = open(gpa_outfile, 'w')
|
|
18
|
-
|
|
18
|
+
genome_dict = OrderedDict(sorted(genome_dict.items()))
|
|
19
|
+
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
|
|
19
20
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
20
21
|
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
21
22
|
gpa_outfile.write('"\n')
|
|
22
23
|
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
23
24
|
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
24
25
|
gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
25
|
-
'","","","","","","","",""
|
|
26
|
+
'","","","","","","","",""')
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
for genome in genome_dict.keys():
|
|
@@ -120,12 +121,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
|
120
121
|
|
|
121
122
|
#@profile
|
|
122
123
|
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
124
|
+
try:
|
|
125
|
+
groups_as_list = list(groups.values())
|
|
126
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
127
|
+
res = idx
|
|
128
|
+
family_group = list(groups)[res]
|
|
129
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
130
|
+
except UnboundLocalError:
|
|
131
|
+
sys.exit("Error in calc_only_Second_only_core")
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
#@profile
|
PyamilySeq/Seq_Combiner.py
CHANGED
|
@@ -65,17 +65,17 @@ def main():
|
|
|
65
65
|
if not os.path.exists(output_path):
|
|
66
66
|
os.makedirs(output_path)
|
|
67
67
|
|
|
68
|
-
output_file = options.output_file + '.fasta'
|
|
69
|
-
if os.path.exists(os.path.join(output_path, output_file)):
|
|
70
|
-
print(f"Output file {output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
68
|
+
#output_file = options.output_file + '.fasta'
|
|
69
|
+
if os.path.exists(os.path.join(output_path, options.output_file)):
|
|
70
|
+
print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
71
71
|
exit(1)
|
|
72
72
|
|
|
73
|
-
combined_out_file = os.path.join(output_path, output_file )
|
|
73
|
+
combined_out_file = os.path.join(output_path, options.output_file )
|
|
74
74
|
|
|
75
75
|
if options.input_type == 'separate':
|
|
76
|
-
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate)
|
|
76
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
|
|
77
77
|
elif options.input_type == 'combined':
|
|
78
|
-
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate)
|
|
78
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
|
|
79
79
|
elif options.input_type == 'fasta':
|
|
80
80
|
read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
|
|
81
81
|
|
PyamilySeq/clusterings.py
CHANGED
|
@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
279
279
|
first = True
|
|
280
280
|
for line in Second_in:
|
|
281
281
|
if line.startswith('>'):
|
|
282
|
-
if '>Cluster 1997' in line:
|
|
283
|
-
print()
|
|
284
282
|
if first == False:
|
|
285
283
|
cluster_size = len(Combined_clusters[cluster_id])
|
|
286
284
|
Combined_reps.update({rep: cluster_size})
|
PyamilySeq/config.py
ADDED
|
File without changes
|
PyamilySeq/constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v1.1.
|
|
1
|
+
PyamilySeq_Version = 'v1.1.2'
|
|
2
2
|
|
PyamilySeq/utils.py
CHANGED
|
@@ -7,6 +7,7 @@ from tempfile import NamedTemporaryFile
|
|
|
7
7
|
import sys
|
|
8
8
|
import re
|
|
9
9
|
import math
|
|
10
|
+
#from config import config_params
|
|
10
11
|
|
|
11
12
|
####
|
|
12
13
|
# Placeholder for the distance function
|
|
@@ -18,6 +19,7 @@ try:
|
|
|
18
19
|
def levenshtein_distance_calc(seq1, seq2):
|
|
19
20
|
return LV.distance(seq1, seq2)
|
|
20
21
|
except (ModuleNotFoundError, ImportError):
|
|
22
|
+
#if config_params.verbose == True: - Not implemented yet
|
|
21
23
|
print("Levenshtein package not installed - Will fallback to slower Python implementation.")
|
|
22
24
|
# Fallback implementation
|
|
23
25
|
def levenshtein_distance_calc(seq1, seq2):
|
|
@@ -228,9 +230,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
228
230
|
|
|
229
231
|
|
|
230
232
|
|
|
231
|
-
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate):
|
|
232
|
-
|
|
233
|
-
|
|
233
|
+
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
|
|
234
|
+
if run_as_combiner == True:
|
|
235
|
+
combined_out_file_aa = None
|
|
236
|
+
else:
|
|
237
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
238
|
+
|
|
239
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
240
|
+
paired_files_found = None
|
|
241
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
234
242
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
|
|
235
243
|
if not gff_files:
|
|
236
244
|
sys.exit("Error: No GFF files found.")
|
|
@@ -299,23 +307,40 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
|
|
|
299
307
|
full_sequence = fasta_dict[contig][1]
|
|
300
308
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
301
309
|
|
|
302
|
-
if
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
310
|
+
if run_as_combiner == True:
|
|
311
|
+
if translate == True:
|
|
312
|
+
seq_aa = translate_frame(seq)
|
|
313
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
314
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
315
|
+
else:
|
|
316
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
317
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
318
|
+
else:
|
|
319
|
+
if translate == True:
|
|
320
|
+
seq_aa = translate_frame(seq)
|
|
321
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
322
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
323
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
324
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
308
325
|
|
|
309
326
|
if not paired_files_found:
|
|
310
327
|
sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
|
|
311
328
|
if translate == False or translate == None:
|
|
312
329
|
#Clean up unused file
|
|
313
|
-
|
|
314
|
-
|
|
330
|
+
try: # Catches is combined_out_file_aa is None
|
|
331
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
332
|
+
os.remove(combined_out_file_aa.name)
|
|
333
|
+
except AttributeError:
|
|
334
|
+
pass
|
|
315
335
|
|
|
316
336
|
|
|
317
|
-
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
318
|
-
|
|
337
|
+
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
|
|
338
|
+
if run_as_combiner == True:
|
|
339
|
+
combined_out_file_aa = None
|
|
340
|
+
else:
|
|
341
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
342
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
|
|
343
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
319
344
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
|
|
320
345
|
if not gff_files:
|
|
321
346
|
sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
|
|
@@ -355,7 +380,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
355
380
|
|
|
356
381
|
for contig, fasta in fasta_dict.items():
|
|
357
382
|
reverse_sequence = reverse_complement(fasta[0])
|
|
358
|
-
fasta_dict[contig][1]=reverse_sequence
|
|
383
|
+
fasta_dict[contig][1] = reverse_sequence
|
|
359
384
|
|
|
360
385
|
if fasta_dict and gff_features:
|
|
361
386
|
for contig, start, end, strand, feature, seq_id in gff_features:
|
|
@@ -369,22 +394,38 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
369
394
|
full_sequence = fasta_dict[contig][1]
|
|
370
395
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
371
396
|
|
|
372
|
-
if
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
397
|
+
if run_as_combiner == True:
|
|
398
|
+
if translate == True:
|
|
399
|
+
seq_aa = translate_frame(seq)
|
|
400
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
401
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
402
|
+
else:
|
|
403
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
404
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
405
|
+
else:
|
|
406
|
+
if translate == True:
|
|
407
|
+
seq_aa = translate_frame(seq)
|
|
408
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
409
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
410
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
411
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
378
412
|
|
|
379
413
|
if translate == False or translate == None:
|
|
380
414
|
#Clean up unused file
|
|
381
|
-
|
|
382
|
-
|
|
415
|
+
try: # Catches is combined_out_file_aa is None
|
|
416
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
417
|
+
os.remove(combined_out_file_aa.name)
|
|
418
|
+
except AttributeError:
|
|
419
|
+
pass
|
|
383
420
|
|
|
384
421
|
|
|
385
422
|
|
|
386
|
-
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
|
|
387
|
-
|
|
423
|
+
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
|
|
424
|
+
if run_as_combiner == True:
|
|
425
|
+
combined_out_file_aa = None
|
|
426
|
+
else:
|
|
427
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
428
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
388
429
|
fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
|
|
389
430
|
if not fasta_files:
|
|
390
431
|
sys.exit("Error: No GFF files found.")
|
|
@@ -400,17 +441,30 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
|
|
|
400
441
|
else:
|
|
401
442
|
fasta_dict[current_seq] +=line.strip()
|
|
402
443
|
for seq_id, seq in fasta_dict.items():
|
|
403
|
-
if
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
444
|
+
if run_as_combiner == True:
|
|
445
|
+
if translate == True:
|
|
446
|
+
seq_aa = translate_frame(seq)
|
|
447
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
448
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
449
|
+
else:
|
|
450
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
451
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
452
|
+
else:
|
|
453
|
+
if translate == True:
|
|
454
|
+
seq_aa = translate_frame(seq)
|
|
455
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
456
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
457
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
458
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
409
459
|
|
|
410
460
|
if translate == False or translate == None:
|
|
411
461
|
#Clean up unused file
|
|
412
|
-
|
|
413
|
-
|
|
462
|
+
try: # Catches is combined_out_file_aa is None
|
|
463
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
464
|
+
os.remove(combined_out_file_aa.name)
|
|
465
|
+
except AttributeError:
|
|
466
|
+
pass
|
|
467
|
+
|
|
414
468
|
|
|
415
469
|
def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
416
470
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
@@ -533,7 +587,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
533
587
|
def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
|
|
534
588
|
# Read sequences from the gene family file
|
|
535
589
|
sequences = read_fasta(gene_path)
|
|
536
|
-
|
|
590
|
+
if len(sequences) == 1: # We can't align a single sequence
|
|
591
|
+
return concatenated_sequences
|
|
537
592
|
# Select the longest sequence for each genome
|
|
538
593
|
longest_sequences = select_longest_gene(sequences, subgrouped)
|
|
539
594
|
|
|
@@ -574,20 +629,18 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
|
|
|
574
629
|
# Iterate over each gene family file
|
|
575
630
|
for gene_file in os.listdir(group_directory):
|
|
576
631
|
if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
|
|
577
|
-
#print(gene_file)
|
|
578
632
|
current_group = int(gene_file.split('_')[3].split('.')[0])
|
|
579
633
|
gene_path = os.path.join(group_directory, gene_file)
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
634
|
+
# Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
|
|
635
|
+
if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
|
|
636
|
+
# Check for matching group in paralog_groups
|
|
637
|
+
if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
|
|
638
|
+
for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
|
|
639
|
+
if size >= threshold_size:
|
|
640
|
+
gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
|
|
641
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
|
|
642
|
+
else:
|
|
643
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
|
|
591
644
|
|
|
592
645
|
# Write the concatenated sequences to the output file
|
|
593
646
|
with open(output_file, 'w') as out:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -13,6 +13,7 @@ Requires-Python: >=3.6
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Requires-Dist: levenshtein
|
|
16
|
+
Dynamic: license-file
|
|
16
17
|
|
|
17
18
|
# PyamilySeq
|
|
18
19
|
**PyamilySeq** is a Python tool for clustering gene sequences into groups based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
|
|
@@ -45,7 +46,7 @@ To update to the newest version add '-U' to end of the pip install command.
|
|
|
45
46
|
```commandline
|
|
46
47
|
usage: PyamilySeq.py [-h] {Full,Partial} ...
|
|
47
48
|
|
|
48
|
-
PyamilySeq v1.1.
|
|
49
|
+
PyamilySeq v1.1.2: A tool for gene clustering and analysis.
|
|
49
50
|
|
|
50
51
|
positional arguments:
|
|
51
52
|
{Full,Partial} Choose a mode: 'Full' or 'Partial'.
|
|
@@ -75,7 +76,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
|
|
|
75
76
|
```
|
|
76
77
|
### Example output:
|
|
77
78
|
```
|
|
78
|
-
Running PyamilySeq v1.1.
|
|
79
|
+
Running PyamilySeq v1.1.2
|
|
79
80
|
Calculating Groups
|
|
80
81
|
Number of Genomes: 10
|
|
81
82
|
Gene Groups
|
|
@@ -220,7 +221,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
|
|
|
220
221
|
```
|
|
221
222
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
222
223
|
|
|
223
|
-
PyamilySeq v1.1.
|
|
224
|
+
PyamilySeq v1.1.2: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
224
225
|
|
|
225
226
|
options:
|
|
226
227
|
-h, --help show this help message and exit
|
|
@@ -263,7 +264,7 @@ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
|
263
264
|
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
264
265
|
[-verbose] [-v]
|
|
265
266
|
|
|
266
|
-
PyamilySeq v1.1.
|
|
267
|
+
PyamilySeq v1.1.2: Group-Splitter - A tool to split multi-copy gene groups
|
|
267
268
|
identified by PyamilySeq.
|
|
268
269
|
|
|
269
270
|
options:
|
|
@@ -316,7 +317,7 @@ Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_
|
|
|
316
317
|
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
317
318
|
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
318
319
|
|
|
319
|
-
PyamilySeq v1.1.
|
|
320
|
+
PyamilySeq v1.1.2: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
320
321
|
|
|
321
322
|
options:
|
|
322
323
|
-h, --help show this help message and exit
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
PyamilySeq/Cluster_Compare.py,sha256=2jRXBYN8T9TUDLV9bj3SWFQ2pBUH3BAKW1FYrDYSQBw,4421
|
|
2
|
+
PyamilySeq/Cluster_Summary.py,sha256=efXMfGvATERCTxwaqbauhZwt_5Hrf9KpGKY3EgsHVDk,6720
|
|
3
|
+
PyamilySeq/Group_Extractor.py,sha256=oe2VmOVxdvTmAcy8NKwD1F27IdN2utAfczxsyxg96yc,2898
|
|
4
|
+
PyamilySeq/Group_Sizes.py,sha256=3snkAN19o3Y4IY6IqSim1qy415FfQe1Wb8vzWTKF0Wo,3028
|
|
5
|
+
PyamilySeq/Group_Splitter.py,sha256=OcMj9GnAyybs_DaNKRyvfL_nl2dB2gUI4BD_EQrBbWo,25653
|
|
6
|
+
PyamilySeq/PyamilySeq.py,sha256=tdmIDB2ZYCRfMFQSuWrN0Psr5ggSaoUcT2wEv54jWos,17462
|
|
7
|
+
PyamilySeq/PyamilySeq_Genus.py,sha256=KUC0QkCRpKQ9HEgxyTSD7Nc63wSXtriWyIqt_YOy5ys,12470
|
|
8
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=gJy8Pn82Za44l6y9tg7bWJri2k_0OwZiplANIEH2o-c,16289
|
|
9
|
+
PyamilySeq/Seq_Combiner.py,sha256=3iJy7LNp7uBa3sU1F5bmov1ghvbcviOYqgkhbrbV1QQ,4737
|
|
10
|
+
PyamilySeq/Seq_Extractor.py,sha256=KMR0KcTJzrh99HcBN4qb76R2FuBvpYCDf4NwkmwhTPU,2870
|
|
11
|
+
PyamilySeq/Seq_Finder.py,sha256=ht-fSQ_opWKydcoWI9D3nTwLt6Rpgevnf2y0KxVjw4M,1881
|
|
12
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
PyamilySeq/clusterings.py,sha256=9t9Q7IYb9x9gXxcv_FxsWqgdMQ-MYa-5OpkBzpgbrXc,22291
|
|
14
|
+
PyamilySeq/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
PyamilySeq/constants.py,sha256=WVns7PIMu89mNbb_lhu_Hf8fcX4AiUKiMKWAnwEHBvM,31
|
|
16
|
+
PyamilySeq/utils.py,sha256=aebXIUWIXsL3Zb47ONYqVoF1X504lJ4amewhpO1hNWE,33067
|
|
17
|
+
pyamilyseq-1.1.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
18
|
+
pyamilyseq-1.1.2.dist-info/METADATA,sha256=YlUvYX1GX0Acoh2V28jq0aMC-reFzEwoUWre8W2eK54,17979
|
|
19
|
+
pyamilyseq-1.1.2.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
|
20
|
+
pyamilyseq-1.1.2.dist-info/entry_points.txt,sha256=mFq5TNzPI_B9vDRGEaT9pNPRGWFAgf_SE3R-dDNf1pM,662
|
|
21
|
+
pyamilyseq-1.1.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
22
|
+
pyamilyseq-1.1.2.dist-info/RECORD,,
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
|
+
Cluster-Extractor = PyamilySeq.Cluster_Extractor:main
|
|
2
3
|
Cluster-Summary = PyamilySeq.Cluster_Summary:main
|
|
3
4
|
Group-Splitter = PyamilySeq.Group_Splitter:main
|
|
4
5
|
PyamilySeq = PyamilySeq.PyamilySeq:main
|
|
5
6
|
Seq-Combiner = PyamilySeq.Seq_Combiner:main
|
|
6
7
|
Seq-Extractor = PyamilySeq.Seq_Extractor:main
|
|
7
8
|
Seq-Finder = PyamilySeq.Seq_Finder:main
|
|
9
|
+
cluster-extractor = PyamilySeq.Cluster_Extractor:main
|
|
8
10
|
cluster-summary = PyamilySeq.Cluster_Summary:main
|
|
9
11
|
group-splitter = PyamilySeq.Group_Splitter:main
|
|
10
12
|
pyamilyseq = PyamilySeq.PyamilySeq:main
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
PyamilySeq/Cluster_Summary.py,sha256=xkzkC9mHRqHNm2bp0gcBWOobKTabltBrVv6ze4nikyg,6982
|
|
2
|
-
PyamilySeq/Group_Extractor.py,sha256=oe2VmOVxdvTmAcy8NKwD1F27IdN2utAfczxsyxg96yc,2898
|
|
3
|
-
PyamilySeq/Group_Sizes.py,sha256=3snkAN19o3Y4IY6IqSim1qy415FfQe1Wb8vzWTKF0Wo,3028
|
|
4
|
-
PyamilySeq/Group_Splitter.py,sha256=OcMj9GnAyybs_DaNKRyvfL_nl2dB2gUI4BD_EQrBbWo,25653
|
|
5
|
-
PyamilySeq/PyamilySeq.py,sha256=iyLU8YYn06ppCBCb-HaARm0ez_uagkj4FRtqr3pFu9Q,17396
|
|
6
|
-
PyamilySeq/PyamilySeq_Genus.py,sha256=mBZNTWWk4a_5gsHzq0cG82Qck-SL9Xgg6jpByH6oKuk,12414
|
|
7
|
-
PyamilySeq/PyamilySeq_Species.py,sha256=yyMHBQhTq1yFUKbiba6vL9nYJ4rlKsmS30hQNimGEB8,16120
|
|
8
|
-
PyamilySeq/Seq_Combiner.py,sha256=wusCDrGwcVqQw3h5iPSY0_E5sauRFNHj0bXMBtELpl0,4700
|
|
9
|
-
PyamilySeq/Seq_Extractor.py,sha256=KMR0KcTJzrh99HcBN4qb76R2FuBvpYCDf4NwkmwhTPU,2870
|
|
10
|
-
PyamilySeq/Seq_Finder.py,sha256=ht-fSQ_opWKydcoWI9D3nTwLt6Rpgevnf2y0KxVjw4M,1881
|
|
11
|
-
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
PyamilySeq/clusterings.py,sha256=9fXnFZSypeZKtJK8OYONQKVpvLoeJCSxx2BRo5Vir38,22355
|
|
13
|
-
PyamilySeq/constants.py,sha256=7zjA69Blx4ycagDKCvaKdVuXidVVwu7adnmgm53_CTI,31
|
|
14
|
-
PyamilySeq/utils.py,sha256=EwZQP5eFaXiXVGw0Vckhu9tJkm8SOKAgGh7-JnIu_j4,29355
|
|
15
|
-
pyamilyseq-1.1.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
16
|
-
pyamilyseq-1.1.0.dist-info/METADATA,sha256=f8hZWEdD06WYoflfPls02q_6FVFYL18np1BGzWXwlGA,17957
|
|
17
|
-
pyamilyseq-1.1.0.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
|
|
18
|
-
pyamilyseq-1.1.0.dist-info/entry_points.txt,sha256=Qao6g8F37k35MQFkUpGt9xoozRBaTkIKUptXWAUs5-E,554
|
|
19
|
-
pyamilyseq-1.1.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
20
|
-
pyamilyseq-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|