plasmidhub 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of plasmidhub might be problematic. Click here for more details.
- plasmidhub/__init__.py +1 -0
- plasmidhub/abricate.py +46 -0
- plasmidhub/ani.py +29 -0
- plasmidhub/cluster_color.py +48 -0
- plasmidhub/clustering.py +143 -0
- plasmidhub/filtering.py +54 -0
- plasmidhub/main.py +381 -0
- plasmidhub/network_builder.py +202 -0
- plasmidhub/node_stats.py +69 -0
- plasmidhub/plot.py +169 -0
- plasmidhub/plot_only.py +153 -0
- plasmidhub/preprocessing.py +48 -0
- plasmidhub-1.0.0.dist-info/LICENSE +24 -0
- plasmidhub-1.0.0.dist-info/METADATA +193 -0
- plasmidhub-1.0.0.dist-info/RECORD +18 -0
- plasmidhub-1.0.0.dist-info/WHEEL +5 -0
- plasmidhub-1.0.0.dist-info/entry_points.txt +2 -0
- plasmidhub-1.0.0.dist-info/top_level.txt +1 -0
plasmidhub/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# plasmidhub package
|
plasmidhub/abricate.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import shutil
|
|
4
|
+
import glob
|
|
5
|
+
import logging
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
def run_abricate_bulk(input_dir, results_dir, db_list, threads=None):
|
|
9
|
+
os.makedirs(results_dir, exist_ok=True)
|
|
10
|
+
|
|
11
|
+
# Use default thread count if not provided
|
|
12
|
+
if threads is None:
|
|
13
|
+
threads = 4
|
|
14
|
+
|
|
15
|
+
# Move into input_dir because wildcard expansion happens here
|
|
16
|
+
original_dir = os.getcwd()
|
|
17
|
+
os.chdir(input_dir)
|
|
18
|
+
|
|
19
|
+
# Collect all fasta-like files
|
|
20
|
+
fasta_files = sorted(
|
|
21
|
+
glob.glob("*.fna") +
|
|
22
|
+
glob.glob("*.fa") +
|
|
23
|
+
glob.glob("*.fasta")
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if not fasta_files:
|
|
27
|
+
raise RuntimeError(f"No input files found in {input_dir} with .fna/.fa/.fasta extensions.")
|
|
28
|
+
|
|
29
|
+
for db in db_list:
|
|
30
|
+
logger.info(f"Running abricate on database: {db}")
|
|
31
|
+
|
|
32
|
+
# Build the shell command with all fasta file names
|
|
33
|
+
cmd = f"abricate {' '.join(fasta_files)} --db {db} -t {threads}"
|
|
34
|
+
|
|
35
|
+
# Output file path (temporary inside input_dir)
|
|
36
|
+
temp_output = f"{db}.abr"
|
|
37
|
+
with open(temp_output, "w") as out_f:
|
|
38
|
+
subprocess.run(cmd, shell=True, stdout=out_f, stderr=subprocess.DEVNULL)
|
|
39
|
+
|
|
40
|
+
# Move the output to results_dir
|
|
41
|
+
final_output_path = os.path.join(results_dir, f"{db}.abr")
|
|
42
|
+
shutil.move(temp_output, final_output_path)
|
|
43
|
+
logger.info(f"Saved: {final_output_path}")
|
|
44
|
+
|
|
45
|
+
# Return to original directory
|
|
46
|
+
os.chdir(original_dir)
|
plasmidhub/ani.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import logging
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
def run_fastani(plasmid_list_file, fragLen=1000, minFrag=3, kmer=14, output_dir=".", threads=None):
|
|
7
|
+
if threads is None:
|
|
8
|
+
threads = 4
|
|
9
|
+
|
|
10
|
+
output_file = os.path.join(output_dir, "fastani_raw_results.tsv")
|
|
11
|
+
cmd = [
|
|
12
|
+
"fastANI",
|
|
13
|
+
"--ql", plasmid_list_file,
|
|
14
|
+
"--rl", plasmid_list_file,
|
|
15
|
+
"-o", output_file,
|
|
16
|
+
"--fragLen", str(fragLen),
|
|
17
|
+
"--minFraction", str(minFrag),
|
|
18
|
+
"--kmer", str(kmer),
|
|
19
|
+
"-t", str(threads)
|
|
20
|
+
]
|
|
21
|
+
logger.info("Running FastANI with command:")
|
|
22
|
+
logger.info(" ".join(cmd))
|
|
23
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
24
|
+
if result.returncode != 0:
|
|
25
|
+
logger.error("FastANI failed with error:")
|
|
26
|
+
logger.error(result.stderr)
|
|
27
|
+
exit(1)
|
|
28
|
+
else:
|
|
29
|
+
logger.info("FastANI completed successfully.")
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import random
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
def assign_cluster_colors(results_dir, mapping_file):
|
|
9
|
+
cluster_list_path = os.path.join(results_dir, "cluster_list.txt")
|
|
10
|
+
color_file = os.path.join(results_dir, "cluster_colours.txt")
|
|
11
|
+
|
|
12
|
+
clusters = []
|
|
13
|
+
with open(cluster_list_path) as f:
|
|
14
|
+
next(f) # Skip header
|
|
15
|
+
for line in f:
|
|
16
|
+
if line.strip():
|
|
17
|
+
cluster_file, _ = line.strip().split('\t')
|
|
18
|
+
cluster = cluster_file.replace('.txt', '')
|
|
19
|
+
clusters.append(cluster)
|
|
20
|
+
|
|
21
|
+
n_clusters = len(clusters)
|
|
22
|
+
|
|
23
|
+
# Start with base colors from tab20
|
|
24
|
+
cmap = plt.get_cmap('tab20')
|
|
25
|
+
base_colors = [
|
|
26
|
+
'#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
|
|
27
|
+
for r, g, b in cmap.colors
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
used_colors = set(base_colors[:min(n_clusters, len(base_colors))])
|
|
31
|
+
full_color_list = base_colors[:min(n_clusters, len(base_colors))]
|
|
32
|
+
|
|
33
|
+
# Generate additional distinct random colors if needed
|
|
34
|
+
while len(full_color_list) < n_clusters:
|
|
35
|
+
while True:
|
|
36
|
+
color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
|
|
37
|
+
if color not in used_colors:
|
|
38
|
+
used_colors.add(color)
|
|
39
|
+
full_color_list.append(color)
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
color_map = dict(zip(clusters, full_color_list))
|
|
43
|
+
|
|
44
|
+
with open(color_file, 'w') as out:
|
|
45
|
+
for cluster, color in color_map.items():
|
|
46
|
+
out.write(f"{cluster}\t{color}\n")
|
|
47
|
+
|
|
48
|
+
logger.info(f"Cluster colors saved to: {color_file}")
|
plasmidhub/clustering.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
import argparse
|
|
5
|
+
import logging
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
def find_valid_subclusters(results_dir):
|
|
9
|
+
valid_subclusters = []
|
|
10
|
+
|
|
11
|
+
for filename in sorted(os.listdir(results_dir)):
|
|
12
|
+
if filename.startswith("subcluster_") and filename.endswith("_plasmids.txt"):
|
|
13
|
+
filepath = os.path.join(results_dir, filename)
|
|
14
|
+
with open(filepath, 'r') as f:
|
|
15
|
+
plasmid_count = sum(1 for _ in f)
|
|
16
|
+
|
|
17
|
+
if plasmid_count >= 3: # Hardcoded rule
|
|
18
|
+
valid_subclusters.append((filename, plasmid_count))
|
|
19
|
+
|
|
20
|
+
valid_subclusters.sort(key=lambda x: x[1], reverse=True)
|
|
21
|
+
return valid_subclusters
|
|
22
|
+
|
|
23
|
+
def write_subcluster_list(valid_subclusters, output_path):
|
|
24
|
+
with open(output_path, "w") as f:
|
|
25
|
+
f.write("Subcluster\tPlasmids\n")
|
|
26
|
+
for subcluster, count in valid_subclusters:
|
|
27
|
+
f.write(f"{subcluster}\t{count}\n")
|
|
28
|
+
|
|
29
|
+
def extract_clusters(valid_subclusters, results_dir, fastani_path, output_dir):
|
|
30
|
+
fastani_df = pd.read_csv(fastani_path, sep="\t")
|
|
31
|
+
|
|
32
|
+
for subcluster_file, _ in valid_subclusters:
|
|
33
|
+
full_path = os.path.join(results_dir, subcluster_file)
|
|
34
|
+
try:
|
|
35
|
+
with open(full_path, "r") as f:
|
|
36
|
+
original_plasmids = set(line.strip() for line in f)
|
|
37
|
+
except FileNotFoundError:
|
|
38
|
+
logger.warning(f"File {subcluster_file} not found. Skipping.")
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
subcluster_plasmids = original_plasmids.copy()
|
|
42
|
+
|
|
43
|
+
connections = defaultdict(set)
|
|
44
|
+
for _, row in fastani_df.iterrows():
|
|
45
|
+
q, r = row["Query"], row["Reference"]
|
|
46
|
+
if q in subcluster_plasmids and r in subcluster_plasmids:
|
|
47
|
+
connections[q].add(r)
|
|
48
|
+
connections[r].add(q)
|
|
49
|
+
|
|
50
|
+
# Iteratively remove nodes with the fewest connections until we get a complete subgraph
|
|
51
|
+
while True:
|
|
52
|
+
current_nodes = set(connections.keys())
|
|
53
|
+
if len(current_nodes) < 3:
|
|
54
|
+
subcluster_plasmids = set()
|
|
55
|
+
break
|
|
56
|
+
|
|
57
|
+
# Check if the current graph is a complete subgraph (clique)
|
|
58
|
+
complete = all(len(connections[node]) == len(current_nodes) - 1 for node in current_nodes)
|
|
59
|
+
if complete:
|
|
60
|
+
subcluster_plasmids = current_nodes
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
# Find the node with the fewest connections (lowest degree)
|
|
64
|
+
min_node = min(current_nodes, key=lambda x: len(connections[x]))
|
|
65
|
+
|
|
66
|
+
# Remove that node from the graph
|
|
67
|
+
del connections[min_node]
|
|
68
|
+
for conn in connections.values():
|
|
69
|
+
conn.discard(min_node)
|
|
70
|
+
|
|
71
|
+
# Step 7: Save the refined subcluster to a new file with the desired naming format
|
|
72
|
+
cluster_number = subcluster_file.split("_")[1] # Extract the number from subcluster_XX_plasmids.txt
|
|
73
|
+
output_file = f"cluster_{cluster_number}.txt"
|
|
74
|
+
|
|
75
|
+
cluster_path = os.path.join(output_dir, output_file)
|
|
76
|
+
with open(cluster_path, "w") as f:
|
|
77
|
+
|
|
78
|
+
for plasmid in subcluster_plasmids:
|
|
79
|
+
f.write(plasmid + "\n")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def filter_clusters_by_size(output_dir, min_cluster_size):
|
|
83
|
+
for filename in os.listdir(output_dir):
|
|
84
|
+
if filename.startswith("cluster_") and filename.endswith(".txt"):
|
|
85
|
+
path = os.path.join(output_dir, filename)
|
|
86
|
+
with open(path, "r") as f:
|
|
87
|
+
lines = f.readlines()
|
|
88
|
+
if len(lines) < min_cluster_size:
|
|
89
|
+
os.remove(path)
|
|
90
|
+
|
|
91
|
+
def write_cluster_list(output_dir, output_path):
|
|
92
|
+
cluster_files = []
|
|
93
|
+
for filename in os.listdir(output_dir):
|
|
94
|
+
if (
|
|
95
|
+
filename.startswith("cluster_")
|
|
96
|
+
and filename.endswith(".txt")
|
|
97
|
+
and filename not in {os.path.basename(output_path), "cluster_colours.txt"} # exclude output file itself and cluster_colours.txt
|
|
98
|
+
):
|
|
99
|
+
path = os.path.join(output_dir, filename)
|
|
100
|
+
with open(path, "r") as f:
|
|
101
|
+
count = sum(1 for _ in f)
|
|
102
|
+
cluster_files.append((filename, count))
|
|
103
|
+
cluster_files.sort(key=lambda x: x[1], reverse=True)
|
|
104
|
+
with open(output_path, "w") as f:
|
|
105
|
+
f.write("Cluster\tPlasmids\n")
|
|
106
|
+
for filename, count in cluster_files:
|
|
107
|
+
f.write(f"{filename}\t{count}\n")
|
|
108
|
+
|
|
109
|
+
def main(results_dir, min_cluster_size):
|
|
110
|
+
fastani_path = os.path.join(results_dir, "ANI_results_final.tsv")
|
|
111
|
+
subcluster_list_output = os.path.join(results_dir, "subcluster_list.txt")
|
|
112
|
+
cluster_list_output = os.path.join(results_dir, "cluster_list.txt")
|
|
113
|
+
|
|
114
|
+
logger.info("Finding valid subclusters (>=3 plasmids)...")
|
|
115
|
+
valid_subclusters = find_valid_subclusters(results_dir)
|
|
116
|
+
|
|
117
|
+
write_subcluster_list(valid_subclusters, subcluster_list_output)
|
|
118
|
+
|
|
119
|
+
logger.info("Identifying clusters...")
|
|
120
|
+
extract_clusters(valid_subclusters, results_dir, fastani_path, results_dir)
|
|
121
|
+
|
|
122
|
+
logging.info(f"Keep only clusters with >={min_cluster_size} plasmids...")
|
|
123
|
+
filter_clusters_by_size(results_dir, min_cluster_size)
|
|
124
|
+
|
|
125
|
+
write_cluster_list(results_dir, cluster_list_output)
|
|
126
|
+
|
|
127
|
+
# Check: warn user if cluster_list.txt is empty
|
|
128
|
+
if os.path.exists(cluster_list_output):
|
|
129
|
+
with open(cluster_list_output, "r") as f:
|
|
130
|
+
lines = f.readlines()
|
|
131
|
+
if len(lines) <= 1:
|
|
132
|
+
logger.warning("No clusters detected with the given parameters!")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# logger.info("Done!")
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
parser = argparse.ArgumentParser(description="Clustering Tool")
|
|
139
|
+
parser.add_argument("results_dir", help="Path to results directory created by main.py")
|
|
140
|
+
parser.add_argument("--min_cluster_size", type=int, default=3, help="Minimum number of plasmids in final cluster (default: 3)")
|
|
141
|
+
args = parser.parse_args()
|
|
142
|
+
|
|
143
|
+
main(args.results_dir, args.min_cluster_size)
|
plasmidhub/filtering.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import logging
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
def strip_paths_in_fastani(input_file):
|
|
7
|
+
"""Remove directory paths from plasmid names in FastANI output."""
|
|
8
|
+
df = pd.read_csv(input_file, sep='\t', header=None)
|
|
9
|
+
df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']
|
|
10
|
+
df['Query'] = df['Query'].apply(os.path.basename)
|
|
11
|
+
df['Reference'] = df['Reference'].apply(os.path.basename)
|
|
12
|
+
df.to_csv(input_file, sep='\t', header=False, index=False)
|
|
13
|
+
|
|
14
|
+
def strip_paths_in_plasmid_list(file_path):
|
|
15
|
+
"""Remove directory paths from plasmid names in Plasmid_list.txt."""
|
|
16
|
+
with open(file_path, 'r') as f:
|
|
17
|
+
lines = f.readlines()
|
|
18
|
+
stripped_lines = [os.path.basename(line.strip()) + '\n' for line in lines]
|
|
19
|
+
with open(file_path, 'w') as f:
|
|
20
|
+
f.writelines(stripped_lines)
|
|
21
|
+
|
|
22
|
+
def filter_self_comparisons(input_file, output_file):
|
|
23
|
+
df = pd.read_csv(input_file, sep='\t', header=None)
|
|
24
|
+
df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']
|
|
25
|
+
df_filtered = df[df['Query'] != df['Reference']]
|
|
26
|
+
df_filtered.to_csv(output_file, sep='\t', index=False)
|
|
27
|
+
|
|
28
|
+
def add_plasmid_sizes(ani_file, sizes_file, output_file):
|
|
29
|
+
sizes_df = pd.read_csv(sizes_file, sep='\t')
|
|
30
|
+
size_dict = dict(zip(sizes_df['PlasmidID'], sizes_df['Size']))
|
|
31
|
+
ani_df = pd.read_csv(ani_file, sep='\t')
|
|
32
|
+
def get_size(plasmid_id):
|
|
33
|
+
return size_dict.get(plasmid_id, None)
|
|
34
|
+
ani_df['Query_size'] = ani_df['Query'].apply(get_size)
|
|
35
|
+
ani_df['Reference_size'] = ani_df['Reference'].apply(get_size)
|
|
36
|
+
ani_df.to_csv(output_file, sep='\t', index=False)
|
|
37
|
+
|
|
38
|
+
def apply_filters(input_file, output_file, frag_len=1000, coverage_threshold=0.5, ani_threshold=95.0):
|
|
39
|
+
df = pd.read_csv(input_file, sep='\t')
|
|
40
|
+
|
|
41
|
+
# Convert matching fragments to base pairs using user-defined fragment length
|
|
42
|
+
df['Matching_Frags_Query_bp'] = df['Matching_Frags_Query'] * frag_len
|
|
43
|
+
df['Matching_Frags_Ref_bp'] = df['Matching_Frags_Ref'] * frag_len
|
|
44
|
+
|
|
45
|
+
# Apply filtering logic
|
|
46
|
+
filtered = df[
|
|
47
|
+
(df['Matching_Frags_Query_bp'] > df['Reference_size'] * coverage_threshold) &
|
|
48
|
+
(df['Matching_Frags_Ref_bp'] > df['Query_size'] * coverage_threshold) &
|
|
49
|
+
(df['Matching_Frags_Query_bp'] > df['Query_size'] * coverage_threshold) &
|
|
50
|
+
(df['ANI'] >= ani_threshold)
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
filtered.to_csv(output_file, sep='\t', index=False)
|
|
54
|
+
logger.info(f"[INFO] Applied filters (ANI = {ani_threshold}, coverage = {coverage_threshold*100:.1f}%). ")
|