plasmidhub 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of plasmidhub might be problematic. Click here for more details.

plasmidhub/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # plasmidhub package
plasmidhub/abricate.py ADDED
@@ -0,0 +1,46 @@
1
+ import os
2
+ import subprocess
3
+ import shutil
4
+ import glob
5
+ import logging
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def run_abricate_bulk(input_dir, results_dir, db_list, threads=None):
9
+ os.makedirs(results_dir, exist_ok=True)
10
+
11
+ # Use default thread count if not provided
12
+ if threads is None:
13
+ threads = 4
14
+
15
+ # Move into input_dir because wildcard expansion happens here
16
+ original_dir = os.getcwd()
17
+ os.chdir(input_dir)
18
+
19
+ # Collect all fasta-like files
20
+ fasta_files = sorted(
21
+ glob.glob("*.fna") +
22
+ glob.glob("*.fa") +
23
+ glob.glob("*.fasta")
24
+ )
25
+
26
+ if not fasta_files:
27
+ raise RuntimeError(f"No input files found in {input_dir} with .fna/.fa/.fasta extensions.")
28
+
29
+ for db in db_list:
30
+ logger.info(f"Running abricate on database: {db}")
31
+
32
+ # Build the shell command with all fasta file names
33
+ cmd = f"abricate {' '.join(fasta_files)} --db {db} -t {threads}"
34
+
35
+ # Output file path (temporary inside input_dir)
36
+ temp_output = f"{db}.abr"
37
+ with open(temp_output, "w") as out_f:
38
+ subprocess.run(cmd, shell=True, stdout=out_f, stderr=subprocess.DEVNULL)
39
+
40
+ # Move the output to results_dir
41
+ final_output_path = os.path.join(results_dir, f"{db}.abr")
42
+ shutil.move(temp_output, final_output_path)
43
+ logger.info(f"Saved: {final_output_path}")
44
+
45
+ # Return to original directory
46
+ os.chdir(original_dir)
plasmidhub/ani.py ADDED
@@ -0,0 +1,29 @@
1
+ import os
2
+ import subprocess
3
+ import logging
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def run_fastani(plasmid_list_file, fragLen=1000, minFrag=3, kmer=14, output_dir=".", threads=None):
7
+ if threads is None:
8
+ threads = 4
9
+
10
+ output_file = os.path.join(output_dir, "fastani_raw_results.tsv")
11
+ cmd = [
12
+ "fastANI",
13
+ "--ql", plasmid_list_file,
14
+ "--rl", plasmid_list_file,
15
+ "-o", output_file,
16
+ "--fragLen", str(fragLen),
17
+ "--minFraction", str(minFrag),
18
+ "--kmer", str(kmer),
19
+ "-t", str(threads)
20
+ ]
21
+ logger.info("Running FastANI with command:")
22
+ logger.info(" ".join(cmd))
23
+ result = subprocess.run(cmd, capture_output=True, text=True)
24
+ if result.returncode != 0:
25
+ logger.error("FastANI failed with error:")
26
+ logger.error(result.stderr)
27
+ exit(1)
28
+ else:
29
+ logger.info("FastANI completed successfully.")
@@ -0,0 +1,48 @@
1
+ import os
2
+ import matplotlib.pyplot as plt
3
+ import random
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def assign_cluster_colors(results_dir, mapping_file):
9
+ cluster_list_path = os.path.join(results_dir, "cluster_list.txt")
10
+ color_file = os.path.join(results_dir, "cluster_colours.txt")
11
+
12
+ clusters = []
13
+ with open(cluster_list_path) as f:
14
+ next(f) # Skip header
15
+ for line in f:
16
+ if line.strip():
17
+ cluster_file, _ = line.strip().split('\t')
18
+ cluster = cluster_file.replace('.txt', '')
19
+ clusters.append(cluster)
20
+
21
+ n_clusters = len(clusters)
22
+
23
+ # Start with base colors from tab20
24
+ cmap = plt.get_cmap('tab20')
25
+ base_colors = [
26
+ '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
27
+ for r, g, b in cmap.colors
28
+ ]
29
+
30
+ used_colors = set(base_colors[:min(n_clusters, len(base_colors))])
31
+ full_color_list = base_colors[:min(n_clusters, len(base_colors))]
32
+
33
+ # Generate additional distinct random colors if needed
34
+ while len(full_color_list) < n_clusters:
35
+ while True:
36
+ color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
37
+ if color not in used_colors:
38
+ used_colors.add(color)
39
+ full_color_list.append(color)
40
+ break
41
+
42
+ color_map = dict(zip(clusters, full_color_list))
43
+
44
+ with open(color_file, 'w') as out:
45
+ for cluster, color in color_map.items():
46
+ out.write(f"{cluster}\t{color}\n")
47
+
48
+ logger.info(f"Cluster colors saved to: {color_file}")
@@ -0,0 +1,143 @@
1
+ import os
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ import argparse
5
+ import logging
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def find_valid_subclusters(results_dir):
9
+ valid_subclusters = []
10
+
11
+ for filename in sorted(os.listdir(results_dir)):
12
+ if filename.startswith("subcluster_") and filename.endswith("_plasmids.txt"):
13
+ filepath = os.path.join(results_dir, filename)
14
+ with open(filepath, 'r') as f:
15
+ plasmid_count = sum(1 for _ in f)
16
+
17
+ if plasmid_count >= 3: # Hardcoded rule
18
+ valid_subclusters.append((filename, plasmid_count))
19
+
20
+ valid_subclusters.sort(key=lambda x: x[1], reverse=True)
21
+ return valid_subclusters
22
+
23
+ def write_subcluster_list(valid_subclusters, output_path):
24
+ with open(output_path, "w") as f:
25
+ f.write("Subcluster\tPlasmids\n")
26
+ for subcluster, count in valid_subclusters:
27
+ f.write(f"{subcluster}\t{count}\n")
28
+
29
+ def extract_clusters(valid_subclusters, results_dir, fastani_path, output_dir):
30
+ fastani_df = pd.read_csv(fastani_path, sep="\t")
31
+
32
+ for subcluster_file, _ in valid_subclusters:
33
+ full_path = os.path.join(results_dir, subcluster_file)
34
+ try:
35
+ with open(full_path, "r") as f:
36
+ original_plasmids = set(line.strip() for line in f)
37
+ except FileNotFoundError:
38
+ logger.warning(f"File {subcluster_file} not found. Skipping.")
39
+ continue
40
+
41
+ subcluster_plasmids = original_plasmids.copy()
42
+
43
+ connections = defaultdict(set)
44
+ for _, row in fastani_df.iterrows():
45
+ q, r = row["Query"], row["Reference"]
46
+ if q in subcluster_plasmids and r in subcluster_plasmids:
47
+ connections[q].add(r)
48
+ connections[r].add(q)
49
+
50
+ # Iteratively remove nodes with the fewest connections until we get a complete subgraph
51
+ while True:
52
+ current_nodes = set(connections.keys())
53
+ if len(current_nodes) < 3:
54
+ subcluster_plasmids = set()
55
+ break
56
+
57
+ # Check if the current graph is a complete subgraph (clique)
58
+ complete = all(len(connections[node]) == len(current_nodes) - 1 for node in current_nodes)
59
+ if complete:
60
+ subcluster_plasmids = current_nodes
61
+ break
62
+
63
+ # Find the node with the fewest connections (lowest degree)
64
+ min_node = min(current_nodes, key=lambda x: len(connections[x]))
65
+
66
+ # Remove that node from the graph
67
+ del connections[min_node]
68
+ for conn in connections.values():
69
+ conn.discard(min_node)
70
+
71
+ # Step 7: Save the refined subcluster to a new file with the desired naming format
72
+ cluster_number = subcluster_file.split("_")[1] # Extract the number from subcluster_XX_plasmids.txt
73
+ output_file = f"cluster_{cluster_number}.txt"
74
+
75
+ cluster_path = os.path.join(output_dir, output_file)
76
+ with open(cluster_path, "w") as f:
77
+
78
+ for plasmid in subcluster_plasmids:
79
+ f.write(plasmid + "\n")
80
+
81
+
82
+ def filter_clusters_by_size(output_dir, min_cluster_size):
83
+ for filename in os.listdir(output_dir):
84
+ if filename.startswith("cluster_") and filename.endswith(".txt"):
85
+ path = os.path.join(output_dir, filename)
86
+ with open(path, "r") as f:
87
+ lines = f.readlines()
88
+ if len(lines) < min_cluster_size:
89
+ os.remove(path)
90
+
91
+ def write_cluster_list(output_dir, output_path):
92
+ cluster_files = []
93
+ for filename in os.listdir(output_dir):
94
+ if (
95
+ filename.startswith("cluster_")
96
+ and filename.endswith(".txt")
97
+ and filename not in {os.path.basename(output_path), "cluster_colours.txt"} # exclude output file itself and cluster_colours.txt
98
+ ):
99
+ path = os.path.join(output_dir, filename)
100
+ with open(path, "r") as f:
101
+ count = sum(1 for _ in f)
102
+ cluster_files.append((filename, count))
103
+ cluster_files.sort(key=lambda x: x[1], reverse=True)
104
+ with open(output_path, "w") as f:
105
+ f.write("Cluster\tPlasmids\n")
106
+ for filename, count in cluster_files:
107
+ f.write(f"{filename}\t{count}\n")
108
+
109
+ def main(results_dir, min_cluster_size):
110
+ fastani_path = os.path.join(results_dir, "ANI_results_final.tsv")
111
+ subcluster_list_output = os.path.join(results_dir, "subcluster_list.txt")
112
+ cluster_list_output = os.path.join(results_dir, "cluster_list.txt")
113
+
114
+ logger.info("Finding valid subclusters (>=3 plasmids)...")
115
+ valid_subclusters = find_valid_subclusters(results_dir)
116
+
117
+ write_subcluster_list(valid_subclusters, subcluster_list_output)
118
+
119
+ logger.info("Identifying clusters...")
120
+ extract_clusters(valid_subclusters, results_dir, fastani_path, results_dir)
121
+
122
+ logging.info(f"Keep only clusters with >={min_cluster_size} plasmids...")
123
+ filter_clusters_by_size(results_dir, min_cluster_size)
124
+
125
+ write_cluster_list(results_dir, cluster_list_output)
126
+
127
+ # Check: warn user if cluster_list.txt is empty
128
+ if os.path.exists(cluster_list_output):
129
+ with open(cluster_list_output, "r") as f:
130
+ lines = f.readlines()
131
+ if len(lines) <= 1:
132
+ logger.warning("No clusters detected with the given parameters!")
133
+
134
+
135
+ # logger.info("Done!")
136
+
137
+ if __name__ == "__main__":
138
+ parser = argparse.ArgumentParser(description="Clustering Tool")
139
+ parser.add_argument("results_dir", help="Path to results directory created by main.py")
140
+ parser.add_argument("--min_cluster_size", type=int, default=3, help="Minimum number of plasmids in final cluster (default: 3)")
141
+ args = parser.parse_args()
142
+
143
+ main(args.results_dir, args.min_cluster_size)
@@ -0,0 +1,54 @@
1
+ import os
2
+ import pandas as pd
3
+ import logging
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def strip_paths_in_fastani(input_file):
7
+ """Remove directory paths from plasmid names in FastANI output."""
8
+ df = pd.read_csv(input_file, sep='\t', header=None)
9
+ df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']
10
+ df['Query'] = df['Query'].apply(os.path.basename)
11
+ df['Reference'] = df['Reference'].apply(os.path.basename)
12
+ df.to_csv(input_file, sep='\t', header=False, index=False)
13
+
14
+ def strip_paths_in_plasmid_list(file_path):
15
+ """Remove directory paths from plasmid names in Plasmid_list.txt."""
16
+ with open(file_path, 'r') as f:
17
+ lines = f.readlines()
18
+ stripped_lines = [os.path.basename(line.strip()) + '\n' for line in lines]
19
+ with open(file_path, 'w') as f:
20
+ f.writelines(stripped_lines)
21
+
22
+ def filter_self_comparisons(input_file, output_file):
23
+ df = pd.read_csv(input_file, sep='\t', header=None)
24
+ df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']
25
+ df_filtered = df[df['Query'] != df['Reference']]
26
+ df_filtered.to_csv(output_file, sep='\t', index=False)
27
+
28
+ def add_plasmid_sizes(ani_file, sizes_file, output_file):
29
+ sizes_df = pd.read_csv(sizes_file, sep='\t')
30
+ size_dict = dict(zip(sizes_df['PlasmidID'], sizes_df['Size']))
31
+ ani_df = pd.read_csv(ani_file, sep='\t')
32
+ def get_size(plasmid_id):
33
+ return size_dict.get(plasmid_id, None)
34
+ ani_df['Query_size'] = ani_df['Query'].apply(get_size)
35
+ ani_df['Reference_size'] = ani_df['Reference'].apply(get_size)
36
+ ani_df.to_csv(output_file, sep='\t', index=False)
37
+
38
+ def apply_filters(input_file, output_file, frag_len=1000, coverage_threshold=0.5, ani_threshold=95.0):
39
+ df = pd.read_csv(input_file, sep='\t')
40
+
41
+ # Convert matching fragments to base pairs using user-defined fragment length
42
+ df['Matching_Frags_Query_bp'] = df['Matching_Frags_Query'] * frag_len
43
+ df['Matching_Frags_Ref_bp'] = df['Matching_Frags_Ref'] * frag_len
44
+
45
+ # Apply filtering logic
46
+ filtered = df[
47
+ (df['Matching_Frags_Query_bp'] > df['Reference_size'] * coverage_threshold) &
48
+ (df['Matching_Frags_Ref_bp'] > df['Query_size'] * coverage_threshold) &
49
+ (df['Matching_Frags_Query_bp'] > df['Query_size'] * coverage_threshold) &
50
+ (df['ANI'] >= ani_threshold)
51
+ ]
52
+
53
+ filtered.to_csv(output_file, sep='\t', index=False)
54
+ logger.info(f"[INFO] Applied filters (ANI = {ani_threshold}, coverage = {coverage_threshold*100:.1f}%). ")