plasmidhub 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of plasmidhub might be problematic. Click here for more details.
- plasmidhub/__init__.py +1 -0
- plasmidhub/abricate.py +46 -0
- plasmidhub/ani.py +29 -0
- plasmidhub/cluster_color.py +48 -0
- plasmidhub/clustering.py +143 -0
- plasmidhub/filtering.py +54 -0
- plasmidhub/main.py +381 -0
- plasmidhub/network_builder.py +202 -0
- plasmidhub/node_stats.py +69 -0
- plasmidhub/plot.py +169 -0
- plasmidhub/plot_only.py +153 -0
- plasmidhub/preprocessing.py +48 -0
- plasmidhub-1.0.0.dist-info/LICENSE +24 -0
- plasmidhub-1.0.0.dist-info/METADATA +193 -0
- plasmidhub-1.0.0.dist-info/RECORD +18 -0
- plasmidhub-1.0.0.dist-info/WHEEL +5 -0
- plasmidhub-1.0.0.dist-info/entry_points.txt +2 -0
- plasmidhub-1.0.0.dist-info/top_level.txt +1 -0
plasmidhub/main.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from argparse import ArgumentParser, RawTextHelpFormatter
|
|
3
|
+
import textwrap
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
import shutil
|
|
7
|
+
import glob
|
|
8
|
+
import subprocess
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from plasmidhub import preprocessing, ani, filtering, abricate
|
|
11
|
+
|
|
12
|
+
VERSION = "1.0.0"
|
|
13
|
+
|
|
14
|
+
# Setup logging
|
|
15
|
+
def setup_logging(log_file_path):
|
|
16
|
+
logger = logging.getLogger()
|
|
17
|
+
logger.setLevel(logging.INFO)
|
|
18
|
+
|
|
19
|
+
# File handler with timestamp
|
|
20
|
+
file_formatter = logging.Formatter('%(asctime)s - %(message)s')
|
|
21
|
+
fh = logging.FileHandler(log_file_path)
|
|
22
|
+
fh.setFormatter(file_formatter)
|
|
23
|
+
logger.addHandler(fh)
|
|
24
|
+
|
|
25
|
+
# Stream handler (terminal) without timestamp
|
|
26
|
+
stream_formatter = logging.Formatter('%(message)s')
|
|
27
|
+
sh = logging.StreamHandler()
|
|
28
|
+
sh.setFormatter(stream_formatter)
|
|
29
|
+
logger.addHandler(sh)
|
|
30
|
+
|
|
31
|
+
return logger
|
|
32
|
+
|
|
33
|
+
def write_versions_txt(output_dir):
|
|
34
|
+
import sys
|
|
35
|
+
import importlib.metadata
|
|
36
|
+
def get_tool_version(cmd):
|
|
37
|
+
try:
|
|
38
|
+
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
|
39
|
+
return result.stdout.strip().split('\n')[0]
|
|
40
|
+
except Exception:
|
|
41
|
+
return "Error retrieving version"
|
|
42
|
+
|
|
43
|
+
def get_package_version(pkg_name):
|
|
44
|
+
try:
|
|
45
|
+
return importlib.metadata.version(pkg_name)
|
|
46
|
+
except importlib.metadata.PackageNotFoundError:
|
|
47
|
+
return "not installed"
|
|
48
|
+
|
|
49
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
50
|
+
with open(os.path.join(output_dir, "versions.txt"), "w") as vf:
|
|
51
|
+
vf.write(f"Plasmidhub version: {VERSION}\n")
|
|
52
|
+
vf.write(f"Python version: {sys.version.split()[0]}\n")
|
|
53
|
+
vf.write(f"FastANI version: {get_tool_version(['fastANI', '--version'])}\n")
|
|
54
|
+
vf.write(f"ABRicate version: {get_tool_version(['abricate', '--version'])}\n\n")
|
|
55
|
+
|
|
56
|
+
vf.write("Python package versions:\n")
|
|
57
|
+
for pkg in ["biopython", "pandas", "networkx", "matplotlib", "python-louvain", "numpy"]:
|
|
58
|
+
vf.write(f"{pkg}: {get_package_version(pkg)}\n")
|
|
59
|
+
|
|
60
|
+
def main():
|
|
61
|
+
parser = argparse.ArgumentParser(
|
|
62
|
+
prog="plasmidhub",
|
|
63
|
+
description=(
|
|
64
|
+
"SYNOPSIS\n"
|
|
65
|
+
" Plasmidhub: Bioinformatic Tool for Plasmid Network Analysis\n\n"
|
|
66
|
+
" Plasmidhub constructs a similarity-based network from plasmid FASTA files.\n"
|
|
67
|
+
" It uses FastANI to calculate pairwise ANI with user defined parameters,\n"
|
|
68
|
+
" clusters the plasmids, and visualizes the network.\n"
|
|
69
|
+
" Includes optional annotation with ABRicate to identify resistance and virulence genes.\n"
|
|
70
|
+
),
|
|
71
|
+
epilog=(
|
|
72
|
+
"DOCUMENTATION \n"
|
|
73
|
+
" https://github.com/BALINTESBL/plasmidhub \n\n"
|
|
74
|
+
"Example:\n"
|
|
75
|
+
" plasmidhub path/to/my/plasmid/FASTA/files --fragLen 1000 --kmer 14 --coverage_threshold 0.5 --ani_threshold 95 --min_cluster_size 4 --plot_k 2.0 3.0 -t 32\n"
|
|
76
|
+
" plasmidhub --plot_only path/to/my/results --plot_k 3 3 --plot_node_color white --plot_node_size 500 --plot_node_shape s -t 32\n\n"
|
|
77
|
+
"Developed by Dr. Balint Timmer, Institute of Metagenomics, University of Debrecen\n"
|
|
78
|
+
"Version 1.0.0\n"
|
|
79
|
+
"If you are using Plasmidhub, please reference: https://github.com/BALINTESBL/plasmidhub"
|
|
80
|
+
),
|
|
81
|
+
formatter_class=argparse.RawTextHelpFormatter
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Positional argument
|
|
85
|
+
parser.add_argument("input_dir", nargs="?", help="Path to plasmid FASTA files directory")
|
|
86
|
+
|
|
87
|
+
parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {VERSION}",
|
|
88
|
+
help="Show program's version number and exit")
|
|
89
|
+
|
|
90
|
+
# ANI
|
|
91
|
+
ani_group = parser.add_argument_group("ANI")
|
|
92
|
+
ani_group.add_argument("--fragLen", type=int, default=1000, help="FastANI fragment length (default: 1000)")
|
|
93
|
+
ani_group.add_argument("--kmer", type=int, default=14, help="FastANI kmer size (default: 14)")
|
|
94
|
+
ani_group.add_argument("--coverage_threshold", type=float, default=0.5, help="Coverage threshold fraction (default: 0.5)")
|
|
95
|
+
ani_group.add_argument("--ani_threshold", type=float, default=95.0, help="ANI threshold (default: 95.0)")
|
|
96
|
+
|
|
97
|
+
# CLUSTER
|
|
98
|
+
cluster = parser.add_argument_group("CLUSTER")
|
|
99
|
+
cluster.add_argument("--cluster_off", action="store_true", help="Disable clustering step")
|
|
100
|
+
cluster.add_argument("--min_cluster_size", type=int, default=3,
|
|
101
|
+
help="Minimum plasmid count for final clusters (default: 3)")
|
|
102
|
+
|
|
103
|
+
# ABRicate
|
|
104
|
+
abricate_group = parser.add_argument_group("ABRicate")
|
|
105
|
+
abricate_group.add_argument("--skip_abricate", action="store_true", help="Skip ABRicate analysis step")
|
|
106
|
+
abricate_group.add_argument(
|
|
107
|
+
"--abricate_dbs",
|
|
108
|
+
nargs="+",
|
|
109
|
+
metavar="DB",
|
|
110
|
+
help=(
|
|
111
|
+
"List of ABRicate databases to run (default: plasmidfinder, card, vfdb).\n"
|
|
112
|
+
"Available databases:\n"
|
|
113
|
+
" resfinder\n"
|
|
114
|
+
" megares\n"
|
|
115
|
+
" vfdb\n"
|
|
116
|
+
" card\n"
|
|
117
|
+
" argannot\n"
|
|
118
|
+
" ecoli_vf\n"
|
|
119
|
+
" plasmidfinder\n"
|
|
120
|
+
" ncbi\n"
|
|
121
|
+
" ecoh"
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# PLOT
|
|
126
|
+
plot = parser.add_argument_group("PLOT")
|
|
127
|
+
plot.add_argument("--plot_k", nargs=2, type=float, metavar=('MIN_K', 'MAX_K'),
|
|
128
|
+
help="Generate network visualizations.\nSpecify minimum and maximum k (e.g.: --plot_k 1.5 3.0)")
|
|
129
|
+
plot.add_argument("--plot_skip", action="store_true", help="Skip network visualization step")
|
|
130
|
+
plot.add_argument("--plot_only", type=str, metavar="DIR",
|
|
131
|
+
help="Generate plots only from existing files, without running the whole pipeline. Figure parameters can be adjusted:")
|
|
132
|
+
plot.add_argument("--plot_edge_width", nargs=2, type=float, metavar=('MIN_WIDTH', 'MAX_WIDTH'),
|
|
133
|
+
default=[0.2, 2.0], help="Minimum and maximum edge widths (default: 0.2 2.0)")
|
|
134
|
+
plot.add_argument("--plot_node_size", type=int, default=900, help="Node size (default: 900)")
|
|
135
|
+
plot.add_argument("--plot_node_shape", type=str, default='o', help="Node shape (e.g.: 'o', 's' (square), '>' (triangle) , '^' , '*' ect., default: o)")
|
|
136
|
+
plot.add_argument("--plot_node_color", type=str, help="Node color (e.g.: 'white', 'blue', default: 'grey')")
|
|
137
|
+
plot.add_argument("--plot_figsize", nargs=2, type=int, metavar=('WIDTH', 'HEIGHT'), default=[25, 25], help="Figure size in inches (default: 25 25)")
|
|
138
|
+
plot.add_argument("--plot_iterations", type=int, default=100, help="Number of iterations (spring layout, default: 100)")
|
|
139
|
+
|
|
140
|
+
# THREADS
|
|
141
|
+
threads = parser.add_argument_group("THREADS")
|
|
142
|
+
threads.add_argument("-t", "--threads", type=int, default=4, metavar="", help="Number of threads to use (default: 4)")
|
|
143
|
+
|
|
144
|
+
args = parser.parse_args()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# === Plot-only mode === #
|
|
148
|
+
|
|
149
|
+
if args.plot_only:
|
|
150
|
+
import sys
|
|
151
|
+
if not os.path.exists(args.plot_only):
|
|
152
|
+
parser.error("The path provided to --plot_only does not exist.")
|
|
153
|
+
|
|
154
|
+
cmd = [sys.executable, os.path.join(os.path.dirname(__file__), "plot_only.py"),
|
|
155
|
+
"--results_dir", args.plot_only]
|
|
156
|
+
|
|
157
|
+
if args.plot_k:
|
|
158
|
+
cmd.extend(["--plot_k", str(args.plot_k[0]), str(args.plot_k[1])])
|
|
159
|
+
if args.plot_edge_width:
|
|
160
|
+
cmd += ["--min_edge_width", str(args.plot_edge_width[0]), "--max_edge_width", str(args.plot_edge_width[1])]
|
|
161
|
+
if args.plot_node_size:
|
|
162
|
+
cmd += ["--node_size", str(args.plot_node_size)]
|
|
163
|
+
if args.plot_node_color:
|
|
164
|
+
cmd += ["--node_color", args.plot_node_color]
|
|
165
|
+
if args.plot_node_shape:
|
|
166
|
+
cmd += ["--node_shape", args.plot_node_shape]
|
|
167
|
+
if args.plot_figsize:
|
|
168
|
+
cmd += ["--figsize", str(args.plot_figsize[0]), str(args.plot_figsize[1])]
|
|
169
|
+
if args.plot_iterations:
|
|
170
|
+
cmd += ["--iterations", str(args.plot_iterations)]
|
|
171
|
+
|
|
172
|
+
subprocess.run(cmd)
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
## === Full pipeline mode === ##
|
|
176
|
+
if not args.input_dir:
|
|
177
|
+
parser.error("input_dir is required unless --plot_only is used.")
|
|
178
|
+
|
|
179
|
+
if not args.plot_only:
|
|
180
|
+
if (
|
|
181
|
+
args.plot_node_size != 900 or
|
|
182
|
+
args.plot_edge_width != [0.2, 2.0] or
|
|
183
|
+
args.plot_node_shape != 'o' or
|
|
184
|
+
args.plot_figsize != [25, 25] or
|
|
185
|
+
args.plot_iterations != 100
|
|
186
|
+
):
|
|
187
|
+
parser.error(
|
|
188
|
+
"Plot customization options (--plot_node_size, --plot_edge_width, etc.) "
|
|
189
|
+
"can only be used with --plot_only."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Create results directory inside input_dir if it doesn't exist
|
|
193
|
+
results_dir = os.path.join(args.input_dir, "results")
|
|
194
|
+
os.makedirs(results_dir, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
# Setup logger now that we have results_dir
|
|
197
|
+
log_file = os.path.join(results_dir, "run.log")
|
|
198
|
+
logger = setup_logging(log_file)
|
|
199
|
+
logger.info(f"Starting Plasmidhub v{VERSION}")
|
|
200
|
+
|
|
201
|
+
logger.info(f"Input directory: {args.input_dir}")
|
|
202
|
+
|
|
203
|
+
write_versions_txt(results_dir)
|
|
204
|
+
|
|
205
|
+
# Step 1-4: preprocess (validate, list files, count, size)
|
|
206
|
+
plasmid_list = preprocessing.validate_and_list_plasmids(args.input_dir)
|
|
207
|
+
logger.info(f"Number of plasmids: {len(plasmid_list)}")
|
|
208
|
+
# Write output files inside results_dir
|
|
209
|
+
preprocessing.write_plasmid_list(plasmid_list, output_file=os.path.join(results_dir, "Plasmid_list.txt"))
|
|
210
|
+
preprocessing.write_plasmid_sizes(plasmid_list, output_file=os.path.join(results_dir, "Plasmid_sizes.txt"))
|
|
211
|
+
|
|
212
|
+
# Step 5-6: run FastANI - output to results_dir
|
|
213
|
+
logger.info("Running FastANI...")
|
|
214
|
+
|
|
215
|
+
cwd = os.getcwd()
|
|
216
|
+
os.chdir(args.input_dir) # switch to directory with FASTA files
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
ani.run_fastani(
|
|
220
|
+
os.path.join(results_dir, "Plasmid_list.txt"),
|
|
221
|
+
fragLen=args.fragLen,
|
|
222
|
+
minFrag=0.001, # Hardcoded minimum fraction
|
|
223
|
+
kmer=args.kmer,
|
|
224
|
+
output_dir=results_dir,
|
|
225
|
+
threads=args.threads
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
finally:
|
|
229
|
+
os.chdir(cwd) # revert back to original working directory
|
|
230
|
+
|
|
231
|
+
# Step 6.5: Normalize plasmid names (strip paths)
|
|
232
|
+
filtering.strip_paths_in_fastani(os.path.join(results_dir, "fastani_raw_results.tsv"))
|
|
233
|
+
filtering.strip_paths_in_plasmid_list(os.path.join(results_dir, "Plasmid_list.txt"))
|
|
234
|
+
|
|
235
|
+
# Step 7: filter self comparisons
|
|
236
|
+
logger.info("Filtering self comparisons...")
|
|
237
|
+
filtering.filter_self_comparisons(
|
|
238
|
+
os.path.join(results_dir, "fastani_raw_results.tsv"),
|
|
239
|
+
os.path.join(results_dir, "fastani_raw_results_filtered.tsv"),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Step 8: add sizes - input and output inside results_dir
|
|
243
|
+
logger.info("Adding plasmid sizes to ANI results...")
|
|
244
|
+
filtering.add_plasmid_sizes(
|
|
245
|
+
os.path.join(results_dir, "fastani_raw_results_filtered.tsv"),
|
|
246
|
+
os.path.join(results_dir, "Plasmid_sizes.txt"),
|
|
247
|
+
os.path.join(results_dir, "ANI_results_with_sizes.tsv"),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Step 9-10: apply coverage and ANI threshold filters - input/output inside results_dir
|
|
251
|
+
logger.info("Applying coverage and ANI thresholds...")
|
|
252
|
+
filtering.apply_filters(
|
|
253
|
+
os.path.join(results_dir, "ANI_results_with_sizes.tsv"),
|
|
254
|
+
os.path.join(results_dir, "ANI_results_final.tsv"),
|
|
255
|
+
coverage_threshold=args.coverage_threshold,
|
|
256
|
+
ani_threshold=args.ani_threshold,
|
|
257
|
+
frag_len=args.fragLen,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Step 11: build network - inputs inside results_dir
|
|
261
|
+
from plasmidhub import network_builder
|
|
262
|
+
logger.info("Building plasmid network...")
|
|
263
|
+
network_builder.build_network(
|
|
264
|
+
os.path.join(results_dir, "ANI_results_final.tsv"),
|
|
265
|
+
os.path.join(results_dir, "Plasmid_list.txt"),
|
|
266
|
+
results_dir
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
logger.info("Done!")
|
|
270
|
+
|
|
271
|
+
# Step 11.5: compute and save node stats
|
|
272
|
+
from plasmidhub import node_stats
|
|
273
|
+
logger.info("Computing node statistics...")
|
|
274
|
+
node_stats.compute_node_stats(results_dir)
|
|
275
|
+
|
|
276
|
+
logger.info("Done!")
|
|
277
|
+
|
|
278
|
+
# Step 12: clustering
|
|
279
|
+
if not args.cluster_off:
|
|
280
|
+
logger.info("Clustering plasmids...")
|
|
281
|
+
from plasmidhub import clustering
|
|
282
|
+
clustering.main(results_dir, args.min_cluster_size)
|
|
283
|
+
|
|
284
|
+
# Generate plasmid-cluster mapping file
|
|
285
|
+
import glob
|
|
286
|
+
def generate_plasmid_cluster_mapping(results_dir):
|
|
287
|
+
mapping_file = os.path.join(results_dir, "plasmid_cluster_mapping.txt")
|
|
288
|
+
with open(mapping_file, 'w') as outfile:
|
|
289
|
+
for filepath in glob.glob(os.path.join(results_dir, "cluster_*.txt")):
|
|
290
|
+
if os.path.basename(filepath) == "cluster_list.txt":
|
|
291
|
+
continue # Skip the summary file
|
|
292
|
+
cluster_name = os.path.basename(filepath).replace(".txt", "")
|
|
293
|
+
with open(filepath) as f:
|
|
294
|
+
for line in f:
|
|
295
|
+
plasmid = line.strip()
|
|
296
|
+
if plasmid:
|
|
297
|
+
outfile.write(f"{plasmid}\t{cluster_name}\n")
|
|
298
|
+
|
|
299
|
+
generate_plasmid_cluster_mapping(results_dir)
|
|
300
|
+
else:
|
|
301
|
+
logger.info("Clustering skipped due to --cluster_off")
|
|
302
|
+
|
|
303
|
+
logger.info("Done!")
|
|
304
|
+
|
|
305
|
+
# Default databases
|
|
306
|
+
default_dbs = ['plasmidfinder', 'card', 'vfdb']
|
|
307
|
+
|
|
308
|
+
if not args.skip_abricate:
|
|
309
|
+
logger.info("Running ABRicate annotation...")
|
|
310
|
+
|
|
311
|
+
# If user specified databases, use them; otherwise use the default
|
|
312
|
+
abricate_dbs = args.abricate_dbs if args.abricate_dbs else default_dbs
|
|
313
|
+
|
|
314
|
+
abricate_results_dir = os.path.join(results_dir, "abricate_results")
|
|
315
|
+
abricate.run_abricate_bulk(
|
|
316
|
+
input_dir=args.input_dir,
|
|
317
|
+
results_dir=abricate_results_dir,
|
|
318
|
+
db_list=abricate_dbs,
|
|
319
|
+
threads=args.threads
|
|
320
|
+
)
|
|
321
|
+
else:
|
|
322
|
+
logger.info("ABRicate skipped due to --skip_abricate")
|
|
323
|
+
|
|
324
|
+
# Network visualization
|
|
325
|
+
if not args.plot_skip:
|
|
326
|
+
logger.info("Generating network visualizations...")
|
|
327
|
+
from plasmidhub import plot
|
|
328
|
+
from plasmidhub.cluster_color import assign_cluster_colors
|
|
329
|
+
|
|
330
|
+
if not args.cluster_off:
|
|
331
|
+
mapping_file = os.path.join(results_dir, "plasmid_cluster_mapping.txt")
|
|
332
|
+
assign_cluster_colors(results_dir, mapping_file)
|
|
333
|
+
|
|
334
|
+
json_file = os.path.join(results_dir, "network.json")
|
|
335
|
+
G = plot.load_network_from_json(json_file)
|
|
336
|
+
|
|
337
|
+
if args.plot_k:
|
|
338
|
+
min_k = int(args.plot_k[0])
|
|
339
|
+
max_k = int(args.plot_k[1])
|
|
340
|
+
else:
|
|
341
|
+
min_k = 3
|
|
342
|
+
max_k = 3
|
|
343
|
+
|
|
344
|
+
plot.run_visualizations(results_dir, min_k, max_k + 1)
|
|
345
|
+
|
|
346
|
+
else:
|
|
347
|
+
logger.info("Network visualization skipped due to --plot_skip")
|
|
348
|
+
|
|
349
|
+
move_files_to_subdirs(results_dir)
|
|
350
|
+
|
|
351
|
+
logger.info("Done!")
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def move_files_to_subdirs(results_dir):
|
|
355
|
+
# Create subdirectories
|
|
356
|
+
plots_dir = os.path.join(results_dir, "plots")
|
|
357
|
+
stats_dir = os.path.join(results_dir, "statistics")
|
|
358
|
+
os.makedirs(plots_dir, exist_ok=True)
|
|
359
|
+
os.makedirs(stats_dir, exist_ok=True)
|
|
360
|
+
|
|
361
|
+
# Move plots
|
|
362
|
+
for plot_file in glob.glob(os.path.join(results_dir, "network_k_*.pdf")) + \
|
|
363
|
+
glob.glob(os.path.join(results_dir, "network_k_*.svg")):
|
|
364
|
+
shutil.move(plot_file, plots_dir)
|
|
365
|
+
|
|
366
|
+
# Move stats files
|
|
367
|
+
stat_files = [
|
|
368
|
+
"degree_centrality.csv",
|
|
369
|
+
"betweenness_centrality.csv",
|
|
370
|
+
"node_degrees.csv",
|
|
371
|
+
"network_metrics.csv",
|
|
372
|
+
"community_partition.json",
|
|
373
|
+
"Node_stats.csv"
|
|
374
|
+
]
|
|
375
|
+
for fname in stat_files:
|
|
376
|
+
full_path = os.path.join(results_dir, fname)
|
|
377
|
+
if os.path.exists(full_path):
|
|
378
|
+
shutil.move(full_path, stats_dir)
|
|
379
|
+
|
|
380
|
+
if __name__ == '__main__':
|
|
381
|
+
main()
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import networkx as nx
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import community.community_louvain as community_louvain
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
import json
|
|
7
|
+
import random
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
def load_and_process_fastani_data(file_path, results_dir):
|
|
13
|
+
"""Load and process FastANI data to create a similarity matrix."""
|
|
14
|
+
with open(file_path, 'r') as f:
|
|
15
|
+
lines = f.readlines()
|
|
16
|
+
|
|
17
|
+
if lines[0].startswith('Query'):
|
|
18
|
+
lines = lines[1:]
|
|
19
|
+
|
|
20
|
+
from io import StringIO
|
|
21
|
+
data = StringIO(''.join(lines))
|
|
22
|
+
df = pd.read_csv(data, sep="\t", header=None)
|
|
23
|
+
|
|
24
|
+
df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref',
|
|
25
|
+
'Query_size', 'Reference_size', 'Matching_Frags_Query_bp', 'Matching_Frags_Ref_bp']
|
|
26
|
+
|
|
27
|
+
df = df[['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']]
|
|
28
|
+
|
|
29
|
+
df['ANI'] = pd.to_numeric(df['ANI'], errors='coerce')
|
|
30
|
+
df['Matching_Frags_Query'] = pd.to_numeric(df['Matching_Frags_Query'], errors='coerce')
|
|
31
|
+
df['Matching_Frags_Ref'] = pd.to_numeric(df['Matching_Frags_Ref'], errors='coerce')
|
|
32
|
+
|
|
33
|
+
labels = pd.concat([df['Query'], df['Reference']]).unique()
|
|
34
|
+
similarity_matrix = pd.DataFrame(0, index=labels, columns=labels, dtype=float)
|
|
35
|
+
|
|
36
|
+
for _, row in df.iterrows():
|
|
37
|
+
query = row['Query']
|
|
38
|
+
reference = row['Reference']
|
|
39
|
+
ani = row['ANI']
|
|
40
|
+
if pd.notna(ani) and pd.notna(row['Matching_Frags_Query']) and pd.notna(row['Matching_Frags_Ref']):
|
|
41
|
+
match_frac = min(row['Matching_Frags_Query'] / row['Matching_Frags_Ref'],
|
|
42
|
+
row['Matching_Frags_Ref'] / row['Matching_Frags_Query'])
|
|
43
|
+
|
|
44
|
+
# Use both ANI and matching fragment ratio to determine the weight
|
|
45
|
+
weight = ani * match_frac
|
|
46
|
+
|
|
47
|
+
# Use the lower value if a weight is already present
|
|
48
|
+
existing_weight = similarity_matrix.loc[query, reference]
|
|
49
|
+
if existing_weight > 0:
|
|
50
|
+
weight = min(weight, existing_weight)
|
|
51
|
+
|
|
52
|
+
# Populate similarity matrix
|
|
53
|
+
similarity_matrix.loc[query, reference] = weight
|
|
54
|
+
similarity_matrix.loc[reference, query] = weight
|
|
55
|
+
|
|
56
|
+
np.fill_diagonal(similarity_matrix.values, 1)
|
|
57
|
+
similarity_matrix.to_csv(os.path.join(results_dir, "similarity_matrix.csv"), index=True)
|
|
58
|
+
|
|
59
|
+
return labels, similarity_matrix.values
|
|
60
|
+
|
|
61
|
+
def create_network(labels, similarity_matrix, plasmid_list_path, results_dir):
|
|
62
|
+
G = nx.Graph()
|
|
63
|
+
|
|
64
|
+
for label in labels:
|
|
65
|
+
if label not in G:
|
|
66
|
+
G.add_node(label)
|
|
67
|
+
|
|
68
|
+
n = len(labels)
|
|
69
|
+
for i in range(n):
|
|
70
|
+
for j in range(i + 1, n):
|
|
71
|
+
similarity = similarity_matrix[i, j]
|
|
72
|
+
if similarity > 5 and similarity <= 100:
|
|
73
|
+
weight = similarity
|
|
74
|
+
if not G.has_edge(labels[i], labels[j]):
|
|
75
|
+
G.add_edge(labels[i], labels[j], weight=weight)
|
|
76
|
+
|
|
77
|
+
# Save edge list after edges are added
|
|
78
|
+
nx.write_weighted_edgelist(G, os.path.join(results_dir, "network_edges.txt"))
|
|
79
|
+
|
|
80
|
+
with open(plasmid_list_path, 'r') as f:
|
|
81
|
+
all_plasmids = [line.strip() for line in f.readlines()]
|
|
82
|
+
|
|
83
|
+
for plasmid in all_plasmids:
|
|
84
|
+
if plasmid not in G.nodes:
|
|
85
|
+
G.add_node(plasmid, type='singleton')
|
|
86
|
+
|
|
87
|
+
singleton_positions = {}
|
|
88
|
+
for plasmid in all_plasmids:
|
|
89
|
+
if plasmid not in G.nodes:
|
|
90
|
+
singleton_positions[plasmid] = (random.uniform(-1, 1), random.uniform(-1, 1))
|
|
91
|
+
|
|
92
|
+
logger.info("Total nodes in the network: %d", len(G.nodes()))
|
|
93
|
+
logger.info("Total edges in the network: %d", G.number_of_edges())
|
|
94
|
+
|
|
95
|
+
nodes_to_remove = [node for node in G.nodes() if node not in all_plasmids]
|
|
96
|
+
G.remove_nodes_from(nodes_to_remove)
|
|
97
|
+
|
|
98
|
+
nx.write_gml(G, os.path.join(results_dir, "network.gml"))
|
|
99
|
+
|
|
100
|
+
G_json = nx.cytoscape_data(G)
|
|
101
|
+
with open(os.path.join(results_dir, "network.json"), "w") as f:
|
|
102
|
+
json.dump(G_json, f)
|
|
103
|
+
|
|
104
|
+
return G
|
|
105
|
+
|
|
106
|
+
def detect_communities(G):
|
|
107
|
+
partition = community_louvain.best_partition(G, weight='weight')
|
|
108
|
+
return partition
|
|
109
|
+
|
|
110
|
+
def calculate_subcluster_distances(G, partition, results_dir):
|
|
111
|
+
subcluster_combinations = list(combinations(set(partition.values()), 2))
|
|
112
|
+
results = []
|
|
113
|
+
|
|
114
|
+
for cluster1, cluster2 in subcluster_combinations:
|
|
115
|
+
edges_between_clusters = []
|
|
116
|
+
for node1, node2 in combinations(G.nodes(), 2):
|
|
117
|
+
if partition[node1] == cluster1 and partition[node2] == cluster2 and G.has_edge(node1, node2):
|
|
118
|
+
edges_between_clusters.append(G[node1][node2]['weight'])
|
|
119
|
+
|
|
120
|
+
mean_distance = np.mean(edges_between_clusters) if edges_between_clusters else np.nan
|
|
121
|
+
median_distance = np.median(edges_between_clusters) if edges_between_clusters else np.nan
|
|
122
|
+
results.append((cluster1, cluster2, mean_distance, median_distance))
|
|
123
|
+
|
|
124
|
+
df_results = pd.DataFrame(results, columns=["Subcluster1", "Subcluster2", "Mean Distance", "Median Distance"])
|
|
125
|
+
df_results.to_csv(os.path.join(results_dir, "subcluster_distances.tsv"), sep="\t", index=False)
|
|
126
|
+
return df_results
|
|
127
|
+
|
|
128
|
+
def save_plasmids_by_subcluster(partition, results_dir):
|
|
129
|
+
subcluster_plasmids = {cluster: [] for cluster in set(partition.values())}
|
|
130
|
+
for node, cluster in partition.items():
|
|
131
|
+
subcluster_plasmids[cluster].append(node)
|
|
132
|
+
|
|
133
|
+
for cluster, plasmids in subcluster_plasmids.items():
|
|
134
|
+
with open(os.path.join(results_dir, f"subcluster_{cluster}_plasmids.txt"), "w") as f:
|
|
135
|
+
f.write("\n".join(plasmids))
|
|
136
|
+
|
|
137
|
+
def calculate_network_metrics(G, partition):
|
|
138
|
+
num_edges = G.number_of_edges()
|
|
139
|
+
num_nodes = G.number_of_nodes()
|
|
140
|
+
num_possible_edges = num_nodes * (num_nodes - 1) / 2
|
|
141
|
+
connectance = num_edges / num_possible_edges
|
|
142
|
+
modularity = community_louvain.modularity(partition, G, weight='weight')
|
|
143
|
+
|
|
144
|
+
def calculate_nestedness():
|
|
145
|
+
adj_matrix = nx.to_numpy_array(G)
|
|
146
|
+
nestedness_value = 0
|
|
147
|
+
count = 0
|
|
148
|
+
for i in range(num_nodes):
|
|
149
|
+
for j in range(i + 1, num_nodes):
|
|
150
|
+
if adj_matrix[i, j] > 0:
|
|
151
|
+
ki = np.sum(adj_matrix[i, :])
|
|
152
|
+
kj = np.sum(adj_matrix[j, :])
|
|
153
|
+
nestedness_value += 1 / min(ki, kj)
|
|
154
|
+
count += 1
|
|
155
|
+
return nestedness_value / count if count > 0 else np.nan
|
|
156
|
+
|
|
157
|
+
nestedness = calculate_nestedness()
|
|
158
|
+
return connectance, modularity, nestedness
|
|
159
|
+
|
|
160
|
+
def calculate_node_degrees(G, results_dir):
|
|
161
|
+
degrees = dict(G.degree(weight='weight'))
|
|
162
|
+
df_degrees = pd.DataFrame(list(degrees.items()), columns=["Node", "Degree"])
|
|
163
|
+
df_degrees.to_csv(os.path.join(results_dir, "node_degrees.csv"), index=False)
|
|
164
|
+
return df_degrees
|
|
165
|
+
|
|
166
|
+
def calculate_betweenness_centrality(G, results_dir):
|
|
167
|
+
betweenness = nx.betweenness_centrality(G, weight='weight', normalized=True)
|
|
168
|
+
df_betweenness = pd.DataFrame(list(betweenness.items()), columns=["Node", "Betweenness Centrality"])
|
|
169
|
+
df_betweenness.to_csv(os.path.join(results_dir, "betweenness_centrality.csv"), index=False)
|
|
170
|
+
return df_betweenness
|
|
171
|
+
|
|
172
|
+
def calculate_degree_centrality(G, results_dir):
|
|
173
|
+
degree_centrality = nx.degree_centrality(G)
|
|
174
|
+
df_degree_centrality = pd.DataFrame(list(degree_centrality.items()), columns=["Node", "Degree Centrality"])
|
|
175
|
+
df_degree_centrality.to_csv(os.path.join(results_dir, "degree_centrality.csv"), index=False)
|
|
176
|
+
return df_degree_centrality
|
|
177
|
+
|
|
178
|
+
def build_network(file_path="ANI_results_final.tsv", plasmid_list_path="Plasmid_list.txt", results_dir="results"):
|
|
179
|
+
os.makedirs(results_dir, exist_ok=True)
|
|
180
|
+
|
|
181
|
+
labels, similarity_matrix = load_and_process_fastani_data(file_path, results_dir)
|
|
182
|
+
G = create_network(labels, similarity_matrix, plasmid_list_path, results_dir)
|
|
183
|
+
partition = detect_communities(G)
|
|
184
|
+
df_results = calculate_subcluster_distances(G, partition, results_dir)
|
|
185
|
+
save_plasmids_by_subcluster(partition, results_dir)
|
|
186
|
+
|
|
187
|
+
connectance, modularity, nestedness = calculate_network_metrics(G, partition)
|
|
188
|
+
metrics = {
|
|
189
|
+
"Connectance": [connectance],
|
|
190
|
+
"Modularity": [modularity],
|
|
191
|
+
"Nestedness": [nestedness]
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
df_metrics = pd.DataFrame(metrics)
|
|
195
|
+
df_metrics.to_csv(os.path.join(results_dir, "network_metrics.csv"), index=False)
|
|
196
|
+
|
|
197
|
+
df_degrees = calculate_node_degrees(G, results_dir)
|
|
198
|
+
df_betweenness = calculate_betweenness_centrality(G, results_dir)
|
|
199
|
+
df_degree_centrality = calculate_degree_centrality(G, results_dir)
|
|
200
|
+
|
|
201
|
+
with open(os.path.join(results_dir, "community_partition.json"), "w") as f:
|
|
202
|
+
json.dump(partition, f)
|
plasmidhub/node_stats.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
def compute_node_stats(results_dir):
|
|
8
|
+
gml_path = os.path.join(results_dir, "network.gml")
|
|
9
|
+
if not os.path.exists(gml_path):
|
|
10
|
+
logger.error(f"Could not find network file at: {gml_path}")
|
|
11
|
+
|
|
12
|
+
G = nx.read_gml(gml_path)
|
|
13
|
+
|
|
14
|
+
# Compute stats
|
|
15
|
+
betweenness = nx.betweenness_centrality(G, normalized=True)
|
|
16
|
+
weighted_betweenness = nx.betweenness_centrality(G, weight="weight", normalized=True)
|
|
17
|
+
|
|
18
|
+
closeness = nx.closeness_centrality(G)
|
|
19
|
+
weighted_closeness = nx.closeness_centrality(G, distance="weight")
|
|
20
|
+
|
|
21
|
+
degree_centrality = nx.degree_centrality(G)
|
|
22
|
+
weighted_degree_centrality = {
|
|
23
|
+
node: sum(data["weight"] for _, _, data in G.edges(node, data=True))
|
|
24
|
+
for node in G.nodes()
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
degree = dict(G.degree())
|
|
28
|
+
weighted_degree = {
|
|
29
|
+
node: sum(data["weight"] for _, _, data in G.edges(node, data=True))
|
|
30
|
+
for node in G.nodes()
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
max_degree = max(degree.values()) if degree else 1
|
|
34
|
+
nested_contribution = {
|
|
35
|
+
node: deg / max_degree
|
|
36
|
+
for node, deg in degree.items()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
max_weighted_degree = max(weighted_degree.values()) if weighted_degree else 1
|
|
40
|
+
weighted_nested_contribution = {
|
|
41
|
+
node: wdeg / max_weighted_degree
|
|
42
|
+
for node, wdeg in weighted_degree.items()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
num_nodes = len(G.nodes())
|
|
46
|
+
normalized_degree = {
|
|
47
|
+
node: deg / (num_nodes - 1)
|
|
48
|
+
for node, deg in degree.items()
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Combine into DataFrame
|
|
52
|
+
df = pd.DataFrame({
|
|
53
|
+
"Node": list(G.nodes()),
|
|
54
|
+
"Betweenness": pd.Series(betweenness),
|
|
55
|
+
"Weighted_Betweenness": pd.Series(weighted_betweenness),
|
|
56
|
+
"Closeness": pd.Series(closeness),
|
|
57
|
+
"Weighted_Closeness": pd.Series(weighted_closeness),
|
|
58
|
+
"Degree_Centrality": pd.Series(degree_centrality),
|
|
59
|
+
"Weighted_Degree_Centrality": pd.Series(weighted_degree_centrality),
|
|
60
|
+
"Nested_Contribution": pd.Series(nested_contribution),
|
|
61
|
+
"Weighted_Nested_Contribution": pd.Series(weighted_nested_contribution),
|
|
62
|
+
"Normalised_Degree": pd.Series(normalized_degree),
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
# Save
|
|
66
|
+
output_path = os.path.join(results_dir, "Node_stats.csv")
|
|
67
|
+
df.to_csv(output_path, index=False)
|
|
68
|
+
|
|
69
|
+
logger.info(f"Node statistics saved.")
|