plasmidhub 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of plasmidhub might be problematic. Click here for more details.

plasmidhub/main.py ADDED
@@ -0,0 +1,381 @@
1
+ import argparse
2
+ from argparse import ArgumentParser, RawTextHelpFormatter
3
+ import textwrap
4
+ import os
5
+ import logging
6
+ import shutil
7
+ import glob
8
+ import subprocess
9
+ from datetime import datetime
10
+ from plasmidhub import preprocessing, ani, filtering, abricate
11
+
12
+ VERSION = "1.0.0"
13
+
14
+ # Setup logging
15
+ def setup_logging(log_file_path):
16
+ logger = logging.getLogger()
17
+ logger.setLevel(logging.INFO)
18
+
19
+ # File handler with timestamp
20
+ file_formatter = logging.Formatter('%(asctime)s - %(message)s')
21
+ fh = logging.FileHandler(log_file_path)
22
+ fh.setFormatter(file_formatter)
23
+ logger.addHandler(fh)
24
+
25
+ # Stream handler (terminal) without timestamp
26
+ stream_formatter = logging.Formatter('%(message)s')
27
+ sh = logging.StreamHandler()
28
+ sh.setFormatter(stream_formatter)
29
+ logger.addHandler(sh)
30
+
31
+ return logger
32
+
33
+ def write_versions_txt(output_dir):
34
+ import sys
35
+ import importlib.metadata
36
+ def get_tool_version(cmd):
37
+ try:
38
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
39
+ return result.stdout.strip().split('\n')[0]
40
+ except Exception:
41
+ return "Error retrieving version"
42
+
43
+ def get_package_version(pkg_name):
44
+ try:
45
+ return importlib.metadata.version(pkg_name)
46
+ except importlib.metadata.PackageNotFoundError:
47
+ return "not installed"
48
+
49
+ os.makedirs(output_dir, exist_ok=True)
50
+ with open(os.path.join(output_dir, "versions.txt"), "w") as vf:
51
+ vf.write(f"Plasmidhub version: {VERSION}\n")
52
+ vf.write(f"Python version: {sys.version.split()[0]}\n")
53
+ vf.write(f"FastANI version: {get_tool_version(['fastANI', '--version'])}\n")
54
+ vf.write(f"ABRicate version: {get_tool_version(['abricate', '--version'])}\n\n")
55
+
56
+ vf.write("Python package versions:\n")
57
+ for pkg in ["biopython", "pandas", "networkx", "matplotlib", "python-louvain", "numpy"]:
58
+ vf.write(f"{pkg}: {get_package_version(pkg)}\n")
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(
62
+ prog="plasmidhub",
63
+ description=(
64
+ "SYNOPSIS\n"
65
+ " Plasmidhub: Bioinformatic Tool for Plasmid Network Analysis\n\n"
66
+ " Plasmidhub constructs a similarity-based network from plasmid FASTA files.\n"
67
+ " It uses FastANI to calculate pairwise ANI with user defined parameters,\n"
68
+ " clusters the plasmids, and visualizes the network.\n"
69
+ " Includes optional annotation with ABRicate to identify resistance and virulence genes.\n"
70
+ ),
71
+ epilog=(
72
+ "DOCUMENTATION \n"
73
+ " https://github.com/BALINTESBL/plasmidhub \n\n"
74
+ "Example:\n"
75
+ " plasmidhub path/to/my/plasmid/FASTA/files --fragLen 1000 --kmer 14 --coverage_threshold 0.5 --ani_threshold 95 --min_cluster_size 4 --plot_k 2.0 3.0 -t 32\n"
76
+ " plasmidhub --plot_only path/to/my/results --plot_k 3 3 --plot_node_color white --plot_node_size 500 --plot_node_shape s -t 32\n\n"
77
+ "Developed by Dr. Balint Timmer, Institute of Metagenomics, University of Debrecen\n"
78
+ "Version 1.0.0\n"
79
+ "If you are using Plasmidhub, please reference: https://github.com/BALINTESBL/plasmidhub"
80
+ ),
81
+ formatter_class=argparse.RawTextHelpFormatter
82
+ )
83
+
84
+ # Positional argument
85
+ parser.add_argument("input_dir", nargs="?", help="Path to plasmid FASTA files directory")
86
+
87
+ parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {VERSION}",
88
+ help="Show program's version number and exit")
89
+
90
+ # ANI
91
+ ani_group = parser.add_argument_group("ANI")
92
+ ani_group.add_argument("--fragLen", type=int, default=1000, help="FastANI fragment length (default: 1000)")
93
+ ani_group.add_argument("--kmer", type=int, default=14, help="FastANI kmer size (default: 14)")
94
+ ani_group.add_argument("--coverage_threshold", type=float, default=0.5, help="Coverage threshold fraction (default: 0.5)")
95
+ ani_group.add_argument("--ani_threshold", type=float, default=95.0, help="ANI threshold (default: 95.0)")
96
+
97
+ # CLUSTER
98
+ cluster = parser.add_argument_group("CLUSTER")
99
+ cluster.add_argument("--cluster_off", action="store_true", help="Disable clustering step")
100
+ cluster.add_argument("--min_cluster_size", type=int, default=3,
101
+ help="Minimum plasmid count for final clusters (default: 3)")
102
+
103
+ # ABRicate
104
+ abricate_group = parser.add_argument_group("ABRicate")
105
+ abricate_group.add_argument("--skip_abricate", action="store_true", help="Skip ABRicate analysis step")
106
+ abricate_group.add_argument(
107
+ "--abricate_dbs",
108
+ nargs="+",
109
+ metavar="DB",
110
+ help=(
111
+ "List of ABRicate databases to run (default: plasmidfinder, card, vfdb).\n"
112
+ "Available databases:\n"
113
+ " resfinder\n"
114
+ " megares\n"
115
+ " vfdb\n"
116
+ " card\n"
117
+ " argannot\n"
118
+ " ecoli_vf\n"
119
+ " plasmidfinder\n"
120
+ " ncbi\n"
121
+ " ecoh"
122
+ )
123
+ )
124
+
125
+ # PLOT
126
+ plot = parser.add_argument_group("PLOT")
127
+ plot.add_argument("--plot_k", nargs=2, type=float, metavar=('MIN_K', 'MAX_K'),
128
+ help="Generate network visualizations.\nSpecify minimum and maximum k (e.g.: --plot_k 1.5 3.0)")
129
+ plot.add_argument("--plot_skip", action="store_true", help="Skip network visualization step")
130
+ plot.add_argument("--plot_only", type=str, metavar="DIR",
131
+ help="Generate plots only from existing files, without running the whole pipeline. Figure parameters can be adjusted:")
132
+ plot.add_argument("--plot_edge_width", nargs=2, type=float, metavar=('MIN_WIDTH', 'MAX_WIDTH'),
133
+ default=[0.2, 2.0], help="Minimum and maximum edge widths (default: 0.2 2.0)")
134
+ plot.add_argument("--plot_node_size", type=int, default=900, help="Node size (default: 900)")
135
+ plot.add_argument("--plot_node_shape", type=str, default='o', help="Node shape (e.g.: 'o', 's' (square), '>' (triangle) , '^' , '*' ect., default: o)")
136
+ plot.add_argument("--plot_node_color", type=str, help="Node color (e.g.: 'white', 'blue', default: 'grey')")
137
+ plot.add_argument("--plot_figsize", nargs=2, type=int, metavar=('WIDTH', 'HEIGHT'), default=[25, 25], help="Figure size in inches (default: 25 25)")
138
+ plot.add_argument("--plot_iterations", type=int, default=100, help="Number of iterations (spring layout, default: 100)")
139
+
140
+ # THREADS
141
+ threads = parser.add_argument_group("THREADS")
142
+ threads.add_argument("-t", "--threads", type=int, default=4, metavar="", help="Number of threads to use (default: 4)")
143
+
144
+ args = parser.parse_args()
145
+
146
+
147
+ # === Plot-only mode === #
148
+
149
+ if args.plot_only:
150
+ import sys
151
+ if not os.path.exists(args.plot_only):
152
+ parser.error("The path provided to --plot_only does not exist.")
153
+
154
+ cmd = [sys.executable, os.path.join(os.path.dirname(__file__), "plot_only.py"),
155
+ "--results_dir", args.plot_only]
156
+
157
+ if args.plot_k:
158
+ cmd.extend(["--plot_k", str(args.plot_k[0]), str(args.plot_k[1])])
159
+ if args.plot_edge_width:
160
+ cmd += ["--min_edge_width", str(args.plot_edge_width[0]), "--max_edge_width", str(args.plot_edge_width[1])]
161
+ if args.plot_node_size:
162
+ cmd += ["--node_size", str(args.plot_node_size)]
163
+ if args.plot_node_color:
164
+ cmd += ["--node_color", args.plot_node_color]
165
+ if args.plot_node_shape:
166
+ cmd += ["--node_shape", args.plot_node_shape]
167
+ if args.plot_figsize:
168
+ cmd += ["--figsize", str(args.plot_figsize[0]), str(args.plot_figsize[1])]
169
+ if args.plot_iterations:
170
+ cmd += ["--iterations", str(args.plot_iterations)]
171
+
172
+ subprocess.run(cmd)
173
+ return
174
+
175
+ ## === Full pipeline mode === ##
176
+ if not args.input_dir:
177
+ parser.error("input_dir is required unless --plot_only is used.")
178
+
179
+ if not args.plot_only:
180
+ if (
181
+ args.plot_node_size != 900 or
182
+ args.plot_edge_width != [0.2, 2.0] or
183
+ args.plot_node_shape != 'o' or
184
+ args.plot_figsize != [25, 25] or
185
+ args.plot_iterations != 100
186
+ ):
187
+ parser.error(
188
+ "Plot customization options (--plot_node_size, --plot_edge_width, etc.) "
189
+ "can only be used with --plot_only."
190
+ )
191
+
192
+ # Create results directory inside input_dir if it doesn't exist
193
+ results_dir = os.path.join(args.input_dir, "results")
194
+ os.makedirs(results_dir, exist_ok=True)
195
+
196
+ # Setup logger now that we have results_dir
197
+ log_file = os.path.join(results_dir, "run.log")
198
+ logger = setup_logging(log_file)
199
+ logger.info(f"Starting Plasmidhub v{VERSION}")
200
+
201
+ logger.info(f"Input directory: {args.input_dir}")
202
+
203
+ write_versions_txt(results_dir)
204
+
205
+ # Step 1-4: preprocess (validate, list files, count, size)
206
+ plasmid_list = preprocessing.validate_and_list_plasmids(args.input_dir)
207
+ logger.info(f"Number of plasmids: {len(plasmid_list)}")
208
+ # Write output files inside results_dir
209
+ preprocessing.write_plasmid_list(plasmid_list, output_file=os.path.join(results_dir, "Plasmid_list.txt"))
210
+ preprocessing.write_plasmid_sizes(plasmid_list, output_file=os.path.join(results_dir, "Plasmid_sizes.txt"))
211
+
212
+ # Step 5-6: run FastANI - output to results_dir
213
+ logger.info("Running FastANI...")
214
+
215
+ cwd = os.getcwd()
216
+ os.chdir(args.input_dir) # switch to directory with FASTA files
217
+
218
+ try:
219
+ ani.run_fastani(
220
+ os.path.join(results_dir, "Plasmid_list.txt"),
221
+ fragLen=args.fragLen,
222
+ minFrag=0.001, # Hardcoded minimum fraction
223
+ kmer=args.kmer,
224
+ output_dir=results_dir,
225
+ threads=args.threads
226
+ )
227
+
228
+ finally:
229
+ os.chdir(cwd) # revert back to original working directory
230
+
231
+ # Step 6.5: Normalize plasmid names (strip paths)
232
+ filtering.strip_paths_in_fastani(os.path.join(results_dir, "fastani_raw_results.tsv"))
233
+ filtering.strip_paths_in_plasmid_list(os.path.join(results_dir, "Plasmid_list.txt"))
234
+
235
+ # Step 7: filter self comparisons
236
+ logger.info("Filtering self comparisons...")
237
+ filtering.filter_self_comparisons(
238
+ os.path.join(results_dir, "fastani_raw_results.tsv"),
239
+ os.path.join(results_dir, "fastani_raw_results_filtered.tsv"),
240
+ )
241
+
242
+ # Step 8: add sizes - input and output inside results_dir
243
+ logger.info("Adding plasmid sizes to ANI results...")
244
+ filtering.add_plasmid_sizes(
245
+ os.path.join(results_dir, "fastani_raw_results_filtered.tsv"),
246
+ os.path.join(results_dir, "Plasmid_sizes.txt"),
247
+ os.path.join(results_dir, "ANI_results_with_sizes.tsv"),
248
+ )
249
+
250
+ # Step 9-10: apply coverage and ANI threshold filters - input/output inside results_dir
251
+ logger.info("Applying coverage and ANI thresholds...")
252
+ filtering.apply_filters(
253
+ os.path.join(results_dir, "ANI_results_with_sizes.tsv"),
254
+ os.path.join(results_dir, "ANI_results_final.tsv"),
255
+ coverage_threshold=args.coverage_threshold,
256
+ ani_threshold=args.ani_threshold,
257
+ frag_len=args.fragLen,
258
+ )
259
+
260
+ # Step 11: build network - inputs inside results_dir
261
+ from plasmidhub import network_builder
262
+ logger.info("Building plasmid network...")
263
+ network_builder.build_network(
264
+ os.path.join(results_dir, "ANI_results_final.tsv"),
265
+ os.path.join(results_dir, "Plasmid_list.txt"),
266
+ results_dir
267
+ )
268
+
269
+ logger.info("Done!")
270
+
271
+ # Step 11.5: compute and save node stats
272
+ from plasmidhub import node_stats
273
+ logger.info("Computing node statistics...")
274
+ node_stats.compute_node_stats(results_dir)
275
+
276
+ logger.info("Done!")
277
+
278
+ # Step 12: clustering
279
+ if not args.cluster_off:
280
+ logger.info("Clustering plasmids...")
281
+ from plasmidhub import clustering
282
+ clustering.main(results_dir, args.min_cluster_size)
283
+
284
+ # Generate plasmid-cluster mapping file
285
+ import glob
286
+ def generate_plasmid_cluster_mapping(results_dir):
287
+ mapping_file = os.path.join(results_dir, "plasmid_cluster_mapping.txt")
288
+ with open(mapping_file, 'w') as outfile:
289
+ for filepath in glob.glob(os.path.join(results_dir, "cluster_*.txt")):
290
+ if os.path.basename(filepath) == "cluster_list.txt":
291
+ continue # Skip the summary file
292
+ cluster_name = os.path.basename(filepath).replace(".txt", "")
293
+ with open(filepath) as f:
294
+ for line in f:
295
+ plasmid = line.strip()
296
+ if plasmid:
297
+ outfile.write(f"{plasmid}\t{cluster_name}\n")
298
+
299
+ generate_plasmid_cluster_mapping(results_dir)
300
+ else:
301
+ logger.info("Clustering skipped due to --cluster_off")
302
+
303
+ logger.info("Done!")
304
+
305
+ # Default databases
306
+ default_dbs = ['plasmidfinder', 'card', 'vfdb']
307
+
308
+ if not args.skip_abricate:
309
+ logger.info("Running ABRicate annotation...")
310
+
311
+ # If user specified databases, use them; otherwise use the default
312
+ abricate_dbs = args.abricate_dbs if args.abricate_dbs else default_dbs
313
+
314
+ abricate_results_dir = os.path.join(results_dir, "abricate_results")
315
+ abricate.run_abricate_bulk(
316
+ input_dir=args.input_dir,
317
+ results_dir=abricate_results_dir,
318
+ db_list=abricate_dbs,
319
+ threads=args.threads
320
+ )
321
+ else:
322
+ logger.info("ABRicate skipped due to --skip_abricate")
323
+
324
+ # Network visualization
325
+ if not args.plot_skip:
326
+ logger.info("Generating network visualizations...")
327
+ from plasmidhub import plot
328
+ from plasmidhub.cluster_color import assign_cluster_colors
329
+
330
+ if not args.cluster_off:
331
+ mapping_file = os.path.join(results_dir, "plasmid_cluster_mapping.txt")
332
+ assign_cluster_colors(results_dir, mapping_file)
333
+
334
+ json_file = os.path.join(results_dir, "network.json")
335
+ G = plot.load_network_from_json(json_file)
336
+
337
+ if args.plot_k:
338
+ min_k = int(args.plot_k[0])
339
+ max_k = int(args.plot_k[1])
340
+ else:
341
+ min_k = 3
342
+ max_k = 3
343
+
344
+ plot.run_visualizations(results_dir, min_k, max_k + 1)
345
+
346
+ else:
347
+ logger.info("Network visualization skipped due to --plot_skip")
348
+
349
+ move_files_to_subdirs(results_dir)
350
+
351
+ logger.info("Done!")
352
+
353
+
354
+ def move_files_to_subdirs(results_dir):
355
+ # Create subdirectories
356
+ plots_dir = os.path.join(results_dir, "plots")
357
+ stats_dir = os.path.join(results_dir, "statistics")
358
+ os.makedirs(plots_dir, exist_ok=True)
359
+ os.makedirs(stats_dir, exist_ok=True)
360
+
361
+ # Move plots
362
+ for plot_file in glob.glob(os.path.join(results_dir, "network_k_*.pdf")) + \
363
+ glob.glob(os.path.join(results_dir, "network_k_*.svg")):
364
+ shutil.move(plot_file, plots_dir)
365
+
366
+ # Move stats files
367
+ stat_files = [
368
+ "degree_centrality.csv",
369
+ "betweenness_centrality.csv",
370
+ "node_degrees.csv",
371
+ "network_metrics.csv",
372
+ "community_partition.json",
373
+ "Node_stats.csv"
374
+ ]
375
+ for fname in stat_files:
376
+ full_path = os.path.join(results_dir, fname)
377
+ if os.path.exists(full_path):
378
+ shutil.move(full_path, stats_dir)
379
+
380
+ if __name__ == '__main__':
381
+ main()
@@ -0,0 +1,202 @@
1
+ import numpy as np
2
+ import networkx as nx
3
+ import pandas as pd
4
+ import community.community_louvain as community_louvain
5
+ from itertools import combinations
6
+ import json
7
+ import random
8
+ import os
9
+ import logging
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def load_and_process_fastani_data(file_path, results_dir):
13
+ """Load and process FastANI data to create a similarity matrix."""
14
+ with open(file_path, 'r') as f:
15
+ lines = f.readlines()
16
+
17
+ if lines[0].startswith('Query'):
18
+ lines = lines[1:]
19
+
20
+ from io import StringIO
21
+ data = StringIO(''.join(lines))
22
+ df = pd.read_csv(data, sep="\t", header=None)
23
+
24
+ df.columns = ['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref',
25
+ 'Query_size', 'Reference_size', 'Matching_Frags_Query_bp', 'Matching_Frags_Ref_bp']
26
+
27
+ df = df[['Query', 'Reference', 'ANI', 'Matching_Frags_Query', 'Matching_Frags_Ref']]
28
+
29
+ df['ANI'] = pd.to_numeric(df['ANI'], errors='coerce')
30
+ df['Matching_Frags_Query'] = pd.to_numeric(df['Matching_Frags_Query'], errors='coerce')
31
+ df['Matching_Frags_Ref'] = pd.to_numeric(df['Matching_Frags_Ref'], errors='coerce')
32
+
33
+ labels = pd.concat([df['Query'], df['Reference']]).unique()
34
+ similarity_matrix = pd.DataFrame(0, index=labels, columns=labels, dtype=float)
35
+
36
+ for _, row in df.iterrows():
37
+ query = row['Query']
38
+ reference = row['Reference']
39
+ ani = row['ANI']
40
+ if pd.notna(ani) and pd.notna(row['Matching_Frags_Query']) and pd.notna(row['Matching_Frags_Ref']):
41
+ match_frac = min(row['Matching_Frags_Query'] / row['Matching_Frags_Ref'],
42
+ row['Matching_Frags_Ref'] / row['Matching_Frags_Query'])
43
+
44
+ # Use both ANI and matching fragment ratio to determine the weight
45
+ weight = ani * match_frac
46
+
47
+ # Use the lower value if a weight is already present
48
+ existing_weight = similarity_matrix.loc[query, reference]
49
+ if existing_weight > 0:
50
+ weight = min(weight, existing_weight)
51
+
52
+ # Populate similarity matrix
53
+ similarity_matrix.loc[query, reference] = weight
54
+ similarity_matrix.loc[reference, query] = weight
55
+
56
+ np.fill_diagonal(similarity_matrix.values, 1)
57
+ similarity_matrix.to_csv(os.path.join(results_dir, "similarity_matrix.csv"), index=True)
58
+
59
+ return labels, similarity_matrix.values
60
+
61
+ def create_network(labels, similarity_matrix, plasmid_list_path, results_dir):
62
+ G = nx.Graph()
63
+
64
+ for label in labels:
65
+ if label not in G:
66
+ G.add_node(label)
67
+
68
+ n = len(labels)
69
+ for i in range(n):
70
+ for j in range(i + 1, n):
71
+ similarity = similarity_matrix[i, j]
72
+ if similarity > 5 and similarity <= 100:
73
+ weight = similarity
74
+ if not G.has_edge(labels[i], labels[j]):
75
+ G.add_edge(labels[i], labels[j], weight=weight)
76
+
77
+ # Save edge list after edges are added
78
+ nx.write_weighted_edgelist(G, os.path.join(results_dir, "network_edges.txt"))
79
+
80
+ with open(plasmid_list_path, 'r') as f:
81
+ all_plasmids = [line.strip() for line in f.readlines()]
82
+
83
+ for plasmid in all_plasmids:
84
+ if plasmid not in G.nodes:
85
+ G.add_node(plasmid, type='singleton')
86
+
87
+ singleton_positions = {}
88
+ for plasmid in all_plasmids:
89
+ if plasmid not in G.nodes:
90
+ singleton_positions[plasmid] = (random.uniform(-1, 1), random.uniform(-1, 1))
91
+
92
+ logger.info("Total nodes in the network: %d", len(G.nodes()))
93
+ logger.info("Total edges in the network: %d", G.number_of_edges())
94
+
95
+ nodes_to_remove = [node for node in G.nodes() if node not in all_plasmids]
96
+ G.remove_nodes_from(nodes_to_remove)
97
+
98
+ nx.write_gml(G, os.path.join(results_dir, "network.gml"))
99
+
100
+ G_json = nx.cytoscape_data(G)
101
+ with open(os.path.join(results_dir, "network.json"), "w") as f:
102
+ json.dump(G_json, f)
103
+
104
+ return G
105
+
106
+ def detect_communities(G):
107
+ partition = community_louvain.best_partition(G, weight='weight')
108
+ return partition
109
+
110
+ def calculate_subcluster_distances(G, partition, results_dir):
111
+ subcluster_combinations = list(combinations(set(partition.values()), 2))
112
+ results = []
113
+
114
+ for cluster1, cluster2 in subcluster_combinations:
115
+ edges_between_clusters = []
116
+ for node1, node2 in combinations(G.nodes(), 2):
117
+ if partition[node1] == cluster1 and partition[node2] == cluster2 and G.has_edge(node1, node2):
118
+ edges_between_clusters.append(G[node1][node2]['weight'])
119
+
120
+ mean_distance = np.mean(edges_between_clusters) if edges_between_clusters else np.nan
121
+ median_distance = np.median(edges_between_clusters) if edges_between_clusters else np.nan
122
+ results.append((cluster1, cluster2, mean_distance, median_distance))
123
+
124
+ df_results = pd.DataFrame(results, columns=["Subcluster1", "Subcluster2", "Mean Distance", "Median Distance"])
125
+ df_results.to_csv(os.path.join(results_dir, "subcluster_distances.tsv"), sep="\t", index=False)
126
+ return df_results
127
+
128
+ def save_plasmids_by_subcluster(partition, results_dir):
129
+ subcluster_plasmids = {cluster: [] for cluster in set(partition.values())}
130
+ for node, cluster in partition.items():
131
+ subcluster_plasmids[cluster].append(node)
132
+
133
+ for cluster, plasmids in subcluster_plasmids.items():
134
+ with open(os.path.join(results_dir, f"subcluster_{cluster}_plasmids.txt"), "w") as f:
135
+ f.write("\n".join(plasmids))
136
+
137
+ def calculate_network_metrics(G, partition):
138
+ num_edges = G.number_of_edges()
139
+ num_nodes = G.number_of_nodes()
140
+ num_possible_edges = num_nodes * (num_nodes - 1) / 2
141
+ connectance = num_edges / num_possible_edges
142
+ modularity = community_louvain.modularity(partition, G, weight='weight')
143
+
144
+ def calculate_nestedness():
145
+ adj_matrix = nx.to_numpy_array(G)
146
+ nestedness_value = 0
147
+ count = 0
148
+ for i in range(num_nodes):
149
+ for j in range(i + 1, num_nodes):
150
+ if adj_matrix[i, j] > 0:
151
+ ki = np.sum(adj_matrix[i, :])
152
+ kj = np.sum(adj_matrix[j, :])
153
+ nestedness_value += 1 / min(ki, kj)
154
+ count += 1
155
+ return nestedness_value / count if count > 0 else np.nan
156
+
157
+ nestedness = calculate_nestedness()
158
+ return connectance, modularity, nestedness
159
+
160
+ def calculate_node_degrees(G, results_dir):
161
+ degrees = dict(G.degree(weight='weight'))
162
+ df_degrees = pd.DataFrame(list(degrees.items()), columns=["Node", "Degree"])
163
+ df_degrees.to_csv(os.path.join(results_dir, "node_degrees.csv"), index=False)
164
+ return df_degrees
165
+
166
+ def calculate_betweenness_centrality(G, results_dir):
167
+ betweenness = nx.betweenness_centrality(G, weight='weight', normalized=True)
168
+ df_betweenness = pd.DataFrame(list(betweenness.items()), columns=["Node", "Betweenness Centrality"])
169
+ df_betweenness.to_csv(os.path.join(results_dir, "betweenness_centrality.csv"), index=False)
170
+ return df_betweenness
171
+
172
+ def calculate_degree_centrality(G, results_dir):
173
+ degree_centrality = nx.degree_centrality(G)
174
+ df_degree_centrality = pd.DataFrame(list(degree_centrality.items()), columns=["Node", "Degree Centrality"])
175
+ df_degree_centrality.to_csv(os.path.join(results_dir, "degree_centrality.csv"), index=False)
176
+ return df_degree_centrality
177
+
178
+ def build_network(file_path="ANI_results_final.tsv", plasmid_list_path="Plasmid_list.txt", results_dir="results"):
179
+ os.makedirs(results_dir, exist_ok=True)
180
+
181
+ labels, similarity_matrix = load_and_process_fastani_data(file_path, results_dir)
182
+ G = create_network(labels, similarity_matrix, plasmid_list_path, results_dir)
183
+ partition = detect_communities(G)
184
+ df_results = calculate_subcluster_distances(G, partition, results_dir)
185
+ save_plasmids_by_subcluster(partition, results_dir)
186
+
187
+ connectance, modularity, nestedness = calculate_network_metrics(G, partition)
188
+ metrics = {
189
+ "Connectance": [connectance],
190
+ "Modularity": [modularity],
191
+ "Nestedness": [nestedness]
192
+ }
193
+
194
+ df_metrics = pd.DataFrame(metrics)
195
+ df_metrics.to_csv(os.path.join(results_dir, "network_metrics.csv"), index=False)
196
+
197
+ df_degrees = calculate_node_degrees(G, results_dir)
198
+ df_betweenness = calculate_betweenness_centrality(G, results_dir)
199
+ df_degree_centrality = calculate_degree_centrality(G, results_dir)
200
+
201
+ with open(os.path.join(results_dir, "community_partition.json"), "w") as f:
202
+ json.dump(partition, f)
@@ -0,0 +1,69 @@
1
+ import networkx as nx
2
+ import pandas as pd
3
+ import os
4
+ import logging
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def compute_node_stats(results_dir):
8
+ gml_path = os.path.join(results_dir, "network.gml")
9
+ if not os.path.exists(gml_path):
10
+ logger.error(f"Could not find network file at: {gml_path}")
11
+
12
+ G = nx.read_gml(gml_path)
13
+
14
+ # Compute stats
15
+ betweenness = nx.betweenness_centrality(G, normalized=True)
16
+ weighted_betweenness = nx.betweenness_centrality(G, weight="weight", normalized=True)
17
+
18
+ closeness = nx.closeness_centrality(G)
19
+ weighted_closeness = nx.closeness_centrality(G, distance="weight")
20
+
21
+ degree_centrality = nx.degree_centrality(G)
22
+ weighted_degree_centrality = {
23
+ node: sum(data["weight"] for _, _, data in G.edges(node, data=True))
24
+ for node in G.nodes()
25
+ }
26
+
27
+ degree = dict(G.degree())
28
+ weighted_degree = {
29
+ node: sum(data["weight"] for _, _, data in G.edges(node, data=True))
30
+ for node in G.nodes()
31
+ }
32
+
33
+ max_degree = max(degree.values()) if degree else 1
34
+ nested_contribution = {
35
+ node: deg / max_degree
36
+ for node, deg in degree.items()
37
+ }
38
+
39
+ max_weighted_degree = max(weighted_degree.values()) if weighted_degree else 1
40
+ weighted_nested_contribution = {
41
+ node: wdeg / max_weighted_degree
42
+ for node, wdeg in weighted_degree.items()
43
+ }
44
+
45
+ num_nodes = len(G.nodes())
46
+ normalized_degree = {
47
+ node: deg / (num_nodes - 1)
48
+ for node, deg in degree.items()
49
+ }
50
+
51
+ # Combine into DataFrame
52
+ df = pd.DataFrame({
53
+ "Node": list(G.nodes()),
54
+ "Betweenness": pd.Series(betweenness),
55
+ "Weighted_Betweenness": pd.Series(weighted_betweenness),
56
+ "Closeness": pd.Series(closeness),
57
+ "Weighted_Closeness": pd.Series(weighted_closeness),
58
+ "Degree_Centrality": pd.Series(degree_centrality),
59
+ "Weighted_Degree_Centrality": pd.Series(weighted_degree_centrality),
60
+ "Nested_Contribution": pd.Series(nested_contribution),
61
+ "Weighted_Nested_Contribution": pd.Series(weighted_nested_contribution),
62
+ "Normalised_Degree": pd.Series(normalized_degree),
63
+ })
64
+
65
+ # Save
66
+ output_path = os.path.join(results_dir, "Node_stats.csv")
67
+ df.to_csv(output_path, index=False)
68
+
69
+ logger.info(f"Node statistics saved.")