PyPI - plasmidhub - Versions diffs - 1.0.0__tar.gz - Mend

plasmidhub 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of plasmidhub might be problematic. Click here for more details.

Files changed (24) hide show

plasmidhub-1.0.0/LICENSE +24 -0
plasmidhub-1.0.0/PKG-INFO +193 -0
plasmidhub-1.0.0/README.md +145 -0
plasmidhub-1.0.0/plasmidhub/__init__.py +1 -0
plasmidhub-1.0.0/plasmidhub/abricate.py +46 -0
plasmidhub-1.0.0/plasmidhub/ani.py +29 -0
plasmidhub-1.0.0/plasmidhub/cluster_color.py +48 -0
plasmidhub-1.0.0/plasmidhub/clustering.py +143 -0
plasmidhub-1.0.0/plasmidhub/filtering.py +54 -0
plasmidhub-1.0.0/plasmidhub/main.py +381 -0
plasmidhub-1.0.0/plasmidhub/network_builder.py +202 -0
plasmidhub-1.0.0/plasmidhub/node_stats.py +69 -0
plasmidhub-1.0.0/plasmidhub/plot.py +169 -0
plasmidhub-1.0.0/plasmidhub/plot_only.py +153 -0
plasmidhub-1.0.0/plasmidhub/preprocessing.py +48 -0
plasmidhub-1.0.0/plasmidhub.egg-info/PKG-INFO +193 -0
plasmidhub-1.0.0/plasmidhub.egg-info/SOURCES.txt +22 -0
plasmidhub-1.0.0/plasmidhub.egg-info/dependency_links.txt +1 -0
plasmidhub-1.0.0/plasmidhub.egg-info/entry_points.txt +2 -0
plasmidhub-1.0.0/plasmidhub.egg-info/requires.txt +7 -0
plasmidhub-1.0.0/plasmidhub.egg-info/top_level.txt +1 -0
plasmidhub-1.0.0/pyproject.toml +36 -0
plasmidhub-1.0.0/setup.cfg +4 -0
plasmidhub-1.0.0/setup.py +34 -0

plasmidhub-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,24 @@
+MIT License
+Copyright (c) 2025 Dr. Balint Timmer
+Institute of Metagenomics, University of Debrecen, Debrecen, Hungary
+Institute of Medical Microbiology, Faculty of Medicine, University of Pecs, Pecs, Hungary
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense,
+and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

plasmidhub-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,193 @@
+Metadata-Version: 2.1
+Name: plasmidhub
+Version: 1.0.0
+Summary: A command-line tool for plasmid clustering, analysis, and visualization.
+Home-page: https://github.com/YOUR_USERNAME/Plasmidhub
+Author: Dr. Balint Timmer
+Author-email: "Dr. Balint Timmer" <timmer.balint@med.unideb.hu>
+License: MIT License
+        Copyright (c) 2025 Dr. Balint Timmer
+        Institute of Metagenomics, University of Debrecen, Debrecen, Hungary
+        Institute of Medical Microbiology, Faculty of Medicine, University of Pecs, Pecs, Hungary
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense,
+        and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Keywords: plasmid,bioinformatics,network,clustering,AMR,virulence,plasmid network
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: biopython>=1.83
+Requires-Dist: pandas>=2.0
+Requires-Dist: networkx>=3.1
+Requires-Dist: matplotlib>=3.7
+Requires-Dist: python-louvain>=0.16
+Requires-Dist: numpy>=1.24
+Requires-Dist: scipy>=1.8
+ <img src="https://img.shields.io/github/license/BALINTESBL/plasmidhub" alt="License"> <img src="https://img.shields.io/pypi/v/plasmidhub" alt="PyPI">  ![Build Status](https://github.com/BALINTESBL/plasmidhub/actions/workflows/tests.yml/badge.svg)
+# Plasmidhub
+Plasmidhub is a free and open-source command-line tool for comprehensive plasmid network analysis based on nucleotide sequence similarity. It enables researchers to cluster plasmids and identify genetically related groups using a dynamic, database-independent approach. Plasmidhub's approach:
+* Is applicable to any plasmid
+* Provides an unambiguous classification
+* Considers the whole sequence of the plasmids
+Network visualizations, stats and data are provided for further analysis.
+## Download and Installation
+PlasmidHub can be installed easily via PyPI, Bioconda, or directly from GitHub.
+### Pip
+```
+pip install plasmidhub
+```
+**Note:** It's highly recommended to use a virtual environment or conda environment.
+Recommended environment setup:
+```
+conda create -n plasmidhub python=3.8
+conda activate plasmidhub
+```
+### Bioconda
+If you use Conda for environment management:
+```
+conda install -c bioconda plasmidhub
+```
+Make sure you have the bioconda channel configured. If not, configure them with:
+```
+conda config --add channels defaults
+conda config --add channels bioconda
+conda config --add channels conda-forge
+```
+### GitHub
+To get the latest version:
+```
+git clone https://github.com/BALINTESBL/plasmidhub.git
+cd plasmidhub
+pip install .
+```
+### Dependencies
+This tool requires the following external software to be installed:
+- [FastANI](https://github.com/ParBLiSS/FastANI)
+- [ABRicate](https://github.com/tseemann/abricate)
+## Inputs
+Plasmidhub requires plasmid FASTA files (.fna or .fa or .fasta). Your FASTA files need to be placed in one directory. Ideally, there are no other files in the directory.
+## Usage
+Perform plasmid network analysis with default settings by defining only the directory path of your plasmid FASTA files! Or, you can also adjust parameters.
+Example usage:
+```
+% plasmidhub path/to/my/plasmid/FASTA/files --fragLen 1000 --kmer 14 --coverage_threshold 0.5 --ani_threshold 95 --min_cluster_size 4 --plot_k 2.0 3.0 -t 32
+```
+This command will:
+* Compute pairwise ANI using FastANI
+* Build a plasmid similarity network
+* Save network metrics and statistics (results/statistics)
+* Cluster plasmids
+* Annotate resistance and virulence genes with ABRicate (results/abricate_results)
+* Generate network visualizations (results/plots)
+### Key Options
+| Category       | Flag                   | Description                             | Default                   |
+| -------------- | ---------------------- | --------------------------------------- | ------------------------- |
+| **Input**      | `                      | Path to folder with plasmid FASTA files | –                         |
+| **FastANI**    | `--fragLen`            | Fragment length                         | `1000`                    |
+|                | `--kmer`               | K-mer size                              | `14`                      |
+|                | `--coverage_threshold` | Minimum proportion of the plasmid lenghts| `0.5`                     |
+|                |                        |  covered by the matching fragments      |                           |
+|                | `--ani_threshold`      | Minimum ANI score (after applying       | `95.0`                    |
+|                |                        |  coverage threshold)                    |                           |
+| **Clustering** | `--cluster_off`        | Disable clustering                      | –                         |
+|                | `--min_cluster_size`   | Minimum cluster size (plasmids)         | `3`                       |
+| **ABRicate**   | `--skip_abricate`      | Skip annotation step                    | –                         |
+|                | `--abricate_dbs`       | Databases to use e.g.:                  | `plasmidfinder card vfdb` |
+|                |                        |  --abricate_dbs ncbi ecoli_vf           |                           |
+| **Plotting**   | `--plot_k`             | Range of k values                       |`3` `3`                    |
+|                | `--plot_skip`          | Skips plotting                          |                           |
+| **Threads**    | `-t` or `--threads`    | Number of threads                       | `4`                       |
+### Plot-only mode
+In plot-only mode, network visualizations can be generated from existing networks directly, by using --plot_only flag and defining the directory path. In this mode, several parameters can be adjusted.
+Example usage:
+```
+% plasmidhub --plot_only path/to/my/results  --plot_k 3 5 --plot_node_color blue --plot_node_size 500 --plot_node_shape s --plot_figsize 20 20 -t 32
+```
+| **Plotting**   | Flag                   | Description                             | Default                   |
+| -------------- | ---------------------- | --------------------------------------- | ------------------------- |
+|                | `--plot_node_size`     | Size of nodes                           | `900`                     |
+|                | `--plot_node_shape`    | Shape of nodes (`o`, `s`, `^`, etc.)    | `o` (circle)              |
+|                | `--plot_edge_width`    | Min/max edge width                      | `0.2 2.0`                 |
+|                | `--plot_figsize`       | Figure size in inches                   | `25 25`                   |
+|                | `--plot_iterations`    | Spring layout iterations                | `100`                     |
+Node shapes:
+| Marker | Description                |
+| ------ | -------------------------- |
+| `'o'`  | Circle                     |
+| `'s'`  | Square                     |
+| `'^'`  | Upward-pointing triangle   |
+| `'v'`  | Downward-pointing triangle |
+| `'>'`  | Right-pointing triangle    |
+| `'<'`  | Left-pointing triangle     |
+| `'D'`  | Diamond                    |
+| `'d'`  | Thin diamond               |
+| `'p'`  | Pentagon                   |
+| `'h'`  | Hexagon 1                  |
+| `'H'`  | Hexagon 2                  |
+|  `'*'` | Star                       |
+| `'+'`  | Plus                       |
+| `'x'`  | Cross                      |
+| `'X'`  | Filled X                   |
+Plots generated with Plasmidhub:
+<img width="1668" height="1668" alt="image" src="https://github.com/user-attachments/assets/afed18b8-6dbe-44b8-b539-23aa47b4bfb0" />
+## Overview
+Plasmidhub performs an all-vs-all comparison of input plasmid sequences using FastANI. FastANI results ("raw results") are filtered  by the coverage (what proportion of the full plasmid sequences are covered by the matching fragments). The remaining pairs are filtered by the minimum ANI score. ANI scores are further weighted by the proportion of matching fragments, and data are sorted into a similarity matrix. The network is build from the similarity matrix, where:
+- **Nodes** represent plasmids
+- **Edges** represent genetic relatedness (weighted ANI)
+Within the network, communities are detected via Louvain method (subclusters). Plasmid clusters are complete subgraphs (cliques) detected within the whole network. Clusters comprising highly similar or identical plasmids. If relevant and scientifically appropriate, plasmids of the same cluster may be considered as equivalent. This approach is alignment-free, reference-free, database-independent, and uses relative similarity-based system to overcome the limitations of database dependency (untypeable plasmids, multireplicon/multi-MOB plasmids, mosaic, hybrid plasmids ect.)
+Network and node statistics are saved to a distinct directory for downstream analyses (connectance, modularity, nestedness, community partition, degree centrality, node degrees, betweenness, closeness ect.)
+Resistance and virulence genes can be annotated via [ABRicate](https://github.com/tseemann/abricate). The abricate files are saved to a distinct subdirectory. By default, plasmidfinder, vfdb and card databases are used, but optionally other databases can be specified from the databases available with ABRicate.
+To generate custom visualizations, feel free to use and modify the *plot.py*.
+## Troubleshooting
+Users are welcome to report any issue or feedback related to Plasmidhub by posting a [Github issue](https://github.com/BALINTESBL/plasmidhub/issues).
+---
+Developed by **Dr. Bálint Timmer**
+*Institute of Metagenomics, University of Debrecen, Debrecen, Hungary*
+*Department of Medical Microbiology, University of Pécs Medical School, Pécs, Hungary*
+ <img width="33" height="33" alt="image" src="https://github.com/user-attachments/assets/bd9f17e9-e9ce-4edb-8319-ef0091c45f00" /> <img width="99" height="32.054" alt="image" src="https://github.com/user-attachments/assets/5f3d5b6b-cef6-478a-af66-614b2e2860b2" />
+Contact: [timmer.balint@med.unideb.hu](mailto:timmer.balint@med.unideb.hu) , [timmer.balint@pte.hu](mailto:timmer.balint@pte.hu)

plasmidhub-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,145 @@
+ <img src="https://img.shields.io/github/license/BALINTESBL/plasmidhub" alt="License"> <img src="https://img.shields.io/pypi/v/plasmidhub" alt="PyPI">  ![Build Status](https://github.com/BALINTESBL/plasmidhub/actions/workflows/tests.yml/badge.svg)
+# Plasmidhub
+Plasmidhub is a free and open-source command-line tool for comprehensive plasmid network analysis based on nucleotide sequence similarity. It enables researchers to cluster plasmids and identify genetically related groups using a dynamic, database-independent approach. Plasmidhub's approach:
+* Is applicable to any plasmid
+* Provides an unambiguous classification
+* Considers the whole sequence of the plasmids
+Network visualizations, stats and data are provided for further analysis.
+## Download and Installation
+PlasmidHub can be installed easily via PyPI, Bioconda, or directly from GitHub.
+### Pip
+```
+pip install plasmidhub
+```
+**Note:** It's highly recommended to use a virtual environment or conda environment.
+Recommended environment setup:
+```
+conda create -n plasmidhub python=3.8
+conda activate plasmidhub
+```
+### Bioconda
+If you use Conda for environment management:
+```
+conda install -c bioconda plasmidhub
+```
+Make sure you have the bioconda channel configured. If not, configure them with:
+```
+conda config --add channels defaults
+conda config --add channels bioconda
+conda config --add channels conda-forge
+```
+### GitHub
+To get the latest version:
+```
+git clone https://github.com/BALINTESBL/plasmidhub.git
+cd plasmidhub
+pip install .
+```
+### Dependencies
+This tool requires the following external software to be installed:
+- [FastANI](https://github.com/ParBLiSS/FastANI)
+- [ABRicate](https://github.com/tseemann/abricate)
+## Inputs
+Plasmidhub requires plasmid FASTA files (.fna or .fa or .fasta). Your FASTA files need to be placed in one directory. Ideally, there are no other files in the directory.
+## Usage
+Perform plasmid network analysis with default settings by defining only the directory path of your plasmid FASTA files! Or, you can also adjust parameters.
+Example usage:
+```
+% plasmidhub path/to/my/plasmid/FASTA/files --fragLen 1000 --kmer 14 --coverage_threshold 0.5 --ani_threshold 95 --min_cluster_size 4 --plot_k 2.0 3.0 -t 32
+```
+This command will:
+* Compute pairwise ANI using FastANI
+* Build a plasmid similarity network
+* Save network metrics and statistics (results/statistics)
+* Cluster plasmids
+* Annotate resistance and virulence genes with ABRicate (results/abricate_results)
+* Generate network visualizations (results/plots)
+### Key Options
+| Category       | Flag                   | Description                             | Default                   |
+| -------------- | ---------------------- | --------------------------------------- | ------------------------- |
+| **Input**      | `                      | Path to folder with plasmid FASTA files | –                         |
+| **FastANI**    | `--fragLen`            | Fragment length                         | `1000`                    |
+|                | `--kmer`               | K-mer size                              | `14`                      |
+|                | `--coverage_threshold` | Minimum proportion of the plasmid lenghts| `0.5`                     |
+|                |                        |  covered by the matching fragments      |                           |
+|                | `--ani_threshold`      | Minimum ANI score (after applying       | `95.0`                    |
+|                |                        |  coverage threshold)                    |                           |
+| **Clustering** | `--cluster_off`        | Disable clustering                      | –                         |
+|                | `--min_cluster_size`   | Minimum cluster size (plasmids)         | `3`                       |
+| **ABRicate**   | `--skip_abricate`      | Skip annotation step                    | –                         |
+|                | `--abricate_dbs`       | Databases to use e.g.:                  | `plasmidfinder card vfdb` |
+|                |                        |  --abricate_dbs ncbi ecoli_vf           |                           |
+| **Plotting**   | `--plot_k`             | Range of k values                       |`3` `3`                    |
+|                | `--plot_skip`          | Skips plotting                          |                           |
+| **Threads**    | `-t` or `--threads`    | Number of threads                       | `4`                       |
+### Plot-only mode
+In plot-only mode, network visualizations can be generated from existing networks directly, by using --plot_only flag and defining the directory path. In this mode, several parameters can be adjusted.
+Example usage:
+```
+% plasmidhub --plot_only path/to/my/results  --plot_k 3 5 --plot_node_color blue --plot_node_size 500 --plot_node_shape s --plot_figsize 20 20 -t 32
+```
+| **Plotting**   | Flag                   | Description                             | Default                   |
+| -------------- | ---------------------- | --------------------------------------- | ------------------------- |
+|                | `--plot_node_size`     | Size of nodes                           | `900`                     |
+|                | `--plot_node_shape`    | Shape of nodes (`o`, `s`, `^`, etc.)    | `o` (circle)              |
+|                | `--plot_edge_width`    | Min/max edge width                      | `0.2 2.0`                 |
+|                | `--plot_figsize`       | Figure size in inches                   | `25 25`                   |
+|                | `--plot_iterations`    | Spring layout iterations                | `100`                     |
+Node shapes:
+| Marker | Description                |
+| ------ | -------------------------- |
+| `'o'`  | Circle                     |
+| `'s'`  | Square                     |
+| `'^'`  | Upward-pointing triangle   |
+| `'v'`  | Downward-pointing triangle |
+| `'>'`  | Right-pointing triangle    |
+| `'<'`  | Left-pointing triangle     |
+| `'D'`  | Diamond                    |
+| `'d'`  | Thin diamond               |
+| `'p'`  | Pentagon                   |
+| `'h'`  | Hexagon 1                  |
+| `'H'`  | Hexagon 2                  |
+|  `'*'` | Star                       |
+| `'+'`  | Plus                       |
+| `'x'`  | Cross                      |
+| `'X'`  | Filled X                   |
+Plots generated with Plasmidhub:
+<img width="1668" height="1668" alt="image" src="https://github.com/user-attachments/assets/afed18b8-6dbe-44b8-b539-23aa47b4bfb0" />
+## Overview
+Plasmidhub performs an all-vs-all comparison of input plasmid sequences using FastANI. FastANI results ("raw results") are filtered  by the coverage (what proportion of the full plasmid sequences are covered by the matching fragments). The remaining pairs are filtered by the minimum ANI score. ANI scores are further weighted by the proportion of matching fragments, and data are sorted into a similarity matrix. The network is build from the similarity matrix, where:
+- **Nodes** represent plasmids
+- **Edges** represent genetic relatedness (weighted ANI)
+Within the network, communities are detected via Louvain method (subclusters). Plasmid clusters are complete subgraphs (cliques) detected within the whole network. Clusters comprising highly similar or identical plasmids. If relevant and scientifically appropriate, plasmids of the same cluster may be considered as equivalent. This approach is alignment-free, reference-free, database-independent, and uses relative similarity-based system to overcome the limitations of database dependency (untypeable plasmids, multireplicon/multi-MOB plasmids, mosaic, hybrid plasmids ect.)
+Network and node statistics are saved to a distinct directory for downstream analyses (connectance, modularity, nestedness, community partition, degree centrality, node degrees, betweenness, closeness ect.)
+Resistance and virulence genes can be annotated via [ABRicate](https://github.com/tseemann/abricate). The abricate files are saved to a distinct subdirectory. By default, plasmidfinder, vfdb and card databases are used, but optionally other databases can be specified from the databases available with ABRicate.
+To generate custom visualizations, feel free to use and modify the *plot.py*.
+## Troubleshooting
+Users are welcome to report any issue or feedback related to Plasmidhub by posting a [Github issue](https://github.com/BALINTESBL/plasmidhub/issues).
+---
+Developed by **Dr. Bálint Timmer**
+*Institute of Metagenomics, University of Debrecen, Debrecen, Hungary*
+*Department of Medical Microbiology, University of Pécs Medical School, Pécs, Hungary*
+ <img width="33" height="33" alt="image" src="https://github.com/user-attachments/assets/bd9f17e9-e9ce-4edb-8319-ef0091c45f00" /> <img width="99" height="32.054" alt="image" src="https://github.com/user-attachments/assets/5f3d5b6b-cef6-478a-af66-614b2e2860b2" />
+Contact: [timmer.balint@med.unideb.hu](mailto:timmer.balint@med.unideb.hu) , [timmer.balint@pte.hu](mailto:timmer.balint@pte.hu)

plasmidhub-1.0.0/plasmidhub/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # plasmidhub package

plasmidhub-1.0.0/plasmidhub/abricate.py ADDED Viewed

@@ -0,0 +1,46 @@
+import os
+import subprocess
+import shutil
+import glob
+import logging
+logger = logging.getLogger(__name__)
+def run_abricate_bulk(input_dir, results_dir, db_list, threads=None):
+    os.makedirs(results_dir, exist_ok=True)
+    # Use default thread count if not provided
+    if threads is None:
+        threads = 4
+    # Move into input_dir because wildcard expansion happens here
+    original_dir = os.getcwd()
+    os.chdir(input_dir)
+    # Collect all fasta-like files
+    fasta_files = sorted(
+        glob.glob("*.fna") +
+        glob.glob("*.fa") +
+        glob.glob("*.fasta")
+    )
+    if not fasta_files:
+        raise RuntimeError(f"No input files found in {input_dir} with .fna/.fa/.fasta extensions.")
+    for db in db_list:
+        logger.info(f"Running abricate on database: {db}")
+        # Build the shell command with all fasta file names
+        cmd = f"abricate {' '.join(fasta_files)} --db {db} -t {threads}"
+        # Output file path (temporary inside input_dir)
+        temp_output = f"{db}.abr"
+        with open(temp_output, "w") as out_f:
+            subprocess.run(cmd, shell=True, stdout=out_f, stderr=subprocess.DEVNULL)
+        # Move the output to results_dir
+        final_output_path = os.path.join(results_dir, f"{db}.abr")
+        shutil.move(temp_output, final_output_path)
+        logger.info(f"Saved: {final_output_path}")
+    # Return to original directory
+    os.chdir(original_dir)

plasmidhub-1.0.0/plasmidhub/ani.py ADDED Viewed

@@ -0,0 +1,29 @@
+import os
+import subprocess
+import logging
+logger = logging.getLogger(__name__)
+def run_fastani(plasmid_list_file, fragLen=1000, minFrag=3, kmer=14, output_dir=".", threads=None):
+    if threads is None:
+        threads = 4
+    output_file = os.path.join(output_dir, "fastani_raw_results.tsv")
+    cmd = [
+        "fastANI",
+        "--ql", plasmid_list_file,
+        "--rl", plasmid_list_file,
+        "-o", output_file,
+        "--fragLen", str(fragLen),
+        "--minFraction", str(minFrag),
+        "--kmer", str(kmer),
+        "-t", str(threads)
+    ]
+    logger.info("Running FastANI with command:")
+    logger.info(" ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        logger.error("FastANI failed with error:")
+        logger.error(result.stderr)
+        exit(1)
+    else:
+        logger.info("FastANI completed successfully.")

plasmidhub-1.0.0/plasmidhub/cluster_color.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+import matplotlib.pyplot as plt
+import random
+import logging
+logger = logging.getLogger(__name__)
+def assign_cluster_colors(results_dir, mapping_file):
+    cluster_list_path = os.path.join(results_dir, "cluster_list.txt")
+    color_file = os.path.join(results_dir, "cluster_colours.txt")
+    clusters = []
+    with open(cluster_list_path) as f:
+        next(f)  # Skip header
+        for line in f:
+            if line.strip():
+                cluster_file, _ = line.strip().split('\t')
+                cluster = cluster_file.replace('.txt', '')
+                clusters.append(cluster)
+    n_clusters = len(clusters)
+    # Start with base colors from tab20
+    cmap = plt.get_cmap('tab20')
+    base_colors = [
+        '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
+        for r, g, b in cmap.colors
+    ]
+    used_colors = set(base_colors[:min(n_clusters, len(base_colors))])
+    full_color_list = base_colors[:min(n_clusters, len(base_colors))]
+    # Generate additional distinct random colors if needed
+    while len(full_color_list) < n_clusters:
+        while True:
+            color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
+            if color not in used_colors:
+                used_colors.add(color)
+                full_color_list.append(color)
+                break
+    color_map = dict(zip(clusters, full_color_list))
+    with open(color_file, 'w') as out:
+        for cluster, color in color_map.items():
+            out.write(f"{cluster}\t{color}\n")
+    logger.info(f"Cluster colors saved to: {color_file}")

plasmidhub-1.0.0/plasmidhub/clustering.py ADDED Viewed

@@ -0,0 +1,143 @@
+import os
+import pandas as pd
+from collections import defaultdict
+import argparse
+import logging
+logger = logging.getLogger(__name__)
+def find_valid_subclusters(results_dir):
+    valid_subclusters = []
+    for filename in sorted(os.listdir(results_dir)):
+        if filename.startswith("subcluster_") and filename.endswith("_plasmids.txt"):
+            filepath = os.path.join(results_dir, filename)
+            with open(filepath, 'r') as f:
+                plasmid_count = sum(1 for _ in f)
+            if plasmid_count >= 3:  # Hardcoded rule
+                valid_subclusters.append((filename, plasmid_count))
+    valid_subclusters.sort(key=lambda x: x[1], reverse=True)
+    return valid_subclusters
+def write_subcluster_list(valid_subclusters, output_path):
+    with open(output_path, "w") as f:
+        f.write("Subcluster\tPlasmids\n")
+        for subcluster, count in valid_subclusters:
+            f.write(f"{subcluster}\t{count}\n")
+def extract_clusters(valid_subclusters, results_dir, fastani_path, output_dir):
+    fastani_df = pd.read_csv(fastani_path, sep="\t")
+    for subcluster_file, _ in valid_subclusters:
+        full_path = os.path.join(results_dir, subcluster_file)
+        try:
+            with open(full_path, "r") as f:
+                original_plasmids = set(line.strip() for line in f)
+        except FileNotFoundError:
+            logger.warning(f"File {subcluster_file} not found. Skipping.")
+            continue
+        subcluster_plasmids = original_plasmids.copy()
+        connections = defaultdict(set)
+        for _, row in fastani_df.iterrows():
+            q, r = row["Query"], row["Reference"]
+            if q in subcluster_plasmids and r in subcluster_plasmids:
+                connections[q].add(r)
+                connections[r].add(q)
+        # Iteratively remove nodes with the fewest connections until we get a complete subgraph
+        while True:
+            current_nodes = set(connections.keys())
+            if len(current_nodes) < 3:
+                subcluster_plasmids = set()
+                break
+            # Check if the current graph is a complete subgraph (clique)
+            complete = all(len(connections[node]) == len(current_nodes) - 1 for node in current_nodes)
+            if complete:
+                subcluster_plasmids = current_nodes
+                break
+            # Find the node with the fewest connections (lowest degree)
+            min_node = min(current_nodes, key=lambda x: len(connections[x]))
+            # Remove that node from the graph
+            del connections[min_node]
+            for conn in connections.values():
+                conn.discard(min_node)
+        # Step 7: Save the refined subcluster to a new file with the desired naming format
+        cluster_number = subcluster_file.split("_")[1]  # Extract the number from subcluster_XX_plasmids.txt
+        output_file = f"cluster_{cluster_number}.txt"
+        cluster_path = os.path.join(output_dir, output_file)
+        with open(cluster_path, "w") as f:
+            for plasmid in subcluster_plasmids:
+                f.write(plasmid + "\n")
+def filter_clusters_by_size(output_dir, min_cluster_size):
+    for filename in os.listdir(output_dir):
+        if filename.startswith("cluster_") and filename.endswith(".txt"):
+            path = os.path.join(output_dir, filename)
+            with open(path, "r") as f:
+                lines = f.readlines()
+            if len(lines) < min_cluster_size:
+                os.remove(path)
+def write_cluster_list(output_dir, output_path):
+    cluster_files = []
+    for filename in os.listdir(output_dir):
+        if (
+            filename.startswith("cluster_")
+            and filename.endswith(".txt")
+            and filename not in {os.path.basename(output_path), "cluster_colours.txt"}  # exclude output file itself and cluster_colours.txt
+        ):
+            path = os.path.join(output_dir, filename)
+            with open(path, "r") as f:
+                count = sum(1 for _ in f)
+            cluster_files.append((filename, count))
+    cluster_files.sort(key=lambda x: x[1], reverse=True)
+    with open(output_path, "w") as f:
+        f.write("Cluster\tPlasmids\n")
+        for filename, count in cluster_files:
+            f.write(f"{filename}\t{count}\n")
+def main(results_dir, min_cluster_size):
+    fastani_path = os.path.join(results_dir, "ANI_results_final.tsv")
+    subcluster_list_output = os.path.join(results_dir, "subcluster_list.txt")
+    cluster_list_output = os.path.join(results_dir, "cluster_list.txt")
+    logger.info("Finding valid subclusters (>=3 plasmids)...")
+    valid_subclusters = find_valid_subclusters(results_dir)
+    write_subcluster_list(valid_subclusters, subcluster_list_output)
+    logger.info("Identifying  clusters...")
+    extract_clusters(valid_subclusters, results_dir, fastani_path, results_dir)
+    logging.info(f"Keep only clusters with >={min_cluster_size} plasmids...")
+    filter_clusters_by_size(results_dir, min_cluster_size)
+    write_cluster_list(results_dir, cluster_list_output)
+    # Check: warn user if cluster_list.txt is empty
+    if os.path.exists(cluster_list_output):
+        with open(cluster_list_output, "r") as f:
+            lines = f.readlines()
+            if len(lines) <= 1:
+                logger.warning("No clusters detected with the given parameters!")
+    # logger.info("Done!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Clustering Tool")
+    parser.add_argument("results_dir", help="Path to results directory created by main.py")
+    parser.add_argument("--min_cluster_size", type=int, default=3, help="Minimum number of plasmids in final cluster (default: 3)")
+    args = parser.parse_args()
+    main(args.results_dir, args.min_cluster_size)