PyPI - baclast - Versions diffs - 0.1.0__tar.gz - Mend

baclast 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

baclast-0.1.0/.gitignore +16 -0
baclast-0.1.0/PKG-INFO +52 -0
baclast-0.1.0/README.md +41 -0
baclast-0.1.0/baclast/__init__.py +92 -0
baclast-0.1.0/baclast/cli.py +184 -0
baclast-0.1.0/baclast/eskape_classifier.py +172 -0
baclast-0.1.0/baclast/features.py +155 -0
baclast-0.1.0/baclast/model.pkl +0 -0
baclast-0.1.0/baclast/model.py +211 -0
baclast-0.1.0/baclast/utils.py +38 -0
baclast-0.1.0/baclast/viz.py +127 -0
baclast-0.1.0/pyproject.toml +39 -0
baclast-0.1.0/tests/__init__.py +0 -0
baclast-0.1.0/tests/conftest.py +43 -0
baclast-0.1.0/tests/test_features.py +133 -0
baclast-0.1.0/tests/test_model.py +156 -0

baclast-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,16 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv/
+# Model files
+*.pkl
+# Notebook cache
+cache/

baclast-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,52 @@
+Metadata-Version: 2.4
+Name: baclast
+Version: 0.1.0
+Summary: Fast ESKAPE bacterial genome classifier using k-mer profiles
+Requires-Python: >=3.12
+Requires-Dist: biopython>=1.81
+Requires-Dist: joblib>=1.3
+Requires-Dist: numpy>=1.24
+Requires-Dist: scikit-learn>=1.3
+Description-Content-Type: text/markdown
+# BaClasT -- Bacterial Classification Tool
+Fast classification of assembled bacterial genomes into ESKAPE pathogen species using k-mer frequency profiling.
+## Install
+```bash
+uv add baclast
+```
+## CLI
+```bash
+baclast --predict genome.fna
+baclast --predict genomes/ -o results.csv
+```
+## Python
+```python
+import src.classifier as baclast
+baclast.predict(file="genome.fna")
+baclast.to_csv(baclast.predict(file="genome.fna"), "results.csv")
+```
+## What it classifies
+ESKAPE pathogens (*E. faecium*, *S. aureus*, *K. pneumoniae*, *A. baumannii*, *P. aeruginosa*, *E. cloacae*) plus an "Other" class for non-ESKAPE bacteria. Includes centroid-based out-of-distribution detection.
+## How it works
+Computes 4-mer frequency profiles (256 features) from genome assemblies and classifies with a Random Forest. A bundled pre-trained model is included -- no training data or setup required.
+## Requirements
+Python >= 3.12, biopython, scikit-learn, joblib, numpy.
+## License
+MIT

baclast-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,41 @@
+# BaClasT -- Bacterial Classification Tool
+Fast classification of assembled bacterial genomes into ESKAPE pathogen species using k-mer frequency profiling.
+## Install
+```bash
+uv add baclast
+```
+## CLI
+```bash
+baclast --predict genome.fna
+baclast --predict genomes/ -o results.csv
+```
+## Python
+```python
+import src.classifier as baclast
+baclast.predict(file="genome.fna")
+baclast.to_csv(baclast.predict(file="genome.fna"), "results.csv")
+```
+## What it classifies
+ESKAPE pathogens (*E. faecium*, *S. aureus*, *K. pneumoniae*, *A. baumannii*, *P. aeruginosa*, *E. cloacae*) plus an "Other" class for non-ESKAPE bacteria. Includes centroid-based out-of-distribution detection.
+## How it works
+Computes 4-mer frequency profiles (256 features) from genome assemblies and classifies with a Random Forest. A bundled pre-trained model is included -- no training data or setup required.
+## Requirements
+Python >= 3.12, biopython, scikit-learn, joblib, numpy.
+## License
+MIT

baclast-0.1.0/baclast/__init__.py ADDED Viewed

@@ -0,0 +1,92 @@
+__version__ = "0.1.0"
+import csv as _csv
+from pathlib import Path as _Path
+import numpy as _np
+_BUNDLED_MODEL = _Path(__file__).resolve().parent / "model.pkl"
+_model_cache = None
+_FIELDS = [
+    "filepath", "filename", "organism_prediction", "confidence",
+    "confidence_warning", "nearest_centroid", "distance", "threshold",
+    "within_distribution", "baclast_version",
+]
+def _load_model():
+    global _model_cache
+    if _model_cache is None:
+        from baclast.model import load_model
+        _model_cache = load_model(_BUNDLED_MODEL)
+    return _model_cache
+def predict(file: str) -> dict:
+    """Classify a bacterial genome FASTA file.
+    Args:
+        file: Path to a FASTA file (.fasta, .fa, .fna).
+    Returns:
+        Dict with keys: filepath, filename, organism_prediction, confidence,
+        confidence_warning, nearest_centroid, distance, threshold,
+        within_distribution, baclast_version.
+    """
+    from baclast.features import genome_to_vector
+    from baclast.model import novelty_score
+    payload = _load_model()
+    clf = payload["classifier"]
+    label_names = payload["label_names"]
+    k = payload["k"]
+    kmer_vocab = payload["kmer_vocab"]
+    centroids = payload.get("centroids")
+    threshold = payload.get("distance_threshold")
+    fpath = _Path(file)
+    vec = genome_to_vector(fpath, k, kmer_vocab)
+    X_q = _np.array([vec])
+    pred = clf.predict(X_q)[0]
+    proba = clf.predict_proba(X_q)[0]
+    species = label_names[pred]
+    confidence = round(float(proba[pred]) * 100, 2)
+    result = {
+        "filepath": str(fpath),
+        "filename": fpath.name,
+        "organism_prediction": species,
+        "confidence": confidence,
+        "confidence_warning": "LOW" if confidence < 70.0 else "",
+        "baclast_version": __version__,
+    }
+    if centroids and threshold:
+        nearest, dist = novelty_score(vec, centroids)
+        result["nearest_centroid"] = nearest
+        result["distance"] = round(float(dist), 6)
+        result["threshold"] = round(float(threshold), 6)
+        result["within_distribution"] = "Yes" if dist <= threshold else "No"
+    else:
+        result["nearest_centroid"] = ""
+        result["distance"] = ""
+        result["threshold"] = ""
+        result["within_distribution"] = ""
+    return result
+def to_csv(result: dict, path: str) -> None:
+    """Write a prediction result dict to a CSV file.
+    Raises:
+        FileExistsError: If the file already exists.
+    """
+    p = _Path(path)
+    if p.exists():
+        raise FileExistsError(f"Output file already exists: {p}")
+    with open(p, "w", newline="") as f:
+        writer = _csv.DictWriter(f, fieldnames=_FIELDS)
+        writer.writeheader()
+        writer.writerow(result)

baclast-0.1.0/baclast/cli.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""BaClasT CLI — bacterial genome classification tool."""
+import argparse
+import csv
+import io
+import sys
+from pathlib import Path
+import numpy as np
+from baclast import __version__
+from baclast.features import genome_to_vector
+from baclast.model import load_model, novelty_score
+_BUNDLED_MODEL = Path(__file__).resolve().parent / "model.pkl"
+_FASTA_EXTENSIONS = {".fasta", ".fa", ".fna"}
+_MIN_CONFIDENCE = 70.0  # below this, flag as low confidence
+_CSV_FIELDS = [
+    "filepath",
+    "filename",
+    "organism_prediction",
+    "confidence",
+    "confidence_warning",
+    "nearest_centroid",
+    "distance",
+    "threshold",
+    "within_distribution",
+    "baclast_version",
+]
+def _find_model(user_path: str | None) -> Path:
+    """Resolve model path: user-provided, or bundled default."""
+    if user_path:
+        p = Path(user_path)
+        if not p.exists():
+            sys.exit(f"Error: Model file not found: {p}")
+        return p
+    if _BUNDLED_MODEL.exists():
+        return _BUNDLED_MODEL
+    sys.exit(f"Error: No model found. Provide --model or install a model to {_BUNDLED_MODEL}")
+def _collect_fastas(target: Path) -> list[Path]:
+    """Return a list of FASTA files from a file path or directory."""
+    if target.is_file():
+        if target.suffix not in _FASTA_EXTENSIONS:
+            sys.exit(f"Error: {target} does not look like a FASTA file. "
+                     f"Expected extensions: {', '.join(sorted(_FASTA_EXTENSIONS))}")
+        return [target]
+    if target.is_dir():
+        fastas = sorted(f for f in target.iterdir() if f.suffix in _FASTA_EXTENSIONS)
+        if not fastas:
+            sys.exit(f"Error: No FASTA files found in {target}")
+        return fastas
+    sys.exit(f"Error: Path not found: {target}")
+def _classify_one(fpath: Path, clf, label_names, k, kmer_vocab, centroids, threshold) -> dict:
+    """Classify a single FASTA and return a result row."""
+    vec = genome_to_vector(fpath, k, kmer_vocab)
+    X_q = np.array([vec])
+    pred = clf.predict(X_q)[0]
+    proba = clf.predict_proba(X_q)[0]
+    species = label_names[pred]
+    confidence = round(proba[pred] * 100, 2)
+    if confidence < _MIN_CONFIDENCE:
+        warning = "LOW"
+    else:
+        warning = ""
+    row = {
+        "filepath": str(fpath),
+        "filename": fpath.name,
+        "organism_prediction": species,
+        "confidence": confidence,
+        "confidence_warning": warning,
+        "baclast_version": __version__,
+    }
+    if centroids and threshold:
+        nearest, dist = novelty_score(vec, centroids)
+        row["nearest_centroid"] = nearest
+        row["distance"] = round(dist, 6)
+        row["threshold"] = round(threshold, 6)
+        row["within_distribution"] = "Yes" if dist <= threshold else "No"
+    else:
+        row["nearest_centroid"] = ""
+        row["distance"] = ""
+        row["threshold"] = ""
+        row["within_distribution"] = ""
+    return row
+def main():
+    parser = argparse.ArgumentParser(
+        prog="baclast",
+        description="BaClasT — fast bacterial genome classification using k-mer profiles",
+    )
+    parser.add_argument(
+        "--predict", required=True, metavar="PATH",
+        help="Path to a FASTA file or directory of FASTAs",
+    )
+    parser.add_argument(
+        "-o", "--output", default=None, metavar="FILE",
+        help="Write results to a CSV file instead of stdout",
+    )
+    parser.add_argument(
+        "--model", default=None, metavar="FILE",
+        help="Path to model .pkl (uses bundled model if omitted)",
+    )
+    args = parser.parse_args()
+    # Check output file doesn't already exist
+    if args.output:
+        out_path = Path(args.output)
+        if out_path.exists():
+            sys.exit(f"Error: Output file already exists: {out_path}")
+    # Load model
+    payload = load_model(_find_model(args.model))
+    clf = payload["classifier"]
+    label_names = payload["label_names"]
+    k = payload["k"]
+    kmer_vocab = payload["kmer_vocab"]
+    centroids = payload.get("centroids")
+    threshold = payload.get("distance_threshold")
+    # Collect input FASTAs
+    target = Path(args.predict)
+    fastas = _collect_fastas(target)
+    # Classify
+    rows = []
+    for i, fpath in enumerate(fastas, 1):
+        if len(fastas) > 1:
+            print(f"  [{i}/{len(fastas)}] {fpath.name} ... ", end="", flush=True, file=sys.stderr)
+        try:
+            row = _classify_one(fpath, clf, label_names, k, kmer_vocab, centroids, threshold)
+            rows.append(row)
+            status = f"{row['organism_prediction']} ({row['confidence']}%)"
+            if row["confidence_warning"]:
+                status += f" [{row['confidence_warning']} CONFIDENCE]"
+            if len(fastas) > 1:
+                print(status, file=sys.stderr)
+        except (ValueError, Exception) as exc:
+            rows.append({
+                "filepath": str(fpath),
+                "filename": fpath.name,
+                "organism_prediction": "SKIPPED",
+                "confidence": "",
+                "confidence_warning": str(exc),
+                "nearest_centroid": "",
+                "distance": "",
+                "threshold": "",
+                "within_distribution": "",
+                "baclast_version": __version__,
+            })
+            if len(fastas) > 1:
+                print(f"SKIPPED: {exc}", file=sys.stderr)
+            else:
+                sys.exit(f"Error: {exc}")
+    # Output
+    if args.output:
+        out_path = Path(args.output)
+        with open(out_path, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=_CSV_FIELDS)
+            writer.writeheader()
+            writer.writerows(rows)
+        print(f"Results written to {out_path} ({len(rows)} genomes)", file=sys.stderr)
+    else:
+        buf = io.StringIO()
+        writer = csv.DictWriter(buf, fieldnames=_CSV_FIELDS)
+        writer.writeheader()
+        writer.writerows(rows)
+        print(buf.getvalue(), end="")
+if __name__ == "__main__":
+    main()

baclast-0.1.0/baclast/eskape_classifier.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""BaClasT — main CLI entry point for training and prediction."""
+import argparse
+import sys
+from baclast.features import all_kmers, genome_to_vector, load_dataset
+from baclast.model import evaluate, load_model, save_model, train_classifier
+from baclast.utils import print_banner, setup_logging
+def cmd_train(args):
+    """Execute the 'train' sub-command."""
+    logger = setup_logging(args.verbose)
+    print_banner()
+    k = args.k
+    kmer_vocab = all_kmers(k)
+    print(f"Loading genomes from {args.data_dir} (k={k})...")
+    try:
+        X, y, label_names = load_dataset(args.data_dir, k, kmer_vocab)
+    except Exception as exc:
+        sys.exit(f"Error: {exc}")
+    if len(label_names) < 2:
+        sys.exit("Error: Need at least 2 species to train a classifier.")
+    n_genomes = len(y)
+    print(f"Loaded {n_genomes} genomes across {len(label_names)} species.")
+    # 80/20 stratified train/test split
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, stratify=y, random_state=42
+    )
+    print(f"Training Random Forest ({args.n_estimators} trees)...")
+    clf = train_classifier(X_train, y_train, n_estimators=args.n_estimators)
+    print("Evaluating on held-out test set:")
+    evaluate(clf, X_test, y_test, label_names)
+    # Optional cross-validation
+    if args.cv is not None:
+        from sklearn.model_selection import StratifiedKFold, cross_val_score
+        print(f"\nRunning {args.cv}-fold stratified cross-validation...")
+        cv = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=42)
+        scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
+        print(f"CV accuracy: {scores.mean():.4f} +/- {scores.std():.4f}")
+    # Save model
+    save_model(clf, label_names, k, kmer_vocab, args.output)
+    # Patch in n_genomes (save_model doesn't know the total count)
+    import joblib
+    payload = joblib.load(args.output)
+    payload["n_genomes"] = n_genomes
+    joblib.dump(payload, args.output)
+    print(f"\nModel saved to {args.output}")
+def cmd_predict(args):
+    """Execute the 'predict' sub-command."""
+    logger = setup_logging(args.verbose)
+    print_banner()
+    # Load model
+    try:
+        payload = load_model(args.model)
+    except FileNotFoundError:
+        sys.exit(f"Error: Model file not found: {args.model}")
+    except ValueError as exc:
+        sys.exit(f"Error: {exc}")
+    clf = payload["classifier"]
+    label_names = payload["label_names"]
+    k = payload["k"]
+    kmer_vocab = payload["kmer_vocab"]
+    # Extract features from input FASTA
+    try:
+        vec = genome_to_vector(args.fasta, k, kmer_vocab)
+    except FileNotFoundError:
+        sys.exit(f"Error: FASTA file not found: {args.fasta}")
+    except ValueError as exc:
+        sys.exit(f"Error: {exc}")
+    import numpy as np
+    X_query = np.array([vec])
+    pred = clf.predict(X_query)[0]
+    proba = clf.predict_proba(X_query)[0]
+    species = label_names[pred]
+    confidence = proba[pred] * 100
+    print(f"Predicted species: {species}")
+    print(f"Confidence: {confidence:.1f}%")
+    if args.verbose:
+        print("\nAll species probabilities:")
+        # Sort by probability descending
+        ranked = sorted(
+            zip(label_names, proba), key=lambda x: x[1], reverse=True
+        )
+        max_name_len = max(len(name) for name in label_names)
+        for name, prob in ranked:
+            bar_len = int(prob * 40)
+            bar = "#" * bar_len
+            print(f"  {name:<{max_name_len}}  {prob * 100:5.1f}%  |{bar}")
+def main():
+    """Main entry point — parse arguments and dispatch to sub-commands."""
+    parser = argparse.ArgumentParser(
+        prog="baclasp",
+        description="BaClasT — Bacterial Classification Tool",
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # train sub-command
+    train_parser = subparsers.add_parser("train", help="Train a classifier")
+    train_parser.add_argument(
+        "--data_dir", required=True, help="Directory with species sub-folders"
+    )
+    train_parser.add_argument(
+        "--output", default="model.pkl", help="Output model path (default: model.pkl)"
+    )
+    train_parser.add_argument(
+        "--k", type=int, default=4, help="K-mer length (default: 4)"
+    )
+    train_parser.add_argument(
+        "--n_estimators",
+        type=int,
+        default=200,
+        help="Number of trees (default: 200)",
+    )
+    train_parser.add_argument(
+        "--cv", type=int, default=None, help="Number of CV folds (optional)"
+    )
+    train_parser.add_argument(
+        "--verbose", action="store_true", help="Enable verbose output"
+    )
+    # predict sub-command
+    predict_parser = subparsers.add_parser(
+        "predict", help="Predict species for a FASTA file"
+    )
+    predict_parser.add_argument(
+        "--model", required=True, help="Path to trained model .pkl"
+    )
+    predict_parser.add_argument(
+        "--fasta", required=True, help="Path to input FASTA file"
+    )
+    predict_parser.add_argument(
+        "--verbose", action="store_true", help="Show all species probabilities"
+    )
+    args = parser.parse_args()
+    if args.command == "train":
+        cmd_train(args)
+    elif args.command == "predict":
+        cmd_predict(args)
+if __name__ == "__main__":
+    main()