PyPI - imbeval - Versions diffs - 0.1.0__py3-none-any.whl - Mend

imbeval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

imbeval/__init__.py +27 -0
imbeval/calibration.py +66 -0
imbeval/metrics.py +87 -0
imbeval/report.py +80 -0
imbeval/threshold.py +83 -0
imbeval-0.1.0.dist-info/METADATA +118 -0
imbeval-0.1.0.dist-info/RECORD +9 -0
imbeval-0.1.0.dist-info/WHEEL +4 -0
imbeval-0.1.0.dist-info/licenses/LICENSE +21 -0

imbeval/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+imbeval — Honest evaluation for imbalanced classification models.
+Most metric libraries hand you numbers; they don't tell you whether your
+model is actually safe to ship on imbalanced data (fraud, medical, anomaly
+detection, churn, etc). imbeval combines per-class confidence, calibration
+quality, and cost-sensitive thresholding into one report so you can answer
+the real question: "is this model usable in production?"
+"""
+from .report import evaluation_report
+from .calibration import calibration_score, reliability_curve
+from .threshold import optimal_threshold, cost_sensitive_threshold
+from .metrics import per_class_confidence, minority_class_report
+__version__ = "0.1.0"
+__all__ = [
+    "evaluation_report",
+    "calibration_score",
+    "reliability_curve",
+    "optimal_threshold",
+    "cost_sensitive_threshold",
+    "per_class_confidence",
+    "minority_class_report",
+    "__version__",
+]

imbeval/calibration.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Calibration quality checks — is the model's confidence trustworthy?"""
+from __future__ import annotations
+import numpy as np
+def reliability_curve(y_true, y_pred_proba, n_bins: int = 10):
+    """
+    Bin predictions by confidence and compare to observed accuracy in
+    each bin. Returns arrays suitable for plotting a reliability diagram.
+    Parameters
+    ----------
+    y_true : array-like, binary (0/1) ground truth.
+    y_pred_proba : array-like, predicted probability of the positive class.
+    n_bins : int
+    Returns
+    -------
+    dict with keys: bin_confidence, bin_accuracy, bin_count
+    """
+    y_true = np.asarray(y_true)
+    y_pred_proba = np.asarray(y_pred_proba)
+    bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
+    bin_confidence, bin_accuracy, bin_count = [], [], []
+    for i in range(n_bins):
+        lo, hi = bin_edges[i], bin_edges[i + 1]
+        mask = (y_pred_proba >= lo) & (y_pred_proba < hi if i < n_bins - 1 else y_pred_proba <= hi)
+        count = int(mask.sum())
+        bin_count.append(count)
+        if count == 0:
+            bin_confidence.append(None)
+            bin_accuracy.append(None)
+        else:
+            bin_confidence.append(float(np.mean(y_pred_proba[mask])))
+            bin_accuracy.append(float(np.mean(y_true[mask])))
+    return {
+        "bin_confidence": bin_confidence,
+        "bin_accuracy": bin_accuracy,
+        "bin_count": bin_count,
+    }
+def calibration_score(y_true, y_pred_proba, n_bins: int = 10) -> float:
+    """
+    Expected Calibration Error (ECE): the weighted average gap between
+    predicted confidence and observed accuracy across bins. Lower is
+    better; 0 is perfect calibration.
+    """
+    curve = reliability_curve(y_true, y_pred_proba, n_bins=n_bins)
+    total = sum(c for c in curve["bin_count"] if c)
+    if total == 0:
+        return 0.0
+    ece = 0.0
+    for conf, acc, count in zip(
+        curve["bin_confidence"], curve["bin_accuracy"], curve["bin_count"]
+    ):
+        if count == 0:
+            continue
+        ece += (count / total) * abs(conf - acc)
+    return float(ece)

imbeval/metrics.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Per-class confidence and minority-class focused metrics."""
+from __future__ import annotations
+import numpy as np
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix,
+)
+def per_class_confidence(y_true, y_pred_proba, classes=None):
+    """
+    Compute the mean predicted-probability "confidence" the model assigns
+    to the correct class, broken down per class.
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        True integer class labels (0..n_classes-1).
+    y_pred_proba : array-like of shape (n_samples, n_classes)
+        Predicted probabilities from `model.predict_proba`.
+    classes : list, optional
+        Labels for each class index, for display purposes.
+    Returns
+    -------
+    dict
+        Mapping of class label -> mean confidence on correctly-attributed
+        probability mass for samples truly belonging to that class.
+    """
+    y_true = np.asarray(y_true)
+    y_pred_proba = np.asarray(y_pred_proba)
+    n_classes = y_pred_proba.shape[1]
+    if classes is None:
+        classes = list(range(n_classes))
+    result = {}
+    for idx, label in enumerate(classes):
+        mask = y_true == idx
+        if mask.sum() == 0:
+            result[label] = None
+            continue
+        result[label] = float(np.mean(y_pred_proba[mask, idx]))
+    return result
+def minority_class_report(y_true, y_pred, classes=None, minority_label=None):
+    """
+    Precision/recall/F1 with explicit emphasis on the minority class.
+    If `minority_label` is not given, the class with the lowest support
+    in `y_true` is auto-detected.
+    Returns
+    -------
+    dict with keys: minority_label, support, precision, recall, f1,
+    confusion_matrix
+    """
+    y_true = np.asarray(y_true)
+    y_pred = np.asarray(y_pred)
+    labels, counts = np.unique(y_true, return_counts=True)
+    if minority_label is None:
+        minority_label = labels[np.argmin(counts)]
+    precision = precision_score(
+        y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
+    )
+    recall = recall_score(
+        y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
+    )
+    f1 = f1_score(
+        y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
+    )
+    support = int(np.sum(y_true == minority_label))
+    return {
+        "minority_label": minority_label,
+        "support": support,
+        "precision": float(precision),
+        "recall": float(recall),
+        "f1": float(f1),
+        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
+    }

imbeval/report.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""The single entry point: one honest report on production-readiness."""
+from __future__ import annotations
+import numpy as np
+from .calibration import calibration_score
+from .threshold import optimal_threshold, cost_sensitive_threshold
+from .metrics import minority_class_report
+def evaluation_report(
+    y_true,
+    y_pred_proba,
+    cost_fp: float = None,
+    cost_fn: float = None,
+    n_bins: int = 10,
+):
+    """
+    Produce one combined evaluation report for a binary classifier on
+    imbalanced data: minority-class performance, calibration quality,
+    a tuned decision threshold, and (optionally) a cost-aware threshold.
+    Parameters
+    ----------
+    y_true : array-like, binary ground truth labels (0/1).
+    y_pred_proba : array-like, predicted probability of the positive (1) class.
+    cost_fp : float, optional. Business cost of one false positive.
+    cost_fn : float, optional. Business cost of one false negative.
+        If both cost_fp and cost_fn are given, a cost-sensitive threshold
+        is included in the report.
+    n_bins : int, bins used for calibration scoring.
+    Returns
+    -------
+    dict
+        {
+          "minority_class": {...},
+          "calibration_error": float,
+          "default_threshold_0.5": {...},
+          "optimal_f1_threshold": {...},
+          "cost_sensitive_threshold": {...} or None,
+          "verdict": str
+        }
+    """
+    y_true = np.asarray(y_true)
+    y_pred_proba = np.asarray(y_pred_proba)
+    preds_at_half = (y_pred_proba >= 0.5).astype(int)
+    minority = minority_class_report(y_true, preds_at_half)
+    ece = calibration_score(y_true, y_pred_proba, n_bins=n_bins)
+    f1_opt = optimal_threshold(y_true, y_pred_proba)
+    cost_result = None
+    if cost_fp is not None and cost_fn is not None:
+        cost_result = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)
+    verdict = _build_verdict(minority, ece, f1_opt)
+    return {
+        "minority_class": minority,
+        "calibration_error": ece,
+        "optimal_f1_threshold": f1_opt,
+        "cost_sensitive_threshold": cost_result,
+        "verdict": verdict,
+    }
+def _build_verdict(minority: dict, ece: float, f1_opt: dict) -> str:
+    flags = []
+    if minority["recall"] < 0.5:
+        flags.append("minority-class recall is below 50% at the default 0.5 threshold")
+    if ece > 0.1:
+        flags.append(f"calibration error is high (ECE={ece:.3f}); confidence scores are not trustworthy")
+    if f1_opt["score"] - minority["f1"] > 0.15:
+        flags.append("default 0.5 threshold is far from optimal; consider using optimal_f1_threshold")
+    if not flags:
+        return "Looks production-ready on the dimensions checked. Validate further on a held-out set."
+    return "Not yet production-ready: " + "; ".join(flags) + "."

imbeval/threshold.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Decision threshold tuning for imbalanced problems."""
+from __future__ import annotations
+import numpy as np
+from sklearn.metrics import f1_score
+def optimal_threshold(y_true, y_pred_proba, metric: str = "f1", n_steps: int = 200):
+    """
+    Sweep decision thresholds and return the one that maximizes the
+    chosen metric. Default metric is 0.5-agnostic F1, which is usually
+    a far better default than 0.5 on imbalanced data.
+    Parameters
+    ----------
+    y_true : array-like, binary ground truth.
+    y_pred_proba : array-like, predicted probability of positive class.
+    metric : {"f1"} currently supported.
+    n_steps : int, number of thresholds to test between 0 and 1.
+    Returns
+    -------
+    dict with keys: threshold, score
+    """
+    y_true = np.asarray(y_true)
+    y_pred_proba = np.asarray(y_pred_proba)
+    thresholds = np.linspace(0.01, 0.99, n_steps)
+    best_threshold, best_score = 0.5, -1.0
+    for t in thresholds:
+        preds = (y_pred_proba >= t).astype(int)
+        if metric == "f1":
+            score = f1_score(y_true, preds, zero_division=0)
+        else:
+            raise ValueError(f"Unsupported metric: {metric}")
+        if score > best_score:
+            best_score, best_threshold = score, t
+    return {"threshold": float(best_threshold), "score": float(best_score)}
+def cost_sensitive_threshold(y_true, y_pred_proba, cost_fp: float, cost_fn: float, n_steps: int = 200):
+    """
+    Find the decision threshold that minimizes total business cost,
+    given the real-world cost of a false positive vs a false negative.
+    This is usually what people actually want on imbalanced data
+    (e.g. fraud: missing fraud is far costlier than a false alarm).
+    Parameters
+    ----------
+    y_true : array-like, binary ground truth.
+    y_pred_proba : array-like, predicted probability of positive class.
+    cost_fp : float, cost incurred per false positive.
+    cost_fn : float, cost incurred per false negative.
+    n_steps : int
+    Returns
+    -------
+    dict with keys: threshold, total_cost, false_positives, false_negatives
+    """
+    y_true = np.asarray(y_true)
+    y_pred_proba = np.asarray(y_pred_proba)
+    thresholds = np.linspace(0.01, 0.99, n_steps)
+    best = {"threshold": 0.5, "total_cost": float("inf"), "false_positives": 0, "false_negatives": 0}
+    for t in thresholds:
+        preds = (y_pred_proba >= t).astype(int)
+        fp = int(np.sum((preds == 1) & (y_true == 0)))
+        fn = int(np.sum((preds == 0) & (y_true == 1)))
+        total_cost = fp * cost_fp + fn * cost_fn
+        if total_cost < best["total_cost"]:
+            best = {
+                "threshold": float(t),
+                "total_cost": float(total_cost),
+                "false_positives": fp,
+                "false_negatives": fn,
+            }
+    return best

imbeval-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: imbeval
+Version: 0.1.0
+Summary: Honest, production-readiness evaluation for imbalanced classification models.
+Project-URL: Homepage, https://github.com/sricodings
+Project-URL: Repository, https://github.com/sricodings/imbeval
+Project-URL: Issues, https://github.com/sricodings/imbeval/issues
+Project-URL: Documentation, https://github.com/sricodings/imbeval#readme
+Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: calibration,imbalanced-classification,machine-learning,model-evaluation,threshold-tuning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Requires-Dist: numpy>=1.21
+Requires-Dist: scikit-learn>=1.0
+Provides-Extra: dev
+Requires-Dist: build; extra == 'dev'
+Requires-Dist: pytest>=7.0; extra == 'dev'
+Requires-Dist: twine; extra == 'dev'
+Description-Content-Type: text/markdown
+# imbeval
+**Honest production-readiness evaluation for imbalanced classification models.**
+Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
+data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
+not enough to know if a model is actually safe to ship. `imbeval` answers the real
+question: **is this model usable in production, and at what threshold?**
+It combines three things most teams check manually and inconsistently:
+1. **Minority-class performance** — not buried inside macro-averages.
+2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
+3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
+## Install
+```bash
+pip install imbeval
+```
+(Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
+## Quickstart
+```python
+from imbeval import evaluation_report
+# y_true: ground truth labels (0/1)
+# y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
+report = evaluation_report(
+    y_true,
+    y_pred_proba,
+    cost_fp=1,     # cost of a false alarm
+    cost_fn=25,    # cost of missing a true positive (e.g. missed fraud)
+)
+print(report["verdict"])
+print(report["minority_class"])
+print(report["optimal_f1_threshold"])
+print(report["cost_sensitive_threshold"])
+```
+Example output:
+```
+Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
+default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
+```
+## What's inside
+| Function | What it does |
+|---|---|
+| `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
+| `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
+| `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
+| `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
+| `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
+| `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
+| `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
+Full API reference: [docs/api.md](docs/api.md)
+Usage guide and recipes: [docs/usage.md](docs/usage.md)
+Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
+## Why this exists
+Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
+(SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
+honestly whether the *model* you already trained is good enough, and at what threshold,
+once class imbalance is in play. It's meant to sit right before you ship.
+## Status
+Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
+is stable for binary classification. Multi-class support is on the roadmap — see
+[CHANGELOG.md](CHANGELOG.md).
+## Contributing
+Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
+how the modules fit together if you want to extend it.
+## License
+MIT — see [LICENSE](LICENSE).

imbeval-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+imbeval/__init__.py,sha256=NymkrvR3Vy6NqLS8-h7dAGRAiE_Lw0Ahr7Tp2kqzN3I,911
+imbeval/calibration.py,sha256=-el42gPBunXNLiTuZKG8GccoM3ln_-wG1jJBSbn6fQY,2092
+imbeval/metrics.py,sha256=qLlpAz4ywASVwzLicgCRQbaVpx_y5RRL33rtkRUoLxQ,2622
+imbeval/report.py,sha256=1BaUaCjHM8CPkNjcFB0jQ1qHgG5GbwUMD6Kas4qA1w4,2803
+imbeval/threshold.py,sha256=ArmCI8qqWjahZ6B9mWdn8uNlFBsyusHVQDQyeZz3UvM,2826
+imbeval-0.1.0.dist-info/METADATA,sha256=YUkBP1vQ5SwKSFcNsVzmguOvQurwk2mFp16VoaluYt8,4756
+imbeval-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+imbeval-0.1.0.dist-info/licenses/LICENSE,sha256=WUUJ80e95dG_vXeLjj31QjwMviIlbhZLTGRl3bcM_5k,1073
+imbeval-0.1.0.dist-info/RECORD,,

imbeval-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

imbeval-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Srikanth Sridhar
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.