imbeval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imbeval/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ imbeval — Honest evaluation for imbalanced classification models.
3
+
4
+ Most metric libraries hand you numbers; they don't tell you whether your
5
+ model is actually safe to ship on imbalanced data (fraud, medical, anomaly
6
+ detection, churn, etc). imbeval combines per-class confidence, calibration
7
+ quality, and cost-sensitive thresholding into one report so you can answer
8
+ the real question: "is this model usable in production?"
9
+ """
10
+
11
+ from .report import evaluation_report
12
+ from .calibration import calibration_score, reliability_curve
13
+ from .threshold import optimal_threshold, cost_sensitive_threshold
14
+ from .metrics import per_class_confidence, minority_class_report
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ __all__ = [
19
+ "evaluation_report",
20
+ "calibration_score",
21
+ "reliability_curve",
22
+ "optimal_threshold",
23
+ "cost_sensitive_threshold",
24
+ "per_class_confidence",
25
+ "minority_class_report",
26
+ "__version__",
27
+ ]
imbeval/calibration.py ADDED
@@ -0,0 +1,66 @@
1
+ """Calibration quality checks — is the model's confidence trustworthy?"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def reliability_curve(y_true, y_pred_proba, n_bins: int = 10):
9
+ """
10
+ Bin predictions by confidence and compare to observed accuracy in
11
+ each bin. Returns arrays suitable for plotting a reliability diagram.
12
+
13
+ Parameters
14
+ ----------
15
+ y_true : array-like, binary (0/1) ground truth.
16
+ y_pred_proba : array-like, predicted probability of the positive class.
17
+ n_bins : int
18
+
19
+ Returns
20
+ -------
21
+ dict with keys: bin_confidence, bin_accuracy, bin_count
22
+ """
23
+ y_true = np.asarray(y_true)
24
+ y_pred_proba = np.asarray(y_pred_proba)
25
+
26
+ bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
27
+ bin_confidence, bin_accuracy, bin_count = [], [], []
28
+
29
+ for i in range(n_bins):
30
+ lo, hi = bin_edges[i], bin_edges[i + 1]
31
+ mask = (y_pred_proba >= lo) & (y_pred_proba < hi if i < n_bins - 1 else y_pred_proba <= hi)
32
+ count = int(mask.sum())
33
+ bin_count.append(count)
34
+ if count == 0:
35
+ bin_confidence.append(None)
36
+ bin_accuracy.append(None)
37
+ else:
38
+ bin_confidence.append(float(np.mean(y_pred_proba[mask])))
39
+ bin_accuracy.append(float(np.mean(y_true[mask])))
40
+
41
+ return {
42
+ "bin_confidence": bin_confidence,
43
+ "bin_accuracy": bin_accuracy,
44
+ "bin_count": bin_count,
45
+ }
46
+
47
+
48
+ def calibration_score(y_true, y_pred_proba, n_bins: int = 10) -> float:
49
+ """
50
+ Expected Calibration Error (ECE): the weighted average gap between
51
+ predicted confidence and observed accuracy across bins. Lower is
52
+ better; 0 is perfect calibration.
53
+ """
54
+ curve = reliability_curve(y_true, y_pred_proba, n_bins=n_bins)
55
+ total = sum(c for c in curve["bin_count"] if c)
56
+ if total == 0:
57
+ return 0.0
58
+
59
+ ece = 0.0
60
+ for conf, acc, count in zip(
61
+ curve["bin_confidence"], curve["bin_accuracy"], curve["bin_count"]
62
+ ):
63
+ if count == 0:
64
+ continue
65
+ ece += (count / total) * abs(conf - acc)
66
+ return float(ece)
imbeval/metrics.py ADDED
@@ -0,0 +1,87 @@
1
+ """Per-class confidence and minority-class focused metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.metrics import (
7
+ precision_score,
8
+ recall_score,
9
+ f1_score,
10
+ confusion_matrix,
11
+ )
12
+
13
+
14
+ def per_class_confidence(y_true, y_pred_proba, classes=None):
15
+ """
16
+ Compute the mean predicted-probability "confidence" the model assigns
17
+ to the correct class, broken down per class.
18
+
19
+ Parameters
20
+ ----------
21
+ y_true : array-like of shape (n_samples,)
22
+ True integer class labels (0..n_classes-1).
23
+ y_pred_proba : array-like of shape (n_samples, n_classes)
24
+ Predicted probabilities from `model.predict_proba`.
25
+ classes : list, optional
26
+ Labels for each class index, for display purposes.
27
+
28
+ Returns
29
+ -------
30
+ dict
31
+ Mapping of class label -> mean confidence on correctly-attributed
32
+ probability mass for samples truly belonging to that class.
33
+ """
34
+ y_true = np.asarray(y_true)
35
+ y_pred_proba = np.asarray(y_pred_proba)
36
+ n_classes = y_pred_proba.shape[1]
37
+ if classes is None:
38
+ classes = list(range(n_classes))
39
+
40
+ result = {}
41
+ for idx, label in enumerate(classes):
42
+ mask = y_true == idx
43
+ if mask.sum() == 0:
44
+ result[label] = None
45
+ continue
46
+ result[label] = float(np.mean(y_pred_proba[mask, idx]))
47
+ return result
48
+
49
+
50
+ def minority_class_report(y_true, y_pred, classes=None, minority_label=None):
51
+ """
52
+ Precision/recall/F1 with explicit emphasis on the minority class.
53
+
54
+ If `minority_label` is not given, the class with the lowest support
55
+ in `y_true` is auto-detected.
56
+
57
+ Returns
58
+ -------
59
+ dict with keys: minority_label, support, precision, recall, f1,
60
+ confusion_matrix
61
+ """
62
+ y_true = np.asarray(y_true)
63
+ y_pred = np.asarray(y_pred)
64
+
65
+ labels, counts = np.unique(y_true, return_counts=True)
66
+ if minority_label is None:
67
+ minority_label = labels[np.argmin(counts)]
68
+
69
+ precision = precision_score(
70
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
71
+ )
72
+ recall = recall_score(
73
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
74
+ )
75
+ f1 = f1_score(
76
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
77
+ )
78
+ support = int(np.sum(y_true == minority_label))
79
+
80
+ return {
81
+ "minority_label": minority_label,
82
+ "support": support,
83
+ "precision": float(precision),
84
+ "recall": float(recall),
85
+ "f1": float(f1),
86
+ "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
87
+ }
imbeval/report.py ADDED
@@ -0,0 +1,80 @@
1
+ """The single entry point: one honest report on production-readiness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from .calibration import calibration_score
8
+ from .threshold import optimal_threshold, cost_sensitive_threshold
9
+ from .metrics import minority_class_report
10
+
11
+
12
+ def evaluation_report(
13
+ y_true,
14
+ y_pred_proba,
15
+ cost_fp: float = None,
16
+ cost_fn: float = None,
17
+ n_bins: int = 10,
18
+ ):
19
+ """
20
+ Produce one combined evaluation report for a binary classifier on
21
+ imbalanced data: minority-class performance, calibration quality,
22
+ a tuned decision threshold, and (optionally) a cost-aware threshold.
23
+
24
+ Parameters
25
+ ----------
26
+ y_true : array-like, binary ground truth labels (0/1).
27
+ y_pred_proba : array-like, predicted probability of the positive (1) class.
28
+ cost_fp : float, optional. Business cost of one false positive.
29
+ cost_fn : float, optional. Business cost of one false negative.
30
+ If both cost_fp and cost_fn are given, a cost-sensitive threshold
31
+ is included in the report.
32
+ n_bins : int, bins used for calibration scoring.
33
+
34
+ Returns
35
+ -------
36
+ dict
37
+ {
38
+ "minority_class": {...},
39
+ "calibration_error": float,
40
+ "default_threshold_0.5": {...},
41
+ "optimal_f1_threshold": {...},
42
+ "cost_sensitive_threshold": {...} or None,
43
+ "verdict": str
44
+ }
45
+ """
46
+ y_true = np.asarray(y_true)
47
+ y_pred_proba = np.asarray(y_pred_proba)
48
+ preds_at_half = (y_pred_proba >= 0.5).astype(int)
49
+
50
+ minority = minority_class_report(y_true, preds_at_half)
51
+ ece = calibration_score(y_true, y_pred_proba, n_bins=n_bins)
52
+ f1_opt = optimal_threshold(y_true, y_pred_proba)
53
+
54
+ cost_result = None
55
+ if cost_fp is not None and cost_fn is not None:
56
+ cost_result = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)
57
+
58
+ verdict = _build_verdict(minority, ece, f1_opt)
59
+
60
+ return {
61
+ "minority_class": minority,
62
+ "calibration_error": ece,
63
+ "optimal_f1_threshold": f1_opt,
64
+ "cost_sensitive_threshold": cost_result,
65
+ "verdict": verdict,
66
+ }
67
+
68
+
69
+ def _build_verdict(minority: dict, ece: float, f1_opt: dict) -> str:
70
+ flags = []
71
+ if minority["recall"] < 0.5:
72
+ flags.append("minority-class recall is below 50% at the default 0.5 threshold")
73
+ if ece > 0.1:
74
+ flags.append(f"calibration error is high (ECE={ece:.3f}); confidence scores are not trustworthy")
75
+ if f1_opt["score"] - minority["f1"] > 0.15:
76
+ flags.append("default 0.5 threshold is far from optimal; consider using optimal_f1_threshold")
77
+
78
+ if not flags:
79
+ return "Looks production-ready on the dimensions checked. Validate further on a held-out set."
80
+ return "Not yet production-ready: " + "; ".join(flags) + "."
imbeval/threshold.py ADDED
@@ -0,0 +1,83 @@
1
+ """Decision threshold tuning for imbalanced problems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.metrics import f1_score
7
+
8
+
9
+ def optimal_threshold(y_true, y_pred_proba, metric: str = "f1", n_steps: int = 200):
10
+ """
11
+ Sweep decision thresholds and return the one that maximizes the
12
+ chosen metric. Default metric is 0.5-agnostic F1, which is usually
13
+ a far better default than 0.5 on imbalanced data.
14
+
15
+ Parameters
16
+ ----------
17
+ y_true : array-like, binary ground truth.
18
+ y_pred_proba : array-like, predicted probability of positive class.
19
+ metric : {"f1"} currently supported.
20
+ n_steps : int, number of thresholds to test between 0 and 1.
21
+
22
+ Returns
23
+ -------
24
+ dict with keys: threshold, score
25
+ """
26
+ y_true = np.asarray(y_true)
27
+ y_pred_proba = np.asarray(y_pred_proba)
28
+
29
+ thresholds = np.linspace(0.01, 0.99, n_steps)
30
+ best_threshold, best_score = 0.5, -1.0
31
+
32
+ for t in thresholds:
33
+ preds = (y_pred_proba >= t).astype(int)
34
+ if metric == "f1":
35
+ score = f1_score(y_true, preds, zero_division=0)
36
+ else:
37
+ raise ValueError(f"Unsupported metric: {metric}")
38
+ if score > best_score:
39
+ best_score, best_threshold = score, t
40
+
41
+ return {"threshold": float(best_threshold), "score": float(best_score)}
42
+
43
+
44
+ def cost_sensitive_threshold(y_true, y_pred_proba, cost_fp: float, cost_fn: float, n_steps: int = 200):
45
+ """
46
+ Find the decision threshold that minimizes total business cost,
47
+ given the real-world cost of a false positive vs a false negative.
48
+
49
+ This is usually what people actually want on imbalanced data
50
+ (e.g. fraud: missing fraud is far costlier than a false alarm).
51
+
52
+ Parameters
53
+ ----------
54
+ y_true : array-like, binary ground truth.
55
+ y_pred_proba : array-like, predicted probability of positive class.
56
+ cost_fp : float, cost incurred per false positive.
57
+ cost_fn : float, cost incurred per false negative.
58
+ n_steps : int
59
+
60
+ Returns
61
+ -------
62
+ dict with keys: threshold, total_cost, false_positives, false_negatives
63
+ """
64
+ y_true = np.asarray(y_true)
65
+ y_pred_proba = np.asarray(y_pred_proba)
66
+
67
+ thresholds = np.linspace(0.01, 0.99, n_steps)
68
+ best = {"threshold": 0.5, "total_cost": float("inf"), "false_positives": 0, "false_negatives": 0}
69
+
70
+ for t in thresholds:
71
+ preds = (y_pred_proba >= t).astype(int)
72
+ fp = int(np.sum((preds == 1) & (y_true == 0)))
73
+ fn = int(np.sum((preds == 0) & (y_true == 1)))
74
+ total_cost = fp * cost_fp + fn * cost_fn
75
+ if total_cost < best["total_cost"]:
76
+ best = {
77
+ "threshold": float(t),
78
+ "total_cost": float(total_cost),
79
+ "false_positives": fp,
80
+ "false_negatives": fn,
81
+ }
82
+
83
+ return best
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: imbeval
3
+ Version: 0.1.0
4
+ Summary: Honest, production-readiness evaluation for imbalanced classification models.
5
+ Project-URL: Homepage, https://github.com/sricodings
6
+ Project-URL: Repository, https://github.com/sricodings/imbeval
7
+ Project-URL: Issues, https://github.com/sricodings/imbeval/issues
8
+ Project-URL: Documentation, https://github.com/sricodings/imbeval#readme
9
+ Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: calibration,imbalanced-classification,machine-learning,model-evaluation,threshold-tuning
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: numpy>=1.21
25
+ Requires-Dist: scikit-learn>=1.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: build; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Requires-Dist: twine; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # imbeval
33
+
34
+ **Honest production-readiness evaluation for imbalanced classification models.**
35
+
36
+ Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
37
+ data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
38
+ not enough to know if a model is actually safe to ship. `imbeval` answers the real
39
+ question: **is this model usable in production, and at what threshold?**
40
+
41
+ It combines three things most teams check manually and inconsistently:
42
+
43
+ 1. **Minority-class performance** — not buried inside macro-averages.
44
+ 2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
45
+ 3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install imbeval
51
+ ```
52
+
53
+ (Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
54
+
55
+ ## Quickstart
56
+
57
+ ```python
58
+ from imbeval import evaluation_report
59
+
60
+ # y_true: ground truth labels (0/1)
61
+ # y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
62
+ report = evaluation_report(
63
+ y_true,
64
+ y_pred_proba,
65
+ cost_fp=1, # cost of a false alarm
66
+ cost_fn=25, # cost of missing a true positive (e.g. missed fraud)
67
+ )
68
+
69
+ print(report["verdict"])
70
+ print(report["minority_class"])
71
+ print(report["optimal_f1_threshold"])
72
+ print(report["cost_sensitive_threshold"])
73
+ ```
74
+
75
+ Example output:
76
+
77
+ ```
78
+ Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
79
+ default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
80
+ ```
81
+
82
+ ## What's inside
83
+
84
+ | Function | What it does |
85
+ |---|---|
86
+ | `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
87
+ | `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
88
+ | `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
89
+ | `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
90
+ | `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
91
+ | `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
92
+ | `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
93
+
94
+ Full API reference: [docs/api.md](docs/api.md)
95
+ Usage guide and recipes: [docs/usage.md](docs/usage.md)
96
+ Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
97
+
98
+ ## Why this exists
99
+
100
+ Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
101
+ (SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
102
+ honestly whether the *model* you already trained is good enough, and at what threshold,
103
+ once class imbalance is in play. It's meant to sit right before you ship.
104
+
105
+ ## Status
106
+
107
+ Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
108
+ is stable for binary classification. Multi-class support is on the roadmap — see
109
+ [CHANGELOG.md](CHANGELOG.md).
110
+
111
+ ## Contributing
112
+
113
+ Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
114
+ how the modules fit together if you want to extend it.
115
+
116
+ ## License
117
+
118
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,9 @@
1
+ imbeval/__init__.py,sha256=NymkrvR3Vy6NqLS8-h7dAGRAiE_Lw0Ahr7Tp2kqzN3I,911
2
+ imbeval/calibration.py,sha256=-el42gPBunXNLiTuZKG8GccoM3ln_-wG1jJBSbn6fQY,2092
3
+ imbeval/metrics.py,sha256=qLlpAz4ywASVwzLicgCRQbaVpx_y5RRL33rtkRUoLxQ,2622
4
+ imbeval/report.py,sha256=1BaUaCjHM8CPkNjcFB0jQ1qHgG5GbwUMD6Kas4qA1w4,2803
5
+ imbeval/threshold.py,sha256=ArmCI8qqWjahZ6B9mWdn8uNlFBsyusHVQDQyeZz3UvM,2826
6
+ imbeval-0.1.0.dist-info/METADATA,sha256=YUkBP1vQ5SwKSFcNsVzmguOvQurwk2mFp16VoaluYt8,4756
7
+ imbeval-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
8
+ imbeval-0.1.0.dist-info/licenses/LICENSE,sha256=WUUJ80e95dG_vXeLjj31QjwMviIlbhZLTGRl3bcM_5k,1073
9
+ imbeval-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Srikanth Sridhar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.