imbeval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here.
4
+ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), versioning follows [SemVer](https://semver.org/).
5
+
6
+ ## [0.1.0] - 2026-06-26
7
+
8
+ ### Added
9
+ - `evaluation_report`: combined production-readiness report with plain-English verdict.
10
+ - `minority_class_report`: precision/recall/F1/confusion matrix focused on the minority class.
11
+ - `per_class_confidence`: mean predicted confidence per true class.
12
+ - `calibration_score` / `reliability_curve`: Expected Calibration Error and reliability diagram data.
13
+ - `optimal_threshold`: F1-maximizing decision threshold sweep.
14
+ - `cost_sensitive_threshold`: business-cost-minimizing decision threshold sweep.
15
+ - Initial test suite (8 tests, all passing).
16
+ - Full docs: README, usage guide, API reference, publishing guide.
17
+
18
+ ### Roadmap (not yet implemented)
19
+ - Multi-class cost-sensitive thresholding (currently binary only).
20
+ - Built-in matplotlib plotting helper for `reliability_curve`.
21
+ - Bootstrap confidence intervals on all reported metrics.
imbeval-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Srikanth Sridhar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
imbeval-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: imbeval
3
+ Version: 0.1.0
4
+ Summary: Honest, production-readiness evaluation for imbalanced classification models.
5
+ Project-URL: Homepage, https://github.com/sricodings
6
+ Project-URL: Repository, https://github.com/sricodings/imbeval
7
+ Project-URL: Issues, https://github.com/sricodings/imbeval/issues
8
+ Project-URL: Documentation, https://github.com/sricodings/imbeval#readme
9
+ Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: calibration,imbalanced-classification,machine-learning,model-evaluation,threshold-tuning
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: numpy>=1.21
25
+ Requires-Dist: scikit-learn>=1.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: build; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Requires-Dist: twine; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # imbeval
33
+
34
+ **Honest production-readiness evaluation for imbalanced classification models.**
35
+
36
+ Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
37
+ data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
38
+ not enough to know if a model is actually safe to ship. `imbeval` answers the real
39
+ question: **is this model usable in production, and at what threshold?**
40
+
41
+ It combines three things most teams check manually and inconsistently:
42
+
43
+ 1. **Minority-class performance** — not buried inside macro-averages.
44
+ 2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
45
+ 3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install imbeval
51
+ ```
52
+
53
+ (Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
54
+
55
+ ## Quickstart
56
+
57
+ ```python
58
+ from imbeval import evaluation_report
59
+
60
+ # y_true: ground truth labels (0/1)
61
+ # y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
62
+ report = evaluation_report(
63
+ y_true,
64
+ y_pred_proba,
65
+ cost_fp=1, # cost of a false alarm
66
+ cost_fn=25, # cost of missing a true positive (e.g. missed fraud)
67
+ )
68
+
69
+ print(report["verdict"])
70
+ print(report["minority_class"])
71
+ print(report["optimal_f1_threshold"])
72
+ print(report["cost_sensitive_threshold"])
73
+ ```
74
+
75
+ Example output:
76
+
77
+ ```
78
+ Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
79
+ default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
80
+ ```
81
+
82
+ ## What's inside
83
+
84
+ | Function | What it does |
85
+ |---|---|
86
+ | `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
87
+ | `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
88
+ | `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
89
+ | `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
90
+ | `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
91
+ | `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
92
+ | `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
93
+
94
+ Full API reference: [docs/api.md](docs/api.md)
95
+ Usage guide and recipes: [docs/usage.md](docs/usage.md)
96
+ Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
97
+
98
+ ## Why this exists
99
+
100
+ Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
101
+ (SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
102
+ honestly whether the *model* you already trained is good enough, and at what threshold,
103
+ once class imbalance is in play. It's meant to sit right before you ship.
104
+
105
+ ## Status
106
+
107
+ Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
108
+ is stable for binary classification. Multi-class support is on the roadmap — see
109
+ [CHANGELOG.md](CHANGELOG.md).
110
+
111
+ ## Contributing
112
+
113
+ Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
114
+ how the modules fit together if you want to extend it.
115
+
116
+ ## License
117
+
118
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,87 @@
1
+ # imbeval
2
+
3
+ **Honest production-readiness evaluation for imbalanced classification models.**
4
+
5
+ Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
6
+ data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
7
+ not enough to know if a model is actually safe to ship. `imbeval` answers the real
8
+ question: **is this model usable in production, and at what threshold?**
9
+
10
+ It combines three things most teams check manually and inconsistently:
11
+
12
+ 1. **Minority-class performance** — not buried inside macro-averages.
13
+ 2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
14
+ 3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install imbeval
20
+ ```
21
+
22
+ (Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
23
+
24
+ ## Quickstart
25
+
26
+ ```python
27
+ from imbeval import evaluation_report
28
+
29
+ # y_true: ground truth labels (0/1)
30
+ # y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
31
+ report = evaluation_report(
32
+ y_true,
33
+ y_pred_proba,
34
+ cost_fp=1, # cost of a false alarm
35
+ cost_fn=25, # cost of missing a true positive (e.g. missed fraud)
36
+ )
37
+
38
+ print(report["verdict"])
39
+ print(report["minority_class"])
40
+ print(report["optimal_f1_threshold"])
41
+ print(report["cost_sensitive_threshold"])
42
+ ```
43
+
44
+ Example output:
45
+
46
+ ```
47
+ Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
48
+ default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
49
+ ```
50
+
51
+ ## What's inside
52
+
53
+ | Function | What it does |
54
+ |---|---|
55
+ | `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
56
+ | `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
57
+ | `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
58
+ | `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
59
+ | `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
60
+ | `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
61
+ | `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
62
+
63
+ Full API reference: [docs/api.md](docs/api.md)
64
+ Usage guide and recipes: [docs/usage.md](docs/usage.md)
65
+ Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
66
+
67
+ ## Why this exists
68
+
69
+ Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
70
+ (SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
71
+ honestly whether the *model* you already trained is good enough, and at what threshold,
72
+ once class imbalance is in play. It's meant to sit right before you ship.
73
+
74
+ ## Status
75
+
76
+ Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
77
+ is stable for binary classification. Multi-class support is on the roadmap — see
78
+ [CHANGELOG.md](CHANGELOG.md).
79
+
80
+ ## Contributing
81
+
82
+ Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
83
+ how the modules fit together if you want to extend it.
84
+
85
+ ## License
86
+
87
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,92 @@
1
+ # API Reference
2
+
3
+ All public functions are importable directly from `imbeval`.
4
+
5
+ ---
6
+
7
+ ### `evaluation_report(y_true, y_pred_proba, cost_fp=None, cost_fn=None, n_bins=10)`
8
+
9
+ Combined production-readiness report.
10
+
11
+ **Parameters**
12
+ - `y_true` (array-like): binary ground truth labels (0/1).
13
+ - `y_pred_proba` (array-like): predicted probability of the positive class.
14
+ - `cost_fp` (float, optional): cost of one false positive. Required together with `cost_fn` to get a cost-sensitive threshold.
15
+ - `cost_fn` (float, optional): cost of one false negative.
16
+ - `n_bins` (int, default 10): bins used for calibration scoring.
17
+
18
+ **Returns** `dict`:
19
+ ```python
20
+ {
21
+ "minority_class": {...}, # see minority_class_report
22
+ "calibration_error": 0.04, # float, ECE
23
+ "optimal_f1_threshold": {...}, # see optimal_threshold
24
+ "cost_sensitive_threshold": {...} or None,
25
+ "verdict": "..." # plain-English summary
26
+ }
27
+ ```
28
+
29
+ ---
30
+
31
+ ### `minority_class_report(y_true, y_pred, classes=None, minority_label=None)`
32
+
33
+ Precision/recall/F1/support/confusion matrix focused on the minority class.
34
+
35
+ **Parameters**
36
+ - `y_true`, `y_pred` (array-like): true and predicted integer labels.
37
+ - `minority_label`: explicit label to treat as minority. If omitted, auto-detected as the class with lowest support.
38
+
39
+ **Returns** `dict` with keys: `minority_label`, `support`, `precision`, `recall`, `f1`, `confusion_matrix`.
40
+
41
+ ---
42
+
43
+ ### `per_class_confidence(y_true, y_pred_proba, classes=None)`
44
+
45
+ Mean predicted probability the model assigns to the *true* class, per class.
46
+
47
+ **Parameters**
48
+ - `y_true` (array-like): integer class labels.
49
+ - `y_pred_proba` (array-like, shape `(n_samples, n_classes)`): predicted probabilities.
50
+ - `classes` (list, optional): display labels for each class index.
51
+
52
+ **Returns** `dict` mapping class label → mean confidence (or `None` if no samples for that class).
53
+
54
+ ---
55
+
56
+ ### `calibration_score(y_true, y_pred_proba, n_bins=10)`
57
+
58
+ Expected Calibration Error (ECE). 0 = perfectly calibrated.
59
+
60
+ **Returns** `float`.
61
+
62
+ ---
63
+
64
+ ### `reliability_curve(y_true, y_pred_proba, n_bins=10)`
65
+
66
+ Binned data for plotting a reliability diagram.
67
+
68
+ **Returns** `dict` with keys `bin_confidence`, `bin_accuracy`, `bin_count` (each a list of length `n_bins`).
69
+
70
+ ---
71
+
72
+ ### `optimal_threshold(y_true, y_pred_proba, metric="f1", n_steps=200)`
73
+
74
+ Sweeps thresholds and returns the one maximizing the chosen metric.
75
+
76
+ **Returns** `dict`: `{"threshold": float, "score": float}`.
77
+
78
+ ---
79
+
80
+ ### `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn, n_steps=200)`
81
+
82
+ Finds the threshold minimizing `fp * cost_fp + fn * cost_fn`.
83
+
84
+ **Returns** `dict`: `{"threshold": float, "total_cost": float, "false_positives": int, "false_negatives": int}`.
85
+
86
+ ---
87
+
88
+ ## Type conventions
89
+
90
+ - All `y_true` / `y_pred` for binary functions use `0`/`1` integer labels.
91
+ - All `y_pred_proba` for binary functions is the probability of the **positive (1)** class — i.e. `model.predict_proba(X)[:, 1]`.
92
+ - `per_class_confidence` is the only function expecting a full `(n_samples, n_classes)` probability matrix, for multi-class use.
@@ -0,0 +1,120 @@
1
+ # Publishing Guide (for first-time package authors)
2
+
3
+ This walks you through every step to get `imbeval` (or any Python package you build
4
+ this way) onto GitHub and PyPI, properly licensed, with zero prior packaging experience.
5
+
6
+ ## 0. Before you publish anything
7
+
8
+ - Double-check the name isn't taken: https://pypi.org/project/imbeval/ — if it's taken, rename it everywhere (`pyproject.toml`'s `name`, the folder under `src/`, imports in tests/docs).
9
+ - Make sure tests pass locally (we already verified 8/8 pass for this package).
10
+ - Fill in real values in `pyproject.toml`: your name, email, and GitHub URL (currently placeholders).
11
+
12
+ ## 1. Choose and apply a license
13
+
14
+ You already have an MIT `LICENSE` file in this package — MIT is the most permissive,
15
+ common choice for libraries because it lets anyone use, modify, and redistribute your
16
+ code (even commercially) as long as they keep your copyright notice. This maximizes
17
+ adoption, which is what you want for a library you hope many people use.
18
+
19
+ If you'd rather require derivative works to stay open-source, use Apache 2.0 (also
20
+ patent-safe, popular for ML tooling) or GPLv3 (strongest copyleft, but it discourages
21
+ commercial adoption — usually a bad choice for "I want everyone to use this").
22
+
23
+ You do **not** need to register or pay for anything to apply a license — putting the
24
+ `LICENSE` file in your repo root and referencing it in `pyproject.toml` (`license = "MIT"`)
25
+ is legally sufficient. GitHub will also detect and display it automatically.
26
+
27
+ ## 2. Put the code on GitHub
28
+
29
+ ```bash
30
+ cd imbeval
31
+ git init
32
+ git add .
33
+ git commit -m "Initial commit: imbeval v0.1.0"
34
+ gh repo create imbeval --public --source=. --remote=origin
35
+ git push -u origin main
36
+ ```
37
+
38
+ (No `gh` CLI? Create the repo manually on github.com, then `git remote add origin <url>` and `git push -u origin main`.)
39
+
40
+ ## 3. Create a PyPI account and API token
41
+
42
+ 1. Register at https://pypi.org/account/register/ (and verify your email).
43
+ 2. Enable 2FA (PyPI requires it for publishing as of recent policy).
44
+ 3. Go to Account Settings → API tokens → "Add API token". Scope it to "Entire account" for your first upload (you can scope it to just this project after the first release).
45
+ 4. Save the token somewhere safe — it's shown once.
46
+
47
+ It's good practice to also register on **TestPyPI** (https://test.pypi.org) first, to do a dry run without polluting the real index.
48
+
49
+ ## 4. Build the distribution files
50
+
51
+ From the project root:
52
+
53
+ ```bash
54
+ pip install --upgrade build twine
55
+ python -m build
56
+ ```
57
+
58
+ This creates a `dist/` folder with a `.tar.gz` (source distribution) and a `.whl`
59
+ (wheel) — these are the files that actually get uploaded to PyPI.
60
+
61
+ ## 5. (Recommended) Upload to TestPyPI first
62
+
63
+ ```bash
64
+ python -m twine upload --repository testpypi dist/*
65
+ ```
66
+
67
+ It will prompt for a username (`__token__`) and password (your TestPyPI token).
68
+ Then verify it installs cleanly:
69
+
70
+ ```bash
71
+ pip install --index-url https://test.pypi.org/simple/ imbeval
72
+ ```
73
+
74
+ ## 6. Publish to the real PyPI
75
+
76
+ ```bash
77
+ python -m twine upload dist/*
78
+ ```
79
+
80
+ Username: `__token__`. Password: your PyPI API token (starts with `pypi-`).
81
+
82
+ Once this succeeds, anyone in the world can run:
83
+
84
+ ```bash
85
+ pip install imbeval
86
+ ```
87
+
88
+ ## 7. Tag the release on GitHub
89
+
90
+ ```bash
91
+ git tag v0.1.0
92
+ git push origin v0.1.0
93
+ ```
94
+
95
+ Then create a "Release" on GitHub from that tag with short release notes — this is
96
+ what shows up when people check "is this actively maintained."
97
+
98
+ ## 8. For every future update
99
+
100
+ 1. Make your changes, add/update tests, run `pytest` — all green before proceeding.
101
+ 2. Bump the version in `pyproject.toml` (and `__version__` in `__init__.py`) — follow [semantic versioning](https://semver.org/): patch (0.1.1) for fixes, minor (0.2.0) for new backward-compatible features, major (1.0.0) for breaking changes.
102
+ 3. Update `CHANGELOG.md`.
103
+ 4. `rm -rf dist/ && python -m build && python -m twine upload dist/*`
104
+ 5. Tag and release on GitHub as in step 7.
105
+
106
+ ## 9. Getting people to actually use it
107
+
108
+ A license and a PyPI listing alone won't get adoption — that comes from people
109
+ finding it useful and finding out it exists:
110
+
111
+ - Write one clear, narrow blog post: "Why your imbalanced classifier's 99% accuracy is lying to you" — demonstrating the exact problem this library solves, ending with a 3-line code example.
112
+ - Post it where the target audience already is: relevant subreddits (r/MachineLearning, r/datascience), a Show HN on Hacker News, relevant Discord/Slack ML communities.
113
+ - Add a badge-rich README (build status, PyPI version, license) — people skim-judge trust from this in seconds. Shields.io provides free badges once you have CI set up.
114
+ - Respond to every GitHub issue quickly for the first few months — early responsiveness is the single biggest driver of whether a library gets a second look.
115
+
116
+ ## Common first-publish mistakes to avoid
117
+
118
+ - Forgetting to bump the version before re-uploading — PyPI rejects re-uploading the same version number, even for a typo fix.
119
+ - Uploading secrets/API tokens accidentally committed in the repo — double check `git log` and use a `.gitignore` (see below) before your first push.
120
+ - Publishing without a `README.md` that has a working code example — this is the #1 reason people bounce off a new package's PyPI page.
@@ -0,0 +1,107 @@
1
+ # Usage Guide
2
+
3
+ ## 1. The core workflow
4
+
5
+ ```python
6
+ from sklearn.linear_model import LogisticRegression
7
+ from imbeval import evaluation_report
8
+
9
+ model = LogisticRegression().fit(X_train, y_train)
10
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
11
+
12
+ report = evaluation_report(y_test, y_pred_proba, cost_fp=1, cost_fn=20)
13
+ print(report["verdict"])
14
+ ```
15
+
16
+ `evaluation_report` is the one function most people need. It is intentionally
17
+ opinionated: it tells you in plain English whether your model has a problem,
18
+ not just a pile of numbers to interpret yourself.
19
+
20
+ ## 2. Reading the minority-class report
21
+
22
+ ```python
23
+ from imbeval import minority_class_report
24
+
25
+ preds = (y_pred_proba >= 0.5).astype(int)
26
+ report = minority_class_report(y_test, preds)
27
+ print(report)
28
+ # {'minority_label': 1, 'support': 48, 'precision': 0.62, 'recall': 0.41, 'f1': 0.5, ...}
29
+ ```
30
+
31
+ If you already know which label is the minority/rare class, pass it explicitly:
32
+
33
+ ```python
34
+ minority_class_report(y_test, preds, minority_label=1)
35
+ ```
36
+
37
+ ## 3. Checking calibration
38
+
39
+ A model can have great accuracy and still be miscalibrated — e.g. it says
40
+ "90% confident" but is only right 60% of the time at that confidence level.
41
+ This matters a lot if you use the probability output for downstream decisions
42
+ (e.g. ranking leads, setting alert priority).
43
+
44
+ ```python
45
+ from imbeval import calibration_score, reliability_curve
46
+
47
+ ece = calibration_score(y_test, y_pred_proba)
48
+ print(f"Expected Calibration Error: {ece:.3f}") # 0 = perfect, >0.1 = concerning
49
+
50
+ curve = reliability_curve(y_test, y_pred_proba, n_bins=10)
51
+ # Plot curve["bin_confidence"] vs curve["bin_accuracy"] for a reliability diagram
52
+ ```
53
+
54
+ ## 4. Picking a better threshold than 0.5
55
+
56
+ The 0.5 default threshold assumes balanced classes and equal costs. Neither
57
+ is usually true for imbalanced problems.
58
+
59
+ ```python
60
+ from imbeval import optimal_threshold
61
+
62
+ result = optimal_threshold(y_test, y_pred_proba)
63
+ print(result) # {'threshold': 0.27, 'score': 0.71}
64
+ ```
65
+
66
+ ## 5. Cost-sensitive thresholding (the most useful function for real decisions)
67
+
68
+ If you know roughly what a false positive and a false negative cost your
69
+ business, use this instead — it directly minimizes cost rather than an
70
+ abstract metric.
71
+
72
+ ```python
73
+ from imbeval import cost_sensitive_threshold
74
+
75
+ # Example: fraud detection. A false alarm costs $1 in review time.
76
+ # A missed fraud case costs $200 on average.
77
+ result = cost_sensitive_threshold(y_test, y_pred_proba, cost_fp=1, cost_fn=200)
78
+ print(result)
79
+ # {'threshold': 0.08, 'total_cost': 1340.0, 'false_positives': 210, 'false_negatives': 6}
80
+ ```
81
+
82
+ This typically pushes the threshold much lower than 0.5 when false negatives
83
+ are expensive — which is the common case in fraud, medical screening, and
84
+ safety-critical anomaly detection.
85
+
86
+ ## 6. Multi-class confidence breakdown
87
+
88
+ ```python
89
+ from imbeval import per_class_confidence
90
+
91
+ confidences = per_class_confidence(y_test, y_pred_proba_matrix, classes=["normal", "fraud", "abuse"])
92
+ print(confidences)
93
+ # {'normal': 0.91, 'fraud': 0.58, 'abuse': 0.44}
94
+ ```
95
+
96
+ A low number for a specific class tells you the model is unsure specifically
97
+ about that class, even if overall accuracy looks fine.
98
+
99
+ ## Common pitfalls this library is built to catch
100
+
101
+ - **"99% accuracy" on a 1%-positive-rate dataset** — `minority_class_report`
102
+ exposes that this can mean the model just predicts the majority class always.
103
+ - **High AUC, useless probabilities** — `calibration_score` catches this even
104
+ when ranking metrics look great.
105
+ - **Using 0.5 by default out of habit** — `optimal_threshold` and
106
+ `cost_sensitive_threshold` replace that habit with a number backed by your
107
+ actual data and actual costs.
@@ -0,0 +1,43 @@
1
+ """
2
+ Quickstart example for imbeval.
3
+
4
+ Run with: python examples/quickstart.py
5
+ """
6
+
7
+ import numpy as np
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.datasets import make_classification
11
+
12
+ from imbeval import evaluation_report
13
+
14
+ # Simulate a realistic imbalanced dataset (5% positive class — e.g. fraud)
15
+ X, y = make_classification(
16
+ n_samples=5000,
17
+ n_features=20,
18
+ weights=[0.95, 0.05],
19
+ flip_y=0.02,
20
+ random_state=42,
21
+ )
22
+
23
+ X_train, X_test, y_train, y_test = train_test_split(
24
+ X, y, test_size=0.3, stratify=y, random_state=42
25
+ )
26
+
27
+ model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
28
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
29
+
30
+ report = evaluation_report(
31
+ y_test,
32
+ y_pred_proba,
33
+ cost_fp=1, # cost of a false alarm
34
+ cost_fn=25, # cost of a missed positive case
35
+ )
36
+
37
+ print("=" * 60)
38
+ print("VERDICT:", report["verdict"])
39
+ print("=" * 60)
40
+ print("Minority class report:", report["minority_class"])
41
+ print("Calibration error (ECE):", round(report["calibration_error"], 4))
42
+ print("Optimal F1 threshold:", report["optimal_f1_threshold"])
43
+ print("Cost-sensitive threshold:", report["cost_sensitive_threshold"])
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "imbeval"
7
+ version = "0.1.0"
8
+ description = "Honest, production-readiness evaluation for imbalanced classification models."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "Srikanth Sridhar", email = "srisrikanthtvs@gmail.com" }
14
+ ]
15
+ keywords = ["machine-learning", "imbalanced-classification", "model-evaluation", "calibration", "threshold-tuning"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
+ ]
28
+ dependencies = [
29
+ "numpy>=1.21",
30
+ "scikit-learn>=1.0",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/sricodings"
35
+ Repository = "https://github.com/sricodings/imbeval"
36
+ Issues = "https://github.com/sricodings/imbeval/issues"
37
+ Documentation = "https://github.com/sricodings/imbeval#readme"
38
+
39
+ [project.optional-dependencies]
40
+ dev = ["pytest>=7.0", "build", "twine"]
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/imbeval"]
@@ -0,0 +1,27 @@
1
+ """
2
+ imbeval — Honest evaluation for imbalanced classification models.
3
+
4
+ Most metric libraries hand you numbers; they don't tell you whether your
5
+ model is actually safe to ship on imbalanced data (fraud, medical, anomaly
6
+ detection, churn, etc). imbeval combines per-class confidence, calibration
7
+ quality, and cost-sensitive thresholding into one report so you can answer
8
+ the real question: "is this model usable in production?"
9
+ """
10
+
11
+ from .report import evaluation_report
12
+ from .calibration import calibration_score, reliability_curve
13
+ from .threshold import optimal_threshold, cost_sensitive_threshold
14
+ from .metrics import per_class_confidence, minority_class_report
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ __all__ = [
19
+ "evaluation_report",
20
+ "calibration_score",
21
+ "reliability_curve",
22
+ "optimal_threshold",
23
+ "cost_sensitive_threshold",
24
+ "per_class_confidence",
25
+ "minority_class_report",
26
+ "__version__",
27
+ ]
@@ -0,0 +1,66 @@
1
+ """Calibration quality checks — is the model's confidence trustworthy?"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def reliability_curve(y_true, y_pred_proba, n_bins: int = 10):
9
+ """
10
+ Bin predictions by confidence and compare to observed accuracy in
11
+ each bin. Returns arrays suitable for plotting a reliability diagram.
12
+
13
+ Parameters
14
+ ----------
15
+ y_true : array-like, binary (0/1) ground truth.
16
+ y_pred_proba : array-like, predicted probability of the positive class.
17
+ n_bins : int
18
+
19
+ Returns
20
+ -------
21
+ dict with keys: bin_confidence, bin_accuracy, bin_count
22
+ """
23
+ y_true = np.asarray(y_true)
24
+ y_pred_proba = np.asarray(y_pred_proba)
25
+
26
+ bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
27
+ bin_confidence, bin_accuracy, bin_count = [], [], []
28
+
29
+ for i in range(n_bins):
30
+ lo, hi = bin_edges[i], bin_edges[i + 1]
31
+ mask = (y_pred_proba >= lo) & (y_pred_proba < hi if i < n_bins - 1 else y_pred_proba <= hi)
32
+ count = int(mask.sum())
33
+ bin_count.append(count)
34
+ if count == 0:
35
+ bin_confidence.append(None)
36
+ bin_accuracy.append(None)
37
+ else:
38
+ bin_confidence.append(float(np.mean(y_pred_proba[mask])))
39
+ bin_accuracy.append(float(np.mean(y_true[mask])))
40
+
41
+ return {
42
+ "bin_confidence": bin_confidence,
43
+ "bin_accuracy": bin_accuracy,
44
+ "bin_count": bin_count,
45
+ }
46
+
47
+
48
+ def calibration_score(y_true, y_pred_proba, n_bins: int = 10) -> float:
49
+ """
50
+ Expected Calibration Error (ECE): the weighted average gap between
51
+ predicted confidence and observed accuracy across bins. Lower is
52
+ better; 0 is perfect calibration.
53
+ """
54
+ curve = reliability_curve(y_true, y_pred_proba, n_bins=n_bins)
55
+ total = sum(c for c in curve["bin_count"] if c)
56
+ if total == 0:
57
+ return 0.0
58
+
59
+ ece = 0.0
60
+ for conf, acc, count in zip(
61
+ curve["bin_confidence"], curve["bin_accuracy"], curve["bin_count"]
62
+ ):
63
+ if count == 0:
64
+ continue
65
+ ece += (count / total) * abs(conf - acc)
66
+ return float(ece)
@@ -0,0 +1,87 @@
1
+ """Per-class confidence and minority-class focused metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.metrics import (
7
+ precision_score,
8
+ recall_score,
9
+ f1_score,
10
+ confusion_matrix,
11
+ )
12
+
13
+
14
+ def per_class_confidence(y_true, y_pred_proba, classes=None):
15
+ """
16
+ Compute the mean predicted-probability "confidence" the model assigns
17
+ to the correct class, broken down per class.
18
+
19
+ Parameters
20
+ ----------
21
+ y_true : array-like of shape (n_samples,)
22
+ True integer class labels (0..n_classes-1).
23
+ y_pred_proba : array-like of shape (n_samples, n_classes)
24
+ Predicted probabilities from `model.predict_proba`.
25
+ classes : list, optional
26
+ Labels for each class index, for display purposes.
27
+
28
+ Returns
29
+ -------
30
+ dict
31
+ Mapping of class label -> mean confidence on correctly-attributed
32
+ probability mass for samples truly belonging to that class.
33
+ """
34
+ y_true = np.asarray(y_true)
35
+ y_pred_proba = np.asarray(y_pred_proba)
36
+ n_classes = y_pred_proba.shape[1]
37
+ if classes is None:
38
+ classes = list(range(n_classes))
39
+
40
+ result = {}
41
+ for idx, label in enumerate(classes):
42
+ mask = y_true == idx
43
+ if mask.sum() == 0:
44
+ result[label] = None
45
+ continue
46
+ result[label] = float(np.mean(y_pred_proba[mask, idx]))
47
+ return result
48
+
49
+
50
+ def minority_class_report(y_true, y_pred, classes=None, minority_label=None):
51
+ """
52
+ Precision/recall/F1 with explicit emphasis on the minority class.
53
+
54
+ If `minority_label` is not given, the class with the lowest support
55
+ in `y_true` is auto-detected.
56
+
57
+ Returns
58
+ -------
59
+ dict with keys: minority_label, support, precision, recall, f1,
60
+ confusion_matrix
61
+ """
62
+ y_true = np.asarray(y_true)
63
+ y_pred = np.asarray(y_pred)
64
+
65
+ labels, counts = np.unique(y_true, return_counts=True)
66
+ if minority_label is None:
67
+ minority_label = labels[np.argmin(counts)]
68
+
69
+ precision = precision_score(
70
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
71
+ )
72
+ recall = recall_score(
73
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
74
+ )
75
+ f1 = f1_score(
76
+ y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
77
+ )
78
+ support = int(np.sum(y_true == minority_label))
79
+
80
+ return {
81
+ "minority_label": minority_label,
82
+ "support": support,
83
+ "precision": float(precision),
84
+ "recall": float(recall),
85
+ "f1": float(f1),
86
+ "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
87
+ }
@@ -0,0 +1,80 @@
1
+ """The single entry point: one honest report on production-readiness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from .calibration import calibration_score
8
+ from .threshold import optimal_threshold, cost_sensitive_threshold
9
+ from .metrics import minority_class_report
10
+
11
+
12
+ def evaluation_report(
13
+ y_true,
14
+ y_pred_proba,
15
+ cost_fp: float = None,
16
+ cost_fn: float = None,
17
+ n_bins: int = 10,
18
+ ):
19
+ """
20
+ Produce one combined evaluation report for a binary classifier on
21
+ imbalanced data: minority-class performance, calibration quality,
22
+ a tuned decision threshold, and (optionally) a cost-aware threshold.
23
+
24
+ Parameters
25
+ ----------
26
+ y_true : array-like, binary ground truth labels (0/1).
27
+ y_pred_proba : array-like, predicted probability of the positive (1) class.
28
+ cost_fp : float, optional. Business cost of one false positive.
29
+ cost_fn : float, optional. Business cost of one false negative.
30
+ If both cost_fp and cost_fn are given, a cost-sensitive threshold
31
+ is included in the report.
32
+ n_bins : int, bins used for calibration scoring.
33
+
34
+ Returns
35
+ -------
36
+ dict
37
+ {
38
+ "minority_class": {...},
39
+ "calibration_error": float,
40
+ "default_threshold_0.5": {...},
41
+ "optimal_f1_threshold": {...},
42
+ "cost_sensitive_threshold": {...} or None,
43
+ "verdict": str
44
+ }
45
+ """
46
+ y_true = np.asarray(y_true)
47
+ y_pred_proba = np.asarray(y_pred_proba)
48
+ preds_at_half = (y_pred_proba >= 0.5).astype(int)
49
+
50
+ minority = minority_class_report(y_true, preds_at_half)
51
+ ece = calibration_score(y_true, y_pred_proba, n_bins=n_bins)
52
+ f1_opt = optimal_threshold(y_true, y_pred_proba)
53
+
54
+ cost_result = None
55
+ if cost_fp is not None and cost_fn is not None:
56
+ cost_result = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)
57
+
58
+ verdict = _build_verdict(minority, ece, f1_opt)
59
+
60
+ return {
61
+ "minority_class": minority,
62
+ "calibration_error": ece,
63
+ "optimal_f1_threshold": f1_opt,
64
+ "cost_sensitive_threshold": cost_result,
65
+ "verdict": verdict,
66
+ }
67
+
68
+
69
+ def _build_verdict(minority: dict, ece: float, f1_opt: dict) -> str:
70
+ flags = []
71
+ if minority["recall"] < 0.5:
72
+ flags.append("minority-class recall is below 50% at the default 0.5 threshold")
73
+ if ece > 0.1:
74
+ flags.append(f"calibration error is high (ECE={ece:.3f}); confidence scores are not trustworthy")
75
+ if f1_opt["score"] - minority["f1"] > 0.15:
76
+ flags.append("default 0.5 threshold is far from optimal; consider using optimal_f1_threshold")
77
+
78
+ if not flags:
79
+ return "Looks production-ready on the dimensions checked. Validate further on a held-out set."
80
+ return "Not yet production-ready: " + "; ".join(flags) + "."
@@ -0,0 +1,83 @@
1
+ """Decision threshold tuning for imbalanced problems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from sklearn.metrics import f1_score
7
+
8
+
9
+ def optimal_threshold(y_true, y_pred_proba, metric: str = "f1", n_steps: int = 200):
10
+ """
11
+ Sweep decision thresholds and return the one that maximizes the
12
+ chosen metric. Default metric is 0.5-agnostic F1, which is usually
13
+ a far better default than 0.5 on imbalanced data.
14
+
15
+ Parameters
16
+ ----------
17
+ y_true : array-like, binary ground truth.
18
+ y_pred_proba : array-like, predicted probability of positive class.
19
+ metric : {"f1"} currently supported.
20
+ n_steps : int, number of thresholds to test between 0 and 1.
21
+
22
+ Returns
23
+ -------
24
+ dict with keys: threshold, score
25
+ """
26
+ y_true = np.asarray(y_true)
27
+ y_pred_proba = np.asarray(y_pred_proba)
28
+
29
+ thresholds = np.linspace(0.01, 0.99, n_steps)
30
+ best_threshold, best_score = 0.5, -1.0
31
+
32
+ for t in thresholds:
33
+ preds = (y_pred_proba >= t).astype(int)
34
+ if metric == "f1":
35
+ score = f1_score(y_true, preds, zero_division=0)
36
+ else:
37
+ raise ValueError(f"Unsupported metric: {metric}")
38
+ if score > best_score:
39
+ best_score, best_threshold = score, t
40
+
41
+ return {"threshold": float(best_threshold), "score": float(best_score)}
42
+
43
+
44
+ def cost_sensitive_threshold(y_true, y_pred_proba, cost_fp: float, cost_fn: float, n_steps: int = 200):
45
+ """
46
+ Find the decision threshold that minimizes total business cost,
47
+ given the real-world cost of a false positive vs a false negative.
48
+
49
+ This is usually what people actually want on imbalanced data
50
+ (e.g. fraud: missing fraud is far costlier than a false alarm).
51
+
52
+ Parameters
53
+ ----------
54
+ y_true : array-like, binary ground truth.
55
+ y_pred_proba : array-like, predicted probability of positive class.
56
+ cost_fp : float, cost incurred per false positive.
57
+ cost_fn : float, cost incurred per false negative.
58
+ n_steps : int
59
+
60
+ Returns
61
+ -------
62
+ dict with keys: threshold, total_cost, false_positives, false_negatives
63
+ """
64
+ y_true = np.asarray(y_true)
65
+ y_pred_proba = np.asarray(y_pred_proba)
66
+
67
+ thresholds = np.linspace(0.01, 0.99, n_steps)
68
+ best = {"threshold": 0.5, "total_cost": float("inf"), "false_positives": 0, "false_negatives": 0}
69
+
70
+ for t in thresholds:
71
+ preds = (y_pred_proba >= t).astype(int)
72
+ fp = int(np.sum((preds == 1) & (y_true == 0)))
73
+ fn = int(np.sum((preds == 0) & (y_true == 1)))
74
+ total_cost = fp * cost_fp + fn * cost_fn
75
+ if total_cost < best["total_cost"]:
76
+ best = {
77
+ "threshold": float(t),
78
+ "total_cost": float(total_cost),
79
+ "false_positives": fp,
80
+ "false_negatives": fn,
81
+ }
82
+
83
+ return best
@@ -0,0 +1,94 @@
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from imbeval import (
5
+ evaluation_report,
6
+ calibration_score,
7
+ reliability_curve,
8
+ optimal_threshold,
9
+ cost_sensitive_threshold,
10
+ per_class_confidence,
11
+ minority_class_report,
12
+ )
13
+
14
+
15
+ @pytest.fixture
16
+ def imbalanced_binary_data():
17
+ rng = np.random.default_rng(42)
18
+ n = 1000
19
+ y_true = np.zeros(n, dtype=int)
20
+ y_true[:50] = 1 # 5% minority class
21
+ rng.shuffle(y_true)
22
+
23
+ # Simulate a decently-calibrated model: higher proba for true positives
24
+ base = rng.uniform(0, 0.3, size=n)
25
+ y_pred_proba = np.where(y_true == 1, base + rng.uniform(0.3, 0.6, size=n), base)
26
+ y_pred_proba = np.clip(y_pred_proba, 0, 1)
27
+ return y_true, y_pred_proba
28
+
29
+
30
+ def test_calibration_score_perfect_calibration():
31
+ rng = np.random.default_rng(0)
32
+ proba = rng.uniform(0, 1, 5000)
33
+ y_true = (rng.uniform(0, 1, 5000) < proba).astype(int)
34
+ ece = calibration_score(y_true, proba, n_bins=10)
35
+ assert ece < 0.08 # roughly well calibrated by construction
36
+
37
+
38
+ def test_reliability_curve_shape():
39
+ y_true = [0, 1, 0, 1, 1, 0]
40
+ proba = [0.1, 0.9, 0.2, 0.8, 0.7, 0.3]
41
+ curve = reliability_curve(y_true, proba, n_bins=5)
42
+ assert len(curve["bin_confidence"]) == 5
43
+ assert len(curve["bin_accuracy"]) == 5
44
+ assert len(curve["bin_count"]) == 5
45
+ assert sum(c for c in curve["bin_count"]) == len(y_true)
46
+
47
+
48
+ def test_optimal_threshold_beats_or_matches_default(imbalanced_binary_data):
49
+ y_true, y_pred_proba = imbalanced_binary_data
50
+ result = optimal_threshold(y_true, y_pred_proba)
51
+ assert 0.0 < result["threshold"] < 1.0
52
+ assert 0.0 <= result["score"] <= 1.0
53
+
54
+
55
+ def test_cost_sensitive_threshold_favors_recall_when_fn_costly(imbalanced_binary_data):
56
+ y_true, y_pred_proba = imbalanced_binary_data
57
+ cheap_fp = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp=1, cost_fn=1)
58
+ costly_fn = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp=1, cost_fn=50)
59
+ # Penalizing false negatives heavily should push the threshold down
60
+ assert costly_fn["threshold"] <= cheap_fp["threshold"]
61
+
62
+
63
+ def test_per_class_confidence_basic():
64
+ y_true = [0, 0, 1, 1]
65
+ proba = np.array([[0.9, 0.1], [0.6, 0.4], [0.3, 0.7], [0.2, 0.8]])
66
+ result = per_class_confidence(y_true, proba, classes=["neg", "pos"])
67
+ assert result["neg"] == pytest.approx(0.75)
68
+ assert result["pos"] == pytest.approx(0.75)
69
+
70
+
71
+ def test_minority_class_report_autodetects_minority(imbalanced_binary_data):
72
+ y_true, y_pred_proba = imbalanced_binary_data
73
+ preds = (y_pred_proba >= 0.5).astype(int)
74
+ report = minority_class_report(y_true, preds)
75
+ assert report["minority_label"] == 1
76
+ assert report["support"] == int(np.sum(y_true == 1))
77
+ assert 0.0 <= report["precision"] <= 1.0
78
+ assert 0.0 <= report["recall"] <= 1.0
79
+
80
+
81
+ def test_evaluation_report_end_to_end(imbalanced_binary_data):
82
+ y_true, y_pred_proba = imbalanced_binary_data
83
+ report = evaluation_report(y_true, y_pred_proba, cost_fp=1, cost_fn=20)
84
+ assert "minority_class" in report
85
+ assert "calibration_error" in report
86
+ assert "optimal_f1_threshold" in report
87
+ assert report["cost_sensitive_threshold"] is not None
88
+ assert isinstance(report["verdict"], str)
89
+
90
+
91
+ def test_evaluation_report_without_costs(imbalanced_binary_data):
92
+ y_true, y_pred_proba = imbalanced_binary_data
93
+ report = evaluation_report(y_true, y_pred_proba)
94
+ assert report["cost_sensitive_threshold"] is None