imbeval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imbeval-0.1.0/CHANGELOG.md +21 -0
- imbeval-0.1.0/LICENSE +21 -0
- imbeval-0.1.0/PKG-INFO +118 -0
- imbeval-0.1.0/README.md +87 -0
- imbeval-0.1.0/docs/api.md +92 -0
- imbeval-0.1.0/docs/publishing.md +120 -0
- imbeval-0.1.0/docs/usage.md +107 -0
- imbeval-0.1.0/examples/quickstart.py +43 -0
- imbeval-0.1.0/pyproject.toml +43 -0
- imbeval-0.1.0/src/imbeval/__init__.py +27 -0
- imbeval-0.1.0/src/imbeval/calibration.py +66 -0
- imbeval-0.1.0/src/imbeval/metrics.py +87 -0
- imbeval-0.1.0/src/imbeval/report.py +80 -0
- imbeval-0.1.0/src/imbeval/threshold.py +83 -0
- imbeval-0.1.0/tests/test_imbeval.py +94 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here.
|
|
4
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), versioning follows [SemVer](https://semver.org/).
|
|
5
|
+
|
|
6
|
+
## [0.1.0] - 2026-06-26
|
|
7
|
+
|
|
8
|
+
### Added
|
|
9
|
+
- `evaluation_report`: combined production-readiness report with plain-English verdict.
|
|
10
|
+
- `minority_class_report`: precision/recall/F1/confusion matrix focused on the minority class.
|
|
11
|
+
- `per_class_confidence`: mean predicted confidence per true class.
|
|
12
|
+
- `calibration_score` / `reliability_curve`: Expected Calibration Error and reliability diagram data.
|
|
13
|
+
- `optimal_threshold`: F1-maximizing decision threshold sweep.
|
|
14
|
+
- `cost_sensitive_threshold`: business-cost-minimizing decision threshold sweep.
|
|
15
|
+
- Initial test suite (8 tests, all passing).
|
|
16
|
+
- Full docs: README, usage guide, API reference, publishing guide.
|
|
17
|
+
|
|
18
|
+
### Roadmap (not yet implemented)
|
|
19
|
+
- Multi-class cost-sensitive thresholding (currently binary only).
|
|
20
|
+
- Built-in matplotlib plotting helper for `reliability_curve`.
|
|
21
|
+
- Bootstrap confidence intervals on all reported metrics.
|
imbeval-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Srikanth Sridhar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
imbeval-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imbeval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Honest, production-readiness evaluation for imbalanced classification models.
|
|
5
|
+
Project-URL: Homepage, https://github.com/sricodings
|
|
6
|
+
Project-URL: Repository, https://github.com/sricodings/imbeval
|
|
7
|
+
Project-URL: Issues, https://github.com/sricodings/imbeval/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/sricodings/imbeval#readme
|
|
9
|
+
Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: calibration,imbalanced-classification,machine-learning,model-evaluation,threshold-tuning
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.21
|
|
25
|
+
Requires-Dist: scikit-learn>=1.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: build; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: twine; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# imbeval
|
|
33
|
+
|
|
34
|
+
**Honest production-readiness evaluation for imbalanced classification models.**
|
|
35
|
+
|
|
36
|
+
Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
|
|
37
|
+
data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
|
|
38
|
+
not enough to know if a model is actually safe to ship. `imbeval` answers the real
|
|
39
|
+
question: **is this model usable in production, and at what threshold?**
|
|
40
|
+
|
|
41
|
+
It combines three things most teams check manually and inconsistently:
|
|
42
|
+
|
|
43
|
+
1. **Minority-class performance** — not buried inside macro-averages.
|
|
44
|
+
2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
|
|
45
|
+
3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install imbeval
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
(Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from imbeval import evaluation_report
|
|
59
|
+
|
|
60
|
+
# y_true: ground truth labels (0/1)
|
|
61
|
+
# y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
|
|
62
|
+
report = evaluation_report(
|
|
63
|
+
y_true,
|
|
64
|
+
y_pred_proba,
|
|
65
|
+
cost_fp=1, # cost of a false alarm
|
|
66
|
+
cost_fn=25, # cost of missing a true positive (e.g. missed fraud)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
print(report["verdict"])
|
|
70
|
+
print(report["minority_class"])
|
|
71
|
+
print(report["optimal_f1_threshold"])
|
|
72
|
+
print(report["cost_sensitive_threshold"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Example output:
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
|
|
79
|
+
default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## What's inside
|
|
83
|
+
|
|
84
|
+
| Function | What it does |
|
|
85
|
+
|---|---|
|
|
86
|
+
| `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
|
|
87
|
+
| `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
|
|
88
|
+
| `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
|
|
89
|
+
| `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
|
|
90
|
+
| `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
|
|
91
|
+
| `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
|
|
92
|
+
| `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
|
|
93
|
+
|
|
94
|
+
Full API reference: [docs/api.md](docs/api.md)
|
|
95
|
+
Usage guide and recipes: [docs/usage.md](docs/usage.md)
|
|
96
|
+
Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
|
|
97
|
+
|
|
98
|
+
## Why this exists
|
|
99
|
+
|
|
100
|
+
Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
|
|
101
|
+
(SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
|
|
102
|
+
honestly whether the *model* you already trained is good enough, and at what threshold,
|
|
103
|
+
once class imbalance is in play. It's meant to sit right before you ship.
|
|
104
|
+
|
|
105
|
+
## Status
|
|
106
|
+
|
|
107
|
+
Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
|
|
108
|
+
is stable for binary classification. Multi-class support is on the roadmap — see
|
|
109
|
+
[CHANGELOG.md](CHANGELOG.md).
|
|
110
|
+
|
|
111
|
+
## Contributing
|
|
112
|
+
|
|
113
|
+
Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
|
|
114
|
+
how the modules fit together if you want to extend it.
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
MIT — see [LICENSE](LICENSE).
|
imbeval-0.1.0/README.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# imbeval
|
|
2
|
+
|
|
3
|
+
**Honest production-readiness evaluation for imbalanced classification models.**
|
|
4
|
+
|
|
5
|
+
Standard metric libraries hand you precision/recall/F1 and stop there. On imbalanced
|
|
6
|
+
data (fraud, churn, medical diagnosis, anomaly detection, rare-event prediction) that's
|
|
7
|
+
not enough to know if a model is actually safe to ship. `imbeval` answers the real
|
|
8
|
+
question: **is this model usable in production, and at what threshold?**
|
|
9
|
+
|
|
10
|
+
It combines three things most teams check manually and inconsistently:
|
|
11
|
+
|
|
12
|
+
1. **Minority-class performance** — not buried inside macro-averages.
|
|
13
|
+
2. **Calibration quality** — is the model's confidence trustworthy, or just confidently wrong?
|
|
14
|
+
3. **Threshold tuning** — the default 0.5 threshold is almost always wrong on imbalanced data; `imbeval` finds a better one, optionally weighted by real business cost (cost of a false positive vs a false negative).
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install imbeval
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
(Once published — see the [publishing guide](docs/publishing.md) if you're building this from source.)
|
|
23
|
+
|
|
24
|
+
## Quickstart
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from imbeval import evaluation_report
|
|
28
|
+
|
|
29
|
+
# y_true: ground truth labels (0/1)
|
|
30
|
+
# y_pred_proba: predicted probability of the positive class, from model.predict_proba(X)[:, 1]
|
|
31
|
+
report = evaluation_report(
|
|
32
|
+
y_true,
|
|
33
|
+
y_pred_proba,
|
|
34
|
+
cost_fp=1, # cost of a false alarm
|
|
35
|
+
cost_fn=25, # cost of missing a true positive (e.g. missed fraud)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
print(report["verdict"])
|
|
39
|
+
print(report["minority_class"])
|
|
40
|
+
print(report["optimal_f1_threshold"])
|
|
41
|
+
print(report["cost_sensitive_threshold"])
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Example output:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
Not yet production-ready: minority-class recall is below 50% at the default 0.5 threshold;
|
|
48
|
+
default 0.5 threshold is far from optimal; consider using optimal_f1_threshold.
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## What's inside
|
|
52
|
+
|
|
53
|
+
| Function | What it does |
|
|
54
|
+
|---|---|
|
|
55
|
+
| `evaluation_report(y_true, y_pred_proba, ...)` | One combined report + plain-English verdict |
|
|
56
|
+
| `minority_class_report(y_true, y_pred)` | Precision/recall/F1 focused on the minority class |
|
|
57
|
+
| `per_class_confidence(y_true, y_pred_proba)` | Mean model confidence per true class |
|
|
58
|
+
| `calibration_score(y_true, y_pred_proba)` | Expected Calibration Error (ECE) |
|
|
59
|
+
| `reliability_curve(y_true, y_pred_proba)` | Data for plotting a reliability diagram |
|
|
60
|
+
| `optimal_threshold(y_true, y_pred_proba)` | Best decision threshold by F1 |
|
|
61
|
+
| `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)` | Best threshold by real business cost |
|
|
62
|
+
|
|
63
|
+
Full API reference: [docs/api.md](docs/api.md)
|
|
64
|
+
Usage guide and recipes: [docs/usage.md](docs/usage.md)
|
|
65
|
+
Publishing this package yourself: [docs/publishing.md](docs/publishing.md)
|
|
66
|
+
|
|
67
|
+
## Why this exists
|
|
68
|
+
|
|
69
|
+
Most "imbalanced learning" tools (e.g. `imbalanced-learn`) focus on *fixing* the data
|
|
70
|
+
(SMOTE and friends). `imbeval` focuses on the other end of the pipeline: telling you
|
|
71
|
+
honestly whether the *model* you already trained is good enough, and at what threshold,
|
|
72
|
+
once class imbalance is in play. It's meant to sit right before you ship.
|
|
73
|
+
|
|
74
|
+
## Status
|
|
75
|
+
|
|
76
|
+
Early (v0.1.0). The core API (`evaluation_report`, threshold tools, calibration tools)
|
|
77
|
+
is stable for binary classification. Multi-class support is on the roadmap — see
|
|
78
|
+
[CHANGELOG.md](CHANGELOG.md).
|
|
79
|
+
|
|
80
|
+
## Contributing
|
|
81
|
+
|
|
82
|
+
Issues and PRs welcome once the repo is public. See [docs/usage.md](docs/usage.md) for
|
|
83
|
+
how the modules fit together if you want to extend it.
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# API Reference
|
|
2
|
+
|
|
3
|
+
All public functions are importable directly from `imbeval`.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
### `evaluation_report(y_true, y_pred_proba, cost_fp=None, cost_fn=None, n_bins=10)`
|
|
8
|
+
|
|
9
|
+
Combined production-readiness report.
|
|
10
|
+
|
|
11
|
+
**Parameters**
|
|
12
|
+
- `y_true` (array-like): binary ground truth labels (0/1).
|
|
13
|
+
- `y_pred_proba` (array-like): predicted probability of the positive class.
|
|
14
|
+
- `cost_fp` (float, optional): cost of one false positive. Required together with `cost_fn` to get a cost-sensitive threshold.
|
|
15
|
+
- `cost_fn` (float, optional): cost of one false negative.
|
|
16
|
+
- `n_bins` (int, default 10): bins used for calibration scoring.
|
|
17
|
+
|
|
18
|
+
**Returns** `dict`:
|
|
19
|
+
```python
|
|
20
|
+
{
|
|
21
|
+
"minority_class": {...}, # see minority_class_report
|
|
22
|
+
"calibration_error": 0.04, # float, ECE
|
|
23
|
+
"optimal_f1_threshold": {...}, # see optimal_threshold
|
|
24
|
+
"cost_sensitive_threshold": {...} or None,
|
|
25
|
+
"verdict": "..." # plain-English summary
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
### `minority_class_report(y_true, y_pred, classes=None, minority_label=None)`
|
|
32
|
+
|
|
33
|
+
Precision/recall/F1/support/confusion matrix focused on the minority class.
|
|
34
|
+
|
|
35
|
+
**Parameters**
|
|
36
|
+
- `y_true`, `y_pred` (array-like): true and predicted integer labels.
|
|
37
|
+
- `minority_label`: explicit label to treat as minority. If omitted, auto-detected as the class with lowest support.
|
|
38
|
+
|
|
39
|
+
**Returns** `dict` with keys: `minority_label`, `support`, `precision`, `recall`, `f1`, `confusion_matrix`.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
### `per_class_confidence(y_true, y_pred_proba, classes=None)`
|
|
44
|
+
|
|
45
|
+
Mean predicted probability the model assigns to the *true* class, per class.
|
|
46
|
+
|
|
47
|
+
**Parameters**
|
|
48
|
+
- `y_true` (array-like): integer class labels.
|
|
49
|
+
- `y_pred_proba` (array-like, shape `(n_samples, n_classes)`): predicted probabilities.
|
|
50
|
+
- `classes` (list, optional): display labels for each class index.
|
|
51
|
+
|
|
52
|
+
**Returns** `dict` mapping class label → mean confidence (or `None` if no samples for that class).
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
### `calibration_score(y_true, y_pred_proba, n_bins=10)`
|
|
57
|
+
|
|
58
|
+
Expected Calibration Error (ECE). 0 = perfectly calibrated.
|
|
59
|
+
|
|
60
|
+
**Returns** `float`.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
### `reliability_curve(y_true, y_pred_proba, n_bins=10)`
|
|
65
|
+
|
|
66
|
+
Binned data for plotting a reliability diagram.
|
|
67
|
+
|
|
68
|
+
**Returns** `dict` with keys `bin_confidence`, `bin_accuracy`, `bin_count` (each a list of length `n_bins`).
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
### `optimal_threshold(y_true, y_pred_proba, metric="f1", n_steps=200)`
|
|
73
|
+
|
|
74
|
+
Sweeps thresholds and returns the one maximizing the chosen metric.
|
|
75
|
+
|
|
76
|
+
**Returns** `dict`: `{"threshold": float, "score": float}`.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
### `cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn, n_steps=200)`
|
|
81
|
+
|
|
82
|
+
Finds the threshold minimizing `fp * cost_fp + fn * cost_fn`.
|
|
83
|
+
|
|
84
|
+
**Returns** `dict`: `{"threshold": float, "total_cost": float, "false_positives": int, "false_negatives": int}`.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Type conventions
|
|
89
|
+
|
|
90
|
+
- All `y_true` / `y_pred` for binary functions use `0`/`1` integer labels.
|
|
91
|
+
- All `y_pred_proba` for binary functions is the probability of the **positive (1)** class — i.e. `model.predict_proba(X)[:, 1]`.
|
|
92
|
+
- `per_class_confidence` is the only function expecting a full `(n_samples, n_classes)` probability matrix, for multi-class use.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Publishing Guide (for first-time package authors)
|
|
2
|
+
|
|
3
|
+
This walks you through every step to get `imbeval` (or any Python package you build
|
|
4
|
+
this way) onto GitHub and PyPI, properly licensed, with zero prior packaging experience.
|
|
5
|
+
|
|
6
|
+
## 0. Before you publish anything
|
|
7
|
+
|
|
8
|
+
- Double-check the name isn't taken: https://pypi.org/project/imbeval/ — if it's taken, rename it everywhere (`pyproject.toml`'s `name`, the folder under `src/`, imports in tests/docs).
|
|
9
|
+
- Make sure tests pass locally (we already verified 8/8 pass for this package).
|
|
10
|
+
- Fill in real values in `pyproject.toml`: your name, email, and GitHub URL (currently placeholders).
|
|
11
|
+
|
|
12
|
+
## 1. Choose and apply a license
|
|
13
|
+
|
|
14
|
+
You already have an MIT `LICENSE` file in this package — MIT is the most permissive,
|
|
15
|
+
common choice for libraries because it lets anyone use, modify, and redistribute your
|
|
16
|
+
code (even commercially) as long as they keep your copyright notice. This maximizes
|
|
17
|
+
adoption, which is what you want for a library you hope many people use.
|
|
18
|
+
|
|
19
|
+
If you'd rather require derivative works to stay open-source, use Apache 2.0 (also
|
|
20
|
+
patent-safe, popular for ML tooling) or GPLv3 (strongest copyleft, but it discourages
|
|
21
|
+
commercial adoption — usually a bad choice for "I want everyone to use this").
|
|
22
|
+
|
|
23
|
+
You do **not** need to register or pay for anything to apply a license — putting the
|
|
24
|
+
`LICENSE` file in your repo root and referencing it in `pyproject.toml` (`license = "MIT"`)
|
|
25
|
+
is legally sufficient. GitHub will also detect and display it automatically.
|
|
26
|
+
|
|
27
|
+
## 2. Put the code on GitHub
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
cd imbeval
|
|
31
|
+
git init
|
|
32
|
+
git add .
|
|
33
|
+
git commit -m "Initial commit: imbeval v0.1.0"
|
|
34
|
+
gh repo create imbeval --public --source=. --remote=origin
|
|
35
|
+
git push -u origin main
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
(No `gh` CLI? Create the repo manually on github.com, then `git remote add origin <url>` and `git push -u origin main`.)
|
|
39
|
+
|
|
40
|
+
## 3. Create a PyPI account and API token
|
|
41
|
+
|
|
42
|
+
1. Register at https://pypi.org/account/register/ (and verify your email).
|
|
43
|
+
2. Enable 2FA (PyPI requires it for publishing as of recent policy).
|
|
44
|
+
3. Go to Account Settings → API tokens → "Add API token". Scope it to "Entire account" for your first upload (you can scope it to just this project after the first release).
|
|
45
|
+
4. Save the token somewhere safe — it's shown once.
|
|
46
|
+
|
|
47
|
+
It's good practice to also register on **TestPyPI** (https://test.pypi.org) first, to do a dry run without polluting the real index.
|
|
48
|
+
|
|
49
|
+
## 4. Build the distribution files
|
|
50
|
+
|
|
51
|
+
From the project root:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install --upgrade build twine
|
|
55
|
+
python -m build
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
This creates a `dist/` folder with a `.tar.gz` (source distribution) and a `.whl`
|
|
59
|
+
(wheel) — these are the files that actually get uploaded to PyPI.
|
|
60
|
+
|
|
61
|
+
## 5. (Recommended) Upload to TestPyPI first
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
python -m twine upload --repository testpypi dist/*
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
It will prompt for a username (`__token__`) and password (your TestPyPI token).
|
|
68
|
+
Then verify it installs cleanly:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install --index-url https://test.pypi.org/simple/ imbeval
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 6. Publish to the real PyPI
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
python -m twine upload dist/*
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Username: `__token__`. Password: your PyPI API token (starts with `pypi-`).
|
|
81
|
+
|
|
82
|
+
Once this succeeds, anyone in the world can run:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install imbeval
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 7. Tag the release on GitHub
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
git tag v0.1.0
|
|
92
|
+
git push origin v0.1.0
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Then create a "Release" on GitHub from that tag with short release notes — this is
|
|
96
|
+
what shows up when people check "is this actively maintained."
|
|
97
|
+
|
|
98
|
+
## 8. For every future update
|
|
99
|
+
|
|
100
|
+
1. Make your changes, add/update tests, run `pytest` — all green before proceeding.
|
|
101
|
+
2. Bump the version in `pyproject.toml` (and `__version__` in `__init__.py`) — follow [semantic versioning](https://semver.org/): patch (0.1.1) for fixes, minor (0.2.0) for new backward-compatible features, major (1.0.0) for breaking changes.
|
|
102
|
+
3. Update `CHANGELOG.md`.
|
|
103
|
+
4. `rm -rf dist/ && python -m build && python -m twine upload dist/*`
|
|
104
|
+
5. Tag and release on GitHub as in step 7.
|
|
105
|
+
|
|
106
|
+
## 9. Getting people to actually use it
|
|
107
|
+
|
|
108
|
+
A license and a PyPI listing alone won't get adoption — that comes from people
|
|
109
|
+
finding it useful and finding out it exists:
|
|
110
|
+
|
|
111
|
+
- Write one clear, narrow blog post: "Why your imbalanced classifier's 99% accuracy is lying to you" — demonstrating the exact problem this library solves, ending with a 3-line code example.
|
|
112
|
+
- Post it where the target audience already is: relevant subreddits (r/MachineLearning, r/datascience), a Show HN on Hacker News, relevant Discord/Slack ML communities.
|
|
113
|
+
- Add a badge-rich README (build status, PyPI version, license) — people skim-judge trust from this in seconds. Shields.io provides free badges once you have CI set up.
|
|
114
|
+
- Respond to every GitHub issue quickly for the first few months — early responsiveness is the single biggest driver of whether a library gets a second look.
|
|
115
|
+
|
|
116
|
+
## Common first-publish mistakes to avoid
|
|
117
|
+
|
|
118
|
+
- Forgetting to bump the version before re-uploading — PyPI rejects re-uploading the same version number, even for a typo fix.
|
|
119
|
+
- Uploading secrets/API tokens accidentally committed in the repo — double check `git log` and use a `.gitignore` (see below) before your first push.
|
|
120
|
+
- Publishing without a `README.md` that has a working code example — this is the #1 reason people bounce off a new package's PyPI page.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Usage Guide
|
|
2
|
+
|
|
3
|
+
## 1. The core workflow
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from sklearn.linear_model import LogisticRegression
|
|
7
|
+
from imbeval import evaluation_report
|
|
8
|
+
|
|
9
|
+
model = LogisticRegression().fit(X_train, y_train)
|
|
10
|
+
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
|
11
|
+
|
|
12
|
+
report = evaluation_report(y_test, y_pred_proba, cost_fp=1, cost_fn=20)
|
|
13
|
+
print(report["verdict"])
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
`evaluation_report` is the one function most people need. It is intentionally
|
|
17
|
+
opinionated: it tells you in plain English whether your model has a problem,
|
|
18
|
+
not just a pile of numbers to interpret yourself.
|
|
19
|
+
|
|
20
|
+
## 2. Reading the minority-class report
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from imbeval import minority_class_report
|
|
24
|
+
|
|
25
|
+
preds = (y_pred_proba >= 0.5).astype(int)
|
|
26
|
+
report = minority_class_report(y_test, preds)
|
|
27
|
+
print(report)
|
|
28
|
+
# {'minority_label': 1, 'support': 48, 'precision': 0.62, 'recall': 0.41, 'f1': 0.5, ...}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
If you already know which label is the minority/rare class, pass it explicitly:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
minority_class_report(y_test, preds, minority_label=1)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## 3. Checking calibration
|
|
38
|
+
|
|
39
|
+
A model can have great accuracy and still be miscalibrated — e.g. it says
|
|
40
|
+
"90% confident" but is only right 60% of the time at that confidence level.
|
|
41
|
+
This matters a lot if you use the probability output for downstream decisions
|
|
42
|
+
(e.g. ranking leads, setting alert priority).
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from imbeval import calibration_score, reliability_curve
|
|
46
|
+
|
|
47
|
+
ece = calibration_score(y_test, y_pred_proba)
|
|
48
|
+
print(f"Expected Calibration Error: {ece:.3f}") # 0 = perfect, >0.1 = concerning
|
|
49
|
+
|
|
50
|
+
curve = reliability_curve(y_test, y_pred_proba, n_bins=10)
|
|
51
|
+
# Plot curve["bin_confidence"] vs curve["bin_accuracy"] for a reliability diagram
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 4. Picking a better threshold than 0.5
|
|
55
|
+
|
|
56
|
+
The 0.5 default threshold assumes balanced classes and equal costs. Neither
|
|
57
|
+
is usually true for imbalanced problems.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from imbeval import optimal_threshold
|
|
61
|
+
|
|
62
|
+
result = optimal_threshold(y_test, y_pred_proba)
|
|
63
|
+
print(result) # {'threshold': 0.27, 'score': 0.71}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## 5. Cost-sensitive thresholding (the most useful function for real decisions)
|
|
67
|
+
|
|
68
|
+
If you know roughly what a false positive and a false negative cost your
|
|
69
|
+
business, use this instead — it directly minimizes cost rather than an
|
|
70
|
+
abstract metric.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from imbeval import cost_sensitive_threshold
|
|
74
|
+
|
|
75
|
+
# Example: fraud detection. A false alarm costs $1 in review time.
|
|
76
|
+
# A missed fraud case costs $200 on average.
|
|
77
|
+
result = cost_sensitive_threshold(y_test, y_pred_proba, cost_fp=1, cost_fn=200)
|
|
78
|
+
print(result)
|
|
79
|
+
# {'threshold': 0.08, 'total_cost': 1340.0, 'false_positives': 210, 'false_negatives': 6}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This typically pushes the threshold much lower than 0.5 when false negatives
|
|
83
|
+
are expensive — which is the common case in fraud, medical screening, and
|
|
84
|
+
safety-critical anomaly detection.
|
|
85
|
+
|
|
86
|
+
## 6. Multi-class confidence breakdown
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from imbeval import per_class_confidence
|
|
90
|
+
|
|
91
|
+
confidences = per_class_confidence(y_test, y_pred_proba_matrix, classes=["normal", "fraud", "abuse"])
|
|
92
|
+
print(confidences)
|
|
93
|
+
# {'normal': 0.91, 'fraud': 0.58, 'abuse': 0.44}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
A low number for a specific class tells you the model is unsure specifically
|
|
97
|
+
about that class, even if overall accuracy looks fine.
|
|
98
|
+
|
|
99
|
+
## Common pitfalls this library is built to catch
|
|
100
|
+
|
|
101
|
+
- **"99% accuracy" on a 1%-positive-rate dataset** — `minority_class_report`
|
|
102
|
+
exposes that this can mean the model just predicts the majority class always.
|
|
103
|
+
- **High AUC, useless probabilities** — `calibration_score` catches this even
|
|
104
|
+
when ranking metrics look great.
|
|
105
|
+
- **Using 0.5 by default out of habit** — `optimal_threshold` and
|
|
106
|
+
`cost_sensitive_threshold` replace that habit with a number backed by your
|
|
107
|
+
actual data and actual costs.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quickstart example for imbeval.
|
|
3
|
+
|
|
4
|
+
Run with: python examples/quickstart.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.linear_model import LogisticRegression
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
from sklearn.datasets import make_classification
|
|
11
|
+
|
|
12
|
+
from imbeval import evaluation_report
|
|
13
|
+
|
|
14
|
+
# Simulate a realistic imbalanced dataset (5% positive class — e.g. fraud)
|
|
15
|
+
X, y = make_classification(
|
|
16
|
+
n_samples=5000,
|
|
17
|
+
n_features=20,
|
|
18
|
+
weights=[0.95, 0.05],
|
|
19
|
+
flip_y=0.02,
|
|
20
|
+
random_state=42,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
24
|
+
X, y, test_size=0.3, stratify=y, random_state=42
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
|
|
28
|
+
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
|
29
|
+
|
|
30
|
+
report = evaluation_report(
|
|
31
|
+
y_test,
|
|
32
|
+
y_pred_proba,
|
|
33
|
+
cost_fp=1, # cost of a false alarm
|
|
34
|
+
cost_fn=25, # cost of a missed positive case
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
print("=" * 60)
|
|
38
|
+
print("VERDICT:", report["verdict"])
|
|
39
|
+
print("=" * 60)
|
|
40
|
+
print("Minority class report:", report["minority_class"])
|
|
41
|
+
print("Calibration error (ECE):", round(report["calibration_error"], 4))
|
|
42
|
+
print("Optimal F1 threshold:", report["optimal_f1_threshold"])
|
|
43
|
+
print("Cost-sensitive threshold:", report["cost_sensitive_threshold"])
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "imbeval"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Honest, production-readiness evaluation for imbalanced classification models."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Srikanth Sridhar", email = "srisrikanthtvs@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["machine-learning", "imbalanced-classification", "model-evaluation", "calibration", "threshold-tuning"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"numpy>=1.21",
|
|
30
|
+
"scikit-learn>=1.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/sricodings"
|
|
35
|
+
Repository = "https://github.com/sricodings/imbeval"
|
|
36
|
+
Issues = "https://github.com/sricodings/imbeval/issues"
|
|
37
|
+
Documentation = "https://github.com/sricodings/imbeval#readme"
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
dev = ["pytest>=7.0", "build", "twine"]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/imbeval"]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
imbeval — Honest evaluation for imbalanced classification models.
|
|
3
|
+
|
|
4
|
+
Most metric libraries hand you numbers; they don't tell you whether your
|
|
5
|
+
model is actually safe to ship on imbalanced data (fraud, medical, anomaly
|
|
6
|
+
detection, churn, etc). imbeval combines per-class confidence, calibration
|
|
7
|
+
quality, and cost-sensitive thresholding into one report so you can answer
|
|
8
|
+
the real question: "is this model usable in production?"
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .report import evaluation_report
|
|
12
|
+
from .calibration import calibration_score, reliability_curve
|
|
13
|
+
from .threshold import optimal_threshold, cost_sensitive_threshold
|
|
14
|
+
from .metrics import per_class_confidence, minority_class_report
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"evaluation_report",
|
|
20
|
+
"calibration_score",
|
|
21
|
+
"reliability_curve",
|
|
22
|
+
"optimal_threshold",
|
|
23
|
+
"cost_sensitive_threshold",
|
|
24
|
+
"per_class_confidence",
|
|
25
|
+
"minority_class_report",
|
|
26
|
+
"__version__",
|
|
27
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Calibration quality checks — is the model's confidence trustworthy?"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def reliability_curve(y_true, y_pred_proba, n_bins: int = 10):
|
|
9
|
+
"""
|
|
10
|
+
Bin predictions by confidence and compare to observed accuracy in
|
|
11
|
+
each bin. Returns arrays suitable for plotting a reliability diagram.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
y_true : array-like, binary (0/1) ground truth.
|
|
16
|
+
y_pred_proba : array-like, predicted probability of the positive class.
|
|
17
|
+
n_bins : int
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
dict with keys: bin_confidence, bin_accuracy, bin_count
|
|
22
|
+
"""
|
|
23
|
+
y_true = np.asarray(y_true)
|
|
24
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
|
25
|
+
|
|
26
|
+
bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
|
|
27
|
+
bin_confidence, bin_accuracy, bin_count = [], [], []
|
|
28
|
+
|
|
29
|
+
for i in range(n_bins):
|
|
30
|
+
lo, hi = bin_edges[i], bin_edges[i + 1]
|
|
31
|
+
mask = (y_pred_proba >= lo) & (y_pred_proba < hi if i < n_bins - 1 else y_pred_proba <= hi)
|
|
32
|
+
count = int(mask.sum())
|
|
33
|
+
bin_count.append(count)
|
|
34
|
+
if count == 0:
|
|
35
|
+
bin_confidence.append(None)
|
|
36
|
+
bin_accuracy.append(None)
|
|
37
|
+
else:
|
|
38
|
+
bin_confidence.append(float(np.mean(y_pred_proba[mask])))
|
|
39
|
+
bin_accuracy.append(float(np.mean(y_true[mask])))
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
"bin_confidence": bin_confidence,
|
|
43
|
+
"bin_accuracy": bin_accuracy,
|
|
44
|
+
"bin_count": bin_count,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def calibration_score(y_true, y_pred_proba, n_bins: int = 10) -> float:
|
|
49
|
+
"""
|
|
50
|
+
Expected Calibration Error (ECE): the weighted average gap between
|
|
51
|
+
predicted confidence and observed accuracy across bins. Lower is
|
|
52
|
+
better; 0 is perfect calibration.
|
|
53
|
+
"""
|
|
54
|
+
curve = reliability_curve(y_true, y_pred_proba, n_bins=n_bins)
|
|
55
|
+
total = sum(c for c in curve["bin_count"] if c)
|
|
56
|
+
if total == 0:
|
|
57
|
+
return 0.0
|
|
58
|
+
|
|
59
|
+
ece = 0.0
|
|
60
|
+
for conf, acc, count in zip(
|
|
61
|
+
curve["bin_confidence"], curve["bin_accuracy"], curve["bin_count"]
|
|
62
|
+
):
|
|
63
|
+
if count == 0:
|
|
64
|
+
continue
|
|
65
|
+
ece += (count / total) * abs(conf - acc)
|
|
66
|
+
return float(ece)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Per-class confidence and minority-class focused metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.metrics import (
|
|
7
|
+
precision_score,
|
|
8
|
+
recall_score,
|
|
9
|
+
f1_score,
|
|
10
|
+
confusion_matrix,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def per_class_confidence(y_true, y_pred_proba, classes=None):
|
|
15
|
+
"""
|
|
16
|
+
Compute the mean predicted-probability "confidence" the model assigns
|
|
17
|
+
to the correct class, broken down per class.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y_true : array-like of shape (n_samples,)
|
|
22
|
+
True integer class labels (0..n_classes-1).
|
|
23
|
+
y_pred_proba : array-like of shape (n_samples, n_classes)
|
|
24
|
+
Predicted probabilities from `model.predict_proba`.
|
|
25
|
+
classes : list, optional
|
|
26
|
+
Labels for each class index, for display purposes.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
dict
|
|
31
|
+
Mapping of class label -> mean confidence on correctly-attributed
|
|
32
|
+
probability mass for samples truly belonging to that class.
|
|
33
|
+
"""
|
|
34
|
+
y_true = np.asarray(y_true)
|
|
35
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
|
36
|
+
n_classes = y_pred_proba.shape[1]
|
|
37
|
+
if classes is None:
|
|
38
|
+
classes = list(range(n_classes))
|
|
39
|
+
|
|
40
|
+
result = {}
|
|
41
|
+
for idx, label in enumerate(classes):
|
|
42
|
+
mask = y_true == idx
|
|
43
|
+
if mask.sum() == 0:
|
|
44
|
+
result[label] = None
|
|
45
|
+
continue
|
|
46
|
+
result[label] = float(np.mean(y_pred_proba[mask, idx]))
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def minority_class_report(y_true, y_pred, classes=None, minority_label=None):
|
|
51
|
+
"""
|
|
52
|
+
Precision/recall/F1 with explicit emphasis on the minority class.
|
|
53
|
+
|
|
54
|
+
If `minority_label` is not given, the class with the lowest support
|
|
55
|
+
in `y_true` is auto-detected.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
dict with keys: minority_label, support, precision, recall, f1,
|
|
60
|
+
confusion_matrix
|
|
61
|
+
"""
|
|
62
|
+
y_true = np.asarray(y_true)
|
|
63
|
+
y_pred = np.asarray(y_pred)
|
|
64
|
+
|
|
65
|
+
labels, counts = np.unique(y_true, return_counts=True)
|
|
66
|
+
if minority_label is None:
|
|
67
|
+
minority_label = labels[np.argmin(counts)]
|
|
68
|
+
|
|
69
|
+
precision = precision_score(
|
|
70
|
+
y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
|
|
71
|
+
)
|
|
72
|
+
recall = recall_score(
|
|
73
|
+
y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
|
|
74
|
+
)
|
|
75
|
+
f1 = f1_score(
|
|
76
|
+
y_true, y_pred, labels=[minority_label], average="macro", zero_division=0
|
|
77
|
+
)
|
|
78
|
+
support = int(np.sum(y_true == minority_label))
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"minority_label": minority_label,
|
|
82
|
+
"support": support,
|
|
83
|
+
"precision": float(precision),
|
|
84
|
+
"recall": float(recall),
|
|
85
|
+
"f1": float(f1),
|
|
86
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
|
|
87
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""The single entry point: one honest report on production-readiness."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .calibration import calibration_score
|
|
8
|
+
from .threshold import optimal_threshold, cost_sensitive_threshold
|
|
9
|
+
from .metrics import minority_class_report
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def evaluation_report(
|
|
13
|
+
y_true,
|
|
14
|
+
y_pred_proba,
|
|
15
|
+
cost_fp: float = None,
|
|
16
|
+
cost_fn: float = None,
|
|
17
|
+
n_bins: int = 10,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Produce one combined evaluation report for a binary classifier on
|
|
21
|
+
imbalanced data: minority-class performance, calibration quality,
|
|
22
|
+
a tuned decision threshold, and (optionally) a cost-aware threshold.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
y_true : array-like, binary ground truth labels (0/1).
|
|
27
|
+
y_pred_proba : array-like, predicted probability of the positive (1) class.
|
|
28
|
+
cost_fp : float, optional. Business cost of one false positive.
|
|
29
|
+
cost_fn : float, optional. Business cost of one false negative.
|
|
30
|
+
If both cost_fp and cost_fn are given, a cost-sensitive threshold
|
|
31
|
+
is included in the report.
|
|
32
|
+
n_bins : int, bins used for calibration scoring.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
dict
|
|
37
|
+
{
|
|
38
|
+
"minority_class": {...},
|
|
39
|
+
"calibration_error": float,
|
|
40
|
+
"default_threshold_0.5": {...},
|
|
41
|
+
"optimal_f1_threshold": {...},
|
|
42
|
+
"cost_sensitive_threshold": {...} or None,
|
|
43
|
+
"verdict": str
|
|
44
|
+
}
|
|
45
|
+
"""
|
|
46
|
+
y_true = np.asarray(y_true)
|
|
47
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
|
48
|
+
preds_at_half = (y_pred_proba >= 0.5).astype(int)
|
|
49
|
+
|
|
50
|
+
minority = minority_class_report(y_true, preds_at_half)
|
|
51
|
+
ece = calibration_score(y_true, y_pred_proba, n_bins=n_bins)
|
|
52
|
+
f1_opt = optimal_threshold(y_true, y_pred_proba)
|
|
53
|
+
|
|
54
|
+
cost_result = None
|
|
55
|
+
if cost_fp is not None and cost_fn is not None:
|
|
56
|
+
cost_result = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp, cost_fn)
|
|
57
|
+
|
|
58
|
+
verdict = _build_verdict(minority, ece, f1_opt)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"minority_class": minority,
|
|
62
|
+
"calibration_error": ece,
|
|
63
|
+
"optimal_f1_threshold": f1_opt,
|
|
64
|
+
"cost_sensitive_threshold": cost_result,
|
|
65
|
+
"verdict": verdict,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _build_verdict(minority: dict, ece: float, f1_opt: dict) -> str:
|
|
70
|
+
flags = []
|
|
71
|
+
if minority["recall"] < 0.5:
|
|
72
|
+
flags.append("minority-class recall is below 50% at the default 0.5 threshold")
|
|
73
|
+
if ece > 0.1:
|
|
74
|
+
flags.append(f"calibration error is high (ECE={ece:.3f}); confidence scores are not trustworthy")
|
|
75
|
+
if f1_opt["score"] - minority["f1"] > 0.15:
|
|
76
|
+
flags.append("default 0.5 threshold is far from optimal; consider using optimal_f1_threshold")
|
|
77
|
+
|
|
78
|
+
if not flags:
|
|
79
|
+
return "Looks production-ready on the dimensions checked. Validate further on a held-out set."
|
|
80
|
+
return "Not yet production-ready: " + "; ".join(flags) + "."
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Decision threshold tuning for imbalanced problems."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.metrics import f1_score
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def optimal_threshold(y_true, y_pred_proba, metric: str = "f1", n_steps: int = 200):
|
|
10
|
+
"""
|
|
11
|
+
Sweep decision thresholds and return the one that maximizes the
|
|
12
|
+
chosen metric. Default metric is 0.5-agnostic F1, which is usually
|
|
13
|
+
a far better default than 0.5 on imbalanced data.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
y_true : array-like, binary ground truth.
|
|
18
|
+
y_pred_proba : array-like, predicted probability of positive class.
|
|
19
|
+
metric : {"f1"} currently supported.
|
|
20
|
+
n_steps : int, number of thresholds to test between 0 and 1.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
dict with keys: threshold, score
|
|
25
|
+
"""
|
|
26
|
+
y_true = np.asarray(y_true)
|
|
27
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
|
28
|
+
|
|
29
|
+
thresholds = np.linspace(0.01, 0.99, n_steps)
|
|
30
|
+
best_threshold, best_score = 0.5, -1.0
|
|
31
|
+
|
|
32
|
+
for t in thresholds:
|
|
33
|
+
preds = (y_pred_proba >= t).astype(int)
|
|
34
|
+
if metric == "f1":
|
|
35
|
+
score = f1_score(y_true, preds, zero_division=0)
|
|
36
|
+
else:
|
|
37
|
+
raise ValueError(f"Unsupported metric: {metric}")
|
|
38
|
+
if score > best_score:
|
|
39
|
+
best_score, best_threshold = score, t
|
|
40
|
+
|
|
41
|
+
return {"threshold": float(best_threshold), "score": float(best_score)}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def cost_sensitive_threshold(y_true, y_pred_proba, cost_fp: float, cost_fn: float, n_steps: int = 200):
|
|
45
|
+
"""
|
|
46
|
+
Find the decision threshold that minimizes total business cost,
|
|
47
|
+
given the real-world cost of a false positive vs a false negative.
|
|
48
|
+
|
|
49
|
+
This is usually what people actually want on imbalanced data
|
|
50
|
+
(e.g. fraud: missing fraud is far costlier than a false alarm).
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
y_true : array-like, binary ground truth.
|
|
55
|
+
y_pred_proba : array-like, predicted probability of positive class.
|
|
56
|
+
cost_fp : float, cost incurred per false positive.
|
|
57
|
+
cost_fn : float, cost incurred per false negative.
|
|
58
|
+
n_steps : int
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
dict with keys: threshold, total_cost, false_positives, false_negatives
|
|
63
|
+
"""
|
|
64
|
+
y_true = np.asarray(y_true)
|
|
65
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
|
66
|
+
|
|
67
|
+
thresholds = np.linspace(0.01, 0.99, n_steps)
|
|
68
|
+
best = {"threshold": 0.5, "total_cost": float("inf"), "false_positives": 0, "false_negatives": 0}
|
|
69
|
+
|
|
70
|
+
for t in thresholds:
|
|
71
|
+
preds = (y_pred_proba >= t).astype(int)
|
|
72
|
+
fp = int(np.sum((preds == 1) & (y_true == 0)))
|
|
73
|
+
fn = int(np.sum((preds == 0) & (y_true == 1)))
|
|
74
|
+
total_cost = fp * cost_fp + fn * cost_fn
|
|
75
|
+
if total_cost < best["total_cost"]:
|
|
76
|
+
best = {
|
|
77
|
+
"threshold": float(t),
|
|
78
|
+
"total_cost": float(total_cost),
|
|
79
|
+
"false_positives": fp,
|
|
80
|
+
"false_negatives": fn,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return best
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from imbeval import (
|
|
5
|
+
evaluation_report,
|
|
6
|
+
calibration_score,
|
|
7
|
+
reliability_curve,
|
|
8
|
+
optimal_threshold,
|
|
9
|
+
cost_sensitive_threshold,
|
|
10
|
+
per_class_confidence,
|
|
11
|
+
minority_class_report,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def imbalanced_binary_data():
|
|
17
|
+
rng = np.random.default_rng(42)
|
|
18
|
+
n = 1000
|
|
19
|
+
y_true = np.zeros(n, dtype=int)
|
|
20
|
+
y_true[:50] = 1 # 5% minority class
|
|
21
|
+
rng.shuffle(y_true)
|
|
22
|
+
|
|
23
|
+
# Simulate a decently-calibrated model: higher proba for true positives
|
|
24
|
+
base = rng.uniform(0, 0.3, size=n)
|
|
25
|
+
y_pred_proba = np.where(y_true == 1, base + rng.uniform(0.3, 0.6, size=n), base)
|
|
26
|
+
y_pred_proba = np.clip(y_pred_proba, 0, 1)
|
|
27
|
+
return y_true, y_pred_proba
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_calibration_score_perfect_calibration():
|
|
31
|
+
rng = np.random.default_rng(0)
|
|
32
|
+
proba = rng.uniform(0, 1, 5000)
|
|
33
|
+
y_true = (rng.uniform(0, 1, 5000) < proba).astype(int)
|
|
34
|
+
ece = calibration_score(y_true, proba, n_bins=10)
|
|
35
|
+
assert ece < 0.08 # roughly well calibrated by construction
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_reliability_curve_shape():
|
|
39
|
+
y_true = [0, 1, 0, 1, 1, 0]
|
|
40
|
+
proba = [0.1, 0.9, 0.2, 0.8, 0.7, 0.3]
|
|
41
|
+
curve = reliability_curve(y_true, proba, n_bins=5)
|
|
42
|
+
assert len(curve["bin_confidence"]) == 5
|
|
43
|
+
assert len(curve["bin_accuracy"]) == 5
|
|
44
|
+
assert len(curve["bin_count"]) == 5
|
|
45
|
+
assert sum(c for c in curve["bin_count"]) == len(y_true)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_optimal_threshold_beats_or_matches_default(imbalanced_binary_data):
|
|
49
|
+
y_true, y_pred_proba = imbalanced_binary_data
|
|
50
|
+
result = optimal_threshold(y_true, y_pred_proba)
|
|
51
|
+
assert 0.0 < result["threshold"] < 1.0
|
|
52
|
+
assert 0.0 <= result["score"] <= 1.0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_cost_sensitive_threshold_favors_recall_when_fn_costly(imbalanced_binary_data):
|
|
56
|
+
y_true, y_pred_proba = imbalanced_binary_data
|
|
57
|
+
cheap_fp = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp=1, cost_fn=1)
|
|
58
|
+
costly_fn = cost_sensitive_threshold(y_true, y_pred_proba, cost_fp=1, cost_fn=50)
|
|
59
|
+
# Penalizing false negatives heavily should push the threshold down
|
|
60
|
+
assert costly_fn["threshold"] <= cheap_fp["threshold"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_per_class_confidence_basic():
|
|
64
|
+
y_true = [0, 0, 1, 1]
|
|
65
|
+
proba = np.array([[0.9, 0.1], [0.6, 0.4], [0.3, 0.7], [0.2, 0.8]])
|
|
66
|
+
result = per_class_confidence(y_true, proba, classes=["neg", "pos"])
|
|
67
|
+
assert result["neg"] == pytest.approx(0.75)
|
|
68
|
+
assert result["pos"] == pytest.approx(0.75)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_minority_class_report_autodetects_minority(imbalanced_binary_data):
|
|
72
|
+
y_true, y_pred_proba = imbalanced_binary_data
|
|
73
|
+
preds = (y_pred_proba >= 0.5).astype(int)
|
|
74
|
+
report = minority_class_report(y_true, preds)
|
|
75
|
+
assert report["minority_label"] == 1
|
|
76
|
+
assert report["support"] == int(np.sum(y_true == 1))
|
|
77
|
+
assert 0.0 <= report["precision"] <= 1.0
|
|
78
|
+
assert 0.0 <= report["recall"] <= 1.0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_evaluation_report_end_to_end(imbalanced_binary_data):
|
|
82
|
+
y_true, y_pred_proba = imbalanced_binary_data
|
|
83
|
+
report = evaluation_report(y_true, y_pred_proba, cost_fp=1, cost_fn=20)
|
|
84
|
+
assert "minority_class" in report
|
|
85
|
+
assert "calibration_error" in report
|
|
86
|
+
assert "optimal_f1_threshold" in report
|
|
87
|
+
assert report["cost_sensitive_threshold"] is not None
|
|
88
|
+
assert isinstance(report["verdict"], str)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_evaluation_report_without_costs(imbalanced_binary_data):
|
|
92
|
+
y_true, y_pred_proba = imbalanced_binary_data
|
|
93
|
+
report = evaluation_report(y_true, y_pred_proba)
|
|
94
|
+
assert report["cost_sensitive_threshold"] is None
|