pearsonify 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pearsonify/__init__.py +5 -0
- pearsonify/utils.py +22 -0
- pearsonify/wrapper.py +93 -0
- pearsonify-0.1.0.dist-info/METADATA +104 -0
- pearsonify-0.1.0.dist-info/RECORD +7 -0
- pearsonify-0.1.0.dist-info/WHEEL +4 -0
- pearsonify-0.1.0.dist-info/licenses/LICENSE.md +9 -0
pearsonify/__init__.py
ADDED
pearsonify/utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Utilities for Pearsonify: Pearson residuals, confidence intervals, coverage."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_pearson_residuals(y_true, y_pred_proba):
|
|
7
|
+
"""Compute Pearson residuals for binary classification."""
|
|
8
|
+
y_pred_proba = np.clip(y_pred_proba, 1e-10, 1 - 1e-10)
|
|
9
|
+
return (y_true - y_pred_proba) / np.sqrt(y_pred_proba * (1 - y_pred_proba))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compute_confidence_intervals(y_pred_proba, q_alpha):
|
|
13
|
+
"""Compute confidence intervals based on Pearson residuals."""
|
|
14
|
+
std_error = np.sqrt(y_pred_proba * (1 - y_pred_proba))
|
|
15
|
+
lower_bounds = np.maximum(0, y_pred_proba - q_alpha * std_error)
|
|
16
|
+
upper_bounds = np.minimum(1, y_pred_proba + q_alpha * std_error)
|
|
17
|
+
return lower_bounds, upper_bounds
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def calculate_coverage(y_true, lower_bounds, upper_bounds):
|
|
21
|
+
"""Calculate the empirical coverage of confidence intervals."""
|
|
22
|
+
return np.mean((y_true >= lower_bounds) & (y_true <= upper_bounds))
|
pearsonify/wrapper.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.base import BaseEstimator
|
|
4
|
+
from sklearn.utils.validation import NotFittedError, check_is_fitted
|
|
5
|
+
|
|
6
|
+
from .utils import (
|
|
7
|
+
calculate_coverage,
|
|
8
|
+
compute_confidence_intervals,
|
|
9
|
+
compute_pearson_residuals,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Pearsonify:
|
|
14
|
+
def __init__(self, estimator: BaseEstimator, alpha=0.05):
|
|
15
|
+
"""
|
|
16
|
+
Initialize with a model that implements `fit` and `predict_proba`.
|
|
17
|
+
|
|
18
|
+
Parameters:
|
|
19
|
+
- estimator: A scikit-learn-like classifier with `fit` and `predict_proba` methods.
|
|
20
|
+
- alpha: Significance level (e.g., 0.05 for 95% intervals).
|
|
21
|
+
"""
|
|
22
|
+
self.estimator = estimator
|
|
23
|
+
self.alpha = alpha
|
|
24
|
+
self.q_alpha = None
|
|
25
|
+
|
|
26
|
+
def fit(self, X_train, y_train, X_cal, y_cal):
|
|
27
|
+
"""Fit the model and compute Pearson residual-based quantile from calibration data."""
|
|
28
|
+
# Train the model if it's not already fitted
|
|
29
|
+
try:
|
|
30
|
+
check_is_fitted(self.estimator)
|
|
31
|
+
if not callable(getattr(self.estimator, "predict_proba", None)):
|
|
32
|
+
raise TypeError(
|
|
33
|
+
"The estimator must have a callable 'predict_proba' method."
|
|
34
|
+
)
|
|
35
|
+
except TypeError as e:
|
|
36
|
+
raise TypeError(f"Estimator validation failed: {e}") from e
|
|
37
|
+
except NotFittedError:
|
|
38
|
+
# Attempt to fit the estimator if not already fitted
|
|
39
|
+
self.estimator.fit(X_train, y_train)
|
|
40
|
+
|
|
41
|
+
# Compute residuals on calibration set
|
|
42
|
+
y_cal_pred_proba = self.estimator.predict_proba(X_cal)[:, 1]
|
|
43
|
+
residuals = compute_pearson_residuals(y_cal, y_cal_pred_proba)
|
|
44
|
+
self.q_alpha = np.quantile(np.abs(residuals), 1 - self.alpha)
|
|
45
|
+
return self.q_alpha
|
|
46
|
+
|
|
47
|
+
def predict_intervals(self, X_test):
|
|
48
|
+
"""Generate prediction intervals for new data."""
|
|
49
|
+
if self.q_alpha is None:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"The model needs to be fitted before predicting intervals."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Generate predicted probabilities for the test set
|
|
55
|
+
y_test_pred_proba = self.estimator.predict_proba(X_test)[:, 1]
|
|
56
|
+
lower_bounds, upper_bounds = compute_confidence_intervals(
|
|
57
|
+
y_test_pred_proba, self.q_alpha
|
|
58
|
+
)
|
|
59
|
+
return y_test_pred_proba, lower_bounds, upper_bounds
|
|
60
|
+
|
|
61
|
+
def evaluate_coverage(self, y_test, lower_bounds, upper_bounds):
|
|
62
|
+
"""Evaluate the empirical coverage."""
|
|
63
|
+
return calculate_coverage(y_test, lower_bounds, upper_bounds)
|
|
64
|
+
|
|
65
|
+
def plot_intervals(
|
|
66
|
+
self, y_test_pred_proba, lower_bounds, upper_bounds, y_test=None
|
|
67
|
+
):
|
|
68
|
+
"""Plot the predicted probabilities with their confidence intervals."""
|
|
69
|
+
if y_test is not None:
|
|
70
|
+
coverage = self.evaluate_coverage(y_test, lower_bounds, upper_bounds)
|
|
71
|
+
sorted_indices = np.argsort(y_test_pred_proba)
|
|
72
|
+
plt.figure(figsize=(10, 6))
|
|
73
|
+
plt.plot(
|
|
74
|
+
y_test_pred_proba[sorted_indices],
|
|
75
|
+
color="dodgerblue",
|
|
76
|
+
label="Predicted Probability"
|
|
77
|
+
+ (f" (Coverage: {coverage:.0%})" if y_test is not None else ""),
|
|
78
|
+
)
|
|
79
|
+
plt.fill_between(
|
|
80
|
+
range(len(y_test_pred_proba)),
|
|
81
|
+
lower_bounds[sorted_indices],
|
|
82
|
+
upper_bounds[sorted_indices],
|
|
83
|
+
color="lightblue",
|
|
84
|
+
alpha=0.4,
|
|
85
|
+
)
|
|
86
|
+
plt.title(
|
|
87
|
+
f"Confidence Intervals with Pearsonify\n"
|
|
88
|
+
f"Confidence Level: {(1 - self.alpha):.0%}"
|
|
89
|
+
)
|
|
90
|
+
plt.xlabel("Sorted Test Sample Index")
|
|
91
|
+
plt.ylabel("Probability")
|
|
92
|
+
plt.legend()
|
|
93
|
+
plt.show()
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pearsonify
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight package for computing confidence intervals for classification tasks using conformal prediction and Pearson residuals.
|
|
5
|
+
Project-URL: Repository, https://github.com/xRiskLab/pearsonify
|
|
6
|
+
Project-URL: Homepage, https://github.com/xRiskLab/pearsonify
|
|
7
|
+
Author-email: xRiskLab <contact@xrisklab.ai>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE.md
|
|
10
|
+
Keywords: classification,confidence intervals,conformal prediction,machine learning,pearson residuals
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: matplotlib<4.0.0,>=3.7.0
|
|
23
|
+
Requires-Dist: numpy<2.0.0,>=1.24.0
|
|
24
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.2.0
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# 💡 Pearsonify
|
|
28
|
+
## Probabilistic Classification with Conformalized Intervals
|
|
29
|
+
|
|
30
|
+
**Pearsonify** is a lightweight 🐍 Python package for generating **classification intervals** around predicted probabilities in binary classification tasks.
|
|
31
|
+
|
|
32
|
+
It uses **Pearson residuals** and **principles of conformal prediction** to quantify uncertainty without making strong distributional assumptions.
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
### 🚀 Why Pearsonify?
|
|
37
|
+
|
|
38
|
+
* 📊 **Intuitive Classification Intervals**: Get reliable intervals for binary classification predictions.
|
|
39
|
+
* 🧠 **Statistically Grounded**: Uses Pearson residuals, a well-established metric from classical statistics.
|
|
40
|
+
* ⚡ **Model-Agnostic**: Works with any model that provides probability estimates.
|
|
41
|
+
* 🛠️ **Lightweight**: Minimal dependencies, easy to integrate into existing projects.
|
|
42
|
+
|
|
43
|
+
### 📦 How to install?
|
|
44
|
+
|
|
45
|
+
Use `pip` to install the package from GitHub:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install pearsonify
|
|
49
|
+
# or from GitHub:
|
|
50
|
+
pip install git+https://github.com/xRiskLab/pearsonify.git
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 💻 How to use?
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import numpy as np
|
|
57
|
+
from pearsonify import Pearsonify
|
|
58
|
+
from sklearn.svm import SVC
|
|
59
|
+
from sklearn.datasets import make_classification
|
|
60
|
+
from sklearn.model_selection import train_test_split
|
|
61
|
+
|
|
62
|
+
# Generate synthetic classification data
|
|
63
|
+
np.random.seed(42)
|
|
64
|
+
X, y = make_classification(
|
|
65
|
+
n_samples=1000, n_features=20, n_informative=10, n_classes=2, random_state=42
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Split data into train, calibration, and test sets
|
|
69
|
+
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
|
|
70
|
+
X_cal, X_test, y_cal, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
|
|
71
|
+
|
|
72
|
+
# Initialize Pearsonify with an SVC model
|
|
73
|
+
clf = SVC(probability=True, random_state=42)
|
|
74
|
+
model = Pearsonify(estimator=clf, alpha=0.05)
|
|
75
|
+
|
|
76
|
+
# Fit the model on training and calibration sets
|
|
77
|
+
model.fit(X_train, y_train, X_cal, y_cal)
|
|
78
|
+
|
|
79
|
+
# Generate prediction intervals for test set
|
|
80
|
+
y_test_pred_proba, lower_bounds, upper_bounds = model.predict_intervals(X_test)
|
|
81
|
+
|
|
82
|
+
# Calculate coverage
|
|
83
|
+
coverage = model.evaluate_coverage(y_test, lower_bounds, upper_bounds)
|
|
84
|
+
print(f"Coverage: {coverage:.2%}")
|
|
85
|
+
|
|
86
|
+
# Plot the intervals
|
|
87
|
+
model.plot_intervals(y_test_pred_proba, lower_bounds, upper_bounds)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Running `example.py` will generate the following plot:
|
|
91
|
+
|
|
92
|
+

|
|
93
|
+
|
|
94
|
+
This plot shows predicted probabilities with 95% confidence intervals, sorted by prediction score.
|
|
95
|
+
|
|
96
|
+
### 📖 References
|
|
97
|
+
|
|
98
|
+
Hosmer, D. W., Lemeshow, S., & Sturdivant, R. X. (2013). Applied Logistic Regression. John Wiley & Sons.
|
|
99
|
+
|
|
100
|
+
Tibshirani, R. (2023). Conformal Prediction. Advanced Topics in Statistical Learning, Spring 2023.
|
|
101
|
+
|
|
102
|
+
### 📝 License
|
|
103
|
+
|
|
104
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pearsonify/__init__.py,sha256=Xh8BsTaKUxBvRKO0KsLmLgqc0uhf-wD-brpH3htXu6c,96
|
|
2
|
+
pearsonify/utils.py,sha256=2M5bFg19PpTx8nkVZwhckN8My1r4TJ6bijRoekF8Rgs,923
|
|
3
|
+
pearsonify/wrapper.py,sha256=XlhXCsK1CKeZTdRCge7xx0vryakWtiL-fzZUkPUpZN0,3571
|
|
4
|
+
pearsonify-0.1.0.dist-info/METADATA,sha256=neNsmEZTHd9XP7Y7iBjBv4nuYsiT3WkG_BEB9hkyEX0,3980
|
|
5
|
+
pearsonify-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
pearsonify-0.1.0.dist-info/licenses/LICENSE.md,sha256=u_UGp1lzWTU8z40NWFG1EkMBuw5Z4OdPcoFrxjupouk,1073
|
|
7
|
+
pearsonify-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2026] [Denis Burakov]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|