cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Taylor Approximation Functions for Empirical Likelihood.
|
|
3
|
+
|
|
4
|
+
This module provides modified logarithm functions with second-order Taylor
|
|
5
|
+
series approximation for numerical stability in empirical likelihood
|
|
6
|
+
optimization. When the argument falls below a threshold (typically 1/N),
|
|
7
|
+
the Taylor approximation prevents log(0) singularities.
|
|
8
|
+
|
|
9
|
+
The key functions are:
|
|
10
|
+
|
|
11
|
+
- ``llog``: Modified log with Taylor branch for small arguments
|
|
12
|
+
- ``llogp``: Derivative of llog for gradient-based optimization
|
|
13
|
+
|
|
14
|
+
Mathematical Background
|
|
15
|
+
-----------------------
|
|
16
|
+
During empirical likelihood optimization, the objective involves
|
|
17
|
+
:math:`\\sum_i \\log w_i` where weights :math:`w_i = 1/(1 - \\gamma^T g_i)`.
|
|
18
|
+
When the denominator approaches zero, the logarithm diverges. The Taylor
|
|
19
|
+
approximation around :math:`\\epsilon = 1/N` ensures smooth optimization:
|
|
20
|
+
|
|
21
|
+
.. math::
|
|
22
|
+
|
|
23
|
+
\\log(z) \\approx \\log(\\epsilon) - 1.5 + 2(z/\\epsilon) - 0.5(z/\\epsilon)^2
|
|
24
|
+
\\quad \\text{for } z < \\epsilon
|
|
25
|
+
|
|
26
|
+
This approximation:
|
|
27
|
+
|
|
28
|
+
1. Matches the true log at :math:`z = \\epsilon`
|
|
29
|
+
2. Has continuous first derivative at the boundary
|
|
30
|
+
3. Prevents numerical overflow during BFGS iterations
|
|
31
|
+
|
|
32
|
+
References
|
|
33
|
+
----------
|
|
34
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
|
|
35
|
+
score for a continuous treatment: Application to the efficacy of political
|
|
36
|
+
advertisements. The Annals of Applied Statistics, 12(1), 156-177.
|
|
37
|
+
https://doi.org/10.1214/17-AOAS1101
|
|
38
|
+
|
|
39
|
+
See Section 3.3.2: "when the argument to the logarithmic function falls
|
|
40
|
+
below 1/N, we instead use the second order Taylor series approximation
|
|
41
|
+
to the log around the point 1/N."
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
import numpy as np
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def llog(z: np.ndarray, eps: float) -> np.ndarray:
|
|
48
|
+
"""
|
|
49
|
+
Modified logarithm with second-order Taylor approximation for small values.
|
|
50
|
+
|
|
51
|
+
This function returns :math:`\\log(z)` when :math:`z \\geq \\epsilon`, and a
|
|
52
|
+
second-order Taylor series approximation when :math:`z < \\epsilon`. The
|
|
53
|
+
approximation prevents numerical issues when optimizing the empirical
|
|
54
|
+
likelihood objective.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
z : np.ndarray
|
|
59
|
+
Input array. NaN values are preserved in the output.
|
|
60
|
+
eps : float
|
|
61
|
+
Threshold below which Taylor approximation is used. In npCBPS,
|
|
62
|
+
this is typically set to :math:`1/N` where N is the sample size.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
np.ndarray
|
|
67
|
+
Element-wise modified log values with the same shape as input.
|
|
68
|
+
|
|
69
|
+
Notes
|
|
70
|
+
-----
|
|
71
|
+
**Taylor expansion formula:**
|
|
72
|
+
|
|
73
|
+
For :math:`z < \\epsilon`:
|
|
74
|
+
|
|
75
|
+
.. math::
|
|
76
|
+
|
|
77
|
+
\\text{llog}(z) = \\log(\\epsilon) - 1.5 + 2\\frac{z}{\\epsilon}
|
|
78
|
+
- 0.5\\left(\\frac{z}{\\epsilon}\\right)^2
|
|
79
|
+
|
|
80
|
+
For :math:`z \\geq \\epsilon`:
|
|
81
|
+
|
|
82
|
+
.. math::
|
|
83
|
+
|
|
84
|
+
\\text{llog}(z) = \\log(z)
|
|
85
|
+
|
|
86
|
+
**Derivation:**
|
|
87
|
+
|
|
88
|
+
The standard second-order Taylor expansion of :math:`\\log(z)` around
|
|
89
|
+
:math:`a = \\epsilon` is:
|
|
90
|
+
|
|
91
|
+
.. math::
|
|
92
|
+
|
|
93
|
+
\\log(z) \\approx \\log(a) + \\frac{z-a}{a} - \\frac{(z-a)^2}{2a^2}
|
|
94
|
+
|
|
95
|
+
Expanding and simplifying yields the coefficients -1.5, 2, and 0.5.
|
|
96
|
+
|
|
97
|
+
**Boundary continuity:**
|
|
98
|
+
|
|
99
|
+
At :math:`z = \\epsilon`, the Taylor branch evaluates to
|
|
100
|
+
:math:`\\log(\\epsilon) - 1.5 + 2 - 0.5 = \\log(\\epsilon)`, matching
|
|
101
|
+
the standard log branch exactly.
|
|
102
|
+
|
|
103
|
+
References
|
|
104
|
+
----------
|
|
105
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.3.2.
|
|
106
|
+
|
|
107
|
+
Examples
|
|
108
|
+
--------
|
|
109
|
+
>>> import numpy as np
|
|
110
|
+
>>> z = np.array([0.005, 0.01, 0.1, 1.0])
|
|
111
|
+
>>> eps = 0.01
|
|
112
|
+
>>> result = llog(z, eps)
|
|
113
|
+
>>> # z < eps uses Taylor approximation
|
|
114
|
+
>>> # z >= eps uses standard log
|
|
115
|
+
>>> np.isclose(result[1], np.log(eps))
|
|
116
|
+
True
|
|
117
|
+
"""
|
|
118
|
+
ans = z.copy()
|
|
119
|
+
avoid_na = ~np.isnan(z)
|
|
120
|
+
lo = (z < eps) & avoid_na
|
|
121
|
+
|
|
122
|
+
# Taylor approximation branch (z < eps)
|
|
123
|
+
ans[lo] = np.log(eps) - 1.5 + 2 * z[lo]/eps - 0.5 * (z[lo]/eps)**2
|
|
124
|
+
|
|
125
|
+
# Standard log branch (z >= eps)
|
|
126
|
+
ans[~lo] = np.log(z[~lo])
|
|
127
|
+
|
|
128
|
+
return ans
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def llogp(z: np.ndarray, eps: float) -> np.ndarray:
|
|
132
|
+
"""
|
|
133
|
+
Derivative of the modified logarithm function.
|
|
134
|
+
|
|
135
|
+
Computes the exact derivative of :func:`llog` for use in gradient-based
|
|
136
|
+
optimization algorithms such as BFGS.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
z : np.ndarray
|
|
141
|
+
Input array. NaN values are preserved in the output.
|
|
142
|
+
eps : float
|
|
143
|
+
Threshold matching the one used in :func:`llog`.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
np.ndarray
|
|
148
|
+
Element-wise derivative values with the same shape as input.
|
|
149
|
+
|
|
150
|
+
Notes
|
|
151
|
+
-----
|
|
152
|
+
**Derivative formula:**
|
|
153
|
+
|
|
154
|
+
For :math:`z < \\epsilon`:
|
|
155
|
+
|
|
156
|
+
.. math::
|
|
157
|
+
|
|
158
|
+
\\frac{d}{dz}\\text{llog}(z) = \\frac{2}{\\epsilon}
|
|
159
|
+
- \\frac{z}{\\epsilon^2}
|
|
160
|
+
|
|
161
|
+
For :math:`z \\geq \\epsilon`:
|
|
162
|
+
|
|
163
|
+
.. math::
|
|
164
|
+
|
|
165
|
+
\\frac{d}{dz}\\text{llog}(z) = \\frac{1}{z}
|
|
166
|
+
|
|
167
|
+
**Derivation:**
|
|
168
|
+
|
|
169
|
+
Taking the derivative of the Taylor branch:
|
|
170
|
+
|
|
171
|
+
.. math::
|
|
172
|
+
|
|
173
|
+
\\frac{d}{dz}\\left[\\log(\\epsilon) - 1.5 + \\frac{2z}{\\epsilon}
|
|
174
|
+
- \\frac{z^2}{2\\epsilon^2}\\right]
|
|
175
|
+
= \\frac{2}{\\epsilon} - \\frac{z}{\\epsilon^2}
|
|
176
|
+
|
|
177
|
+
**Boundary continuity:**
|
|
178
|
+
|
|
179
|
+
At :math:`z = \\epsilon`, both branches yield :math:`1/\\epsilon`.
|
|
180
|
+
|
|
181
|
+
References
|
|
182
|
+
----------
|
|
183
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.3.2.
|
|
184
|
+
|
|
185
|
+
Examples
|
|
186
|
+
--------
|
|
187
|
+
>>> import numpy as np
|
|
188
|
+
>>> z = np.array([0.005, 0.01, 0.1])
|
|
189
|
+
>>> eps = 0.01
|
|
190
|
+
>>> deriv = llogp(z, eps)
|
|
191
|
+
>>> # Verify numerically
|
|
192
|
+
>>> h = 1e-8
|
|
193
|
+
>>> numerical = (llog(z + h, eps) - llog(z - h, eps)) / (2 * h)
|
|
194
|
+
>>> np.allclose(deriv, numerical, rtol=1e-5)
|
|
195
|
+
True
|
|
196
|
+
"""
|
|
197
|
+
ans = z.copy()
|
|
198
|
+
avoid_na = ~np.isnan(z)
|
|
199
|
+
lo = (z < eps) & avoid_na
|
|
200
|
+
|
|
201
|
+
# Taylor derivative branch (z < eps)
|
|
202
|
+
ans[lo] = 2/eps - z[lo]/eps**2
|
|
203
|
+
|
|
204
|
+
# Standard derivative branch (z >= eps)
|
|
205
|
+
ans[~lo] = 1/z[~lo]
|
|
206
|
+
|
|
207
|
+
return ans
|
cbps/py.typed
ADDED
|
File without changes
|
cbps/sklearn/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scikit-learn Integration
|
|
3
|
+
========================
|
|
4
|
+
|
|
5
|
+
This module provides scikit-learn compatible wrappers for CBPS estimators,
|
|
6
|
+
enabling seamless integration with the sklearn ecosystem.
|
|
7
|
+
|
|
8
|
+
Classes
|
|
9
|
+
-------
|
|
10
|
+
CBPSEstimator
|
|
11
|
+
A scikit-learn compatible wrapper for discrete treatment CBPS that
|
|
12
|
+
inherits from ``BaseEstimator`` and ``ClassifierMixin``.
|
|
13
|
+
|
|
14
|
+
Features
|
|
15
|
+
--------
|
|
16
|
+
- Full compatibility with sklearn's ``Pipeline`` and ``FeatureUnion``
|
|
17
|
+
- Hyperparameter tuning via ``GridSearchCV`` and ``RandomizedSearchCV``
|
|
18
|
+
- Cross-validation support for model selection
|
|
19
|
+
- Access to CBPS weights through ``get_weights()`` for downstream analysis
|
|
20
|
+
|
|
21
|
+
Limitations
|
|
22
|
+
-----------
|
|
23
|
+
- Supports discrete treatments with 2-4 levels; for continuous treatments
|
|
24
|
+
use ``cbps.CBPS()`` directly
|
|
25
|
+
- Out-of-sample prediction via ``predict_proba()`` is not implemented;
|
|
26
|
+
for prediction on new data, use ``cbps.CBPS().predict(newdata=...)``
|
|
27
|
+
- Only array interface is available; formula interface requires ``cbps.CBPS()``
|
|
28
|
+
|
|
29
|
+
See Also
|
|
30
|
+
--------
|
|
31
|
+
cbps.CBPS : Main CBPS function with full feature support.
|
|
32
|
+
|
|
33
|
+
References
|
|
34
|
+
----------
|
|
35
|
+
.. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
|
|
36
|
+
Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
|
|
37
|
+
https://doi.org/10.1111/rssb.12027
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from cbps.sklearn.estimator import CBPSEstimator
|
|
41
|
+
|
|
42
|
+
__all__ = ['CBPSEstimator']
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scikit-learn Compatible CBPS Estimator
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
This module provides a scikit-learn compatible wrapper for the CBPS estimator,
|
|
6
|
+
enabling seamless integration with the sklearn ecosystem including Pipeline,
|
|
7
|
+
GridSearchCV, and cross-validation utilities.
|
|
8
|
+
|
|
9
|
+
The wrapper exposes CBPS functionality through the standard sklearn API
|
|
10
|
+
(fit, predict, predict_proba) while preserving access to CBPS-specific
|
|
11
|
+
outputs such as propensity score weights for inverse probability weighting.
|
|
12
|
+
|
|
13
|
+
References
|
|
14
|
+
----------
|
|
15
|
+
.. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
|
|
16
|
+
Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
|
|
17
|
+
https://doi.org/10.1111/rssb.12027
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from typing import Optional
|
|
21
|
+
import warnings
|
|
22
|
+
import numpy as np
|
|
23
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
24
|
+
from sklearn.utils.validation import check_is_fitted
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CBPSEstimator(BaseEstimator, ClassifierMixin):
|
|
28
|
+
"""scikit-learn compatible wrapper for Covariate Balancing Propensity Score.
|
|
29
|
+
|
|
30
|
+
This estimator wraps the CBPS methodology as a scikit-learn compatible
|
|
31
|
+
classifier, enabling integration with sklearn's Pipeline, GridSearchCV,
|
|
32
|
+
and cross-validation utilities.
|
|
33
|
+
|
|
34
|
+
CBPS estimates propensity scores by simultaneously optimizing treatment
|
|
35
|
+
prediction and covariate balance through the Generalized Method of Moments
|
|
36
|
+
(GMM) framework.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
att : {0, 1, 2}, default=1
|
|
41
|
+
Target estimand for causal inference:
|
|
42
|
+
|
|
43
|
+
- 0: Average Treatment Effect (ATE)
|
|
44
|
+
- 1: Average Treatment Effect on the Treated (ATT), second level as treated
|
|
45
|
+
- 2: ATT with first level as treated
|
|
46
|
+
|
|
47
|
+
Multi-valued treatments (3-4 levels) only support att=0 (ATE).
|
|
48
|
+
method : {'over', 'exact'}, default='over'
|
|
49
|
+
GMM estimation method:
|
|
50
|
+
|
|
51
|
+
- 'over': Over-identified GMM combining score function and balance conditions
|
|
52
|
+
- 'exact': Just-identified GMM using balance conditions only
|
|
53
|
+
two_step : bool, default=True
|
|
54
|
+
If True, uses two-step GMM with pre-computed weight matrix (faster).
|
|
55
|
+
If False, uses continuous updating GMM (better finite-sample properties).
|
|
56
|
+
iterations : int, default=1000
|
|
57
|
+
Maximum number of optimization iterations.
|
|
58
|
+
standardize : bool, default=True
|
|
59
|
+
If True, normalizes weights to sum to 1 within each treatment group.
|
|
60
|
+
If False, returns Horvitz-Thompson weights.
|
|
61
|
+
sample_weights : array-like of shape (n_samples,), optional
|
|
62
|
+
Survey sampling weights. Defaults to uniform weights.
|
|
63
|
+
|
|
64
|
+
Attributes
|
|
65
|
+
----------
|
|
66
|
+
fitted_ : bool
|
|
67
|
+
Indicates whether the model has been fitted.
|
|
68
|
+
cbps_result_ : CBPSResults
|
|
69
|
+
Complete CBPS result object containing coefficients, diagnostics,
|
|
70
|
+
and convergence information.
|
|
71
|
+
classes_ : ndarray of shape (n_classes,)
|
|
72
|
+
Unique treatment levels observed during fitting.
|
|
73
|
+
n_features_in_ : int
|
|
74
|
+
Number of features seen during fit (excludes auto-added intercept).
|
|
75
|
+
|
|
76
|
+
Notes
|
|
77
|
+
-----
|
|
78
|
+
**Limitations**
|
|
79
|
+
|
|
80
|
+
- Supports array interface only; for formula interface use ``cbps.CBPS()``
|
|
81
|
+
- Supports discrete treatments with 2-4 levels; for continuous treatments
|
|
82
|
+
use ``cbps.CBPS()`` directly
|
|
83
|
+
- ``predict_proba()`` returns stored training fitted values only; for
|
|
84
|
+
prediction on new data, access ``cbps_result_.predict(newdata=...)``
|
|
85
|
+
|
|
86
|
+
**Propensity Score Output**
|
|
87
|
+
|
|
88
|
+
- Binary treatment: ``fitted_values`` is 1D array of shape (n,) representing P(T=1)
|
|
89
|
+
- Multi-valued treatment (3-4 levels): ``fitted_values`` is 2D array of shape
|
|
90
|
+
(n, K) where each row is a probability distribution over K treatment levels
|
|
91
|
+
|
|
92
|
+
**Multi-valued Treatment**
|
|
93
|
+
|
|
94
|
+
For treatments with 3-4 levels, the wrapper automatically converts numeric
|
|
95
|
+
arrays to ``pd.Categorical`` to trigger multi-valued discrete CBPS (using
|
|
96
|
+
multinomial logistic regression per Imai and Ratkovic 2014, Section 4.1).
|
|
97
|
+
|
|
98
|
+
References
|
|
99
|
+
----------
|
|
100
|
+
.. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
|
|
101
|
+
Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
|
|
102
|
+
https://doi.org/10.1111/rssb.12027
|
|
103
|
+
|
|
104
|
+
Examples
|
|
105
|
+
--------
|
|
106
|
+
Basic usage with binary treatment:
|
|
107
|
+
|
|
108
|
+
>>> from cbps.sklearn import CBPSEstimator
|
|
109
|
+
>>> from cbps.datasets import load_lalonde
|
|
110
|
+
>>> df = load_lalonde()
|
|
111
|
+
>>> X = df[['age', 'educ', 're74', 're75']].values
|
|
112
|
+
>>> y = df['treat'].values
|
|
113
|
+
>>> est = CBPSEstimator(att=1, method='over')
|
|
114
|
+
>>> est.fit(X, y) # doctest: +ELLIPSIS
|
|
115
|
+
CBPSEstimator(...)
|
|
116
|
+
>>> weights = est.get_weights()
|
|
117
|
+
>>> weights.shape
|
|
118
|
+
(445,)
|
|
119
|
+
|
|
120
|
+
Integration with sklearn Pipeline:
|
|
121
|
+
|
|
122
|
+
>>> from sklearn.pipeline import Pipeline
|
|
123
|
+
>>> from sklearn.preprocessing import StandardScaler
|
|
124
|
+
>>> pipe = Pipeline([
|
|
125
|
+
... ('scaler', StandardScaler()),
|
|
126
|
+
... ('cbps', CBPSEstimator(att=1))
|
|
127
|
+
... ])
|
|
128
|
+
>>> pipe.fit(X, y) # doctest: +ELLIPSIS
|
|
129
|
+
Pipeline(...)
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
att: int = 1,
|
|
135
|
+
method: str = 'over',
|
|
136
|
+
two_step: bool = True,
|
|
137
|
+
iterations: int = 1000,
|
|
138
|
+
standardize: bool = True,
|
|
139
|
+
sample_weights: Optional[np.ndarray] = None
|
|
140
|
+
):
|
|
141
|
+
# CBPS core parameters (array interface only)
|
|
142
|
+
self.att = att
|
|
143
|
+
self.method = method
|
|
144
|
+
self.two_step = two_step
|
|
145
|
+
self.iterations = iterations
|
|
146
|
+
self.standardize = standardize
|
|
147
|
+
self.sample_weights = sample_weights
|
|
148
|
+
|
|
149
|
+
def fit(self, X, y):
|
|
150
|
+
"""Fit the CBPS model to the training data.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
X : array-like of shape (n_samples, n_features)
|
|
155
|
+
Covariate matrix. An intercept column is automatically added
|
|
156
|
+
if not present.
|
|
157
|
+
|
|
158
|
+
y : array-like of shape (n_samples,)
|
|
159
|
+
Treatment assignment vector with 2-4 unique discrete values.
|
|
160
|
+
For 3-4 levels, numeric arrays are automatically converted to
|
|
161
|
+
``pd.Categorical`` to use multi-valued discrete CBPS.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
self : CBPSEstimator
|
|
166
|
+
Fitted estimator.
|
|
167
|
+
|
|
168
|
+
Raises
|
|
169
|
+
------
|
|
170
|
+
ValueError
|
|
171
|
+
If X is not 2-dimensional.
|
|
172
|
+
If y is not 1-dimensional.
|
|
173
|
+
If X and y have different numbers of samples.
|
|
174
|
+
If y has fewer than 2 or more than 4 unique values.
|
|
175
|
+
If ``att != 0`` for treatments with 3-4 levels.
|
|
176
|
+
"""
|
|
177
|
+
from cbps import CBPS
|
|
178
|
+
import pandas as pd
|
|
179
|
+
|
|
180
|
+
X = np.asarray(X)
|
|
181
|
+
|
|
182
|
+
# Preserve original y for CBPS (may be pd.Categorical for multi-valued)
|
|
183
|
+
y_original = y
|
|
184
|
+
y_array = np.asarray(y) # For validation only
|
|
185
|
+
|
|
186
|
+
if X.ndim != 2:
|
|
187
|
+
raise ValueError(f"X must be a 2D array, got {X.ndim}D")
|
|
188
|
+
|
|
189
|
+
if y_array.ndim != 1:
|
|
190
|
+
raise ValueError(f"y must be a 1D array, got {y_array.ndim}D")
|
|
191
|
+
|
|
192
|
+
if X.shape[0] != len(y_array):
|
|
193
|
+
raise ValueError(
|
|
194
|
+
f"Sample count mismatch: X has {X.shape[0]} samples, "
|
|
195
|
+
f"y has {len(y_array)} samples"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
n_unique = len(np.unique(y_array))
|
|
199
|
+
|
|
200
|
+
if n_unique < 2:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
"Treatment variable must have at least 2 unique values"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if n_unique > 4:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"CBPSEstimator supports discrete treatments with 2-4 levels. "
|
|
208
|
+
f"Received {n_unique} unique values. For continuous treatments, "
|
|
209
|
+
f"use cbps.CBPS() directly."
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if n_unique >= 3 and self.att != 0:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"Multi-valued treatment ({n_unique} levels) requires att=0 (ATE). "
|
|
215
|
+
f"ATT estimation is only available for binary treatments."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# For multi-valued treatment, ensure categorical type
|
|
219
|
+
# This triggers multi-valued discrete CBPS instead of continuous
|
|
220
|
+
if n_unique >= 3 and not isinstance(y_original, pd.Categorical):
|
|
221
|
+
y_original = pd.Categorical(y_original)
|
|
222
|
+
|
|
223
|
+
# sklearn convention: store input feature count
|
|
224
|
+
self.n_features_in_ = X.shape[1]
|
|
225
|
+
self.classes_ = np.unique(y_array)
|
|
226
|
+
|
|
227
|
+
# Fit CBPS model (pass original y to preserve Categorical type)
|
|
228
|
+
self.cbps_result_ = CBPS(
|
|
229
|
+
treatment=y_original,
|
|
230
|
+
covariates=X,
|
|
231
|
+
att=self.att,
|
|
232
|
+
method=self.method,
|
|
233
|
+
two_step=self.two_step,
|
|
234
|
+
iterations=self.iterations,
|
|
235
|
+
standardize=self.standardize,
|
|
236
|
+
sample_weights=self.sample_weights
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
self.fitted_ = True
|
|
240
|
+
|
|
241
|
+
# Expose sklearn-standard coefficient attributes
|
|
242
|
+
coefs = self.cbps_result_.coefficients
|
|
243
|
+
if coefs.ndim == 2 and coefs.shape[1] == 1:
|
|
244
|
+
# Binary treatment: (k, 1) -> intercept + coef_
|
|
245
|
+
self.intercept_ = float(coefs[0, 0])
|
|
246
|
+
self.coef_ = coefs[1:, 0]
|
|
247
|
+
elif coefs.ndim == 2 and coefs.shape[1] > 1:
|
|
248
|
+
# Multi-valued treatment: (k, J-1)
|
|
249
|
+
self.intercept_ = coefs[0, :]
|
|
250
|
+
self.coef_ = coefs[1:, :]
|
|
251
|
+
else:
|
|
252
|
+
self.intercept_ = float(coefs.ravel()[0])
|
|
253
|
+
self.coef_ = coefs.ravel()[1:]
|
|
254
|
+
|
|
255
|
+
return self
|
|
256
|
+
|
|
257
|
+
def predict_proba(self, X):
|
|
258
|
+
"""Return estimated propensity scores for observations.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
X : array-like of shape (n_samples, n_features)
|
|
263
|
+
Covariate matrix. Must have the same number of samples as the
|
|
264
|
+
training data. The actual values are not used; this parameter
|
|
265
|
+
exists for sklearn API compatibility.
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
proba : ndarray of shape (n_samples, n_classes)
|
|
270
|
+
Propensity score matrix. For binary treatment, column 0 contains
|
|
271
|
+
P(T=0) and column 1 contains P(T=1). For multi-valued treatment,
|
|
272
|
+
each column k contains P(T=k).
|
|
273
|
+
|
|
274
|
+
Raises
|
|
275
|
+
------
|
|
276
|
+
ValueError
|
|
277
|
+
If the number of samples in X differs from the training set size.
|
|
278
|
+
|
|
279
|
+
Warns
|
|
280
|
+
-----
|
|
281
|
+
UserWarning
|
|
282
|
+
Always issued to remind that this method returns stored fitted
|
|
283
|
+
values rather than predictions on new data.
|
|
284
|
+
"""
|
|
285
|
+
check_is_fitted(self, 'fitted_')
|
|
286
|
+
|
|
287
|
+
X = np.asarray(X)
|
|
288
|
+
n_samples_X = X.shape[0]
|
|
289
|
+
n_samples_train = len(self.cbps_result_.fitted_values)
|
|
290
|
+
|
|
291
|
+
if n_samples_X != n_samples_train:
|
|
292
|
+
raise ValueError(
|
|
293
|
+
f"Sample count mismatch: X has {n_samples_X} samples, but the "
|
|
294
|
+
f"model was fitted on {n_samples_train} samples. "
|
|
295
|
+
f"predict_proba() only returns fitted values for training data."
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
warnings.warn(
|
|
299
|
+
"predict_proba() returns stored fitted values from training. "
|
|
300
|
+
"For prediction on new data, use self.cbps_result_.predict(newdata=...).",
|
|
301
|
+
UserWarning,
|
|
302
|
+
stacklevel=2
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
fitted_values = self.cbps_result_.fitted_values
|
|
306
|
+
|
|
307
|
+
if len(self.classes_) == 2:
|
|
308
|
+
# Binary: fitted_values is P(T=1), convert to (n, 2) matrix
|
|
309
|
+
proba = np.column_stack([1 - fitted_values, fitted_values])
|
|
310
|
+
else:
|
|
311
|
+
# Multi-valued: fitted_values is already (n, K) matrix
|
|
312
|
+
proba = fitted_values
|
|
313
|
+
|
|
314
|
+
return proba
|
|
315
|
+
|
|
316
|
+
def predict(self, X):
|
|
317
|
+
"""Predict treatment assignment based on maximum propensity score.
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
X : array-like of shape (n_samples, n_features)
|
|
322
|
+
Covariate matrix. Must match the training data sample count.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
y_pred : ndarray of shape (n_samples,)
|
|
327
|
+
Predicted treatment class for each observation, determined by
|
|
328
|
+
the treatment level with highest estimated propensity.
|
|
329
|
+
|
|
330
|
+
Notes
|
|
331
|
+
-----
|
|
332
|
+
This method returns the treatment level with the maximum estimated
|
|
333
|
+
propensity score for each observation. It is provided for sklearn
|
|
334
|
+
API compatibility but has limited practical utility since CBPS
|
|
335
|
+
propensity scores are estimated for weighting purposes, not
|
|
336
|
+
classification.
|
|
337
|
+
|
|
338
|
+
See Also
|
|
339
|
+
--------
|
|
340
|
+
predict_proba : Return probability estimates.
|
|
341
|
+
get_weights : Return IPW weights (primary CBPS output).
|
|
342
|
+
"""
|
|
343
|
+
proba = self.predict_proba(X)
|
|
344
|
+
return self.classes_[np.argmax(proba, axis=1)]
|
|
345
|
+
|
|
346
|
+
def get_weights(self):
|
|
347
|
+
"""Return inverse probability weights for causal effect estimation.
|
|
348
|
+
|
|
349
|
+
Returns
|
|
350
|
+
-------
|
|
351
|
+
weights : ndarray of shape (n_samples,)
|
|
352
|
+
Covariate balancing weights. When ``standardize=True`` (default),
|
|
353
|
+
weights sum to 1 within each treatment group. Otherwise,
|
|
354
|
+
Horvitz-Thompson weights are returned.
|
|
355
|
+
|
|
356
|
+
Notes
|
|
357
|
+
-----
|
|
358
|
+
These weights are the primary output of CBPS estimation, designed
|
|
359
|
+
for use in weighted outcome regressions or Horvitz-Thompson estimators
|
|
360
|
+
to obtain unbiased estimates of causal effects.
|
|
361
|
+
|
|
362
|
+
For ATE estimation (``att=0``), all observations receive positive
|
|
363
|
+
weights. For ATT estimation (``att=1`` or ``att=2``), control group
|
|
364
|
+
observations are reweighted to match the treated group's covariate
|
|
365
|
+
distribution.
|
|
366
|
+
|
|
367
|
+
Examples
|
|
368
|
+
--------
|
|
369
|
+
>>> est = CBPSEstimator(att=1).fit(X, y) # doctest: +SKIP
|
|
370
|
+
>>> weights = est.get_weights() # doctest: +SKIP
|
|
371
|
+
>>> # Use weights in outcome regression
|
|
372
|
+
>>> from sklearn.linear_model import LinearRegression
|
|
373
|
+
>>> outcome_model = LinearRegression() # doctest: +SKIP
|
|
374
|
+
>>> outcome_model.fit(X, outcome, sample_weight=weights) # doctest: +SKIP
|
|
375
|
+
"""
|
|
376
|
+
check_is_fitted(self, 'fitted_')
|
|
377
|
+
return self.cbps_result_.weights
|
|
378
|
+
|