cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,207 @@
1
+ """
2
+ Taylor Approximation Functions for Empirical Likelihood.
3
+
4
+ This module provides modified logarithm functions with second-order Taylor
5
+ series approximation for numerical stability in empirical likelihood
6
+ optimization. When the argument falls below a threshold (typically 1/N),
7
+ the Taylor approximation prevents log(0) singularities.
8
+
9
+ The key functions are:
10
+
11
+ - ``llog``: Modified log with Taylor branch for small arguments
12
+ - ``llogp``: Derivative of llog for gradient-based optimization
13
+
14
+ Mathematical Background
15
+ -----------------------
16
+ During empirical likelihood optimization, the objective involves
17
+ :math:`\\sum_i \\log w_i` where weights :math:`w_i = 1/(1 - \\gamma^T g_i)`.
18
+ When the denominator approaches zero, the logarithm diverges. The Taylor
19
+ approximation around :math:`\\epsilon = 1/N` ensures smooth optimization:
20
+
21
+ .. math::
22
+
23
+ \\log(z) \\approx \\log(\\epsilon) - 1.5 + 2(z/\\epsilon) - 0.5(z/\\epsilon)^2
24
+ \\quad \\text{for } z < \\epsilon
25
+
26
+ This approximation:
27
+
28
+ 1. Matches the true log at :math:`z = \\epsilon`
29
+ 2. Has continuous first derivative at the boundary
30
+ 3. Prevents numerical overflow during BFGS iterations
31
+
32
+ References
33
+ ----------
34
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
35
+ score for a continuous treatment: Application to the efficacy of political
36
+ advertisements. The Annals of Applied Statistics, 12(1), 156-177.
37
+ https://doi.org/10.1214/17-AOAS1101
38
+
39
+ See Section 3.3.2: "when the argument to the logarithmic function falls
40
+ below 1/N, we instead use the second order Taylor series approximation
41
+ to the log around the point 1/N."
42
+ """
43
+
44
+ import numpy as np
45
+
46
+
47
+ def llog(z: np.ndarray, eps: float) -> np.ndarray:
48
+ """
49
+ Modified logarithm with second-order Taylor approximation for small values.
50
+
51
+ This function returns :math:`\\log(z)` when :math:`z \\geq \\epsilon`, and a
52
+ second-order Taylor series approximation when :math:`z < \\epsilon`. The
53
+ approximation prevents numerical issues when optimizing the empirical
54
+ likelihood objective.
55
+
56
+ Parameters
57
+ ----------
58
+ z : np.ndarray
59
+ Input array. NaN values are preserved in the output.
60
+ eps : float
61
+ Threshold below which Taylor approximation is used. In npCBPS,
62
+ this is typically set to :math:`1/N` where N is the sample size.
63
+
64
+ Returns
65
+ -------
66
+ np.ndarray
67
+ Element-wise modified log values with the same shape as input.
68
+
69
+ Notes
70
+ -----
71
+ **Taylor expansion formula:**
72
+
73
+ For :math:`z < \\epsilon`:
74
+
75
+ .. math::
76
+
77
+ \\text{llog}(z) = \\log(\\epsilon) - 1.5 + 2\\frac{z}{\\epsilon}
78
+ - 0.5\\left(\\frac{z}{\\epsilon}\\right)^2
79
+
80
+ For :math:`z \\geq \\epsilon`:
81
+
82
+ .. math::
83
+
84
+ \\text{llog}(z) = \\log(z)
85
+
86
+ **Derivation:**
87
+
88
+ The standard second-order Taylor expansion of :math:`\\log(z)` around
89
+ :math:`a = \\epsilon` is:
90
+
91
+ .. math::
92
+
93
+ \\log(z) \\approx \\log(a) + \\frac{z-a}{a} - \\frac{(z-a)^2}{2a^2}
94
+
95
+ Expanding and simplifying yields the coefficients -1.5, 2, and 0.5.
96
+
97
+ **Boundary continuity:**
98
+
99
+ At :math:`z = \\epsilon`, the Taylor branch evaluates to
100
+ :math:`\\log(\\epsilon) - 1.5 + 2 - 0.5 = \\log(\\epsilon)`, matching
101
+ the standard log branch exactly.
102
+
103
+ References
104
+ ----------
105
+ Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.3.2.
106
+
107
+ Examples
108
+ --------
109
+ >>> import numpy as np
110
+ >>> z = np.array([0.005, 0.01, 0.1, 1.0])
111
+ >>> eps = 0.01
112
+ >>> result = llog(z, eps)
113
+ >>> # z < eps uses Taylor approximation
114
+ >>> # z >= eps uses standard log
115
+ >>> np.isclose(result[1], np.log(eps))
116
+ True
117
+ """
118
+ ans = z.copy()
119
+ avoid_na = ~np.isnan(z)
120
+ lo = (z < eps) & avoid_na
121
+
122
+ # Taylor approximation branch (z < eps)
123
+ ans[lo] = np.log(eps) - 1.5 + 2 * z[lo]/eps - 0.5 * (z[lo]/eps)**2
124
+
125
+ # Standard log branch (z >= eps)
126
+ ans[~lo] = np.log(z[~lo])
127
+
128
+ return ans
129
+
130
+
131
+ def llogp(z: np.ndarray, eps: float) -> np.ndarray:
132
+ """
133
+ Derivative of the modified logarithm function.
134
+
135
+ Computes the exact derivative of :func:`llog` for use in gradient-based
136
+ optimization algorithms such as BFGS.
137
+
138
+ Parameters
139
+ ----------
140
+ z : np.ndarray
141
+ Input array. NaN values are preserved in the output.
142
+ eps : float
143
+ Threshold matching the one used in :func:`llog`.
144
+
145
+ Returns
146
+ -------
147
+ np.ndarray
148
+ Element-wise derivative values with the same shape as input.
149
+
150
+ Notes
151
+ -----
152
+ **Derivative formula:**
153
+
154
+ For :math:`z < \\epsilon`:
155
+
156
+ .. math::
157
+
158
+ \\frac{d}{dz}\\text{llog}(z) = \\frac{2}{\\epsilon}
159
+ - \\frac{z}{\\epsilon^2}
160
+
161
+ For :math:`z \\geq \\epsilon`:
162
+
163
+ .. math::
164
+
165
+ \\frac{d}{dz}\\text{llog}(z) = \\frac{1}{z}
166
+
167
+ **Derivation:**
168
+
169
+ Taking the derivative of the Taylor branch:
170
+
171
+ .. math::
172
+
173
+ \\frac{d}{dz}\\left[\\log(\\epsilon) - 1.5 + \\frac{2z}{\\epsilon}
174
+ - \\frac{z^2}{2\\epsilon^2}\\right]
175
+ = \\frac{2}{\\epsilon} - \\frac{z}{\\epsilon^2}
176
+
177
+ **Boundary continuity:**
178
+
179
+ At :math:`z = \\epsilon`, both branches yield :math:`1/\\epsilon`.
180
+
181
+ References
182
+ ----------
183
+ Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.3.2.
184
+
185
+ Examples
186
+ --------
187
+ >>> import numpy as np
188
+ >>> z = np.array([0.005, 0.01, 0.1])
189
+ >>> eps = 0.01
190
+ >>> deriv = llogp(z, eps)
191
+ >>> # Verify numerically
192
+ >>> h = 1e-8
193
+ >>> numerical = (llog(z + h, eps) - llog(z - h, eps)) / (2 * h)
194
+ >>> np.allclose(deriv, numerical, rtol=1e-5)
195
+ True
196
+ """
197
+ ans = z.copy()
198
+ avoid_na = ~np.isnan(z)
199
+ lo = (z < eps) & avoid_na
200
+
201
+ # Taylor derivative branch (z < eps)
202
+ ans[lo] = 2/eps - z[lo]/eps**2
203
+
204
+ # Standard derivative branch (z >= eps)
205
+ ans[~lo] = 1/z[~lo]
206
+
207
+ return ans
cbps/py.typed ADDED
File without changes
@@ -0,0 +1,42 @@
1
+ """
2
+ scikit-learn Integration
3
+ ========================
4
+
5
+ This module provides scikit-learn compatible wrappers for CBPS estimators,
6
+ enabling seamless integration with the sklearn ecosystem.
7
+
8
+ Classes
9
+ -------
10
+ CBPSEstimator
11
+ A scikit-learn compatible wrapper for discrete treatment CBPS that
12
+ inherits from ``BaseEstimator`` and ``ClassifierMixin``.
13
+
14
+ Features
15
+ --------
16
+ - Full compatibility with sklearn's ``Pipeline`` and ``FeatureUnion``
17
+ - Hyperparameter tuning via ``GridSearchCV`` and ``RandomizedSearchCV``
18
+ - Cross-validation support for model selection
19
+ - Access to CBPS weights through ``get_weights()`` for downstream analysis
20
+
21
+ Limitations
22
+ -----------
23
+ - Supports discrete treatments with 2-4 levels; for continuous treatments
24
+ use ``cbps.CBPS()`` directly
25
+ - Out-of-sample prediction via ``predict_proba()`` is not implemented;
26
+ for prediction on new data, use ``cbps.CBPS().predict(newdata=...)``
27
+ - Only array interface is available; formula interface requires ``cbps.CBPS()``
28
+
29
+ See Also
30
+ --------
31
+ cbps.CBPS : Main CBPS function with full feature support.
32
+
33
+ References
34
+ ----------
35
+ .. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
36
+ Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
37
+ https://doi.org/10.1111/rssb.12027
38
+ """
39
+
40
+ from cbps.sklearn.estimator import CBPSEstimator
41
+
42
+ __all__ = ['CBPSEstimator']
@@ -0,0 +1,378 @@
1
+ """
2
+ scikit-learn Compatible CBPS Estimator
3
+ ======================================
4
+
5
+ This module provides a scikit-learn compatible wrapper for the CBPS estimator,
6
+ enabling seamless integration with the sklearn ecosystem including Pipeline,
7
+ GridSearchCV, and cross-validation utilities.
8
+
9
+ The wrapper exposes CBPS functionality through the standard sklearn API
10
+ (fit, predict, predict_proba) while preserving access to CBPS-specific
11
+ outputs such as propensity score weights for inverse probability weighting.
12
+
13
+ References
14
+ ----------
15
+ .. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
16
+ Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
17
+ https://doi.org/10.1111/rssb.12027
18
+ """
19
+
20
+ from typing import Optional
21
+ import warnings
22
+ import numpy as np
23
+ from sklearn.base import BaseEstimator, ClassifierMixin
24
+ from sklearn.utils.validation import check_is_fitted
25
+
26
+
27
+ class CBPSEstimator(BaseEstimator, ClassifierMixin):
28
+ """scikit-learn compatible wrapper for Covariate Balancing Propensity Score.
29
+
30
+ This estimator wraps the CBPS methodology as a scikit-learn compatible
31
+ classifier, enabling integration with sklearn's Pipeline, GridSearchCV,
32
+ and cross-validation utilities.
33
+
34
+ CBPS estimates propensity scores by simultaneously optimizing treatment
35
+ prediction and covariate balance through the Generalized Method of Moments
36
+ (GMM) framework.
37
+
38
+ Parameters
39
+ ----------
40
+ att : {0, 1, 2}, default=1
41
+ Target estimand for causal inference:
42
+
43
+ - 0: Average Treatment Effect (ATE)
44
+ - 1: Average Treatment Effect on the Treated (ATT), second level as treated
45
+ - 2: ATT with first level as treated
46
+
47
+ Multi-valued treatments (3-4 levels) only support att=0 (ATE).
48
+ method : {'over', 'exact'}, default='over'
49
+ GMM estimation method:
50
+
51
+ - 'over': Over-identified GMM combining score function and balance conditions
52
+ - 'exact': Just-identified GMM using balance conditions only
53
+ two_step : bool, default=True
54
+ If True, uses two-step GMM with pre-computed weight matrix (faster).
55
+ If False, uses continuous updating GMM (better finite-sample properties).
56
+ iterations : int, default=1000
57
+ Maximum number of optimization iterations.
58
+ standardize : bool, default=True
59
+ If True, normalizes weights to sum to 1 within each treatment group.
60
+ If False, returns Horvitz-Thompson weights.
61
+ sample_weights : array-like of shape (n_samples,), optional
62
+ Survey sampling weights. Defaults to uniform weights.
63
+
64
+ Attributes
65
+ ----------
66
+ fitted_ : bool
67
+ Indicates whether the model has been fitted.
68
+ cbps_result_ : CBPSResults
69
+ Complete CBPS result object containing coefficients, diagnostics,
70
+ and convergence information.
71
+ classes_ : ndarray of shape (n_classes,)
72
+ Unique treatment levels observed during fitting.
73
+ n_features_in_ : int
74
+ Number of features seen during fit (excludes auto-added intercept).
75
+
76
+ Notes
77
+ -----
78
+ **Limitations**
79
+
80
+ - Supports array interface only; for formula interface use ``cbps.CBPS()``
81
+ - Supports discrete treatments with 2-4 levels; for continuous treatments
82
+ use ``cbps.CBPS()`` directly
83
+ - ``predict_proba()`` returns stored training fitted values only; for
84
+ prediction on new data, access ``cbps_result_.predict(newdata=...)``
85
+
86
+ **Propensity Score Output**
87
+
88
+ - Binary treatment: ``fitted_values`` is 1D array of shape (n,) representing P(T=1)
89
+ - Multi-valued treatment (3-4 levels): ``fitted_values`` is 2D array of shape
90
+ (n, K) where each row is a probability distribution over K treatment levels
91
+
92
+ **Multi-valued Treatment**
93
+
94
+ For treatments with 3-4 levels, the wrapper automatically converts numeric
95
+ arrays to ``pd.Categorical`` to trigger multi-valued discrete CBPS (using
96
+ multinomial logistic regression per Imai and Ratkovic 2014, Section 4.1).
97
+
98
+ References
99
+ ----------
100
+ .. [1] Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
101
+ Journal of the Royal Statistical Society, Series B, 76(1), 243-263.
102
+ https://doi.org/10.1111/rssb.12027
103
+
104
+ Examples
105
+ --------
106
+ Basic usage with binary treatment:
107
+
108
+ >>> from cbps.sklearn import CBPSEstimator
109
+ >>> from cbps.datasets import load_lalonde
110
+ >>> df = load_lalonde()
111
+ >>> X = df[['age', 'educ', 're74', 're75']].values
112
+ >>> y = df['treat'].values
113
+ >>> est = CBPSEstimator(att=1, method='over')
114
+ >>> est.fit(X, y) # doctest: +ELLIPSIS
115
+ CBPSEstimator(...)
116
+ >>> weights = est.get_weights()
117
+ >>> weights.shape
118
+ (445,)
119
+
120
+ Integration with sklearn Pipeline:
121
+
122
+ >>> from sklearn.pipeline import Pipeline
123
+ >>> from sklearn.preprocessing import StandardScaler
124
+ >>> pipe = Pipeline([
125
+ ... ('scaler', StandardScaler()),
126
+ ... ('cbps', CBPSEstimator(att=1))
127
+ ... ])
128
+ >>> pipe.fit(X, y) # doctest: +ELLIPSIS
129
+ Pipeline(...)
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ att: int = 1,
135
+ method: str = 'over',
136
+ two_step: bool = True,
137
+ iterations: int = 1000,
138
+ standardize: bool = True,
139
+ sample_weights: Optional[np.ndarray] = None
140
+ ):
141
+ # CBPS core parameters (array interface only)
142
+ self.att = att
143
+ self.method = method
144
+ self.two_step = two_step
145
+ self.iterations = iterations
146
+ self.standardize = standardize
147
+ self.sample_weights = sample_weights
148
+
149
+ def fit(self, X, y):
150
+ """Fit the CBPS model to the training data.
151
+
152
+ Parameters
153
+ ----------
154
+ X : array-like of shape (n_samples, n_features)
155
+ Covariate matrix. An intercept column is automatically added
156
+ if not present.
157
+
158
+ y : array-like of shape (n_samples,)
159
+ Treatment assignment vector with 2-4 unique discrete values.
160
+ For 3-4 levels, numeric arrays are automatically converted to
161
+ ``pd.Categorical`` to use multi-valued discrete CBPS.
162
+
163
+ Returns
164
+ -------
165
+ self : CBPSEstimator
166
+ Fitted estimator.
167
+
168
+ Raises
169
+ ------
170
+ ValueError
171
+ If X is not 2-dimensional.
172
+ If y is not 1-dimensional.
173
+ If X and y have different numbers of samples.
174
+ If y has fewer than 2 or more than 4 unique values.
175
+ If ``att != 0`` for treatments with 3-4 levels.
176
+ """
177
+ from cbps import CBPS
178
+ import pandas as pd
179
+
180
+ X = np.asarray(X)
181
+
182
+ # Preserve original y for CBPS (may be pd.Categorical for multi-valued)
183
+ y_original = y
184
+ y_array = np.asarray(y) # For validation only
185
+
186
+ if X.ndim != 2:
187
+ raise ValueError(f"X must be a 2D array, got {X.ndim}D")
188
+
189
+ if y_array.ndim != 1:
190
+ raise ValueError(f"y must be a 1D array, got {y_array.ndim}D")
191
+
192
+ if X.shape[0] != len(y_array):
193
+ raise ValueError(
194
+ f"Sample count mismatch: X has {X.shape[0]} samples, "
195
+ f"y has {len(y_array)} samples"
196
+ )
197
+
198
+ n_unique = len(np.unique(y_array))
199
+
200
+ if n_unique < 2:
201
+ raise ValueError(
202
+ "Treatment variable must have at least 2 unique values"
203
+ )
204
+
205
+ if n_unique > 4:
206
+ raise ValueError(
207
+ f"CBPSEstimator supports discrete treatments with 2-4 levels. "
208
+ f"Received {n_unique} unique values. For continuous treatments, "
209
+ f"use cbps.CBPS() directly."
210
+ )
211
+
212
+ if n_unique >= 3 and self.att != 0:
213
+ raise ValueError(
214
+ f"Multi-valued treatment ({n_unique} levels) requires att=0 (ATE). "
215
+ f"ATT estimation is only available for binary treatments."
216
+ )
217
+
218
+ # For multi-valued treatment, ensure categorical type
219
+ # This triggers multi-valued discrete CBPS instead of continuous
220
+ if n_unique >= 3 and not isinstance(y_original, pd.Categorical):
221
+ y_original = pd.Categorical(y_original)
222
+
223
+ # sklearn convention: store input feature count
224
+ self.n_features_in_ = X.shape[1]
225
+ self.classes_ = np.unique(y_array)
226
+
227
+ # Fit CBPS model (pass original y to preserve Categorical type)
228
+ self.cbps_result_ = CBPS(
229
+ treatment=y_original,
230
+ covariates=X,
231
+ att=self.att,
232
+ method=self.method,
233
+ two_step=self.two_step,
234
+ iterations=self.iterations,
235
+ standardize=self.standardize,
236
+ sample_weights=self.sample_weights
237
+ )
238
+
239
+ self.fitted_ = True
240
+
241
+ # Expose sklearn-standard coefficient attributes
242
+ coefs = self.cbps_result_.coefficients
243
+ if coefs.ndim == 2 and coefs.shape[1] == 1:
244
+ # Binary treatment: (k, 1) -> intercept + coef_
245
+ self.intercept_ = float(coefs[0, 0])
246
+ self.coef_ = coefs[1:, 0]
247
+ elif coefs.ndim == 2 and coefs.shape[1] > 1:
248
+ # Multi-valued treatment: (k, J-1)
249
+ self.intercept_ = coefs[0, :]
250
+ self.coef_ = coefs[1:, :]
251
+ else:
252
+ self.intercept_ = float(coefs.ravel()[0])
253
+ self.coef_ = coefs.ravel()[1:]
254
+
255
+ return self
256
+
257
+ def predict_proba(self, X):
258
+ """Return estimated propensity scores for observations.
259
+
260
+ Parameters
261
+ ----------
262
+ X : array-like of shape (n_samples, n_features)
263
+ Covariate matrix. Must have the same number of samples as the
264
+ training data. The actual values are not used; this parameter
265
+ exists for sklearn API compatibility.
266
+
267
+ Returns
268
+ -------
269
+ proba : ndarray of shape (n_samples, n_classes)
270
+ Propensity score matrix. For binary treatment, column 0 contains
271
+ P(T=0) and column 1 contains P(T=1). For multi-valued treatment,
272
+ each column k contains P(T=k).
273
+
274
+ Raises
275
+ ------
276
+ ValueError
277
+ If the number of samples in X differs from the training set size.
278
+
279
+ Warns
280
+ -----
281
+ UserWarning
282
+ Always issued to remind that this method returns stored fitted
283
+ values rather than predictions on new data.
284
+ """
285
+ check_is_fitted(self, 'fitted_')
286
+
287
+ X = np.asarray(X)
288
+ n_samples_X = X.shape[0]
289
+ n_samples_train = len(self.cbps_result_.fitted_values)
290
+
291
+ if n_samples_X != n_samples_train:
292
+ raise ValueError(
293
+ f"Sample count mismatch: X has {n_samples_X} samples, but the "
294
+ f"model was fitted on {n_samples_train} samples. "
295
+ f"predict_proba() only returns fitted values for training data."
296
+ )
297
+
298
+ warnings.warn(
299
+ "predict_proba() returns stored fitted values from training. "
300
+ "For prediction on new data, use self.cbps_result_.predict(newdata=...).",
301
+ UserWarning,
302
+ stacklevel=2
303
+ )
304
+
305
+ fitted_values = self.cbps_result_.fitted_values
306
+
307
+ if len(self.classes_) == 2:
308
+ # Binary: fitted_values is P(T=1), convert to (n, 2) matrix
309
+ proba = np.column_stack([1 - fitted_values, fitted_values])
310
+ else:
311
+ # Multi-valued: fitted_values is already (n, K) matrix
312
+ proba = fitted_values
313
+
314
+ return proba
315
+
316
+ def predict(self, X):
317
+ """Predict treatment assignment based on maximum propensity score.
318
+
319
+ Parameters
320
+ ----------
321
+ X : array-like of shape (n_samples, n_features)
322
+ Covariate matrix. Must match the training data sample count.
323
+
324
+ Returns
325
+ -------
326
+ y_pred : ndarray of shape (n_samples,)
327
+ Predicted treatment class for each observation, determined by
328
+ the treatment level with highest estimated propensity.
329
+
330
+ Notes
331
+ -----
332
+ This method returns the treatment level with the maximum estimated
333
+ propensity score for each observation. It is provided for sklearn
334
+ API compatibility but has limited practical utility since CBPS
335
+ propensity scores are estimated for weighting purposes, not
336
+ classification.
337
+
338
+ See Also
339
+ --------
340
+ predict_proba : Return probability estimates.
341
+ get_weights : Return IPW weights (primary CBPS output).
342
+ """
343
+ proba = self.predict_proba(X)
344
+ return self.classes_[np.argmax(proba, axis=1)]
345
+
346
+ def get_weights(self):
347
+ """Return inverse probability weights for causal effect estimation.
348
+
349
+ Returns
350
+ -------
351
+ weights : ndarray of shape (n_samples,)
352
+ Covariate balancing weights. When ``standardize=True`` (default),
353
+ weights sum to 1 within each treatment group. Otherwise,
354
+ Horvitz-Thompson weights are returned.
355
+
356
+ Notes
357
+ -----
358
+ These weights are the primary output of CBPS estimation, designed
359
+ for use in weighted outcome regressions or Horvitz-Thompson estimators
360
+ to obtain unbiased estimates of causal effects.
361
+
362
+ For ATE estimation (``att=0``), all observations receive positive
363
+ weights. For ATT estimation (``att=1`` or ``att=2``), control group
364
+ observations are reweighted to match the treated group's covariate
365
+ distribution.
366
+
367
+ Examples
368
+ --------
369
+ >>> est = CBPSEstimator(att=1).fit(X, y) # doctest: +SKIP
370
+ >>> weights = est.get_weights() # doctest: +SKIP
371
+ >>> # Use weights in outcome regression
372
+ >>> from sklearn.linear_model import LinearRegression
373
+ >>> outcome_model = LinearRegression() # doctest: +SKIP
374
+ >>> outcome_model.fit(X, outcome, sample_weight=weights) # doctest: +SKIP
375
+ """
376
+ check_is_fitted(self, 'fitted_')
377
+ return self.cbps_result_.weights
378
+