cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1036 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Nonparametric Covariate Balancing Propensity Score (npCBPS).
|
|
3
|
+
|
|
4
|
+
This module implements the nonparametric covariate balancing generalized
|
|
5
|
+
propensity score (npCBGPS) estimator from Section 3.3 of Fong, Hazlett,
|
|
6
|
+
and Imai (2018). Unlike parametric CBPS, this approach does not specify
|
|
7
|
+
a functional form for the propensity score. Instead, it directly estimates
|
|
8
|
+
inverse probability weights by maximizing the empirical likelihood subject
|
|
9
|
+
to covariate balance constraints.
|
|
10
|
+
|
|
11
|
+
Key Features
|
|
12
|
+
------------
|
|
13
|
+
- **Model-free**: No parametric assumptions about treatment assignment.
|
|
14
|
+
- **Empirical likelihood**: Weights chosen to maximize data likelihood.
|
|
15
|
+
- **Penalized imbalance**: Allows controlled finite-sample imbalance
|
|
16
|
+
via the ``corprior`` parameter.
|
|
17
|
+
|
|
18
|
+
Main API
|
|
19
|
+
--------
|
|
20
|
+
- :func:`npCBPS`: High-level function accepting formula and DataFrame.
|
|
21
|
+
- :class:`NPCBPSResults`: Container for estimated weights and diagnostics.
|
|
22
|
+
|
|
23
|
+
Algorithm Overview
|
|
24
|
+
------------------
|
|
25
|
+
1. Whiten covariates: :math:`X^* = S_X^{-1/2}(X - \\bar{X})`.
|
|
26
|
+
2. Standardize treatment: :math:`T^* = (T - \\bar{T})/s_T`.
|
|
27
|
+
3. Construct constraint matrix: :math:`g_i = (X_i^* T_i^*, X_i^*, T_i^*)^T`.
|
|
28
|
+
4. Line search over :math:`\\alpha \\in [0, 1]` to maximize penalized
|
|
29
|
+
likelihood (Equation 10).
|
|
30
|
+
5. Recover weights: :math:`w_i = 1/(1 - \\gamma^T(g_i - \\eta))`.
|
|
31
|
+
|
|
32
|
+
References
|
|
33
|
+
----------
|
|
34
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
|
|
35
|
+
score for a continuous treatment: Application to the efficacy of political
|
|
36
|
+
advertisements. The Annals of Applied Statistics, 12(1), 156-177.
|
|
37
|
+
https://doi.org/10.1214/17-AOAS1101
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from typing import Optional, Any, Union
|
|
41
|
+
import numpy as np
|
|
42
|
+
import pandas as pd
|
|
43
|
+
import scipy.optimize
|
|
44
|
+
|
|
45
|
+
from ..utils.formula import parse_formula
|
|
46
|
+
from .cholesky_whitening import cholesky_whitening
|
|
47
|
+
from .empirical_likelihood import get_w, log_post
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class NPCBPSSummary:
|
|
51
|
+
"""Summary object for NPCBPSResults.
|
|
52
|
+
|
|
53
|
+
Returned by :meth:`NPCBPSResults.summary`. Provides a structured
|
|
54
|
+
representation of npCBPS estimation results that can be printed
|
|
55
|
+
via ``print()`` or ``str()``.
|
|
56
|
+
|
|
57
|
+
Attributes
|
|
58
|
+
----------
|
|
59
|
+
call : str or None
|
|
60
|
+
String representation of the function call.
|
|
61
|
+
n : int
|
|
62
|
+
Total sample size.
|
|
63
|
+
n_treat : int
|
|
64
|
+
Number of treated units.
|
|
65
|
+
n_control : int
|
|
66
|
+
Number of control units.
|
|
67
|
+
converged : bool or None
|
|
68
|
+
Whether the optimization converged.
|
|
69
|
+
iterations : int or None
|
|
70
|
+
Number of iterations used.
|
|
71
|
+
sumw0 : float or None
|
|
72
|
+
Sum of unnormalized weights (should ≈ 1.0).
|
|
73
|
+
log_el : float or None
|
|
74
|
+
Log empirical likelihood at the optimum.
|
|
75
|
+
log_p_eta : float or None
|
|
76
|
+
Log prior density at the optimum.
|
|
77
|
+
par : float or None
|
|
78
|
+
Optimal scaling parameter alpha.
|
|
79
|
+
eta : np.ndarray or None
|
|
80
|
+
Weighted correlations.
|
|
81
|
+
weights : np.ndarray or None
|
|
82
|
+
Final normalized weights.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
call: Optional[str],
|
|
88
|
+
y: Optional[np.ndarray],
|
|
89
|
+
converged: Optional[bool],
|
|
90
|
+
iterations: Optional[int],
|
|
91
|
+
sumw0: Optional[float],
|
|
92
|
+
par: Optional[float],
|
|
93
|
+
log_el: Optional[float],
|
|
94
|
+
log_p_eta: Optional[float],
|
|
95
|
+
eta: Optional[np.ndarray],
|
|
96
|
+
weights: Optional[np.ndarray],
|
|
97
|
+
):
|
|
98
|
+
self.call = call
|
|
99
|
+
self.converged = converged
|
|
100
|
+
self.iterations = iterations
|
|
101
|
+
self.sumw0 = sumw0
|
|
102
|
+
self.par = par
|
|
103
|
+
self.log_el = log_el
|
|
104
|
+
self.log_p_eta = log_p_eta
|
|
105
|
+
self.eta = eta
|
|
106
|
+
self.weights = weights
|
|
107
|
+
|
|
108
|
+
# Derived sample info
|
|
109
|
+
if y is not None:
|
|
110
|
+
self.n = len(y)
|
|
111
|
+
self.n_treat = int(y.sum()) if hasattr(y, 'sum') else 0
|
|
112
|
+
self.n_control = self.n - self.n_treat
|
|
113
|
+
else:
|
|
114
|
+
self.n = 0
|
|
115
|
+
self.n_treat = 0
|
|
116
|
+
self.n_control = 0
|
|
117
|
+
|
|
118
|
+
def __str__(self) -> str:
|
|
119
|
+
"""Return formatted summary text.
|
|
120
|
+
|
|
121
|
+
The output is identical to the legacy ``NPCBPSResults.summary()``
|
|
122
|
+
string for backward compatibility.
|
|
123
|
+
"""
|
|
124
|
+
lines = []
|
|
125
|
+
lines.append("\n" + "=" * 70)
|
|
126
|
+
lines.append("npCBPS: Nonparametric Covariate Balancing Propensity Score")
|
|
127
|
+
lines.append("=" * 70)
|
|
128
|
+
lines.append("")
|
|
129
|
+
|
|
130
|
+
# Call
|
|
131
|
+
lines.append("Call:")
|
|
132
|
+
lines.append(f" {self.call or 'npCBPS()'}")
|
|
133
|
+
lines.append("")
|
|
134
|
+
|
|
135
|
+
# Sample information
|
|
136
|
+
if self.n > 0:
|
|
137
|
+
lines.append(f"Sample size: {self.n}")
|
|
138
|
+
lines.append(f" Treatment group: {self.n_treat} ({100*self.n_treat/self.n:.1f}%)")
|
|
139
|
+
lines.append(f" Control group: {self.n_control} ({100*self.n_control/self.n:.1f}%)")
|
|
140
|
+
lines.append("")
|
|
141
|
+
|
|
142
|
+
# Convergence diagnostics
|
|
143
|
+
lines.append("Convergence Diagnostics:")
|
|
144
|
+
lines.append("-" * 70)
|
|
145
|
+
if self.converged is not None:
|
|
146
|
+
conv_status = "✓ Yes" if self.converged else "✗ No"
|
|
147
|
+
lines.append(f" Converged: {conv_status}")
|
|
148
|
+
|
|
149
|
+
if self.iterations is not None:
|
|
150
|
+
lines.append(f" Iterations used: {self.iterations}")
|
|
151
|
+
|
|
152
|
+
if self.sumw0 is not None:
|
|
153
|
+
# sumw0 should be approximately 1.0 (key diagnostic from Fong et al. 2018)
|
|
154
|
+
deviation = abs(self.sumw0 - 1.0)
|
|
155
|
+
if deviation < 0.01:
|
|
156
|
+
status = "✓ Excellent (within 1%)"
|
|
157
|
+
elif deviation < 0.05:
|
|
158
|
+
status = "✓ Good (within 5%)"
|
|
159
|
+
elif deviation < 0.10:
|
|
160
|
+
status = "⚠ Acceptable (within 10%)"
|
|
161
|
+
else:
|
|
162
|
+
status = "✗ Warning (>10% deviation)"
|
|
163
|
+
|
|
164
|
+
lines.append(f" Sum of weights (sumw0): {self.sumw0:.6f} {status}")
|
|
165
|
+
lines.append(" Theoretical value: 1.0")
|
|
166
|
+
lines.append(f" Deviation: {deviation:.4f}")
|
|
167
|
+
|
|
168
|
+
lines.append("")
|
|
169
|
+
|
|
170
|
+
# Optimization results
|
|
171
|
+
lines.append("Optimization Results:")
|
|
172
|
+
lines.append("-" * 70)
|
|
173
|
+
|
|
174
|
+
if self.par is not None:
|
|
175
|
+
lines.append(f" Optimization parameter (alpha): {self.par:.6f}")
|
|
176
|
+
|
|
177
|
+
if self.log_el is not None:
|
|
178
|
+
lines.append(f" Log Empirical Likelihood: {self.log_el:.6f}")
|
|
179
|
+
|
|
180
|
+
if self.log_p_eta is not None:
|
|
181
|
+
lines.append(f" Log Prior Density p(η): {self.log_p_eta:.6f}")
|
|
182
|
+
|
|
183
|
+
if self.log_el is not None and self.log_p_eta is not None:
|
|
184
|
+
total_obj = self.log_el + self.log_p_eta
|
|
185
|
+
lines.append(f" Total objective: {total_obj:.6f}")
|
|
186
|
+
|
|
187
|
+
lines.append("")
|
|
188
|
+
|
|
189
|
+
# Weighted correlations (key statistics)
|
|
190
|
+
if self.eta is not None:
|
|
191
|
+
lines.append("Weighted Correlations (η):")
|
|
192
|
+
lines.append("-" * 70)
|
|
193
|
+
|
|
194
|
+
eta_array = np.atleast_1d(self.eta)
|
|
195
|
+
if len(eta_array) == 1:
|
|
196
|
+
lines.append(f" η = {eta_array[0]:.6f}")
|
|
197
|
+
else:
|
|
198
|
+
lines.append(f" Number of correlations: {len(eta_array)}")
|
|
199
|
+
lines.append(f" Mean: {eta_array.mean():.6f}")
|
|
200
|
+
lines.append(f" Range: [{eta_array.min():.6f}, {eta_array.max():.6f}]")
|
|
201
|
+
if len(eta_array) <= 10:
|
|
202
|
+
lines.append(" Values:")
|
|
203
|
+
for i, val in enumerate(eta_array):
|
|
204
|
+
lines.append(f" η[{i}] = {val:.6f}")
|
|
205
|
+
|
|
206
|
+
lines.append("")
|
|
207
|
+
|
|
208
|
+
# Weight statistics
|
|
209
|
+
if self.weights is not None:
|
|
210
|
+
lines.append("Weight Distribution:")
|
|
211
|
+
lines.append("-" * 70)
|
|
212
|
+
lines.append(f" Min: {self.weights.min():.6f}")
|
|
213
|
+
lines.append(f" Q1: {np.percentile(self.weights, 25):.6f}")
|
|
214
|
+
lines.append(f" Median: {np.median(self.weights):.6f}")
|
|
215
|
+
lines.append(f" Mean: {self.weights.mean():.6f}")
|
|
216
|
+
lines.append(f" Q3: {np.percentile(self.weights, 75):.6f}")
|
|
217
|
+
lines.append(f" Max: {self.weights.max():.6f}")
|
|
218
|
+
lines.append(f" Sum: {self.weights.sum():.6f}")
|
|
219
|
+
|
|
220
|
+
# Effective sample size
|
|
221
|
+
ess = (self.weights.sum() ** 2) / (self.weights ** 2).sum()
|
|
222
|
+
lines.append(f" Effective sample size: {ess:.1f}")
|
|
223
|
+
if self.n > 0:
|
|
224
|
+
efficiency = ess / self.n
|
|
225
|
+
lines.append(f" Efficiency: {100*efficiency:.1f}%")
|
|
226
|
+
|
|
227
|
+
lines.append("")
|
|
228
|
+
|
|
229
|
+
# Diagnostic recommendations
|
|
230
|
+
lines.append("Diagnostics:")
|
|
231
|
+
lines.append("-" * 70)
|
|
232
|
+
|
|
233
|
+
diagnostics = []
|
|
234
|
+
|
|
235
|
+
if self.converged is False:
|
|
236
|
+
diagnostics.append("⚠ Optimization did not converge - results may be unreliable")
|
|
237
|
+
|
|
238
|
+
if self.sumw0 is not None and abs(self.sumw0 - 1.0) > 0.10:
|
|
239
|
+
diagnostics.append("⚠ sumw0 deviates >10% from 1.0 - check optimization quality")
|
|
240
|
+
|
|
241
|
+
if self.weights is not None:
|
|
242
|
+
# Check weight range
|
|
243
|
+
weight_range = self.weights.max() / self.weights.min() if self.weights.min() > 0 else float('inf')
|
|
244
|
+
if weight_range > 100:
|
|
245
|
+
diagnostics.append(f"⚠ Large weight range ({weight_range:.1f}x) - may indicate overlap issues")
|
|
246
|
+
|
|
247
|
+
# Check effective sample size
|
|
248
|
+
if self.n > 0:
|
|
249
|
+
ess = (self.weights.sum() ** 2) / (self.weights ** 2).sum()
|
|
250
|
+
efficiency = ess / self.n
|
|
251
|
+
if efficiency < 0.5:
|
|
252
|
+
eff_pct = 100 * efficiency
|
|
253
|
+
diagnostics.append(
|
|
254
|
+
f"⚠ Low weighting efficiency ({eff_pct:.1f}%) - consider different corprior"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if diagnostics:
|
|
258
|
+
for diag in diagnostics:
|
|
259
|
+
lines.append(f" {diag}")
|
|
260
|
+
else:
|
|
261
|
+
lines.append(" ✓ All diagnostics passed")
|
|
262
|
+
|
|
263
|
+
lines.append("")
|
|
264
|
+
lines.append("=" * 70)
|
|
265
|
+
|
|
266
|
+
return "\n".join(lines)
|
|
267
|
+
|
|
268
|
+
def __repr__(self) -> str:
|
|
269
|
+
return f"NPCBPSSummary(n={self.n}, converged={self.converged})"
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class NPCBPSResults:
|
|
273
|
+
"""
|
|
274
|
+
Container for nonparametric CBPS estimation results.
|
|
275
|
+
|
|
276
|
+
Stores the output from :func:`npCBPS`, including estimated weights,
|
|
277
|
+
optimization diagnostics, and the original data. Unlike parametric
|
|
278
|
+
CBPS, npCBPS does not estimate propensity score model coefficients.
|
|
279
|
+
|
|
280
|
+
Attributes
|
|
281
|
+
----------
|
|
282
|
+
weights : np.ndarray of shape (n,)
|
|
283
|
+
Final normalized weights summing to n. Use these for weighted
|
|
284
|
+
outcome analysis.
|
|
285
|
+
sumw0 : float
|
|
286
|
+
Sum of unnormalized weights before normalization. Should be close
|
|
287
|
+
to 1.0 (within 5%). Values far from 1 indicate potential convergence
|
|
288
|
+
issues.
|
|
289
|
+
eta : np.ndarray of shape (K,)
|
|
290
|
+
Optimal weighted correlation :math:`\\eta = \\alpha \\cdot \\eta_0`,
|
|
291
|
+
where K is the number of covariates. Measures the remaining
|
|
292
|
+
covariate-treatment association after weighting.
|
|
293
|
+
par : float
|
|
294
|
+
Optimal scaling parameter :math:`\\alpha \\in [0, 1]` from line search.
|
|
295
|
+
Values near 0 indicate tight balance; values near 1 indicate relaxed
|
|
296
|
+
balance.
|
|
297
|
+
log_el : float
|
|
298
|
+
Log empirical likelihood at the optimum.
|
|
299
|
+
log_p_eta : float
|
|
300
|
+
Log prior density :math:`\\log f(\\eta)` at the optimum.
|
|
301
|
+
y : np.ndarray of shape (n,)
|
|
302
|
+
Treatment variable.
|
|
303
|
+
x : np.ndarray of shape (n, K)
|
|
304
|
+
Original covariate matrix (before whitening).
|
|
305
|
+
converged : bool
|
|
306
|
+
Whether the optimization converged successfully.
|
|
307
|
+
iterations : int or None
|
|
308
|
+
Number of iterations used in the optimization.
|
|
309
|
+
formula : str
|
|
310
|
+
Model formula used for fitting.
|
|
311
|
+
data : pd.DataFrame
|
|
312
|
+
Original input DataFrame.
|
|
313
|
+
call : str
|
|
314
|
+
String representation of the function call.
|
|
315
|
+
terms : object
|
|
316
|
+
patsy DesignInfo object for formula parsing (used by diagnostics).
|
|
317
|
+
na_action : dict or None
|
|
318
|
+
Information about missing value handling.
|
|
319
|
+
|
|
320
|
+
See Also
|
|
321
|
+
--------
|
|
322
|
+
npCBPS : Function that creates this results object.
|
|
323
|
+
CBPSResults : Results container for parametric CBPS.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
def __init__(self):
|
|
327
|
+
# npCBPS-specific fields
|
|
328
|
+
self.par: Optional[float] = None
|
|
329
|
+
self.log_p_eta: Optional[float] = None
|
|
330
|
+
self.log_el: Optional[float] = None
|
|
331
|
+
self.eta: Optional[np.ndarray] = None
|
|
332
|
+
self.sumw0: Optional[float] = None
|
|
333
|
+
|
|
334
|
+
# Common fields (shared with CBPS)
|
|
335
|
+
self.weights: Optional[np.ndarray] = None
|
|
336
|
+
self.y: Optional[np.ndarray] = None
|
|
337
|
+
self.x: Optional[np.ndarray] = None
|
|
338
|
+
|
|
339
|
+
# Convergence diagnostic fields (documented fields)
|
|
340
|
+
self.converged: Optional[bool] = None
|
|
341
|
+
self.iterations: Optional[int] = None
|
|
342
|
+
|
|
343
|
+
# Metadata
|
|
344
|
+
self.call: Optional[str] = None
|
|
345
|
+
self.formula: Optional[str] = None
|
|
346
|
+
self.data: Optional[pd.DataFrame] = None
|
|
347
|
+
|
|
348
|
+
# Metadata attributes for compatibility with predict() and model diagnostics
|
|
349
|
+
self.terms: Optional[object] = None # patsy DesignInfo object
|
|
350
|
+
self.na_action: Optional[dict] = None # Missing value handling information
|
|
351
|
+
|
|
352
|
+
def __repr__(self) -> str:
|
|
353
|
+
"""Concise repr output (for interactive environment)"""
|
|
354
|
+
n = len(self.y) if self.y is not None else 0
|
|
355
|
+
converged_str = "Yes" if self.converged else "No" if self.converged is not None else "Unknown"
|
|
356
|
+
sumw0_str = f"{self.sumw0:.4f}" if self.sumw0 is not None else "N/A"
|
|
357
|
+
return f"NPCBPSResults(n={n}, converged={converged_str}, sumw0={sumw0_str})"
|
|
358
|
+
|
|
359
|
+
def __str__(self) -> str:
|
|
360
|
+
"""Complete string output (for print calls)"""
|
|
361
|
+
output = "\nCall:\n " + (self.call or "npCBPS()") + "\n\n"
|
|
362
|
+
|
|
363
|
+
# Sample information
|
|
364
|
+
if self.y is not None:
|
|
365
|
+
output += f"Sample size: {len(self.y)}\n"
|
|
366
|
+
|
|
367
|
+
# Convergence information
|
|
368
|
+
if self.converged is not None:
|
|
369
|
+
output += f"Converged: {'Yes' if self.converged else 'No'}\n"
|
|
370
|
+
if self.iterations is not None:
|
|
371
|
+
output += f"Iterations: {self.iterations}\n"
|
|
372
|
+
|
|
373
|
+
# Key statistics
|
|
374
|
+
if self.sumw0 is not None:
|
|
375
|
+
sumw0_status = "Good" if abs(self.sumw0 - 1.0) < 0.05 else "Check"
|
|
376
|
+
output += f"Sum of weights (sumw0): {self.sumw0:.6f} ({sumw0_status}: should ≈ 1.0 ± 5%)\n"
|
|
377
|
+
|
|
378
|
+
if self.log_el is not None:
|
|
379
|
+
output += f"Log Empirical Likelihood: {self.log_el:.6f}\n"
|
|
380
|
+
|
|
381
|
+
if self.log_p_eta is not None:
|
|
382
|
+
output += f"Log Prior Density: {self.log_p_eta:.6f}\n"
|
|
383
|
+
|
|
384
|
+
if self.par is not None:
|
|
385
|
+
output += f"Optimization parameter (alpha): {self.par:.6f}\n"
|
|
386
|
+
|
|
387
|
+
# Weight information
|
|
388
|
+
if self.weights is not None:
|
|
389
|
+
output += "\nWeights:\n"
|
|
390
|
+
output += f" Min: {self.weights.min():.6f}\n"
|
|
391
|
+
output += f" Max: {self.weights.max():.6f}\n"
|
|
392
|
+
output += f" Mean: {self.weights.mean():.6f}\n"
|
|
393
|
+
output += f" Sum: {self.weights.sum():.6f}\n"
|
|
394
|
+
|
|
395
|
+
return output
|
|
396
|
+
|
|
397
|
+
def summary(self) -> 'NPCBPSSummary':
|
|
398
|
+
"""
|
|
399
|
+
Display npCBPS fit summary (detailed diagnostic information).
|
|
400
|
+
|
|
401
|
+
This method provides comprehensive diagnostic information for npCBPS results.
|
|
402
|
+
|
|
403
|
+
Returns
|
|
404
|
+
-------
|
|
405
|
+
NPCBPSSummary
|
|
406
|
+
Summary object with ``__str__`` method for formatted output.
|
|
407
|
+
Use ``print(result.summary())`` to display.
|
|
408
|
+
|
|
409
|
+
Notes
|
|
410
|
+
-----
|
|
411
|
+
summary() provides more detailed diagnostic information than __str__(), including:
|
|
412
|
+
- Convergence diagnostics (whether sumw0 ≈ 1)
|
|
413
|
+
- Optimization parameters
|
|
414
|
+
- Empirical likelihood and priors
|
|
415
|
+
- Weighted correlation eta (if multiple covariates)
|
|
416
|
+
- Weight distribution statistics
|
|
417
|
+
|
|
418
|
+
This provides a comprehensive view of the nonparametric CBPS estimation results.
|
|
419
|
+
|
|
420
|
+
Examples
|
|
421
|
+
--------
|
|
422
|
+
>>> from cbps import npCBPS
|
|
423
|
+
>>> fit = npCBPS('treat ~ x1 + x2', data=df)
|
|
424
|
+
>>> print(fit.summary())
|
|
425
|
+
|
|
426
|
+
References
|
|
427
|
+
----------
|
|
428
|
+
.. [1] Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing
|
|
429
|
+
propensity score for a continuous treatment. The Annals of Applied
|
|
430
|
+
Statistics, 12(1), 156-177. https://doi.org/10.1214/17-AOAS1101
|
|
431
|
+
"""
|
|
432
|
+
return NPCBPSSummary(
|
|
433
|
+
call=self.call,
|
|
434
|
+
y=self.y,
|
|
435
|
+
converged=self.converged,
|
|
436
|
+
iterations=self.iterations,
|
|
437
|
+
sumw0=self.sumw0,
|
|
438
|
+
par=self.par,
|
|
439
|
+
log_el=self.log_el,
|
|
440
|
+
log_p_eta=self.log_p_eta,
|
|
441
|
+
eta=self.eta,
|
|
442
|
+
weights=self.weights,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
def balance(self, **kwargs):
|
|
446
|
+
"""
|
|
447
|
+
Compute covariate balance statistics (convenience method).
|
|
448
|
+
|
|
449
|
+
This method wraps the standalone balance() function to provide
|
|
450
|
+
a convenient object-oriented interface.
|
|
451
|
+
|
|
452
|
+
The balance function routes to the appropriate method based on treatment type:
|
|
453
|
+
- For continuous treatments: computes weighted correlations
|
|
454
|
+
- For discrete treatments: computes standardized mean differences
|
|
455
|
+
|
|
456
|
+
Parameters
|
|
457
|
+
----------
|
|
458
|
+
**kwargs
|
|
459
|
+
Additional arguments passed to the balance() function.
|
|
460
|
+
Supported keys: enhanced (bool), threshold (float), covariate_names (list).
|
|
461
|
+
|
|
462
|
+
Returns
|
|
463
|
+
-------
|
|
464
|
+
dict
|
|
465
|
+
Dictionary containing balance statistics:
|
|
466
|
+
- balanced: weighted covariate balance statistics
|
|
467
|
+
- original/unweighted: unweighted baseline statistics
|
|
468
|
+
See cbps.balance() documentation for details
|
|
469
|
+
|
|
470
|
+
Notes
|
|
471
|
+
-----
|
|
472
|
+
This method maintains API consistency with CBPSResults.balance()
|
|
473
|
+
and CBMSMResults.balance().
|
|
474
|
+
|
|
475
|
+
Implementation:
|
|
476
|
+
- Calls the standalone balance(self) function
|
|
477
|
+
- The global balance() function supports NPCBPSResults objects
|
|
478
|
+
- Automatically routes to the appropriate balance function based on treatment type
|
|
479
|
+
|
|
480
|
+
Examples
|
|
481
|
+
--------
|
|
482
|
+
>>> fit = npCBPS('treat ~ age + educ', data=df)
|
|
483
|
+
>>>
|
|
484
|
+
>>> # Method 1: Standalone function
|
|
485
|
+
>>> from cbps import balance
|
|
486
|
+
>>> bal = balance(fit)
|
|
487
|
+
>>>
|
|
488
|
+
>>> # Method 2: Object method
|
|
489
|
+
>>> bal = fit.balance()
|
|
490
|
+
>>>
|
|
491
|
+
>>> # Both methods return identical results
|
|
492
|
+
"""
|
|
493
|
+
# Import standalone function (avoid circular import)
|
|
494
|
+
from cbps import balance as balance_func
|
|
495
|
+
|
|
496
|
+
# Call the global balance() function directly
|
|
497
|
+
return balance_func(self, **kwargs)
|
|
498
|
+
|
|
499
|
+
def vcov(self):
|
|
500
|
+
"""
|
|
501
|
+
Return the variance-covariance matrix of coefficients.
|
|
502
|
+
|
|
503
|
+
Notes
|
|
504
|
+
-----
|
|
505
|
+
npCBPS is a nonparametric method that does not estimate parametric
|
|
506
|
+
model coefficients, and therefore has no variance-covariance matrix.
|
|
507
|
+
This method raises a ValueError to inform users of this limitation.
|
|
508
|
+
|
|
509
|
+
Raises
|
|
510
|
+
------
|
|
511
|
+
ValueError
|
|
512
|
+
Always raised because npCBPS does not estimate coefficients.
|
|
513
|
+
"""
|
|
514
|
+
raise ValueError(
|
|
515
|
+
"npCBPS is a nonparametric method and does not estimate "
|
|
516
|
+
"coefficients or their variance-covariance matrix."
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def npCBPS(
|
|
523
|
+
formula: str,
|
|
524
|
+
data: pd.DataFrame,
|
|
525
|
+
na_action: Optional[str] = None,
|
|
526
|
+
corprior: Optional[float] = None,
|
|
527
|
+
print_level: int = 0,
|
|
528
|
+
seed: Optional[int] = None,
|
|
529
|
+
**kwargs: Any
|
|
530
|
+
) -> NPCBPSResults:
|
|
531
|
+
"""
|
|
532
|
+
Estimate nonparametric covariate balancing weights.
|
|
533
|
+
|
|
534
|
+
Implements the nonparametric CBGPS estimator from Section 3.3 of Fong,
|
|
535
|
+
Hazlett, and Imai (2018). This method estimates inverse probability
|
|
536
|
+
weights directly via empirical likelihood without specifying a
|
|
537
|
+
parametric propensity score model.
|
|
538
|
+
|
|
539
|
+
Parameters
|
|
540
|
+
----------
|
|
541
|
+
formula : str
|
|
542
|
+
Model formula specifying the treatment and covariates, e.g.,
|
|
543
|
+
``'treat ~ age + educ + income'``. The left-hand side is the
|
|
544
|
+
treatment variable; the right-hand side lists covariates.
|
|
545
|
+
data : pd.DataFrame
|
|
546
|
+
DataFrame containing all variables referenced in the formula.
|
|
547
|
+
na_action : {'warn', 'fail', 'ignore'}, optional
|
|
548
|
+
How to handle missing values:
|
|
549
|
+
|
|
550
|
+
- ``'warn'`` (default): Drop rows with missing values and warn.
|
|
551
|
+
- ``'fail'``: Raise ValueError if missing values are present.
|
|
552
|
+
- ``'ignore'``: Silently drop rows with missing values.
|
|
553
|
+
|
|
554
|
+
corprior : float, optional
|
|
555
|
+
Prior standard deviation :math:`\\sigma` for the allowed weighted
|
|
556
|
+
correlation :math:`\\eta \\sim N(0, \\sigma^2 I_K)`. Controls the
|
|
557
|
+
tradeoff between exact balance and stable weights.
|
|
558
|
+
|
|
559
|
+
If ``None`` (default), set to ``0.1/n`` following the paper's
|
|
560
|
+
recommendation in Section 3.3.4.
|
|
561
|
+
|
|
562
|
+
Interpretation:
|
|
563
|
+
|
|
564
|
+
- Smaller values enforce tighter balance but may produce extreme
|
|
565
|
+
weights or fail to converge.
|
|
566
|
+
- Larger values allow more imbalance but ensure convergence.
|
|
567
|
+
- The default ``0.1/n`` generally provides good balance while
|
|
568
|
+
ensuring convergence.
|
|
569
|
+
|
|
570
|
+
print_level : int, default=0
|
|
571
|
+
Verbosity level. If > 0, prints optimization diagnostics including
|
|
572
|
+
``log_post``, ``log_el``, ``log_p_eta``, and ``sumw0``.
|
|
573
|
+
seed : int, optional
|
|
574
|
+
Random seed for reproducibility.
|
|
575
|
+
**kwargs
|
|
576
|
+
Reserved for future extensions.
|
|
577
|
+
|
|
578
|
+
Returns
|
|
579
|
+
-------
|
|
580
|
+
NPCBPSResults
|
|
581
|
+
Fitted result object with attributes:
|
|
582
|
+
|
|
583
|
+
- **weights**: Final weights normalized to sum to n.
|
|
584
|
+
- **sumw0**: Sum of unnormalized weights (should be close to 1).
|
|
585
|
+
- **eta**: Optimal weighted correlations.
|
|
586
|
+
- **par**: Optimal scaling parameter :math:`\\alpha`.
|
|
587
|
+
- **log_el**: Log empirical likelihood.
|
|
588
|
+
- **log_p_eta**: Log prior density.
|
|
589
|
+
- **converged**: Whether optimization converged.
|
|
590
|
+
- **y**, **x**: Treatment and covariate arrays.
|
|
591
|
+
- **formula**, **data**, **call**: Metadata.
|
|
592
|
+
|
|
593
|
+
Raises
|
|
594
|
+
------
|
|
595
|
+
ValueError
|
|
596
|
+
If ``na_action='fail'`` and missing values are present, or if
|
|
597
|
+
``corprior`` is outside [0, 10].
|
|
598
|
+
RuntimeError
|
|
599
|
+
If optimization produces NaN weights.
|
|
600
|
+
|
|
601
|
+
Notes
|
|
602
|
+
-----
|
|
603
|
+
**Algorithm (Section 3.3 of Fong et al., 2018):**
|
|
604
|
+
|
|
605
|
+
1. Parse formula and extract treatment :math:`T` and covariates :math:`X`.
|
|
606
|
+
2. Whiten covariates via Cholesky decomposition (Section 3.1).
|
|
607
|
+
3. Construct constraint matrix :math:`g = (X^* T^*, X^*, T^*)^T`.
|
|
608
|
+
4. Line search over :math:`\\alpha \\in [0, 1]` to maximize the penalized
|
|
609
|
+
likelihood (Equation 10).
|
|
610
|
+
5. Recover weights :math:`w_i = 1/(1 - \\gamma^T(g_i - \\eta))`.
|
|
611
|
+
6. Normalize weights so that :math:`\\sum w_i = n`.
|
|
612
|
+
|
|
613
|
+
**Non-convexity (Section 3.3.2):**
|
|
614
|
+
|
|
615
|
+
The empirical likelihood objective is not generally convex, so there is
|
|
616
|
+
no guarantee of finding the global optimum. Results may vary slightly
|
|
617
|
+
between runs, which is expected behavior.
|
|
618
|
+
|
|
619
|
+
**Convergence diagnostic:**
|
|
620
|
+
|
|
621
|
+
The key diagnostic is ``sumw0``, the sum of unnormalized weights.
|
|
622
|
+
Values within 5% of 1.0 indicate successful convergence. Large
|
|
623
|
+
deviations suggest adjusting the ``corprior`` parameter.
|
|
624
|
+
|
|
625
|
+
References
|
|
626
|
+
----------
|
|
627
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing
|
|
628
|
+
propensity score for a continuous treatment: Application to the
|
|
629
|
+
efficacy of political advertisements. The Annals of Applied
|
|
630
|
+
Statistics, 12(1), 156-177. https://doi.org/10.1214/17-AOAS1101
|
|
631
|
+
|
|
632
|
+
Examples
|
|
633
|
+
--------
|
|
634
|
+
Basic usage with the LaLonde dataset:
|
|
635
|
+
|
|
636
|
+
>>> import pandas as pd
|
|
637
|
+
>>> from cbps import npCBPS
|
|
638
|
+
>>> from cbps.datasets import load_lalonde
|
|
639
|
+
>>> df = load_lalonde()
|
|
640
|
+
>>> fit = npCBPS('treat ~ age + educ + black + hisp + married + nodegr',
|
|
641
|
+
... data=df)
|
|
642
|
+
>>> print(f"Sum of weights: {fit.weights.sum():.1f}")
|
|
643
|
+
>>> print(f"sumw0 (should be ~1): {fit.sumw0:.4f}")
|
|
644
|
+
|
|
645
|
+
Adjusting the balance-variance tradeoff:
|
|
646
|
+
|
|
647
|
+
>>> # Tighter balance (may not converge for all datasets)
|
|
648
|
+
>>> fit_tight = npCBPS('treat ~ age + educ', data=df, corprior=0.001)
|
|
649
|
+
>>> # Looser balance (ensures convergence)
|
|
650
|
+
>>> fit_loose = npCBPS('treat ~ age + educ', data=df, corprior=0.1)
|
|
651
|
+
"""
|
|
652
|
+
# Set random seed for reproducibility
|
|
653
|
+
if seed is not None:
|
|
654
|
+
np.random.seed(seed)
|
|
655
|
+
|
|
656
|
+
# Handle na_action parameter
|
|
657
|
+
# Set default value
|
|
658
|
+
if na_action is None:
|
|
659
|
+
na_action = 'warn' # Default: warn and drop missing values
|
|
660
|
+
|
|
661
|
+
# Validate na_action parameter value
|
|
662
|
+
valid_na_actions = {'warn', 'fail', 'ignore'}
|
|
663
|
+
if na_action not in valid_na_actions:
|
|
664
|
+
raise ValueError(
|
|
665
|
+
f"Invalid na_action='{na_action}'. "
|
|
666
|
+
f"Valid options are: {', '.join(repr(x) for x in valid_na_actions)}. "
|
|
667
|
+
f"Note: use 'warn' (not 'drop') to remove missing values with a warning."
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Missing value handling (before formula parsing)
|
|
671
|
+
# Extract columns involved in formula
|
|
672
|
+
treat_col = formula.split('~')[0].strip()
|
|
673
|
+
covar_part = formula.split('~')[1]
|
|
674
|
+
# Simple variable name extraction (handles basic formulas, complex ones handled by patsy)
|
|
675
|
+
import re
|
|
676
|
+
covar_cols = re.findall(r'\b[a-zA-Z_]\w*\b', covar_part)
|
|
677
|
+
relevant_cols = [treat_col] + [col for col in covar_cols if col in data.columns]
|
|
678
|
+
|
|
679
|
+
# Check for missing values
|
|
680
|
+
n_missing = data[relevant_cols].isna().any(axis=1).sum()
|
|
681
|
+
na_action_info = None
|
|
682
|
+
if n_missing > 0:
|
|
683
|
+
if na_action == 'fail':
|
|
684
|
+
raise ValueError(
|
|
685
|
+
f"npCBPS: Missing values detected in {n_missing} observations. "
|
|
686
|
+
f"Set na_action='warn' to remove them, or handle missing values before calling npCBPS()."
|
|
687
|
+
)
|
|
688
|
+
elif na_action == 'warn':
|
|
689
|
+
import warnings
|
|
690
|
+
data_clean = data.dropna(subset=relevant_cols)
|
|
691
|
+
n_dropped = len(data) - len(data_clean)
|
|
692
|
+
warnings.warn(
|
|
693
|
+
f"npCBPS: Removed {n_dropped} observations with missing values. "
|
|
694
|
+
f"Remaining sample size: {len(data_clean)}.",
|
|
695
|
+
UserWarning
|
|
696
|
+
)
|
|
697
|
+
data = data_clean
|
|
698
|
+
na_action_info = {'method': 'warn', 'n_dropped': n_dropped}
|
|
699
|
+
elif na_action == 'ignore':
|
|
700
|
+
# Ignore mode: silently drop missing values
|
|
701
|
+
data_clean = data.dropna(subset=relevant_cols)
|
|
702
|
+
n_dropped = len(data) - len(data_clean)
|
|
703
|
+
data = data_clean
|
|
704
|
+
na_action_info = {'method': 'ignore', 'n_dropped': n_dropped}
|
|
705
|
+
|
|
706
|
+
# Sample-size adaptive corprior default value
|
|
707
|
+
# Paper recommendation: ρ = 0.1/N (Fong et al. 2018, Section 3.3.4)
|
|
708
|
+
n_obs = len(data)
|
|
709
|
+
if corprior is None:
|
|
710
|
+
corprior = 0.1 / n_obs
|
|
711
|
+
if print_level > 0:
|
|
712
|
+
print(f"npCBPS: Using paper-recommended corprior = 0.1/n = {corprior:.6f}")
|
|
713
|
+
|
|
714
|
+
# Input validation
|
|
715
|
+
# Validate corprior range (based on paper recommendation and experience)
|
|
716
|
+
# Allow corprior=0 but issue a warning
|
|
717
|
+
if not (0.0 <= corprior <= 10.0):
|
|
718
|
+
raise ValueError(
|
|
719
|
+
f"corprior={corprior} is outside the valid range [0.0, 10.0]. "
|
|
720
|
+
f"The paper recommends corprior ≈ 0.1/n (for this dataset: {0.1/n_obs:.6f}). "
|
|
721
|
+
f"Values >>1 often lead to NaN weights."
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Special warning for corprior=0
|
|
725
|
+
if corprior == 0.0:
|
|
726
|
+
import warnings
|
|
727
|
+
warnings.warn(
|
|
728
|
+
"corprior=0 removes all correlation prior penalty, which may lead to "
|
|
729
|
+
"unstable or extreme weights in small samples. "
|
|
730
|
+
"The paper recommends corprior ≈ 0.1/n for most applications. "
|
|
731
|
+
"Use corprior=0 only for specific purposes like sensitivity analysis.",
|
|
732
|
+
UserWarning
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
# Validate sample size (avoid numerical instability with small samples)
|
|
736
|
+
if n_obs < 30:
|
|
737
|
+
import warnings
|
|
738
|
+
warnings.warn(
|
|
739
|
+
f"Small sample size (n={n_obs}). npCBPS may be unstable with n<30. "
|
|
740
|
+
f"Consider using standard CBPS for small samples.",
|
|
741
|
+
UserWarning
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Formula parsing
|
|
745
|
+
# parse_formula returns (y, X), where y is the treatment vector and X is the covariate matrix (with intercept)
|
|
746
|
+
# preserve_categorical=True to maintain factor semantics
|
|
747
|
+
# Also extract terms object for model diagnostics
|
|
748
|
+
from patsy import dmatrices
|
|
749
|
+
|
|
750
|
+
# Save original data for metadata
|
|
751
|
+
data_original = data.copy()
|
|
752
|
+
|
|
753
|
+
# Use dmatrices to parse and obtain terms information
|
|
754
|
+
_, X_design = dmatrices(formula, data, return_type='dataframe')
|
|
755
|
+
terms_obj = X_design.design_info # patsy DesignInfo object
|
|
756
|
+
|
|
757
|
+
treat, X_mat = parse_formula(formula, data, preserve_categorical=True)
|
|
758
|
+
|
|
759
|
+
# Remove zero-variance columns
|
|
760
|
+
non_zero_var_cols = X_mat.std(axis=0) > 0
|
|
761
|
+
X_mat = X_mat[:, non_zero_var_cols]
|
|
762
|
+
|
|
763
|
+
# Call core fitting function
|
|
764
|
+
fit_result = npCBPS_fit(
|
|
765
|
+
treat=treat,
|
|
766
|
+
X=X_mat,
|
|
767
|
+
corprior=corprior,
|
|
768
|
+
print_level=print_level
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Append metadata
|
|
772
|
+
fit_result.call = f"npCBPS(formula={formula}, data=..., corprior={corprior})"
|
|
773
|
+
fit_result.formula = formula
|
|
774
|
+
fit_result.data = data_original # Save original data
|
|
775
|
+
|
|
776
|
+
# Add terms and na_action for model diagnostics
|
|
777
|
+
fit_result.terms = terms_obj
|
|
778
|
+
fit_result.na_action = na_action_info
|
|
779
|
+
|
|
780
|
+
return fit_result
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def npCBPS_fit(
|
|
784
|
+
treat: Union[np.ndarray, pd.Series],
|
|
785
|
+
X: np.ndarray,
|
|
786
|
+
corprior: float,
|
|
787
|
+
print_level: int
|
|
788
|
+
) -> NPCBPSResults:
|
|
789
|
+
"""
|
|
790
|
+
Core fitting procedure for nonparametric CBPS.
|
|
791
|
+
|
|
792
|
+
This is the internal implementation called by :func:`npCBPS` after
|
|
793
|
+
formula parsing and data preprocessing. It performs the empirical
|
|
794
|
+
likelihood optimization to estimate covariate balancing weights.
|
|
795
|
+
|
|
796
|
+
Parameters
|
|
797
|
+
----------
|
|
798
|
+
treat : np.ndarray or pd.Series of shape (n,)
|
|
799
|
+
Treatment variable. If a pandas Categorical Series, it is treated
|
|
800
|
+
as a factor with J levels. Otherwise, it is treated as continuous.
|
|
801
|
+
X : np.ndarray of shape (n, K)
|
|
802
|
+
Covariate matrix with zero-variance columns removed.
|
|
803
|
+
corprior : float
|
|
804
|
+
Prior standard deviation :math:`\\sigma` for the allowed weighted
|
|
805
|
+
correlation (see Section 3.3.4 of Fong et al., 2018).
|
|
806
|
+
print_level : int
|
|
807
|
+
Verbosity level for diagnostic output.
|
|
808
|
+
|
|
809
|
+
Returns
|
|
810
|
+
-------
|
|
811
|
+
NPCBPSResults
|
|
812
|
+
Fitted result object. Note that ``formula``, ``data``, ``call``,
|
|
813
|
+
``terms``, and ``na_action`` attributes are populated by the caller.
|
|
814
|
+
|
|
815
|
+
Notes
|
|
816
|
+
-----
|
|
817
|
+
**Treatment types:**
|
|
818
|
+
|
|
819
|
+
- *Continuous*: Constraint matrix has K correlation constraints.
|
|
820
|
+
- *Factor (J levels)*: Constraint matrix has K*(J-1) correlation
|
|
821
|
+
constraints using one-hot encoding.
|
|
822
|
+
|
|
823
|
+
**Implementation details:**
|
|
824
|
+
|
|
825
|
+
- Covariates are whitened using :func:`cholesky_whitening`.
|
|
826
|
+
- The treatment is standardized to zero mean and unit variance.
|
|
827
|
+
- The line search is bounded to :math:`\\alpha \\in [0, 1]`.
|
|
828
|
+
|
|
829
|
+
See Also
|
|
830
|
+
--------
|
|
831
|
+
npCBPS : High-level interface with formula parsing.
|
|
832
|
+
"""
|
|
833
|
+
# Initialization
|
|
834
|
+
# Detect if treatment is a Categorical factor
|
|
835
|
+
is_factor_treat = isinstance(treat, pd.Series) and isinstance(treat.dtype, pd.CategoricalDtype)
|
|
836
|
+
|
|
837
|
+
if is_factor_treat:
|
|
838
|
+
# Factor treatment: preserve original Series, convert later
|
|
839
|
+
_D_original = treat.copy() # Reserved for future use
|
|
840
|
+
D = treat.cat.codes.values.astype(np.float64)
|
|
841
|
+
else:
|
|
842
|
+
# Continuous treatment: direct copy
|
|
843
|
+
D = treat.copy() if isinstance(treat, np.ndarray) else treat.values.copy()
|
|
844
|
+
_D_original = None # noqa: F841
|
|
845
|
+
|
|
846
|
+
rescale_orig = True
|
|
847
|
+
orig_X = X.copy()
|
|
848
|
+
|
|
849
|
+
# Preprocessing: Cholesky whitening
|
|
850
|
+
X = cholesky_whitening(X, verify=True)
|
|
851
|
+
|
|
852
|
+
n = X.shape[0]
|
|
853
|
+
|
|
854
|
+
# Compute epsilon (numerical tolerance)
|
|
855
|
+
eps = 1.0 / n
|
|
856
|
+
|
|
857
|
+
# Construct constraint matrix z
|
|
858
|
+
# Determine treatment type:
|
|
859
|
+
# - pd.Categorical -> factor treatment
|
|
860
|
+
# - Numeric types (including binary 0/1) -> continuous treatment
|
|
861
|
+
#
|
|
862
|
+
# Note: Numeric treatment variables (including 0/1 binary values) use
|
|
863
|
+
# the continuous treatment path for consistency.
|
|
864
|
+
|
|
865
|
+
if not is_factor_treat:
|
|
866
|
+
# Continuous treatment path
|
|
867
|
+
if print_level > 0:
|
|
868
|
+
print("Estimating npCBPS as a continuous treatment.")
|
|
869
|
+
|
|
870
|
+
# Redirect X to ensure positive correlation with T
|
|
871
|
+
correlations = np.array([np.corrcoef(X[:, j], D)[0, 1] for j in range(X.shape[1])])
|
|
872
|
+
signs = np.sign(correlations)
|
|
873
|
+
X = X @ np.diag(signs)
|
|
874
|
+
|
|
875
|
+
# Standardize treatment
|
|
876
|
+
D = (D - D.mean()) / D.std(ddof=1)
|
|
877
|
+
|
|
878
|
+
# Construct constraint matrix: z = cbind(X*D, X, D)
|
|
879
|
+
X_times_D = X * D[:, None] # Element-wise multiplication, broadcast D
|
|
880
|
+
D_col = D[:, None] # Convert to column vector
|
|
881
|
+
z = np.column_stack([X_times_D, X, D_col])
|
|
882
|
+
|
|
883
|
+
_ncon = z.shape[1] # Total constraints (reserved for diagnostics) # noqa: F841
|
|
884
|
+
ncon_cor = X.shape[1] # K
|
|
885
|
+
|
|
886
|
+
# cor_init only used for factor treatment
|
|
887
|
+
cor_init = None
|
|
888
|
+
|
|
889
|
+
else:
|
|
890
|
+
# Factor treatment path
|
|
891
|
+
if print_level > 0:
|
|
892
|
+
print("Estimating npCBPS as a factor treatment.")
|
|
893
|
+
|
|
894
|
+
# Convert to one-hot encoding
|
|
895
|
+
unique_levels = np.unique(D)
|
|
896
|
+
conds = len(unique_levels)
|
|
897
|
+
Td = np.zeros((n, conds))
|
|
898
|
+
for i, level in enumerate(unique_levels):
|
|
899
|
+
Td[:, i] = (D == level).astype(float)
|
|
900
|
+
|
|
901
|
+
dimX = X.shape[1]
|
|
902
|
+
|
|
903
|
+
# Normalize each column
|
|
904
|
+
colsums = Td.sum(axis=0)
|
|
905
|
+
Td = Td @ np.diag(1 / colsums)
|
|
906
|
+
|
|
907
|
+
# Subtract last column and remove it
|
|
908
|
+
subtract_mat = Td[:, -1:] @ np.ones((1, conds))
|
|
909
|
+
Td = Td - subtract_mat
|
|
910
|
+
Td = Td[:, :-1]
|
|
911
|
+
|
|
912
|
+
# Center and scale
|
|
913
|
+
Td = (Td - Td.mean(axis=0)) / Td.std(axis=0, ddof=1)
|
|
914
|
+
|
|
915
|
+
# Construct z using Kronecker product
|
|
916
|
+
z_list = []
|
|
917
|
+
for i in range(n):
|
|
918
|
+
kron_prod = np.kron(Td[i, :], X[i, :])
|
|
919
|
+
z_list.append(kron_prod)
|
|
920
|
+
z = np.array(z_list)
|
|
921
|
+
|
|
922
|
+
# Compute cor_init for eta initialization
|
|
923
|
+
# For each column of X, compute correlations with all columns of Td
|
|
924
|
+
cor_init_list = []
|
|
925
|
+
for j in range(dimX):
|
|
926
|
+
cors_with_Td = np.array([np.corrcoef(Td[:, i], X[:, j])[0, 1] for i in range(Td.shape[1])])
|
|
927
|
+
cor_init_list.append(cors_with_Td)
|
|
928
|
+
# Transpose and flatten: stack into matrix (dimX, conds-1), transpose to (conds-1, dimX), then flatten
|
|
929
|
+
cor_init_matrix = np.array(cor_init_list) # shape: (dimX, conds-1)
|
|
930
|
+
cor_init = cor_init_matrix.T.ravel() # Transpose and flatten, shape: (dimX*(conds-1),)
|
|
931
|
+
|
|
932
|
+
# Add mean constraints
|
|
933
|
+
ncon_cor = z.shape[1] # Record number of correlation constraints
|
|
934
|
+
z = np.column_stack([z, X])
|
|
935
|
+
_ncon = z.shape[1] # Total constraints (reserved for diagnostics) # noqa: F841
|
|
936
|
+
|
|
937
|
+
# Optimization preparation
|
|
938
|
+
# Prior standard deviation
|
|
939
|
+
# eta_prior_sd = corprior (standard deviation, not variance)
|
|
940
|
+
eta_prior_sd = np.full(ncon_cor, corprior)
|
|
941
|
+
|
|
942
|
+
# Initialize eta
|
|
943
|
+
if not is_factor_treat:
|
|
944
|
+
# Continuous treatment: eta_init = cor(X, D)
|
|
945
|
+
# Note: D is already standardized, X is already whitened
|
|
946
|
+
eta_init = np.array([np.corrcoef(X[:, j], D)[0, 1] for j in range(X.shape[1])])
|
|
947
|
+
else:
|
|
948
|
+
# Factor treatment: use cor_init computed above
|
|
949
|
+
eta_init = cor_init
|
|
950
|
+
|
|
951
|
+
# Eta scaling vector
|
|
952
|
+
if rescale_orig:
|
|
953
|
+
eta_to_be_scaled = eta_init
|
|
954
|
+
else:
|
|
955
|
+
eta_to_be_scaled = np.ones(ncon_cor)
|
|
956
|
+
|
|
957
|
+
# Main optimization: line search over α ∈ [0, 1] (Fong et al. 2018, Equation 10)
|
|
958
|
+
# The paper specifies α ∈ [0, 1] to ensure algorithm stability
|
|
959
|
+
|
|
960
|
+
# Define wrapper function for maximization
|
|
961
|
+
def objective_for_maximize(par_scalar):
|
|
962
|
+
return log_post(par_scalar, eta_to_be_scaled, eta_prior_sd, z, eps, 0.001, ncon_cor, n)
|
|
963
|
+
|
|
964
|
+
# Maximize log_post using bounded scalar optimization
|
|
965
|
+
result = scipy.optimize.minimize_scalar(
|
|
966
|
+
lambda par: -objective_for_maximize(par),
|
|
967
|
+
bounds=(0, 1),
|
|
968
|
+
method='bounded',
|
|
969
|
+
options={'xatol': 1e-10, 'maxiter': 2000}
|
|
970
|
+
)
|
|
971
|
+
par_opt = result.x
|
|
972
|
+
|
|
973
|
+
# Print warning if optimization did not converge
|
|
974
|
+
if not result.success:
|
|
975
|
+
if print_level > 0:
|
|
976
|
+
print(f"Warning: optimization may not have converged: {result.message}")
|
|
977
|
+
|
|
978
|
+
# Compute optimal eta
|
|
979
|
+
eta_opt = par_opt * eta_to_be_scaled
|
|
980
|
+
|
|
981
|
+
# Compute optimal weights
|
|
982
|
+
el_out_opt = get_w(eta_opt, z, 0.05, eps, ncon_cor, n)
|
|
983
|
+
w_opt = el_out_opt['w']
|
|
984
|
+
sumw0 = el_out_opt['sumw']
|
|
985
|
+
|
|
986
|
+
# Weight normalization following theory (Fong et al. 2018, Equation 8)
|
|
987
|
+
# The paper requires: Σw_i = N (constraint in Equation 8)
|
|
988
|
+
# Normalize weights so that sum(w) = n
|
|
989
|
+
w = w_opt * n / sumw0
|
|
990
|
+
|
|
991
|
+
# Check for NaN weights and raise meaningful error
|
|
992
|
+
if np.isnan(w).any() or np.isnan(sumw0):
|
|
993
|
+
raise RuntimeError(
|
|
994
|
+
f"npCBPS optimization failed and produced NaN weights. "
|
|
995
|
+
f"This usually indicates:\n"
|
|
996
|
+
f" 1. corprior is too large (current: {corprior}, try < 1.0)\n"
|
|
997
|
+
f" 2. corprior is too small (current: {corprior}, try > 0.0001)\n"
|
|
998
|
+
f" 3. Sample size is too small (current: n={n}, recommend n>=30)\n"
|
|
999
|
+
f" 4. Covariate-treatment correlation is extreme\n"
|
|
1000
|
+
f"Suggestion: Try adjusting corprior or using standard CBPS instead."
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
# Compute log prior density
|
|
1004
|
+
log_p_eta_opt = np.sum(
|
|
1005
|
+
-0.5 * np.log(2 * np.pi * eta_prior_sd**2)
|
|
1006
|
+
- eta_opt**2 / (2 * eta_prior_sd**2)
|
|
1007
|
+
)
|
|
1008
|
+
log_el_opt = el_out_opt['log_el']
|
|
1009
|
+
|
|
1010
|
+
# Construct result object
|
|
1011
|
+
result_obj = NPCBPSResults()
|
|
1012
|
+
result_obj.par = par_opt
|
|
1013
|
+
result_obj.log_p_eta = log_p_eta_opt
|
|
1014
|
+
result_obj.log_el = log_el_opt
|
|
1015
|
+
result_obj.eta = eta_opt
|
|
1016
|
+
result_obj.sumw0 = sumw0
|
|
1017
|
+
result_obj.weights = w
|
|
1018
|
+
result_obj.y = treat # Original treatment variable
|
|
1019
|
+
result_obj.x = orig_X
|
|
1020
|
+
|
|
1021
|
+
# Convergence diagnostic fields
|
|
1022
|
+
result_obj.converged = result.success
|
|
1023
|
+
result_obj.iterations = result.nit if hasattr(result, 'nit') else None
|
|
1024
|
+
|
|
1025
|
+
# Diagnostic output (optional)
|
|
1026
|
+
if print_level > 0:
|
|
1027
|
+
print(f"par: {par_opt:.6f}")
|
|
1028
|
+
print(f"log_post: {-(log_el_opt + log_p_eta_opt):.6f}")
|
|
1029
|
+
print(f"log_el: {log_el_opt:.6f}")
|
|
1030
|
+
print(f"log_p_eta: {log_p_eta_opt:.6f}")
|
|
1031
|
+
print(f"sumw0: {sumw0:.6f}")
|
|
1032
|
+
print(f"converged: {result_obj.converged}")
|
|
1033
|
+
if result_obj.iterations is not None:
|
|
1034
|
+
print(f"iterations: {result_obj.iterations}")
|
|
1035
|
+
|
|
1036
|
+
return result_obj
|