cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,339 @@
1
+ """
2
+ Empirical Likelihood Optimization for Nonparametric CBPS.
3
+
4
+ This module implements the dual optimization approach for empirical
5
+ likelihood estimation described in Section 3.3.2 of Fong, Hazlett,
6
+ and Imai (2018).
7
+
8
+ The key insight is dimension reduction: instead of optimizing over
9
+ n weights directly, we optimize over (2K+1) Lagrange multipliers
10
+ :math:`\\gamma`, then recover weights via:
11
+
12
+ .. math::
13
+
14
+ w_i = \\frac{1}{1 - \\gamma^T g(X_i^*, T_i^*)}
15
+
16
+ Key Functions
17
+ -------------
18
+ - :func:`log_elgiven_eta`: Objective function for :math:`\\gamma`
19
+ optimization given the weighted correlation :math:`\\eta`.
20
+ - :func:`get_w`: Recover weights and check convergence.
21
+ - :func:`log_post`: Penalized likelihood for the outer :math:`\\alpha`
22
+ line search.
23
+
24
+ Mathematical Background
25
+ -----------------------
26
+ The Lagrangian for the constrained likelihood maximization (Section 3.3.2)
27
+ leads to the dual problem:
28
+
29
+ .. math::
30
+
31
+ \\underset{\\gamma}{\\text{argmax}} \\sum_{i=1}^n
32
+ \\log(1 - \\gamma^T(g_i - \\eta))
33
+
34
+ where :math:`g_i = (X_i^* T_i^*, X_i^*, T_i^*)^T` is the constraint vector
35
+ and :math:`\\eta` is the allowed finite-sample imbalance.
36
+
37
+ Note: The ordering of components in :math:`g_i` follows the implementation
38
+ rather than the paper's notation :math:`(X_i^*, T_i^*, X_i^* T_i^*)^T`.
39
+
40
+ References
41
+ ----------
42
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
43
+ score for a continuous treatment: Application to the efficacy of political
44
+ advertisements. The Annals of Applied Statistics, 12(1), 156-177.
45
+ https://doi.org/10.1214/17-AOAS1101
46
+
47
+ Owen, A.B. (2001). Empirical Likelihood. Chapman & Hall/CRC.
48
+ """
49
+
50
+ from typing import Dict, Union
51
+ import numpy as np
52
+ import scipy.optimize
53
+
54
+ from .taylor_approx import llog
55
+
56
+
57
+ def log_elgiven_eta(
58
+ gamma: np.ndarray,
59
+ eta: np.ndarray,
60
+ z: np.ndarray,
61
+ eps: float,
62
+ ncon_cor: int,
63
+ n: int
64
+ ) -> float:
65
+ """
66
+ Dual objective function for empirical likelihood optimization.
67
+
68
+ Computes the negative log empirical likelihood as a function of the
69
+ Lagrange multipliers :math:`\\gamma`, given the allowed imbalance
70
+ :math:`\\eta`. This is the inner optimization in the npCBPS algorithm.
71
+
72
+ The dual formulation (Equation 9 in Fong et al., 2018) reduces the
73
+ problem from n-dimensional weight optimization to (2K+1)-dimensional
74
+ :math:`\\gamma` optimization.
75
+
76
+ Parameters
77
+ ----------
78
+ gamma : np.ndarray of shape (ncon,)
79
+ Lagrange multiplier vector to optimize.
80
+ eta : np.ndarray of shape (ncon_cor,)
81
+ Allowed weighted correlation vector :math:`\\eta`.
82
+ z : np.ndarray of shape (n, ncon)
83
+ Constraint matrix :math:`(X^* T^*, X^*, T^*)`.
84
+ eps : float
85
+ Threshold for Taylor approximation in :func:`llog`, typically 1/n.
86
+ ncon_cor : int
87
+ Number of correlation constraints (K for continuous treatment,
88
+ K*(J-1) for J-level factor treatment).
89
+ n : int
90
+ Sample size.
91
+
92
+ Returns
93
+ -------
94
+ float
95
+ Negative log empirical likelihood (to be minimized).
96
+
97
+ Notes
98
+ -----
99
+ **Mathematical formulation:**
100
+
101
+ The objective is derived from Equation 9 in Section 3.3.3:
102
+
103
+ .. math::
104
+
105
+ -\\sum_{i=1}^n \\log(1 - \\gamma^T(g_i - \\eta))
106
+
107
+ Equivalently, with the scaling convention used in this implementation:
108
+
109
+ .. math::
110
+
111
+ -\\sum_{i=1}^n \\text{llog}(n + \\gamma^T(\\eta - z_i))
112
+
113
+ where :func:`llog` provides numerical stability for small arguments.
114
+
115
+ References
116
+ ----------
117
+ Fong, C., Hazlett, C., and Imai, K. (2018). Equation 9, Section 3.3.3.
118
+ """
119
+ ncon = z.shape[1]
120
+
121
+ # Extend eta to ncon dimensions (pad with zeros)
122
+ eta_long = np.concatenate([eta, np.zeros(ncon - ncon_cor)])
123
+
124
+ # Broadcast eta_long to matrix (ncon × n)
125
+ eta_mat = eta_long[:, None] @ np.ones((1, n))
126
+
127
+ # Core formula: arg is a 1 × n row vector
128
+ arg = n + gamma.T @ (eta_mat - z.T)
129
+
130
+ # Empirical likelihood
131
+ log_el = -np.sum(llog(arg, eps))
132
+
133
+ return log_el
134
+
135
+
136
+ def get_w(
137
+ eta: np.ndarray,
138
+ z: np.ndarray,
139
+ sumw_tol: float,
140
+ eps: float,
141
+ ncon_cor: int,
142
+ n: int
143
+ ) -> Dict[str, Union[np.ndarray, float]]:
144
+ """
145
+ Compute optimal weights given the allowed imbalance eta.
146
+
147
+ This function performs the inner optimization: given :math:`\\eta`,
148
+ find the optimal :math:`\\gamma` via BFGS, then recover weights using
149
+ the formula from Owen (2001):
150
+
151
+ .. math::
152
+
153
+ w_i = \\frac{1}{1 - \\gamma^T(g_i - \\eta)}
154
+
155
+ A convergence check verifies that :math:`\\sum w_i \\approx 1`.
156
+
157
+ Parameters
158
+ ----------
159
+ eta : np.ndarray of shape (ncon_cor,)
160
+ Allowed weighted correlation vector.
161
+ z : np.ndarray of shape (n, ncon)
162
+ Constraint matrix.
163
+ sumw_tol : float
164
+ Tolerance for weight sum convergence. If :math:`|1 - \\sum w_i|`
165
+ exceeds this threshold, a penalty is added to the likelihood.
166
+ Typical values: 0.05 for final weights, 0.001 during optimization.
167
+ eps : float
168
+ Threshold for Taylor approximation, typically 1/n.
169
+ ncon_cor : int
170
+ Number of correlation constraints.
171
+ n : int
172
+ Sample size.
173
+
174
+ Returns
175
+ -------
176
+ dict
177
+ Dictionary with keys:
178
+
179
+ - **w** : np.ndarray of shape (n,)
180
+ Unnormalized weights (before normalization to sum to n).
181
+ - **sumw** : float
182
+ Sum of weights (ideally close to 1).
183
+ - **log_el** : float
184
+ Log empirical likelihood, possibly with penalty if sumw
185
+ deviates from 1.
186
+ - **el_gamma** : np.ndarray of shape (ncon_cor,)
187
+ Optimal Lagrange multipliers for correlation constraints.
188
+
189
+ Notes
190
+ -----
191
+ **Convergence penalty:**
192
+
193
+ When :math:`|1 - \\sum w_i| > \\text{sumw\\_tol}`:
194
+
195
+ .. math::
196
+
197
+ \\text{log\\_el} = -\\sum \\log(w_i / \\sum w_i)
198
+ - 10^4 \\cdot (1 + |1 - \\sum w_i|)
199
+
200
+ This penalty guides the outer optimization away from :math:`\\eta`
201
+ values that lead to poor weight recovery.
202
+
203
+ References
204
+ ----------
205
+ Owen, A.B. (2001). Empirical Likelihood. Chapman & Hall/CRC.
206
+ """
207
+ ncon = z.shape[1]
208
+
209
+ # Initialize gamma = 0
210
+ gam_init = np.zeros(ncon)
211
+
212
+ # BFGS optimization
213
+ result = scipy.optimize.minimize(
214
+ log_elgiven_eta,
215
+ gam_init,
216
+ args=(eta, z, eps, ncon_cor, n),
217
+ method='BFGS'
218
+ )
219
+ gam_opt = result.x
220
+
221
+ # Recover weights
222
+ eta_long = np.concatenate([eta, np.zeros(ncon - ncon_cor)])
223
+ eta_mat = eta_long[:, None] @ np.ones((1, n))
224
+ arg_temp = n + gam_opt.T @ (eta_mat - z.T)
225
+
226
+ # w = 1 / arg_temp
227
+ w = 1 / arg_temp.flatten()
228
+ sum_w = w.sum()
229
+
230
+ # Normalize weights
231
+ w_scaled = w / sum_w
232
+
233
+ # Convergence check
234
+ if abs(1 - sum_w) <= sumw_tol:
235
+ # Pass: weight sum is close enough to 1
236
+ log_el = -np.sum(np.log(w_scaled))
237
+ else:
238
+ # Fail: add penalty term
239
+ log_el = -np.sum(np.log(w_scaled)) - 10**4 * (1 + abs(1 - sum_w))
240
+
241
+ # Return results
242
+ return {
243
+ 'w': w,
244
+ 'sumw': sum_w,
245
+ 'log_el': log_el,
246
+ 'el_gamma': gam_opt[:ncon_cor]
247
+ }
248
+
249
+
250
+ def log_post(
251
+ par: float,
252
+ eta_to_be_scaled: np.ndarray,
253
+ eta_prior_sd: np.ndarray,
254
+ z: np.ndarray,
255
+ eps: float,
256
+ sumw_tol: float,
257
+ ncon_cor: int,
258
+ n: int
259
+ ) -> float:
260
+ """
261
+ Penalized log-likelihood for the outer line search.
262
+
263
+ Computes the objective for Equation 10 in Section 3.3.3 of Fong et al.
264
+ (2018):
265
+
266
+ .. math::
267
+
268
+ \\log f(X^*, T^* | \\eta) + \\log f(\\eta)
269
+
270
+ where :math:`\\eta = \\alpha \\cdot \\eta_0` is parameterized by the
271
+ scalar :math:`\\alpha \\in [0, 1]`, and :math:`\\eta_0` is the initial
272
+ (unweighted) correlation.
273
+
274
+ Parameters
275
+ ----------
276
+ par : float
277
+ Scaling parameter :math:`\\alpha` in the range [0, 1].
278
+ At :math:`\\alpha = 0`, exact balance is enforced.
279
+ At :math:`\\alpha = 1`, the initial imbalance is retained.
280
+ eta_to_be_scaled : np.ndarray of shape (ncon_cor,)
281
+ Base correlation vector :math:`\\eta_0` to be scaled.
282
+ eta_prior_sd : np.ndarray of shape (ncon_cor,)
283
+ Prior standard deviation :math:`\\sigma` for :math:`\\eta`, where
284
+ :math:`\\eta \\sim N(0, \\sigma^2 I_K)`. This equals the ``corprior``
285
+ parameter.
286
+ z : np.ndarray of shape (n, ncon)
287
+ Constraint matrix.
288
+ eps : float
289
+ Threshold for Taylor approximation.
290
+ sumw_tol : float
291
+ Weight sum tolerance (typically 0.001 during line search).
292
+ ncon_cor : int
293
+ Number of correlation constraints.
294
+ n : int
295
+ Sample size.
296
+
297
+ Returns
298
+ -------
299
+ float
300
+ Log posterior (penalized likelihood) for maximization.
301
+
302
+ Notes
303
+ -----
304
+ **Prior specification (Section 3.3.3):**
305
+
306
+ The penalty assumes :math:`\\eta \\sim N(0, \\sigma^2 I_K)`:
307
+
308
+ .. math::
309
+
310
+ \\log f(\\eta) = -\\frac{K}{2}\\log(2\\pi\\sigma^2)
311
+ - \\frac{\\eta^T \\eta}{2\\sigma^2}
312
+
313
+ The ``corprior`` parameter corresponds to :math:`\\sigma`. Smaller
314
+ values enforce tighter balance constraints at the cost of potentially
315
+ more extreme weights.
316
+
317
+ References
318
+ ----------
319
+ Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.3.3: A penalized
320
+ imbalance approach. Equation 10.
321
+ """
322
+ # Scale eta
323
+ eta_now = par * eta_to_be_scaled
324
+
325
+ # Compute prior log density
326
+ log_p_eta = np.sum(
327
+ -0.5 * np.log(2 * np.pi * eta_prior_sd**2)
328
+ - eta_now**2 / (2 * eta_prior_sd**2)
329
+ )
330
+
331
+ # Compute empirical likelihood
332
+ el_out = get_w(eta_now, z, sumw_tol, eps, ncon_cor, n)
333
+
334
+ # Compute posterior density
335
+ c = 1
336
+ log_post_value = el_out['log_el'] + c * log_p_eta
337
+
338
+ # Return log posterior for maximization
339
+ return log_post_value