cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1036 @@
1
+ """
2
+ Nonparametric Covariate Balancing Propensity Score (npCBPS).
3
+
4
+ This module implements the nonparametric covariate balancing generalized
5
+ propensity score (npCBGPS) estimator from Section 3.3 of Fong, Hazlett,
6
+ and Imai (2018). Unlike parametric CBPS, this approach does not specify
7
+ a functional form for the propensity score. Instead, it directly estimates
8
+ inverse probability weights by maximizing the empirical likelihood subject
9
+ to covariate balance constraints.
10
+
11
+ Key Features
12
+ ------------
13
+ - **Model-free**: No parametric assumptions about treatment assignment.
14
+ - **Empirical likelihood**: Weights chosen to maximize data likelihood.
15
+ - **Penalized imbalance**: Allows controlled finite-sample imbalance
16
+ via the ``corprior`` parameter.
17
+
18
+ Main API
19
+ --------
20
+ - :func:`npCBPS`: High-level function accepting formula and DataFrame.
21
+ - :class:`NPCBPSResults`: Container for estimated weights and diagnostics.
22
+
23
+ Algorithm Overview
24
+ ------------------
25
+ 1. Whiten covariates: :math:`X^* = S_X^{-1/2}(X - \\bar{X})`.
26
+ 2. Standardize treatment: :math:`T^* = (T - \\bar{T})/s_T`.
27
+ 3. Construct constraint matrix: :math:`g_i = (X_i^* T_i^*, X_i^*, T_i^*)^T`.
28
+ 4. Line search over :math:`\\alpha \\in [0, 1]` to maximize penalized
29
+ likelihood (Equation 10).
30
+ 5. Recover weights: :math:`w_i = 1/(1 - \\gamma^T(g_i - \\eta))`.
31
+
32
+ References
33
+ ----------
34
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
35
+ score for a continuous treatment: Application to the efficacy of political
36
+ advertisements. The Annals of Applied Statistics, 12(1), 156-177.
37
+ https://doi.org/10.1214/17-AOAS1101
38
+ """
39
+
40
+ from typing import Optional, Any, Union
41
+ import numpy as np
42
+ import pandas as pd
43
+ import scipy.optimize
44
+
45
+ from ..utils.formula import parse_formula
46
+ from .cholesky_whitening import cholesky_whitening
47
+ from .empirical_likelihood import get_w, log_post
48
+
49
+
50
+ class NPCBPSSummary:
51
+ """Summary object for NPCBPSResults.
52
+
53
+ Returned by :meth:`NPCBPSResults.summary`. Provides a structured
54
+ representation of npCBPS estimation results that can be printed
55
+ via ``print()`` or ``str()``.
56
+
57
+ Attributes
58
+ ----------
59
+ call : str or None
60
+ String representation of the function call.
61
+ n : int
62
+ Total sample size.
63
+ n_treat : int
64
+ Number of treated units.
65
+ n_control : int
66
+ Number of control units.
67
+ converged : bool or None
68
+ Whether the optimization converged.
69
+ iterations : int or None
70
+ Number of iterations used.
71
+ sumw0 : float or None
72
+ Sum of unnormalized weights (should ≈ 1.0).
73
+ log_el : float or None
74
+ Log empirical likelihood at the optimum.
75
+ log_p_eta : float or None
76
+ Log prior density at the optimum.
77
+ par : float or None
78
+ Optimal scaling parameter alpha.
79
+ eta : np.ndarray or None
80
+ Weighted correlations.
81
+ weights : np.ndarray or None
82
+ Final normalized weights.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ call: Optional[str],
88
+ y: Optional[np.ndarray],
89
+ converged: Optional[bool],
90
+ iterations: Optional[int],
91
+ sumw0: Optional[float],
92
+ par: Optional[float],
93
+ log_el: Optional[float],
94
+ log_p_eta: Optional[float],
95
+ eta: Optional[np.ndarray],
96
+ weights: Optional[np.ndarray],
97
+ ):
98
+ self.call = call
99
+ self.converged = converged
100
+ self.iterations = iterations
101
+ self.sumw0 = sumw0
102
+ self.par = par
103
+ self.log_el = log_el
104
+ self.log_p_eta = log_p_eta
105
+ self.eta = eta
106
+ self.weights = weights
107
+
108
+ # Derived sample info
109
+ if y is not None:
110
+ self.n = len(y)
111
+ self.n_treat = int(y.sum()) if hasattr(y, 'sum') else 0
112
+ self.n_control = self.n - self.n_treat
113
+ else:
114
+ self.n = 0
115
+ self.n_treat = 0
116
+ self.n_control = 0
117
+
118
+ def __str__(self) -> str:
119
+ """Return formatted summary text.
120
+
121
+ The output is identical to the legacy ``NPCBPSResults.summary()``
122
+ string for backward compatibility.
123
+ """
124
+ lines = []
125
+ lines.append("\n" + "=" * 70)
126
+ lines.append("npCBPS: Nonparametric Covariate Balancing Propensity Score")
127
+ lines.append("=" * 70)
128
+ lines.append("")
129
+
130
+ # Call
131
+ lines.append("Call:")
132
+ lines.append(f" {self.call or 'npCBPS()'}")
133
+ lines.append("")
134
+
135
+ # Sample information
136
+ if self.n > 0:
137
+ lines.append(f"Sample size: {self.n}")
138
+ lines.append(f" Treatment group: {self.n_treat} ({100*self.n_treat/self.n:.1f}%)")
139
+ lines.append(f" Control group: {self.n_control} ({100*self.n_control/self.n:.1f}%)")
140
+ lines.append("")
141
+
142
+ # Convergence diagnostics
143
+ lines.append("Convergence Diagnostics:")
144
+ lines.append("-" * 70)
145
+ if self.converged is not None:
146
+ conv_status = "✓ Yes" if self.converged else "✗ No"
147
+ lines.append(f" Converged: {conv_status}")
148
+
149
+ if self.iterations is not None:
150
+ lines.append(f" Iterations used: {self.iterations}")
151
+
152
+ if self.sumw0 is not None:
153
+ # sumw0 should be approximately 1.0 (key diagnostic from Fong et al. 2018)
154
+ deviation = abs(self.sumw0 - 1.0)
155
+ if deviation < 0.01:
156
+ status = "✓ Excellent (within 1%)"
157
+ elif deviation < 0.05:
158
+ status = "✓ Good (within 5%)"
159
+ elif deviation < 0.10:
160
+ status = "⚠ Acceptable (within 10%)"
161
+ else:
162
+ status = "✗ Warning (>10% deviation)"
163
+
164
+ lines.append(f" Sum of weights (sumw0): {self.sumw0:.6f} {status}")
165
+ lines.append(" Theoretical value: 1.0")
166
+ lines.append(f" Deviation: {deviation:.4f}")
167
+
168
+ lines.append("")
169
+
170
+ # Optimization results
171
+ lines.append("Optimization Results:")
172
+ lines.append("-" * 70)
173
+
174
+ if self.par is not None:
175
+ lines.append(f" Optimization parameter (alpha): {self.par:.6f}")
176
+
177
+ if self.log_el is not None:
178
+ lines.append(f" Log Empirical Likelihood: {self.log_el:.6f}")
179
+
180
+ if self.log_p_eta is not None:
181
+ lines.append(f" Log Prior Density p(η): {self.log_p_eta:.6f}")
182
+
183
+ if self.log_el is not None and self.log_p_eta is not None:
184
+ total_obj = self.log_el + self.log_p_eta
185
+ lines.append(f" Total objective: {total_obj:.6f}")
186
+
187
+ lines.append("")
188
+
189
+ # Weighted correlations (key statistics)
190
+ if self.eta is not None:
191
+ lines.append("Weighted Correlations (η):")
192
+ lines.append("-" * 70)
193
+
194
+ eta_array = np.atleast_1d(self.eta)
195
+ if len(eta_array) == 1:
196
+ lines.append(f" η = {eta_array[0]:.6f}")
197
+ else:
198
+ lines.append(f" Number of correlations: {len(eta_array)}")
199
+ lines.append(f" Mean: {eta_array.mean():.6f}")
200
+ lines.append(f" Range: [{eta_array.min():.6f}, {eta_array.max():.6f}]")
201
+ if len(eta_array) <= 10:
202
+ lines.append(" Values:")
203
+ for i, val in enumerate(eta_array):
204
+ lines.append(f" η[{i}] = {val:.6f}")
205
+
206
+ lines.append("")
207
+
208
+ # Weight statistics
209
+ if self.weights is not None:
210
+ lines.append("Weight Distribution:")
211
+ lines.append("-" * 70)
212
+ lines.append(f" Min: {self.weights.min():.6f}")
213
+ lines.append(f" Q1: {np.percentile(self.weights, 25):.6f}")
214
+ lines.append(f" Median: {np.median(self.weights):.6f}")
215
+ lines.append(f" Mean: {self.weights.mean():.6f}")
216
+ lines.append(f" Q3: {np.percentile(self.weights, 75):.6f}")
217
+ lines.append(f" Max: {self.weights.max():.6f}")
218
+ lines.append(f" Sum: {self.weights.sum():.6f}")
219
+
220
+ # Effective sample size
221
+ ess = (self.weights.sum() ** 2) / (self.weights ** 2).sum()
222
+ lines.append(f" Effective sample size: {ess:.1f}")
223
+ if self.n > 0:
224
+ efficiency = ess / self.n
225
+ lines.append(f" Efficiency: {100*efficiency:.1f}%")
226
+
227
+ lines.append("")
228
+
229
+ # Diagnostic recommendations
230
+ lines.append("Diagnostics:")
231
+ lines.append("-" * 70)
232
+
233
+ diagnostics = []
234
+
235
+ if self.converged is False:
236
+ diagnostics.append("⚠ Optimization did not converge - results may be unreliable")
237
+
238
+ if self.sumw0 is not None and abs(self.sumw0 - 1.0) > 0.10:
239
+ diagnostics.append("⚠ sumw0 deviates >10% from 1.0 - check optimization quality")
240
+
241
+ if self.weights is not None:
242
+ # Check weight range
243
+ weight_range = self.weights.max() / self.weights.min() if self.weights.min() > 0 else float('inf')
244
+ if weight_range > 100:
245
+ diagnostics.append(f"⚠ Large weight range ({weight_range:.1f}x) - may indicate overlap issues")
246
+
247
+ # Check effective sample size
248
+ if self.n > 0:
249
+ ess = (self.weights.sum() ** 2) / (self.weights ** 2).sum()
250
+ efficiency = ess / self.n
251
+ if efficiency < 0.5:
252
+ eff_pct = 100 * efficiency
253
+ diagnostics.append(
254
+ f"⚠ Low weighting efficiency ({eff_pct:.1f}%) - consider different corprior"
255
+ )
256
+
257
+ if diagnostics:
258
+ for diag in diagnostics:
259
+ lines.append(f" {diag}")
260
+ else:
261
+ lines.append(" ✓ All diagnostics passed")
262
+
263
+ lines.append("")
264
+ lines.append("=" * 70)
265
+
266
+ return "\n".join(lines)
267
+
268
+ def __repr__(self) -> str:
269
+ return f"NPCBPSSummary(n={self.n}, converged={self.converged})"
270
+
271
+
272
+ class NPCBPSResults:
273
+ """
274
+ Container for nonparametric CBPS estimation results.
275
+
276
+ Stores the output from :func:`npCBPS`, including estimated weights,
277
+ optimization diagnostics, and the original data. Unlike parametric
278
+ CBPS, npCBPS does not estimate propensity score model coefficients.
279
+
280
+ Attributes
281
+ ----------
282
+ weights : np.ndarray of shape (n,)
283
+ Final normalized weights summing to n. Use these for weighted
284
+ outcome analysis.
285
+ sumw0 : float
286
+ Sum of unnormalized weights before normalization. Should be close
287
+ to 1.0 (within 5%). Values far from 1 indicate potential convergence
288
+ issues.
289
+ eta : np.ndarray of shape (K,)
290
+ Optimal weighted correlation :math:`\\eta = \\alpha \\cdot \\eta_0`,
291
+ where K is the number of covariates. Measures the remaining
292
+ covariate-treatment association after weighting.
293
+ par : float
294
+ Optimal scaling parameter :math:`\\alpha \\in [0, 1]` from line search.
295
+ Values near 0 indicate tight balance; values near 1 indicate relaxed
296
+ balance.
297
+ log_el : float
298
+ Log empirical likelihood at the optimum.
299
+ log_p_eta : float
300
+ Log prior density :math:`\\log f(\\eta)` at the optimum.
301
+ y : np.ndarray of shape (n,)
302
+ Treatment variable.
303
+ x : np.ndarray of shape (n, K)
304
+ Original covariate matrix (before whitening).
305
+ converged : bool
306
+ Whether the optimization converged successfully.
307
+ iterations : int or None
308
+ Number of iterations used in the optimization.
309
+ formula : str
310
+ Model formula used for fitting.
311
+ data : pd.DataFrame
312
+ Original input DataFrame.
313
+ call : str
314
+ String representation of the function call.
315
+ terms : object
316
+ patsy DesignInfo object for formula parsing (used by diagnostics).
317
+ na_action : dict or None
318
+ Information about missing value handling.
319
+
320
+ See Also
321
+ --------
322
+ npCBPS : Function that creates this results object.
323
+ CBPSResults : Results container for parametric CBPS.
324
+ """
325
+
326
+ def __init__(self):
327
+ # npCBPS-specific fields
328
+ self.par: Optional[float] = None
329
+ self.log_p_eta: Optional[float] = None
330
+ self.log_el: Optional[float] = None
331
+ self.eta: Optional[np.ndarray] = None
332
+ self.sumw0: Optional[float] = None
333
+
334
+ # Common fields (shared with CBPS)
335
+ self.weights: Optional[np.ndarray] = None
336
+ self.y: Optional[np.ndarray] = None
337
+ self.x: Optional[np.ndarray] = None
338
+
339
+ # Convergence diagnostic fields (documented fields)
340
+ self.converged: Optional[bool] = None
341
+ self.iterations: Optional[int] = None
342
+
343
+ # Metadata
344
+ self.call: Optional[str] = None
345
+ self.formula: Optional[str] = None
346
+ self.data: Optional[pd.DataFrame] = None
347
+
348
+ # Metadata attributes for compatibility with predict() and model diagnostics
349
+ self.terms: Optional[object] = None # patsy DesignInfo object
350
+ self.na_action: Optional[dict] = None # Missing value handling information
351
+
352
+ def __repr__(self) -> str:
353
+ """Concise repr output (for interactive environment)"""
354
+ n = len(self.y) if self.y is not None else 0
355
+ converged_str = "Yes" if self.converged else "No" if self.converged is not None else "Unknown"
356
+ sumw0_str = f"{self.sumw0:.4f}" if self.sumw0 is not None else "N/A"
357
+ return f"NPCBPSResults(n={n}, converged={converged_str}, sumw0={sumw0_str})"
358
+
359
+ def __str__(self) -> str:
360
+ """Complete string output (for print calls)"""
361
+ output = "\nCall:\n " + (self.call or "npCBPS()") + "\n\n"
362
+
363
+ # Sample information
364
+ if self.y is not None:
365
+ output += f"Sample size: {len(self.y)}\n"
366
+
367
+ # Convergence information
368
+ if self.converged is not None:
369
+ output += f"Converged: {'Yes' if self.converged else 'No'}\n"
370
+ if self.iterations is not None:
371
+ output += f"Iterations: {self.iterations}\n"
372
+
373
+ # Key statistics
374
+ if self.sumw0 is not None:
375
+ sumw0_status = "Good" if abs(self.sumw0 - 1.0) < 0.05 else "Check"
376
+ output += f"Sum of weights (sumw0): {self.sumw0:.6f} ({sumw0_status}: should ≈ 1.0 ± 5%)\n"
377
+
378
+ if self.log_el is not None:
379
+ output += f"Log Empirical Likelihood: {self.log_el:.6f}\n"
380
+
381
+ if self.log_p_eta is not None:
382
+ output += f"Log Prior Density: {self.log_p_eta:.6f}\n"
383
+
384
+ if self.par is not None:
385
+ output += f"Optimization parameter (alpha): {self.par:.6f}\n"
386
+
387
+ # Weight information
388
+ if self.weights is not None:
389
+ output += "\nWeights:\n"
390
+ output += f" Min: {self.weights.min():.6f}\n"
391
+ output += f" Max: {self.weights.max():.6f}\n"
392
+ output += f" Mean: {self.weights.mean():.6f}\n"
393
+ output += f" Sum: {self.weights.sum():.6f}\n"
394
+
395
+ return output
396
+
397
+ def summary(self) -> 'NPCBPSSummary':
398
+ """
399
+ Display npCBPS fit summary (detailed diagnostic information).
400
+
401
+ This method provides comprehensive diagnostic information for npCBPS results.
402
+
403
+ Returns
404
+ -------
405
+ NPCBPSSummary
406
+ Summary object with ``__str__`` method for formatted output.
407
+ Use ``print(result.summary())`` to display.
408
+
409
+ Notes
410
+ -----
411
+ summary() provides more detailed diagnostic information than __str__(), including:
412
+ - Convergence diagnostics (whether sumw0 ≈ 1)
413
+ - Optimization parameters
414
+ - Empirical likelihood and priors
415
+ - Weighted correlation eta (if multiple covariates)
416
+ - Weight distribution statistics
417
+
418
+ This provides a comprehensive view of the nonparametric CBPS estimation results.
419
+
420
+ Examples
421
+ --------
422
+ >>> from cbps import npCBPS
423
+ >>> fit = npCBPS('treat ~ x1 + x2', data=df)
424
+ >>> print(fit.summary())
425
+
426
+ References
427
+ ----------
428
+ .. [1] Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing
429
+ propensity score for a continuous treatment. The Annals of Applied
430
+ Statistics, 12(1), 156-177. https://doi.org/10.1214/17-AOAS1101
431
+ """
432
+ return NPCBPSSummary(
433
+ call=self.call,
434
+ y=self.y,
435
+ converged=self.converged,
436
+ iterations=self.iterations,
437
+ sumw0=self.sumw0,
438
+ par=self.par,
439
+ log_el=self.log_el,
440
+ log_p_eta=self.log_p_eta,
441
+ eta=self.eta,
442
+ weights=self.weights,
443
+ )
444
+
445
+ def balance(self, **kwargs):
446
+ """
447
+ Compute covariate balance statistics (convenience method).
448
+
449
+ This method wraps the standalone balance() function to provide
450
+ a convenient object-oriented interface.
451
+
452
+ The balance function routes to the appropriate method based on treatment type:
453
+ - For continuous treatments: computes weighted correlations
454
+ - For discrete treatments: computes standardized mean differences
455
+
456
+ Parameters
457
+ ----------
458
+ **kwargs
459
+ Additional arguments passed to the balance() function.
460
+ Supported keys: enhanced (bool), threshold (float), covariate_names (list).
461
+
462
+ Returns
463
+ -------
464
+ dict
465
+ Dictionary containing balance statistics:
466
+ - balanced: weighted covariate balance statistics
467
+ - original/unweighted: unweighted baseline statistics
468
+ See cbps.balance() documentation for details
469
+
470
+ Notes
471
+ -----
472
+ This method maintains API consistency with CBPSResults.balance()
473
+ and CBMSMResults.balance().
474
+
475
+ Implementation:
476
+ - Calls the standalone balance(self) function
477
+ - The global balance() function supports NPCBPSResults objects
478
+ - Automatically routes to the appropriate balance function based on treatment type
479
+
480
+ Examples
481
+ --------
482
+ >>> fit = npCBPS('treat ~ age + educ', data=df)
483
+ >>>
484
+ >>> # Method 1: Standalone function
485
+ >>> from cbps import balance
486
+ >>> bal = balance(fit)
487
+ >>>
488
+ >>> # Method 2: Object method
489
+ >>> bal = fit.balance()
490
+ >>>
491
+ >>> # Both methods return identical results
492
+ """
493
+ # Import standalone function (avoid circular import)
494
+ from cbps import balance as balance_func
495
+
496
+ # Call the global balance() function directly
497
+ return balance_func(self, **kwargs)
498
+
499
+ def vcov(self):
500
+ """
501
+ Return the variance-covariance matrix of coefficients.
502
+
503
+ Notes
504
+ -----
505
+ npCBPS is a nonparametric method that does not estimate parametric
506
+ model coefficients, and therefore has no variance-covariance matrix.
507
+ This method raises a ValueError to inform users of this limitation.
508
+
509
+ Raises
510
+ ------
511
+ ValueError
512
+ Always raised because npCBPS does not estimate coefficients.
513
+ """
514
+ raise ValueError(
515
+ "npCBPS is a nonparametric method and does not estimate "
516
+ "coefficients or their variance-covariance matrix."
517
+ )
518
+
519
+
520
+
521
+
522
+ def npCBPS(
523
+ formula: str,
524
+ data: pd.DataFrame,
525
+ na_action: Optional[str] = None,
526
+ corprior: Optional[float] = None,
527
+ print_level: int = 0,
528
+ seed: Optional[int] = None,
529
+ **kwargs: Any
530
+ ) -> NPCBPSResults:
531
+ """
532
+ Estimate nonparametric covariate balancing weights.
533
+
534
+ Implements the nonparametric CBGPS estimator from Section 3.3 of Fong,
535
+ Hazlett, and Imai (2018). This method estimates inverse probability
536
+ weights directly via empirical likelihood without specifying a
537
+ parametric propensity score model.
538
+
539
+ Parameters
540
+ ----------
541
+ formula : str
542
+ Model formula specifying the treatment and covariates, e.g.,
543
+ ``'treat ~ age + educ + income'``. The left-hand side is the
544
+ treatment variable; the right-hand side lists covariates.
545
+ data : pd.DataFrame
546
+ DataFrame containing all variables referenced in the formula.
547
+ na_action : {'warn', 'fail', 'ignore'}, optional
548
+ How to handle missing values:
549
+
550
+ - ``'warn'`` (default): Drop rows with missing values and warn.
551
+ - ``'fail'``: Raise ValueError if missing values are present.
552
+ - ``'ignore'``: Silently drop rows with missing values.
553
+
554
+ corprior : float, optional
555
+ Prior standard deviation :math:`\\sigma` for the allowed weighted
556
+ correlation :math:`\\eta \\sim N(0, \\sigma^2 I_K)`. Controls the
557
+ tradeoff between exact balance and stable weights.
558
+
559
+ If ``None`` (default), set to ``0.1/n`` following the paper's
560
+ recommendation in Section 3.3.4.
561
+
562
+ Interpretation:
563
+
564
+ - Smaller values enforce tighter balance but may produce extreme
565
+ weights or fail to converge.
566
+ - Larger values allow more imbalance but ensure convergence.
567
+ - The default ``0.1/n`` generally provides good balance while
568
+ ensuring convergence.
569
+
570
+ print_level : int, default=0
571
+ Verbosity level. If > 0, prints optimization diagnostics including
572
+ ``log_post``, ``log_el``, ``log_p_eta``, and ``sumw0``.
573
+ seed : int, optional
574
+ Random seed for reproducibility.
575
+ **kwargs
576
+ Reserved for future extensions.
577
+
578
+ Returns
579
+ -------
580
+ NPCBPSResults
581
+ Fitted result object with attributes:
582
+
583
+ - **weights**: Final weights normalized to sum to n.
584
+ - **sumw0**: Sum of unnormalized weights (should be close to 1).
585
+ - **eta**: Optimal weighted correlations.
586
+ - **par**: Optimal scaling parameter :math:`\\alpha`.
587
+ - **log_el**: Log empirical likelihood.
588
+ - **log_p_eta**: Log prior density.
589
+ - **converged**: Whether optimization converged.
590
+ - **y**, **x**: Treatment and covariate arrays.
591
+ - **formula**, **data**, **call**: Metadata.
592
+
593
+ Raises
594
+ ------
595
+ ValueError
596
+ If ``na_action='fail'`` and missing values are present, or if
597
+ ``corprior`` is outside [0, 10].
598
+ RuntimeError
599
+ If optimization produces NaN weights.
600
+
601
+ Notes
602
+ -----
603
+ **Algorithm (Section 3.3 of Fong et al., 2018):**
604
+
605
+ 1. Parse formula and extract treatment :math:`T` and covariates :math:`X`.
606
+ 2. Whiten covariates via Cholesky decomposition (Section 3.1).
607
+ 3. Construct constraint matrix :math:`g = (X^* T^*, X^*, T^*)^T`.
608
+ 4. Line search over :math:`\\alpha \\in [0, 1]` to maximize the penalized
609
+ likelihood (Equation 10).
610
+ 5. Recover weights :math:`w_i = 1/(1 - \\gamma^T(g_i - \\eta))`.
611
+ 6. Normalize weights so that :math:`\\sum w_i = n`.
612
+
613
+ **Non-convexity (Section 3.3.2):**
614
+
615
+ The empirical likelihood objective is not generally convex, so there is
616
+ no guarantee of finding the global optimum. Results may vary slightly
617
+ between runs, which is expected behavior.
618
+
619
+ **Convergence diagnostic:**
620
+
621
+ The key diagnostic is ``sumw0``, the sum of unnormalized weights.
622
+ Values within 5% of 1.0 indicate successful convergence. Large
623
+ deviations suggest adjusting the ``corprior`` parameter.
624
+
625
+ References
626
+ ----------
627
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing
628
+ propensity score for a continuous treatment: Application to the
629
+ efficacy of political advertisements. The Annals of Applied
630
+ Statistics, 12(1), 156-177. https://doi.org/10.1214/17-AOAS1101
631
+
632
+ Examples
633
+ --------
634
+ Basic usage with the LaLonde dataset:
635
+
636
+ >>> import pandas as pd
637
+ >>> from cbps import npCBPS
638
+ >>> from cbps.datasets import load_lalonde
639
+ >>> df = load_lalonde()
640
+ >>> fit = npCBPS('treat ~ age + educ + black + hisp + married + nodegr',
641
+ ... data=df)
642
+ >>> print(f"Sum of weights: {fit.weights.sum():.1f}")
643
+ >>> print(f"sumw0 (should be ~1): {fit.sumw0:.4f}")
644
+
645
+ Adjusting the balance-variance tradeoff:
646
+
647
+ >>> # Tighter balance (may not converge for all datasets)
648
+ >>> fit_tight = npCBPS('treat ~ age + educ', data=df, corprior=0.001)
649
+ >>> # Looser balance (ensures convergence)
650
+ >>> fit_loose = npCBPS('treat ~ age + educ', data=df, corprior=0.1)
651
+ """
652
+ # Set random seed for reproducibility
653
+ if seed is not None:
654
+ np.random.seed(seed)
655
+
656
+ # Handle na_action parameter
657
+ # Set default value
658
+ if na_action is None:
659
+ na_action = 'warn' # Default: warn and drop missing values
660
+
661
+ # Validate na_action parameter value
662
+ valid_na_actions = {'warn', 'fail', 'ignore'}
663
+ if na_action not in valid_na_actions:
664
+ raise ValueError(
665
+ f"Invalid na_action='{na_action}'. "
666
+ f"Valid options are: {', '.join(repr(x) for x in valid_na_actions)}. "
667
+ f"Note: use 'warn' (not 'drop') to remove missing values with a warning."
668
+ )
669
+
670
+ # Missing value handling (before formula parsing)
671
+ # Extract columns involved in formula
672
+ treat_col = formula.split('~')[0].strip()
673
+ covar_part = formula.split('~')[1]
674
+ # Simple variable name extraction (handles basic formulas, complex ones handled by patsy)
675
+ import re
676
+ covar_cols = re.findall(r'\b[a-zA-Z_]\w*\b', covar_part)
677
+ relevant_cols = [treat_col] + [col for col in covar_cols if col in data.columns]
678
+
679
+ # Check for missing values
680
+ n_missing = data[relevant_cols].isna().any(axis=1).sum()
681
+ na_action_info = None
682
+ if n_missing > 0:
683
+ if na_action == 'fail':
684
+ raise ValueError(
685
+ f"npCBPS: Missing values detected in {n_missing} observations. "
686
+ f"Set na_action='warn' to remove them, or handle missing values before calling npCBPS()."
687
+ )
688
+ elif na_action == 'warn':
689
+ import warnings
690
+ data_clean = data.dropna(subset=relevant_cols)
691
+ n_dropped = len(data) - len(data_clean)
692
+ warnings.warn(
693
+ f"npCBPS: Removed {n_dropped} observations with missing values. "
694
+ f"Remaining sample size: {len(data_clean)}.",
695
+ UserWarning
696
+ )
697
+ data = data_clean
698
+ na_action_info = {'method': 'warn', 'n_dropped': n_dropped}
699
+ elif na_action == 'ignore':
700
+ # Ignore mode: silently drop missing values
701
+ data_clean = data.dropna(subset=relevant_cols)
702
+ n_dropped = len(data) - len(data_clean)
703
+ data = data_clean
704
+ na_action_info = {'method': 'ignore', 'n_dropped': n_dropped}
705
+
706
+ # Sample-size adaptive corprior default value
707
+ # Paper recommendation: ρ = 0.1/N (Fong et al. 2018, Section 3.3.4)
708
+ n_obs = len(data)
709
+ if corprior is None:
710
+ corprior = 0.1 / n_obs
711
+ if print_level > 0:
712
+ print(f"npCBPS: Using paper-recommended corprior = 0.1/n = {corprior:.6f}")
713
+
714
+ # Input validation
715
+ # Validate corprior range (based on paper recommendation and experience)
716
+ # Allow corprior=0 but issue a warning
717
+ if not (0.0 <= corprior <= 10.0):
718
+ raise ValueError(
719
+ f"corprior={corprior} is outside the valid range [0.0, 10.0]. "
720
+ f"The paper recommends corprior ≈ 0.1/n (for this dataset: {0.1/n_obs:.6f}). "
721
+ f"Values >>1 often lead to NaN weights."
722
+ )
723
+
724
+ # Special warning for corprior=0
725
+ if corprior == 0.0:
726
+ import warnings
727
+ warnings.warn(
728
+ "corprior=0 removes all correlation prior penalty, which may lead to "
729
+ "unstable or extreme weights in small samples. "
730
+ "The paper recommends corprior ≈ 0.1/n for most applications. "
731
+ "Use corprior=0 only for specific purposes like sensitivity analysis.",
732
+ UserWarning
733
+ )
734
+
735
+ # Validate sample size (avoid numerical instability with small samples)
736
+ if n_obs < 30:
737
+ import warnings
738
+ warnings.warn(
739
+ f"Small sample size (n={n_obs}). npCBPS may be unstable with n<30. "
740
+ f"Consider using standard CBPS for small samples.",
741
+ UserWarning
742
+ )
743
+
744
+ # Formula parsing
745
+ # parse_formula returns (y, X), where y is the treatment vector and X is the covariate matrix (with intercept)
746
+ # preserve_categorical=True to maintain factor semantics
747
+ # Also extract terms object for model diagnostics
748
+ from patsy import dmatrices
749
+
750
+ # Save original data for metadata
751
+ data_original = data.copy()
752
+
753
+ # Use dmatrices to parse and obtain terms information
754
+ _, X_design = dmatrices(formula, data, return_type='dataframe')
755
+ terms_obj = X_design.design_info # patsy DesignInfo object
756
+
757
+ treat, X_mat = parse_formula(formula, data, preserve_categorical=True)
758
+
759
+ # Remove zero-variance columns
760
+ non_zero_var_cols = X_mat.std(axis=0) > 0
761
+ X_mat = X_mat[:, non_zero_var_cols]
762
+
763
+ # Call core fitting function
764
+ fit_result = npCBPS_fit(
765
+ treat=treat,
766
+ X=X_mat,
767
+ corprior=corprior,
768
+ print_level=print_level
769
+ )
770
+
771
+ # Append metadata
772
+ fit_result.call = f"npCBPS(formula={formula}, data=..., corprior={corprior})"
773
+ fit_result.formula = formula
774
+ fit_result.data = data_original # Save original data
775
+
776
+ # Add terms and na_action for model diagnostics
777
+ fit_result.terms = terms_obj
778
+ fit_result.na_action = na_action_info
779
+
780
+ return fit_result
781
+
782
+
783
+ def npCBPS_fit(
784
+ treat: Union[np.ndarray, pd.Series],
785
+ X: np.ndarray,
786
+ corprior: float,
787
+ print_level: int
788
+ ) -> NPCBPSResults:
789
+ """
790
+ Core fitting procedure for nonparametric CBPS.
791
+
792
+ This is the internal implementation called by :func:`npCBPS` after
793
+ formula parsing and data preprocessing. It performs the empirical
794
+ likelihood optimization to estimate covariate balancing weights.
795
+
796
+ Parameters
797
+ ----------
798
+ treat : np.ndarray or pd.Series of shape (n,)
799
+ Treatment variable. If a pandas Categorical Series, it is treated
800
+ as a factor with J levels. Otherwise, it is treated as continuous.
801
+ X : np.ndarray of shape (n, K)
802
+ Covariate matrix with zero-variance columns removed.
803
+ corprior : float
804
+ Prior standard deviation :math:`\\sigma` for the allowed weighted
805
+ correlation (see Section 3.3.4 of Fong et al., 2018).
806
+ print_level : int
807
+ Verbosity level for diagnostic output.
808
+
809
+ Returns
810
+ -------
811
+ NPCBPSResults
812
+ Fitted result object. Note that ``formula``, ``data``, ``call``,
813
+ ``terms``, and ``na_action`` attributes are populated by the caller.
814
+
815
+ Notes
816
+ -----
817
+ **Treatment types:**
818
+
819
+ - *Continuous*: Constraint matrix has K correlation constraints.
820
+ - *Factor (J levels)*: Constraint matrix has K*(J-1) correlation
821
+ constraints using one-hot encoding.
822
+
823
+ **Implementation details:**
824
+
825
+ - Covariates are whitened using :func:`cholesky_whitening`.
826
+ - The treatment is standardized to zero mean and unit variance.
827
+ - The line search is bounded to :math:`\\alpha \\in [0, 1]`.
828
+
829
+ See Also
830
+ --------
831
+ npCBPS : High-level interface with formula parsing.
832
+ """
833
+ # Initialization
834
+ # Detect if treatment is a Categorical factor
835
+ is_factor_treat = isinstance(treat, pd.Series) and isinstance(treat.dtype, pd.CategoricalDtype)
836
+
837
+ if is_factor_treat:
838
+ # Factor treatment: preserve original Series, convert later
839
+ _D_original = treat.copy() # Reserved for future use
840
+ D = treat.cat.codes.values.astype(np.float64)
841
+ else:
842
+ # Continuous treatment: direct copy
843
+ D = treat.copy() if isinstance(treat, np.ndarray) else treat.values.copy()
844
+ _D_original = None # noqa: F841
845
+
846
+ rescale_orig = True
847
+ orig_X = X.copy()
848
+
849
+ # Preprocessing: Cholesky whitening
850
+ X = cholesky_whitening(X, verify=True)
851
+
852
+ n = X.shape[0]
853
+
854
+ # Compute epsilon (numerical tolerance)
855
+ eps = 1.0 / n
856
+
857
+ # Construct constraint matrix z
858
+ # Determine treatment type:
859
+ # - pd.Categorical -> factor treatment
860
+ # - Numeric types (including binary 0/1) -> continuous treatment
861
+ #
862
+ # Note: Numeric treatment variables (including 0/1 binary values) use
863
+ # the continuous treatment path for consistency.
864
+
865
+ if not is_factor_treat:
866
+ # Continuous treatment path
867
+ if print_level > 0:
868
+ print("Estimating npCBPS as a continuous treatment.")
869
+
870
+ # Redirect X to ensure positive correlation with T
871
+ correlations = np.array([np.corrcoef(X[:, j], D)[0, 1] for j in range(X.shape[1])])
872
+ signs = np.sign(correlations)
873
+ X = X @ np.diag(signs)
874
+
875
+ # Standardize treatment
876
+ D = (D - D.mean()) / D.std(ddof=1)
877
+
878
+ # Construct constraint matrix: z = cbind(X*D, X, D)
879
+ X_times_D = X * D[:, None] # Element-wise multiplication, broadcast D
880
+ D_col = D[:, None] # Convert to column vector
881
+ z = np.column_stack([X_times_D, X, D_col])
882
+
883
+ _ncon = z.shape[1] # Total constraints (reserved for diagnostics) # noqa: F841
884
+ ncon_cor = X.shape[1] # K
885
+
886
+ # cor_init only used for factor treatment
887
+ cor_init = None
888
+
889
+ else:
890
+ # Factor treatment path
891
+ if print_level > 0:
892
+ print("Estimating npCBPS as a factor treatment.")
893
+
894
+ # Convert to one-hot encoding
895
+ unique_levels = np.unique(D)
896
+ conds = len(unique_levels)
897
+ Td = np.zeros((n, conds))
898
+ for i, level in enumerate(unique_levels):
899
+ Td[:, i] = (D == level).astype(float)
900
+
901
+ dimX = X.shape[1]
902
+
903
+ # Normalize each column
904
+ colsums = Td.sum(axis=0)
905
+ Td = Td @ np.diag(1 / colsums)
906
+
907
+ # Subtract last column and remove it
908
+ subtract_mat = Td[:, -1:] @ np.ones((1, conds))
909
+ Td = Td - subtract_mat
910
+ Td = Td[:, :-1]
911
+
912
+ # Center and scale
913
+ Td = (Td - Td.mean(axis=0)) / Td.std(axis=0, ddof=1)
914
+
915
+ # Construct z using Kronecker product
916
+ z_list = []
917
+ for i in range(n):
918
+ kron_prod = np.kron(Td[i, :], X[i, :])
919
+ z_list.append(kron_prod)
920
+ z = np.array(z_list)
921
+
922
+ # Compute cor_init for eta initialization
923
+ # For each column of X, compute correlations with all columns of Td
924
+ cor_init_list = []
925
+ for j in range(dimX):
926
+ cors_with_Td = np.array([np.corrcoef(Td[:, i], X[:, j])[0, 1] for i in range(Td.shape[1])])
927
+ cor_init_list.append(cors_with_Td)
928
+ # Transpose and flatten: stack into matrix (dimX, conds-1), transpose to (conds-1, dimX), then flatten
929
+ cor_init_matrix = np.array(cor_init_list) # shape: (dimX, conds-1)
930
+ cor_init = cor_init_matrix.T.ravel() # Transpose and flatten, shape: (dimX*(conds-1),)
931
+
932
+ # Add mean constraints
933
+ ncon_cor = z.shape[1] # Record number of correlation constraints
934
+ z = np.column_stack([z, X])
935
+ _ncon = z.shape[1] # Total constraints (reserved for diagnostics) # noqa: F841
936
+
937
+ # Optimization preparation
938
+ # Prior standard deviation
939
+ # eta_prior_sd = corprior (standard deviation, not variance)
940
+ eta_prior_sd = np.full(ncon_cor, corprior)
941
+
942
+ # Initialize eta
943
+ if not is_factor_treat:
944
+ # Continuous treatment: eta_init = cor(X, D)
945
+ # Note: D is already standardized, X is already whitened
946
+ eta_init = np.array([np.corrcoef(X[:, j], D)[0, 1] for j in range(X.shape[1])])
947
+ else:
948
+ # Factor treatment: use cor_init computed above
949
+ eta_init = cor_init
950
+
951
+ # Eta scaling vector
952
+ if rescale_orig:
953
+ eta_to_be_scaled = eta_init
954
+ else:
955
+ eta_to_be_scaled = np.ones(ncon_cor)
956
+
957
+ # Main optimization: line search over α ∈ [0, 1] (Fong et al. 2018, Equation 10)
958
+ # The paper specifies α ∈ [0, 1] to ensure algorithm stability
959
+
960
+ # Define wrapper function for maximization
961
+ def objective_for_maximize(par_scalar):
962
+ return log_post(par_scalar, eta_to_be_scaled, eta_prior_sd, z, eps, 0.001, ncon_cor, n)
963
+
964
+ # Maximize log_post using bounded scalar optimization
965
+ result = scipy.optimize.minimize_scalar(
966
+ lambda par: -objective_for_maximize(par),
967
+ bounds=(0, 1),
968
+ method='bounded',
969
+ options={'xatol': 1e-10, 'maxiter': 2000}
970
+ )
971
+ par_opt = result.x
972
+
973
+ # Print warning if optimization did not converge
974
+ if not result.success:
975
+ if print_level > 0:
976
+ print(f"Warning: optimization may not have converged: {result.message}")
977
+
978
+ # Compute optimal eta
979
+ eta_opt = par_opt * eta_to_be_scaled
980
+
981
+ # Compute optimal weights
982
+ el_out_opt = get_w(eta_opt, z, 0.05, eps, ncon_cor, n)
983
+ w_opt = el_out_opt['w']
984
+ sumw0 = el_out_opt['sumw']
985
+
986
+ # Weight normalization following theory (Fong et al. 2018, Equation 8)
987
+ # The paper requires: Σw_i = N (constraint in Equation 8)
988
+ # Normalize weights so that sum(w) = n
989
+ w = w_opt * n / sumw0
990
+
991
+ # Check for NaN weights and raise meaningful error
992
+ if np.isnan(w).any() or np.isnan(sumw0):
993
+ raise RuntimeError(
994
+ f"npCBPS optimization failed and produced NaN weights. "
995
+ f"This usually indicates:\n"
996
+ f" 1. corprior is too large (current: {corprior}, try < 1.0)\n"
997
+ f" 2. corprior is too small (current: {corprior}, try > 0.0001)\n"
998
+ f" 3. Sample size is too small (current: n={n}, recommend n>=30)\n"
999
+ f" 4. Covariate-treatment correlation is extreme\n"
1000
+ f"Suggestion: Try adjusting corprior or using standard CBPS instead."
1001
+ )
1002
+
1003
+ # Compute log prior density
1004
+ log_p_eta_opt = np.sum(
1005
+ -0.5 * np.log(2 * np.pi * eta_prior_sd**2)
1006
+ - eta_opt**2 / (2 * eta_prior_sd**2)
1007
+ )
1008
+ log_el_opt = el_out_opt['log_el']
1009
+
1010
+ # Construct result object
1011
+ result_obj = NPCBPSResults()
1012
+ result_obj.par = par_opt
1013
+ result_obj.log_p_eta = log_p_eta_opt
1014
+ result_obj.log_el = log_el_opt
1015
+ result_obj.eta = eta_opt
1016
+ result_obj.sumw0 = sumw0
1017
+ result_obj.weights = w
1018
+ result_obj.y = treat # Original treatment variable
1019
+ result_obj.x = orig_X
1020
+
1021
+ # Convergence diagnostic fields
1022
+ result_obj.converged = result.success
1023
+ result_obj.iterations = result.nit if hasattr(result, 'nit') else None
1024
+
1025
+ # Diagnostic output (optional)
1026
+ if print_level > 0:
1027
+ print(f"par: {par_opt:.6f}")
1028
+ print(f"log_post: {-(log_el_opt + log_p_eta_opt):.6f}")
1029
+ print(f"log_el: {log_el_opt:.6f}")
1030
+ print(f"log_p_eta: {log_p_eta_opt:.6f}")
1031
+ print(f"sumw0: {sumw0:.6f}")
1032
+ print(f"converged: {result_obj.converged}")
1033
+ if result_obj.iterations is not None:
1034
+ print(f"iterations: {result_obj.iterations}")
1035
+
1036
+ return result_obj