cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1090 @@
1
+ Metadata-Version: 2.4
2
+ Name: cbps
3
+ Version: 0.2.0
4
+ Summary: Covariate Balancing Propensity Score (CBPS) for causal inference in Python
5
+ Author: Cai Xuanyu, Xu Wenli
6
+ Maintainer: Cai Xuanyu, Xu Wenli
7
+ License-Expression: AGPL-3.0-only
8
+ Project-URL: Homepage, https://cbps.readthedocs.io
9
+ Project-URL: Repository, https://github.com/gorgeousfish/CBPS-py
10
+ Project-URL: Documentation, https://cbps.readthedocs.io
11
+ Project-URL: Changelog, https://github.com/gorgeousfish/CBPS-py/blob/main/CHANGELOG.md
12
+ Project-URL: Bug Tracker, https://github.com/gorgeousfish/CBPS-py/issues
13
+ Keywords: causal-inference,propensity-score,covariate-balancing,treatment-effects,observational-studies,inverse-probability-weighting,generalized-method-of-moments,doubly-robust
14
+ Classifier: Development Status :: 5 - Production/Stable
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Intended Audience :: Education
17
+ Classifier: Natural Language :: English
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3 :: Only
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
26
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
+ Classifier: Framework :: Pytest
28
+ Classifier: Typing :: Typed
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Operating System :: MacOS
31
+ Classifier: Operating System :: Microsoft :: Windows
32
+ Classifier: Operating System :: POSIX :: Linux
33
+ Requires-Python: >=3.10
34
+ Description-Content-Type: text/markdown
35
+ License-File: LICENSE
36
+ Requires-Dist: numpy>=1.22.0
37
+ Requires-Dist: scipy>=1.8.0
38
+ Requires-Dist: pandas>=1.4.0
39
+ Requires-Dist: statsmodels>=0.13.0
40
+ Requires-Dist: patsy>=0.5.0
41
+ Provides-Extra: plots
42
+ Requires-Dist: matplotlib>=3.5.0; extra == "plots"
43
+ Provides-Extra: sklearn
44
+ Requires-Dist: scikit-learn>=1.0.0; extra == "sklearn"
45
+ Provides-Extra: hdcbps
46
+ Requires-Dist: glmnetforpython>=1.0.0; extra == "hdcbps"
47
+ Provides-Extra: dev
48
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
49
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
50
+ Requires-Dist: black>=23.0.0; extra == "dev"
51
+ Requires-Dist: isort>=5.12.0; extra == "dev"
52
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
53
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
54
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
55
+ Provides-Extra: test
56
+ Requires-Dist: pytest>=7.0.0; extra == "test"
57
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
58
+ Provides-Extra: docs
59
+ Requires-Dist: sphinx>=5.0.0; extra == "docs"
60
+ Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
61
+ Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
62
+ Requires-Dist: nbsphinx>=0.9.0; extra == "docs"
63
+ Requires-Dist: numpydoc>=1.5.0; extra == "docs"
64
+ Requires-Dist: myst-parser>=1.0.0; extra == "docs"
65
+ Requires-Dist: ipykernel>=6.0.0; extra == "docs"
66
+ Provides-Extra: all
67
+ Requires-Dist: cbps[hdcbps,plots,sklearn]; extra == "all"
68
+ Dynamic: license-file
69
+
70
+ # cbps
71
+
72
+ **Covariate Balancing Propensity Score for Python**
73
+
74
+ [![PyPI version](https://img.shields.io/pypi/v/cbps.svg)](https://pypi.org/project/cbps/)
75
+ [![Python 3.10+](https://img.shields.io/badge/Python-3.10%2B-blue.svg)](https://www.python.org/)
76
+ [![License: AGPL-3.0](https://img.shields.io/badge/License-AGPL--3.0-blue.svg)](LICENSE)
77
+ [![Documentation Status](https://readthedocs.org/projects/cbps/badge/?version=latest)](https://cbps.readthedocs.io/en/latest/?badge=latest)
78
+ [![CITATION.cff](https://img.shields.io/badge/citation-cff-blue.svg)](CITATION.cff)
79
+
80
+ ![cbps](image/image.jpg)
81
+
82
+ ## Overview
83
+
84
+ Traditional propensity score estimation faces a fundamental challenge known as the **propensity score tautology**: researchers iterate between fitting models and checking covariate balance, yet the estimated propensity score is considered appropriate only if it achieves balance. Even slight model misspecification can result in substantial bias in treatment effect estimates.
85
+
86
+ **CBPS solves this problem** by directly incorporating covariate balance conditions into propensity score estimation through the Generalized Method of Moments (GMM) framework. Instead of solely maximizing likelihood, CBPS simultaneously optimizes:
87
+
88
+ 1. **Predictive accuracy** of treatment assignment (score condition)
89
+ 2. **Covariate balance** between treatment groups (balance condition)
90
+
91
+ This dual optimization yields propensity scores that are more robust to model misspecification while maintaining theoretical guarantees for consistent causal effect estimation.
92
+
93
+ ## Features
94
+
95
+ - **Binary & Multi-valued Treatments** - Standard CBPS for discrete treatments with ATE/ATT estimation
96
+ - **Continuous Treatments** - Generalized propensity scores (CBGPS) for dose-response analysis
97
+ - **Longitudinal Data** - Marginal structural models (CBMSM) for time-varying treatments
98
+ - **High-dimensional Settings** - Regularized estimation (hdCBPS) when covariates exceed sample size
99
+ - **Nonparametric Methods** - Empirical likelihood approach (npCBPS) without distributional assumptions
100
+ - **Doubly Robust Estimation** - Optimal CBPS (oCBPS) with improved efficiency
101
+ - **Instrumental Variables** - CBPS for IV settings with treatment noncompliance
102
+ - **Model Diagnostics** - Hansen's J-test, balance statistics, and visualization tools
103
+ - **R Package Compatibility** - Numerical accuracy within ±1e-6 of CBPS R package v0.23
104
+
105
+ ## Installation
106
+
107
+ ### From PyPI
108
+
109
+ ```bash
110
+ pip install cbps
111
+ ```
112
+
113
+ ### With Optional Dependencies
114
+
115
+ ```bash
116
+ # High-dimensional CBPS support
117
+ pip install 'cbps[hdcbps]'
118
+
119
+ # Visualization tools
120
+ pip install 'cbps[plots]'
121
+
122
+ # scikit-learn integration
123
+ pip install 'cbps[sklearn]'
124
+
125
+ # All features
126
+ pip install 'cbps[all]'
127
+ ```
128
+
129
+ ### Development Installation
130
+
131
+ ```bash
132
+ git clone https://github.com/gorgeousfish/CBPS-py.git
133
+ cd CBPS-py
134
+ pip install -e ".[dev]"
135
+ ```
136
+
137
+ > **Note for Apple Silicon Users**: The `glmnetforpython` package (required for hdCBPS) needs compilation from source:
138
+ > ```bash
139
+ > brew install gcc
140
+ > git clone https://github.com/thierrymoudiki/glmnetforpython.git
141
+ > cd glmnetforpython && pip install -e .
142
+ > ```
143
+
144
+ ## Quick Start
145
+
146
+ Replicating the LaLonde (1986) analysis from Imai and Ratkovic (2014, Section 3.2):
147
+
148
+ ```python
149
+ from cbps import CBPS, balance
150
+ from cbps.datasets import load_lalonde
151
+
152
+ # Load LaLonde (1986) NSW job training data (445 observations)
153
+ data = load_lalonde()
154
+
155
+ # Over-identified CBPS for ATE estimation (Imai & Ratkovic, 2014)
156
+ fit = CBPS(
157
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
158
+ data=data,
159
+ att=0, # Average Treatment Effect
160
+ method='over' # Over-identified GMM (combines score + balance conditions)
161
+ )
162
+
163
+ # View results
164
+ print(fit.summary())
165
+ # Coefficients:
166
+ # Estimate Std. Error z value Pr(>|z|)
167
+ # (Intercept) 0.740178 0.002239 330.625 0.000e+00 ***
168
+ # age 0.007589 0.085471 0.089 9.292e-01
169
+ # educ -0.048260 0.060982 -0.791 4.287e-01
170
+ # black -0.199980 0.000624 -320.226 0.000e+00 ***
171
+ # hisp -0.756848 0.000051 -14837.961 0.000e+00 ***
172
+ # married 0.103707 0.000092 1129.734 0.000e+00 ***
173
+ # nodegr -0.676270 0.000224 -3013.458 0.000e+00 ***
174
+ # re74 -0.000022 0.124988 -0.000 9.999e-01
175
+ # re75 0.000032 0.128498 0.000 9.998e-01
176
+ #
177
+ # J - statistic: 5.09e-03
178
+ # Log-Likelihood: -294.2790
179
+ #
180
+ # Diagnostics:
181
+ # Converged: Yes
182
+ # Weight Summary:
183
+ # Min: 0.0029 Max: 0.0090 Mean: 0.0045
184
+ # Effective Sample Size: 420.7
185
+
186
+ # Check covariate balance improvement
187
+ bal = balance(fit)
188
+ print(bal['balanced']) # Balance after CBPS weighting
189
+ print(bal['original']) # Balance before weighting (baseline)
190
+ ```
191
+
192
+ ## Method Family
193
+
194
+ The CBPS methodology has been extended to address various causal inference challenges:
195
+
196
+ ```
197
+ ┌─────────────────────┐
198
+ │ CBPS │
199
+ │ Binary/Multi-valued │
200
+ └──────────┬───────────┘
201
+ ┌───────────┬──────┼──────┬───────────┐
202
+ ▼ ▼ ▼ ▼ ▼
203
+ ┌─────────┐ ┌────────┐ ┌────┐ ┌──────┐ ┌──────┐
204
+ │ CBGPS │ │ CBMSM │ │hdCB│ │ CBIV │ │oCBPS │
205
+ │Continuo-│ │Longitu-│ │PS │ │Instr-│ │Optim-│
206
+ │us Treat.│ │dinal │ │High│ │ument-│ │al/DR │
207
+ └────┬────┘ └────────┘ │Dim │ │al IV │ └──────┘
208
+ ▼ └────┘ └──────┘
209
+ ┌─────────┐
210
+ │ npCBPS │
211
+ │Nonparam.│
212
+ └─────────┘
213
+ ```
214
+
215
+ ### Method Selection Guide
216
+
217
+ | Scenario | Method | Function | Key Reference |
218
+ |:---------|:-------|:---------|:--------------|
219
+ | Binary treatment, cross-sectional | CBPS | `CBPS()` | Imai & Ratkovic (2014) |
220
+ | Multi-valued treatment (3-4 levels) | CBPS | `CBPS()` | Imai & Ratkovic (2014) |
221
+ | Continuous treatment (parametric) | CBGPS | `CBPS()` | Fong et al. (2018) |
222
+ | Continuous treatment (nonparametric) | npCBPS | `npCBPS()` | Fong et al. (2018) |
223
+ | Longitudinal/panel data | CBMSM | `CBMSM()` | Imai & Ratkovic (2015) |
224
+ | High-dimensional (d >> n) | hdCBPS | `hdCBPS()` | Ning et al. (2020) |
225
+ | Doubly robust estimation | oCBPS | `CBPS(..., baseline_formula, diff_formula)` | Fan et al. (2022) |
226
+ | Instrumental variables | CBIV | `CBIV()` | Imai & Ratkovic (2014) |
227
+
228
+ ---
229
+
230
+ ## CBPS: Binary and Multi-valued Treatments
231
+
232
+ The core CBPS method estimates propensity scores for binary or multi-valued discrete treatments by solving GMM moment conditions that combine the score function with covariate balance constraints.
233
+
234
+ ### When to Use
235
+
236
+ - Cross-sectional observational studies with binary treatment (0/1)
237
+ - Categorical treatments with 3-4 levels
238
+ - When model diagnostics (J-test) are desired
239
+
240
+ ### Key Concepts
241
+
242
+ **For ATE estimation**, the balance condition ensures:
243
+
244
+ $$E\left[\frac{T \cdot X}{\pi(X)} - \frac{(1-T) \cdot X}{1-\pi(X)}\right] = 0$$
245
+
246
+ **For ATT estimation**, control observations are reweighted to match the treated:
247
+
248
+ $$E\left[T \cdot X - \frac{\pi(X)(1-T) \cdot X}{1-\pi(X)}\right] = 0$$
249
+
250
+ ### Syntax
251
+
252
+ ```python
253
+ CBPS(formula, data, att=1, method='over', two_step=True, standardize=True)
254
+ ```
255
+
256
+ | Parameter | Default | Description |
257
+ |:----------|:--------|:------------|
258
+ | `formula` | - | R-style formula: `'treatment ~ x1 + x2 + ...'` |
259
+ | `data` | - | pandas DataFrame |
260
+ | `att` | `1` | Estimand: 0 = ATE, 1 = ATT, 2 = ATT (reversed encoding) |
261
+ | `method` | `'over'` | `'exact'` (just-identified) or `'over'` (over-identified) |
262
+ | `two_step` | `True` | Two-step GMM (`True`) or continuous updating (`False`) |
263
+ | `standardize` | `True` | Standardize weights to sum to 1 within each treatment group |
264
+
265
+ ### Example
266
+
267
+ Replicating the LaLonde analysis from Imai and Ratkovic (2014, Section 3.2):
268
+
269
+ ```python
270
+ from cbps import CBPS, balance
271
+ from cbps.datasets import load_lalonde
272
+ from scipy import stats
273
+
274
+ data = load_lalonde()
275
+
276
+ # ATE estimation with over-identified GMM (CBPS2 in the paper)
277
+ fit_ate = CBPS(
278
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
279
+ data=data,
280
+ att=0,
281
+ method='over'
282
+ )
283
+ print(fit_ate.summary())
284
+
285
+ # ATT estimation
286
+ fit_att = CBPS(
287
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
288
+ data=data,
289
+ att=1,
290
+ method='over'
291
+ )
292
+
293
+ # Hansen's J-test for model specification (cf. Section 2.3)
294
+ # Paper reports J = 6.8 (df=22) for the linear specification
295
+ k = fit_ate.coefficients.shape[0] # number of parameters
296
+ j_pvalue = 1 - stats.chi2.cdf(fit_ate.J, k)
297
+ print(f"J-statistic: {fit_ate.J:.4e}, df: {k}, p-value: {j_pvalue:.4f}")
298
+
299
+ # Covariate balance comparison
300
+ bal = balance(fit_ate)
301
+ print(bal['balanced']) # Weighted balance (should show improvement)
302
+ print(bal['original']) # Unweighted baseline
303
+ ```
304
+
305
+ ---
306
+
307
+ ## CBGPS: Continuous Treatments
308
+
309
+ CBGPS extends the covariate balancing framework to continuous treatments by minimizing the weighted correlation between treatment and covariates.
310
+
311
+ ### When to Use
312
+
313
+ - Continuous treatment variable (e.g., dosage, intensity, duration)
314
+ - Dose-response curve estimation
315
+ - When parametric assumptions about treatment distribution are acceptable
316
+
317
+ ### Key Concept
318
+
319
+ For continuous treatment T with generalized propensity score f(T|X), the balance condition minimizes:
320
+
321
+ $$E\left[\frac{f(T)}{f(T|X)} \cdot T^* \cdot X^*\right] = 0$$
322
+
323
+ where T* and X* are centered and scaled versions of treatment and covariates.
324
+
325
+ ### Syntax
326
+
327
+ ```python
328
+ CBPS(formula, data, method='over') # Auto-detects continuous treatment
329
+ ```
330
+
331
+ ### Example
332
+
333
+ Replicating the empirical application from Fong, Hazlett, and Imai (2018, Section 5) — the effect of political advertising on campaign contributions using the Urban and Niebler (2014) dataset:
334
+
335
+ ```python
336
+ import numpy as np
337
+ from cbps import CBPS
338
+ from cbps.datasets import load_political_ads
339
+
340
+ # Load Urban & Niebler (2014) political advertising data (n=16,265)
341
+ df_raw, meta = load_political_ads()
342
+
343
+ # Box-Cox transform treatment variable (lambda = -0.16, as in the paper)
344
+ work = df_raw.copy()
345
+ lam = meta["boxcox_lambda"] # -0.16
346
+ work["T_bc"] = ((work["TotAds"].values + 1).clip(min=1e-10) ** lam - 1.0) / lam
347
+ work["logPop"] = np.log(work["TotalPop"].values.clip(min=1))
348
+ work["logInc"] = np.log(work["Inc"].values.clip(min=0) + 1)
349
+
350
+ # Add squared terms for non-binary covariates (paper p. 171)
351
+ work["logPop_sq"] = work["logPop"] ** 2
352
+ work["density_sq"] = work["density"] ** 2
353
+ work["logInc_sq"] = work["logInc"] ** 2
354
+ work["PercentHispanic_sq"] = work["PercentHispanic"] ** 2
355
+ work["PercentBlack_sq"] = work["PercentBlack"] ** 2
356
+ work["PercentOver65_sq"] = work["PercentOver65"] ** 2
357
+ work["per_collegegrads_sq"] = work["per_collegegrads"] ** 2
358
+
359
+ cov_cols = ["logPop", "density", "logInc", "PercentHispanic",
360
+ "PercentBlack", "PercentOver65", "per_collegegrads", "CanCommute",
361
+ "logPop_sq", "density_sq", "logInc_sq", "PercentHispanic_sq",
362
+ "PercentBlack_sq", "PercentOver65_sq", "per_collegegrads_sq"]
363
+ work = work.dropna(subset=["T_bc"] + cov_cols).reset_index(drop=True)
364
+
365
+ # CBPS auto-detects continuous treatment
366
+ formula = "T_bc ~ " + " + ".join(cov_cols)
367
+ fit = CBPS(formula=formula, data=work, att=0, method="over")
368
+ print(fit.summary())
369
+ # Converged: True, J-statistic ≈ 0.0000
370
+ # CBGPS reduces covariate-treatment correlations
371
+ # (cf. Table 1 and Figure 3 in the paper, 15 covariates)
372
+ # Note: Over-identified GMM may fall back to exact method for this dataset
373
+ # due to high condition number of the covariate matrix
374
+ ```
375
+
376
+ ---
377
+
378
+ ## npCBPS: Nonparametric CBPS
379
+
380
+ npCBPS uses empirical likelihood to estimate balancing weights without parametric assumptions about the generalized propensity score.
381
+
382
+ ### When to Use
383
+
384
+ - Continuous treatment with unknown distribution
385
+ - When parametric assumptions may be violated
386
+ - Flexible, assumption-light estimation preferred
387
+
388
+ ### Key Concept
389
+
390
+ Weights are chosen to maximize empirical likelihood subject to balance constraints:
391
+
392
+ $$\max \prod_{i=1}^n w_i \quad \text{s.t.} \quad \sum_i w_i X_i^* T_i^* = 0, \quad \sum_i w_i = n$$
393
+
394
+ ### Syntax
395
+
396
+ ```python
397
+ npCBPS(formula, data, corprior=0.01)
398
+ ```
399
+
400
+ | Parameter | Default | Description |
401
+ |:----------|:--------|:------------|
402
+ | `corprior` | None (auto: 0.1/n) | Prior penalty for correlation (larger = more tolerance for imbalance) |
403
+
404
+ ### Example
405
+
406
+ Using the Urban and Niebler (2014) data as in Fong et al. (2018, Section 5). For faster execution, we use a random subset and the 8 base covariates (without squared terms):
407
+
408
+ > **Note**: npCBPS uses iterative empirical likelihood optimization. With the full dataset (n=16,265) and 15 covariates, computation may take several minutes. The subset below (n=2,000) runs in ~30 seconds and demonstrates the same methodology.
409
+
410
+ ```python
411
+ import numpy as np
412
+ from cbps import npCBPS
413
+ from cbps.datasets import load_political_ads
414
+
415
+ # Data preparation (same as CBGPS example)
416
+ df_raw, meta = load_political_ads()
417
+ work = df_raw.copy()
418
+ lam = meta["boxcox_lambda"]
419
+ work["T_bc"] = ((work["TotAds"].values + 1).clip(min=1e-10) ** lam - 1.0) / lam
420
+ work["logPop"] = np.log(work["TotalPop"].values.clip(min=1))
421
+ work["logInc"] = np.log(work["Inc"].values.clip(min=0) + 1)
422
+
423
+ cov_cols = ["logPop", "density", "logInc", "PercentHispanic",
424
+ "PercentBlack", "PercentOver65", "per_collegegrads", "CanCommute"]
425
+ work = work.dropna(subset=["T_bc"] + cov_cols).reset_index(drop=True)
426
+
427
+ # Random subset for demonstration (full dataset also supported)
428
+ np.random.seed(42)
429
+ idx = np.random.choice(len(work), 2000, replace=False)
430
+ subset = work.iloc[idx].reset_index(drop=True)
431
+
432
+ formula = "T_bc ~ " + " + ".join(cov_cols)
433
+ fit = npCBPS(formula=formula, data=subset, corprior=0.01)
434
+ print(fit.summary())
435
+ # Converged: ✓ Yes
436
+ # Weighted Correlations: Mean=0.000101, all < 0.0002 (near-perfect balance)
437
+ # Weight Distribution: Min=0.533, Max=1.670, Mean=1.000
438
+ # Effective sample size: 1965.2
439
+ # Efficiency: 98.3%
440
+ ```
441
+
442
+ ---
443
+
444
+ ## CBMSM: Marginal Structural Models
445
+
446
+ CBMSM extends CBPS to longitudinal settings with time-varying treatments and confounders, addressing the challenge that standard regression cannot properly adjust for time-dependent confounders affected by prior treatment.
447
+
448
+ ### When to Use
449
+
450
+ - Panel/longitudinal data with repeated measurements
451
+ - Time-varying treatments
452
+ - Time-dependent confounders affected by past treatment
453
+
454
+ ### Key Concept
455
+
456
+ At each time period, weights balance covariates across all potential future treatment sequences, conditional on observed treatment history:
457
+
458
+ $$E\left[w_i(\bar{T}_J, \bar{X}_J) \cdot X_{ij} \mid \bar{T}_{j-1}\right] = E[X_{ij} \mid \bar{T}_{j-1}]$$
459
+
460
+ ### Syntax
461
+
462
+ ```python
463
+ CBMSM(formula, id, time, data, type='MSM', time_vary=False)
464
+ ```
465
+
466
+ | Parameter | Description |
467
+ |:----------|:------------|
468
+ | `formula` | Treatment model formula |
469
+ | `id` | Unit identifier variable name |
470
+ | `time` | Time period variable name |
471
+ | `type` | `'MSM'` (marginal structural) or `'MultiBin'` (multiple binary) |
472
+ | `twostep` | Two-step GMM (`True`, default) or continuous updating (`False`) |
473
+ | `msm_variance` | `'approx'` (default) or `'full'` variance estimation |
474
+ | `time_vary` | Whether covariates are time-varying |
475
+
476
+ ### Example
477
+
478
+ Replicating the empirical application from Imai and Ratkovic (2015, Section 5) — the effect of negative campaign advertising on Democratic vote share using the Blackwell (2013) dataset:
479
+
480
+ ```python
481
+ import numpy as np
482
+ import statsmodels.api as sm
483
+ from cbps import CBMSM
484
+ from cbps.datasets import load_blackwell
485
+
486
+ # Blackwell (2013): 114 U.S. Senate/gubernatorial races, J=5 weekly periods
487
+ data = load_blackwell()
488
+
489
+ # Full treatment model from Section 5 (1548 balancing conditions)
490
+ fit = CBMSM(
491
+ formula='d.gone.neg ~ d.gone.neg.l1 + d.gone.neg.l2 + d.neg.frac.l3 + '
492
+ 'camp.length + deminc + base.poll + '
493
+ 'year.2002 + year.2004 + year.2006 + base.und + office',
494
+ id='demName',
495
+ time='time',
496
+ data=data,
497
+ type='MSM',
498
+ time_vary=True,
499
+ twostep=True,
500
+ msm_variance='approx'
501
+ )
502
+ print(fit.summary())
503
+
504
+ # Estimate cumulative effect of negative advertising on vote share
505
+ # (cf. Table 3, CBPS-Approx column: Cumulative effect = -1.43)
506
+ outcome = data.loc[data["time"] == data["time"].min(), "demprcnt"].values
507
+ X_cum = sm.add_constant(fit.treat_cum.reshape(-1, 1))
508
+ m_cum = sm.WLS(outcome, X_cum, weights=fit.fitted_values).fit()
509
+ print(f"Cumulative effect: {m_cum.params[1]:.2f} (SE: {m_cum.bse[1]:.2f})")
510
+ # Expected output: Cumulative effect: -1.44 (SE: 0.43)
511
+ # Paper Table 3 reports -1.43 (0.43) for CBPS-Approx
512
+ # Note: Convergence may show False due to optimizer tolerance settings,
513
+ # but the estimates closely match the published results.
514
+ ```
515
+
516
+ ---
517
+
518
+ ## hdCBPS: High-dimensional CBPS
519
+
520
+ hdCBPS handles settings where the number of covariates exceeds the sample size through LASSO regularization, while maintaining doubly robust properties.
521
+
522
+ ### When to Use
523
+
524
+ - High-dimensional settings (d >> n)
525
+ - Many potential confounders with unknown importance
526
+ - When variable selection is needed
527
+ - Doubly robust estimation desired
528
+
529
+ ### Key Concept
530
+
531
+ hdCBPS achieves the **weak covariate balancing property**:
532
+
533
+ $$\sum_i \left(\frac{T_i}{\tilde{\pi}_i} - 1\right) \alpha^{*\top} X_i \approx 0$$
534
+
535
+ where α* are outcome model coefficients. This enables root-n consistency even when d >> n.
536
+
537
+ ### Syntax
538
+
539
+ ```python
540
+ hdCBPS(formula, data, y, ATT=0)
541
+ ```
542
+
543
+ | Parameter | Description |
544
+ |:----------|:------------|
545
+ | `formula` | Propensity score model (can include many covariates) |
546
+ | `y` | Outcome variable name (required for variable selection) |
547
+ | `ATT` | 0 = ATE, 1 = ATT |
548
+
549
+ ### Example
550
+
551
+ ```python
552
+ from cbps import hdCBPS
553
+ import pandas as pd
554
+ import numpy as np
555
+
556
+ # Simulate high-dimensional data (d > n)
557
+ np.random.seed(42)
558
+ n, d = 200, 300
559
+ X = np.random.normal(0, 1, (n, d))
560
+ beta_true = np.zeros(d)
561
+ beta_true[:5] = [1, 0.5, 0.25, 0.1, 0.05] # Sparse true model
562
+ T = (X @ beta_true + np.random.normal(0, 1, n) > 0).astype(int)
563
+ Y = T + X[:, :3] @ [1, 0.5, 0.25] + np.random.normal(0, 1, n)
564
+
565
+ data = pd.DataFrame(X, columns=[f'X{i}' for i in range(d)])
566
+ data['T'] = T
567
+ data['Y'] = Y
568
+
569
+ # hdCBPS with automatic variable selection
570
+ import sys
571
+ sys.setrecursionlimit(5000) # Needed for patsy with many covariates
572
+ fit = hdCBPS(
573
+ formula='T ~ ' + ' + '.join([f'X{i}' for i in range(d)]),
574
+ data=data,
575
+ y='Y',
576
+ ATT=0
577
+ )
578
+ print(f"ATE estimate: {fit.ATE:.4f}")
579
+ print(f"SE: {fit.s:.4f}")
580
+ print(f"Selected variables (treated): {fit.n_selected_treat}")
581
+ print(f"Selected variables (control): {fit.n_selected_control}")
582
+ ```
583
+
584
+ > **Note**: Debug attributes (e.g., `result.debug_r_yhat1`) are now stored internally in `result._debug` dict. Direct attribute access still works but emits a `DeprecationWarning`. Use `result._debug['debug_r_yhat1']` instead.
585
+
586
+ ---
587
+
588
+ ## oCBPS: Optimal CBPS
589
+
590
+ Optimal CBPS improves upon standard CBPS by incorporating the outcome model structure, achieving doubly robust estimation with improved efficiency (Fan et al. 2022).
591
+
592
+ ### When to Use
593
+
594
+ - When doubly robust estimation is desired
595
+ - Outcome model structure is known or estimable
596
+ - Maximum efficiency is important
597
+
598
+ ### Key Concept
599
+
600
+ oCBPS solves for optimal balancing conditions that minimize asymptotic variance while maintaining consistency under either propensity score or outcome model misspecification. The optimal balancing function satisfies:
601
+
602
+ $$\alpha^\top f(X) = \pi(X) E[Y(0)|X] + (1-\pi(X)) E[Y(1)|X]$$
603
+
604
+ This gives greater weight to determinants of the mean potential outcome that is less likely to be realized.
605
+
606
+ ### Syntax
607
+
608
+ Optimal CBPS is accessed through the `CBPS()` function by specifying both baseline and difference formulas:
609
+
610
+ ```python
611
+ CBPS(formula, data, baseline_formula, diff_formula, att=0)
612
+ ```
613
+
614
+ | Parameter | Description |
615
+ |:----------|:------------|
616
+ | `formula` | Propensity score model formula |
617
+ | `baseline_formula` | Outcome model baseline covariates (K(X)) |
618
+ | `diff_formula` | Treatment effect covariates (L(X)) |
619
+ | `att` | Must be 0 (ATE only for oCBPS) |
620
+
621
+ ### Example
622
+
623
+ The baseline and diff formulas must satisfy the dimension constraint: m1 + m2 + 1 ≥ k, where m1 = number of baseline covariates, m2 = number of diff covariates, and k = number of propensity score parameters (including intercept).
624
+
625
+ ```python
626
+ from cbps import CBPS
627
+ from cbps.datasets import load_lalonde
628
+
629
+ data = load_lalonde()
630
+
631
+ # Optimal CBPS with outcome model specification
632
+ # Propensity score model: 9 parameters (intercept + 8 covariates), k=9
633
+ # Baseline K(X): 8 covariates, m1=8
634
+ # Diff L(X): 2 covariates, m2=2
635
+ # Dimension check: m1 + m2 + 1 = 11 >= 9 = k ✓ (over-identified)
636
+ fit = CBPS(
637
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
638
+ data=data,
639
+ baseline_formula='~ age + educ + black + hisp + married + nodegr + re74 + re75',
640
+ diff_formula='~ age + educ',
641
+ att=0
642
+ )
643
+ print(fit.summary())
644
+ # Coefficients:
645
+ # Estimate Std. Error z value Pr(>|z|)
646
+ # (Intercept) 1.175647 0.023190 50.696 0.000e+00 ***
647
+ # age 0.004057 0.137242 0.030 9.764e-01
648
+ # educ -0.069238 0.357867 -0.193 8.466e-01
649
+ # black -0.224812 0.017131 -13.123 0.000e+00 ***
650
+ # hisp -0.856508 0.002234 -383.434 0.000e+00 ***
651
+ # married 0.165491 0.002664 62.114 0.000e+00 ***
652
+ # nodegr -0.916259 0.005290 -173.202 0.000e+00 ***
653
+ # re74 -0.000035 0.000265 -0.130 8.964e-01
654
+ # re75 0.000068 0.000438 0.155 8.766e-01
655
+ #
656
+ # J - statistic: 3.94e-10
657
+ # Log-Likelihood: -293.6243
658
+ # Note: Convergence may show False due to optimizer tolerance,
659
+ # but the J-statistic near zero indicates excellent balance.
660
+ ```
661
+
662
+ ---
663
+
664
+ ## CBIV: Instrumental Variables
665
+
666
+ CBIV extends the covariate balancing framework to instrumental variable settings where treatment noncompliance exists.
667
+
668
+ ### When to Use
669
+
670
+ - Randomized experiments with noncompliance
671
+ - Observational studies with valid instruments
672
+ - Estimating local average treatment effects (LATE)
673
+
674
+ ### Syntax
675
+
676
+ ```python
677
+ # Formula interface (recommended)
678
+ CBIV(formula='treatment ~ covariates | instruments', data=df)
679
+
680
+ # Matrix interface
681
+ CBIV(Tr=treatment, Z=instrument, X=covariates)
682
+ ```
683
+
684
+ | Parameter | Default | Description |
685
+ |:----------|:--------|:------------|
686
+ | `formula` | None | IV formula: `'treat ~ x1 + x2 \| z'` (pipe separates covariates from instruments) |
687
+ | `data` | None | pandas DataFrame (required with formula) |
688
+ | `Tr` | None | Treatment array (matrix interface) |
689
+ | `Z` | None | Instrument array (matrix interface) |
690
+ | `X` | None | Covariate array (matrix interface) |
691
+ | `method` | `'over'` | `'exact'` or `'over'` (over-identified GMM) |
692
+ | `twostep` | `True` | Two-step GMM (`True`) or continuous updating (`False`) |
693
+ | `twosided` | `True` | Two-sided noncompliance (both always-takers and never-takers) |
694
+
695
+ ### Example
696
+
697
+ ```python
698
+ import numpy as np
699
+ import pandas as pd
700
+ from cbps import CBIV
701
+
702
+ # Simulate IV data with one-sided noncompliance
703
+ np.random.seed(42)
704
+ n = 500
705
+ X = np.random.randn(n, 2)
706
+ Z = np.random.binomial(1, 0.5, n) # Randomized instrument
707
+ p_comply = 1 / (1 + np.exp(-0.5 - 0.3 * X[:, 0]))
708
+ comply = np.random.binomial(1, p_comply, n)
709
+ Tr = Z * comply # Treatment = instrument × compliance
710
+
711
+ # Formula interface
712
+ df = pd.DataFrame({
713
+ 'treat': Tr, 'z': Z, 'x1': X[:, 0], 'x2': X[:, 1]
714
+ })
715
+ fit = CBIV(formula="treat ~ x1 + x2 | z", data=df,
716
+ method='over', twosided=False)
717
+ print(fit.summary())
718
+ # CBIV Estimation Results
719
+ # ===============================
720
+ # Sample size: 500
721
+ # Method: over
722
+ # Two-sided noncompliance: No
723
+ # Converged: Yes
724
+ #
725
+ # Model Statistics:
726
+ # J-statistic: 0.021656
727
+ # Complier Probabilities (π_c): Mean=0.6098
728
+ # Complier Weights (1/π_c): Mean=1.6431
729
+
730
+ # Matrix interface (equivalent)
731
+ fit2 = CBIV(Tr=Tr, Z=Z, X=X, method='over', twosided=False)
732
+ print(f"Converged: {fit2.converged}") # True
733
+ print(f"J-statistic: {fit2.J:.4f}") # 0.0217
734
+ ```
735
+
736
+ ---
737
+
738
+ ## Diagnostics
739
+
740
+ ### Balance Assessment
741
+
742
+ ```python
743
+ from cbps import CBPS, balance
744
+ from cbps.datasets import load_lalonde
745
+
746
+ data = load_lalonde()
747
+ fit = CBPS(
748
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
749
+ data=data, att=0, method='over'
750
+ )
751
+
752
+ # Balance assessment (cf. Table 3 in Imai & Ratkovic, 2014)
753
+ bal = balance(fit)
754
+ print(bal['balanced']) # Balance statistics after CBPS weighting
755
+ print(bal['original']) # Baseline unweighted statistics
756
+ # DataFrames have covariate names as row index for all estimator types
757
+ ```
758
+
759
+ ### Asymptotic Variance (AsyVar)
760
+
761
+ ```python
762
+ from cbps import CBPS, AsyVar
763
+ from cbps.datasets import load_lalonde
764
+
765
+ data = load_lalonde()
766
+ fit = CBPS(
767
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
768
+ data=data, att=0, method='over'
769
+ )
770
+
771
+ # Asymptotic variance for ATE
772
+ result = AsyVar(Y=data['re78'].values, CBPS_obj=fit, method='oCBPS')
773
+
774
+ # Preferred: snake_case keys
775
+ print(f"ATE: {result['mu_hat']:.3f} (SE: {result['std_err']:.3f})")
776
+ print(f"95% CI: [{result['ci_mu_hat'][0]:.1f}, {result['ci_mu_hat'][1]:.1f}]")
777
+
778
+ # Backward compatible: R-style keys also work
779
+ print(f"ATE: {result['mu.hat']:.3f}") # same value as result['mu_hat']
780
+ ```
781
+
782
+ ### Visualization
783
+
784
+ ```python
785
+ from cbps import CBPS, plot_cbps, plot_cbps_continuous
786
+ from cbps.datasets import load_lalonde, load_political_ads
787
+ import numpy as np
788
+
789
+ data = load_lalonde()
790
+ fit = CBPS(
791
+ formula='treat ~ age + educ + black + hisp + married + nodegr + re74 + re75',
792
+ data=data, att=0, method='over'
793
+ )
794
+
795
+ # Love plot for binary treatment balance (cf. Figure 1 concept in the paper)
796
+ plot_cbps(fit)
797
+
798
+ # For continuous treatment (Fong et al. 2018, Figure 3 concept)
799
+ df_raw, meta = load_political_ads()
800
+ work = df_raw.copy()
801
+ lam = meta["boxcox_lambda"]
802
+ work["T_bc"] = ((work["TotAds"].values + 1).clip(min=1e-10) ** lam - 1.0) / lam
803
+ work["logPop"] = np.log(work["TotalPop"].values.clip(min=1))
804
+ work["logInc"] = np.log(work["Inc"].values.clip(min=0) + 1)
805
+ work["logPop_sq"] = work["logPop"] ** 2
806
+ work["density_sq"] = work["density"] ** 2
807
+ work["logInc_sq"] = work["logInc"] ** 2
808
+ work["PercentHispanic_sq"] = work["PercentHispanic"] ** 2
809
+ work["PercentBlack_sq"] = work["PercentBlack"] ** 2
810
+ work["PercentOver65_sq"] = work["PercentOver65"] ** 2
811
+ work["per_collegegrads_sq"] = work["per_collegegrads"] ** 2
812
+ cov_cols = ["logPop", "density", "logInc", "PercentHispanic",
813
+ "PercentBlack", "PercentOver65", "per_collegegrads", "CanCommute",
814
+ "logPop_sq", "density_sq", "logInc_sq", "PercentHispanic_sq",
815
+ "PercentBlack_sq", "PercentOver65_sq", "per_collegegrads_sq"]
816
+ work = work.dropna(subset=["T_bc"] + cov_cols).reset_index(drop=True)
817
+ fit_cont = CBPS(formula="T_bc ~ " + " + ".join(cov_cols), data=work, att=0, method="over")
818
+ plot_cbps_continuous(fit_cont)
819
+ ```
820
+
821
+ ### J-Statistic (Specification Test)
822
+
823
+ For over-identified CBPS, Hansen's J-statistic tests model specification:
824
+
825
+ $$J = n \cdot \bar{g}(\hat{\beta})' \hat{\Sigma}^{-1} \bar{g}(\hat{\beta}) \xrightarrow{d} \chi^2_{k}$$
826
+
827
+ A significant J-statistic suggests potential model misspecification.
828
+
829
+ ```python
830
+ from scipy import stats
831
+
832
+ # J-statistic is stored in fit.J
833
+ print(f"J-statistic: {fit.J:.4f}")
834
+
835
+ # Compute p-value manually
836
+ k = fit.coefficients.shape[0] # number of parameters
837
+ j_pvalue = 1 - stats.chi2.cdf(fit.J, k)
838
+ print(f"Degrees of freedom: {k}")
839
+ print(f"p-value: {j_pvalue:.4f}")
840
+ ```
841
+
842
+ ---
843
+
844
+ ## API Reference
845
+
846
+ ### Core Estimators
847
+
848
+ | Function | Treatment Type | Description |
849
+ |:---------|:---------------|:------------|
850
+ | `CBPS()` | Binary, Multi-valued, Continuous | Main estimator with automatic detection; also supports oCBPS via `baseline_formula`/`diff_formula` |
851
+ | `cbps_fit()` | Binary, Multi-valued, Continuous | Low-level array interface for CBPS |
852
+ | `npCBPS()` | Continuous | Nonparametric empirical likelihood |
853
+ | `npCBPS_fit()` | Continuous | Low-level array interface for npCBPS |
854
+ | `CBMSM()` | Time-varying Binary | Marginal structural models (formula interface) |
855
+ | `cbmsm_fit()` | Time-varying Binary | Low-level array interface for CBMSM |
856
+ | `hdCBPS()` | Binary | High-dimensional with LASSO regularization |
857
+ | `CBIV()` | Binary | Instrumental variables |
858
+
859
+ ### Diagnostics and Inference
860
+
861
+ | Function | Description |
862
+ |:---------|:------------|
863
+ | `balance()` | Covariate balance statistics (SMD for discrete, correlation for continuous) |
864
+ | `AsyVar()` | Asymptotic variance estimation for ATE (returns both snake_case and R-style keys) |
865
+ | `vcov_outcome()` | Variance-covariance matrix for weighted outcome regression |
866
+ | `plot_cbps()` | Love plot for binary/multi-valued treatments |
867
+ | `plot_cbps_continuous()` | Correlation plot for continuous treatments |
868
+ | `plot_cbmsm()` | Balance plot for marginal structural models |
869
+ | `plot_npcbps()` | Balance plot for nonparametric CBPS |
870
+
871
+ ### Result Attributes (CBPSResults)
872
+
873
+ | Attribute | Type | Description |
874
+ |:----------|:-----|:------------|
875
+ | `coefficients` | ndarray (k, 1) or (k, m) | Propensity score model coefficients |
876
+ | `coef` | ndarray (k,) | Flattened coefficient vector (alias) |
877
+ | `weights` | ndarray (n,) | Balancing weights |
878
+ | `fitted_values` | ndarray (n,) | Predicted propensity scores |
879
+ | `fitted` | ndarray (n,) | Alias for `fitted_values` |
880
+ | `linear_predictor` | ndarray (n,) | X @ beta before link transformation |
881
+ | `J` | float | Hansen's J-statistic |
882
+ | `J_stat` | float | Alias for `J` |
883
+ | `deviance` | float | Model deviance (-2 * log-likelihood) |
884
+ | `var` | ndarray (k, k) | Variance-covariance matrix of coefficients |
885
+ | `converged` | bool | Optimization convergence status |
886
+ | `residuals` | ndarray (n,) | Deviance residuals |
887
+ | `pseudo_r2` | float | McFadden's pseudo R-squared |
888
+ | `sigmasq` | float or None | Residual variance (continuous treatment only) |
889
+
890
+ > **Note**: `str(result)` and `str(result.summary())` now include a Diagnostics block showing convergence status, weight distribution summary (Min/Max/Mean), and Effective Sample Size (ESS).
891
+
892
+ ### Result Methods
893
+
894
+ | Method | Description |
895
+ |:-------|:------------|
896
+ | `.summary()` | Returns `CBPSSummary` with coefficient table, SEs, z-values, p-values |
897
+ | `.vcov()` | Returns variance-covariance matrix |
898
+ | `.balance(**kwargs)` | Computes covariate balance diagnostics |
899
+ | `.predict(newdata, type)` | Predicts propensity scores; `type='response'` or `'link'` |
900
+ | `.plot(kind)` | Diagnostic plots; `kind='deviance'` or `'residuals'` |
901
+
902
+ ### Summary Methods for All Result Classes
903
+
904
+ All result classes now provide a consistent `summary()` method that returns a dedicated summary object (not a string). Use `print(result.summary())` for formatted output.
905
+
906
+ | Result Class | Summary Class | Key Contents |
907
+ |:-------------|:--------------|:-------------|
908
+ | `CBPSResults` | `CBPSSummary` | Coefficients, SEs, z-values, p-values, diagnostics |
909
+ | `CBMSMResults` | `CBMSMSummary` | Propensity scores, MSM weights, coefficients |
910
+ | `NPCBPSResults` | `NPCBPSSummary` | Convergence, optimization, weighted correlations, weight distribution |
911
+ | `HDCBPSResults` | `HDCBPSSummary` | ATE/ATT, variable selection, convergence |
912
+ | `CBIVResults` | `CBIVSummary` | Coefficients, J-statistic, balance |
913
+
914
+ ### Summary Attributes (CBPSSummary)
915
+
916
+ | Attribute | Type | Description |
917
+ |:----------|:-----|:------------|
918
+ | `coef` | ndarray | Coefficient estimates |
919
+ | `se` | ndarray | Standard errors |
920
+ | `zvalues` | ndarray | z-statistics |
921
+ | `pvalues` | ndarray | Two-sided p-values |
922
+
923
+ ---
924
+
925
+ ## Numerical Accuracy
926
+
927
+ This package maintains high numerical precision validated against the R CBPS package (v0.23):
928
+
929
+ | Component | Precision | Notes |
930
+ |:----------|:----------|:------|
931
+ | Coefficients | ±1e-6 | Core propensity score parameters |
932
+ | Weights | ±1e-6 | IPW weights |
933
+ | J-statistic | ±1e-4 | Specification test |
934
+ | Standard errors | ±1e-5 | Asymptotic variance |
935
+
936
+ Numerical accuracy is verified through extensive benchmark tests against R outputs using the LaLonde and Blackwell datasets.
937
+
938
+ ---
939
+
940
+ ## Datasets
941
+
942
+ The package includes classic datasets for causal inference research:
943
+
944
+ | Function | Description | Treatment Type | Reference |
945
+ |:---------|:------------|:---------------|:----------|
946
+ | `load_lalonde()` | NSW job training program evaluation | Binary | LaLonde (1986) |
947
+ | `load_lalonde_psid_combined()` | NSW experimental + PSID control data | Binary | Dehejia & Wahba (1999) |
948
+ | `load_blackwell()` | Longitudinal political campaign data | Time-varying Binary | Blackwell (2013) |
949
+ | `load_continuous_simulation()` | Simulated dose-response data (requires external data) | Continuous | Fong et al. (2018) |
950
+ | `load_political_ads()` | Political advertising efficacy | Continuous | Urban & Niebler (2014) |
951
+ | `load_npcbps_continuous_sim()` | Nonparametric CBPS validation data | Continuous | Fong et al. (2018) |
952
+
953
+ ### Example Usage
954
+
955
+ ```python
956
+ from cbps.datasets import (
957
+ load_lalonde,
958
+ load_lalonde_psid_combined,
959
+ load_blackwell,
960
+ load_political_ads
961
+ )
962
+
963
+ # LaLonde (1986) job training data - 445 observations
964
+ lalonde = load_lalonde()
965
+
966
+ # Combined NSW + PSID data for selection bias studies
967
+ lalonde_psid = load_lalonde_psid_combined()
968
+
969
+ # Blackwell (2013) negative campaign advertising - longitudinal data
970
+ blackwell = load_blackwell()
971
+
972
+ # Political ads efficacy data (Urban & Niebler 2014)
973
+ df_raw, meta = load_political_ads()
974
+ ```
975
+
976
+ ---
977
+
978
+ ## References
979
+
980
+ Imai, K., & Ratkovic, M. (2014). Covariate balancing propensity score. *Journal of the Royal Statistical Society Series B: Statistical Methodology*, 76(1), 243-263. [doi:10.1111/rssb.12027](https://doi.org/10.1111/rssb.12027)
981
+
982
+ Imai, K., & Ratkovic, M. (2015). Robust estimation of inverse probability weights for marginal structural models. *Journal of the American Statistical Association*, 110(511), 1013-1023. [doi:10.1080/01621459.2014.956872](https://doi.org/10.1080/01621459.2014.956872)
983
+
984
+ Fong, C., Hazlett, C., & Imai, K. (2018). Covariate balancing propensity score for a continuous treatment: Application to the efficacy of political advertisements. *The Annals of Applied Statistics*, 12(1), 156-177. [doi:10.1214/17-AOAS1101](https://doi.org/10.1214/17-AOAS1101)
985
+
986
+ Ning, Y., Peng, S., & Imai, K. (2020). Robust estimation of causal effects via a high-dimensional covariate balancing propensity score. *Biometrika*, 107(3), 533-554. [doi:10.1093/biomet/asaa020](https://doi.org/10.1093/biomet/asaa020)
987
+
988
+ Fan, J., Imai, K., Lee, I., Liu, H., Ning, Y., & Yang, X. (2022). Optimal covariate balancing conditions in propensity score estimation. *Journal of Business & Economic Statistics*, 41(1), 97-110. [doi:10.1080/07350015.2021.2002159](https://doi.org/10.1080/07350015.2021.2002159)
989
+
990
+ ## Authors
991
+
992
+ **Python Implementation:**
993
+
994
+ - **Xuanyu Cai**, City University of Macau
995
+ Email: [xuanyuCAI@outlook.com](mailto:xuanyuCAI@outlook.com)
996
+ - **Wenli Xu**, City University of Macau
997
+ Email: [wlxu@cityu.edu.mo](mailto:wlxu@cityu.edu.mo)
998
+
999
+ **Methodology:**
1000
+
1001
+ - **Kosuke Imai**, Harvard University
1002
+ - **Marc Ratkovic**, Princeton University
1003
+ - **Christian Fong**, Stanford University
1004
+ - **Chad Hazlett**, UCLA
1005
+ - **Yang Ning**, Cornell University
1006
+ - **Jianqing Fan**, Princeton University
1007
+
1008
+ ## License
1009
+
1010
+ AGPL-3.0. See [LICENSE](LICENSE) for details.
1011
+
1012
+ ## Citation
1013
+
1014
+ If you use this package in your research, please cite both the methodology papers and the Python implementation:
1015
+
1016
+ **APA Format:**
1017
+
1018
+ > Cai, X., & Xu, W. (2026). *cbps: Covariate Balancing Propensity Score for Python* (Version 0.1.0) [Computer software]. GitHub. https://github.com/gorgeousfish/CBPS-py
1019
+ >
1020
+ > Imai, K., & Ratkovic, M. (2014). Covariate balancing propensity score. *Journal of the Royal Statistical Society Series B: Statistical Methodology*, 76(1), 243-263.
1021
+ >
1022
+ > Imai, K., & Ratkovic, M. (2015). Robust estimation of inverse probability weights for marginal structural models. *Journal of the American Statistical Association*, 110(511), 1013-1023.
1023
+ >
1024
+ > Fong, C., Hazlett, C., & Imai, K. (2018). Covariate balancing propensity score for a continuous treatment: Application to the efficacy of political advertisements. *The Annals of Applied Statistics*, 12(1), 156-177.
1025
+ >
1026
+ > Ning, Y., Peng, S., & Imai, K. (2020). Robust estimation of causal effects via a high-dimensional covariate balancing propensity score. *Biometrika*, 107(3), 533-554.
1027
+ >
1028
+ > Fan, J., Imai, K., Lee, I., Liu, H., Ning, Y., & Yang, X. (2022). Optimal covariate balancing conditions in propensity score estimation. *Journal of Business & Economic Statistics*, 41(1), 97-110.
1029
+
1030
+ **BibTeX:**
1031
+
1032
+ ```bibtex
1033
+ @software{cbps2026python,
1034
+ title={cbps: Covariate Balancing Propensity Score for Python},
1035
+ author={Cai, Xuanyu and Xu, Wenli},
1036
+ year={2026},
1037
+ version={0.1.0},
1038
+ url={https://github.com/gorgeousfish/CBPS-py}
1039
+ }
1040
+
1041
+ @article{imai2014covariate,
1042
+ title={Covariate Balancing Propensity Score},
1043
+ author={Imai, Kosuke and Ratkovic, Marc},
1044
+ journal={Journal of the Royal Statistical Society Series B: Statistical Methodology},
1045
+ volume={76}, number={1}, pages={243--263},
1046
+ year={2014},
1047
+ doi={10.1111/rssb.12027}
1048
+ }
1049
+
1050
+ @article{imai2015robust,
1051
+ title={Robust Estimation of Inverse Probability Weights for Marginal Structural Models},
1052
+ author={Imai, Kosuke and Ratkovic, Marc},
1053
+ journal={Journal of the American Statistical Association},
1054
+ volume={110}, number={511}, pages={1013--1023},
1055
+ year={2015},
1056
+ doi={10.1080/01621459.2014.956872}
1057
+ }
1058
+
1059
+ @article{fong2018covariate,
1060
+ title={Covariate Balancing Propensity Score for a Continuous Treatment: Application to the Efficacy of Political Advertisements},
1061
+ author={Fong, Christian and Hazlett, Chad and Imai, Kosuke},
1062
+ journal={The Annals of Applied Statistics},
1063
+ volume={12}, number={1}, pages={156--177},
1064
+ year={2018},
1065
+ doi={10.1214/17-AOAS1101}
1066
+ }
1067
+
1068
+ @article{ning2020robust,
1069
+ title={Robust Estimation of Causal Effects via a High-Dimensional Covariate Balancing Propensity Score},
1070
+ author={Ning, Yang and Peng, Sida and Imai, Kosuke},
1071
+ journal={Biometrika},
1072
+ volume={107}, number={3}, pages={533--554},
1073
+ year={2020},
1074
+ doi={10.1093/biomet/asaa020}
1075
+ }
1076
+
1077
+ @article{fan2022optimal,
1078
+ title={Optimal Covariate Balancing Conditions in Propensity Score Estimation},
1079
+ author={Fan, Jianqing and Imai, Kosuke and Lee, Inbeom and Liu, Han and Ning, Yang and Yang, Xiaolin},
1080
+ journal={Journal of Business \& Economic Statistics},
1081
+ volume={41}, number={1}, pages={97--110},
1082
+ year={2022},
1083
+ doi={10.1080/07350015.2021.2002159}
1084
+ }
1085
+ ```
1086
+
1087
+ ## See Also
1088
+
1089
+ - Original R package by Fong, Ratkovic, Imai, Hazlett, and Yang: https://CRAN.R-project.org/package=CBPS
1090
+ - Paper: Imai, K., & Ratkovic, M. (2014). Covariate balancing propensity score. https://doi.org/10.1111/rssb.12027