cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Variance Adjustment for Weighted Outcome Regression
|
|
3
|
+
====================================================
|
|
4
|
+
|
|
5
|
+
Sandwich variance estimator for weighted least squares regression using
|
|
6
|
+
CBPS weights from continuous treatment models. Adjusts standard errors to
|
|
7
|
+
account for estimation uncertainty in the generalized propensity score.
|
|
8
|
+
|
|
9
|
+
The methodology follows Section 3.2 of Fong, Hazlett, and Imai (2018),
|
|
10
|
+
which derives the asymptotic variance of the weighted least squares
|
|
11
|
+
estimator by viewing it as a method of moments estimator based on the
|
|
12
|
+
combined moment conditions for propensity score estimation and outcome
|
|
13
|
+
regression.
|
|
14
|
+
|
|
15
|
+
References
|
|
16
|
+
----------
|
|
17
|
+
Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
|
|
18
|
+
score for a continuous treatment. The Annals of Applied Statistics,
|
|
19
|
+
12(1), 156-177. https://doi.org/10.1214/17-AOAS1101
|
|
20
|
+
|
|
21
|
+
Newey, W. K. and McFadden, D. (1994). Large sample estimation and
|
|
22
|
+
hypothesis testing. In Handbook of Econometrics, Vol. IV, 2111-2245.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from typing import Union
|
|
26
|
+
import numpy as np
|
|
27
|
+
from cbps.core.results import CBPSResults
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def vcov_outcome(
|
|
31
|
+
cbps_fit: CBPSResults,
|
|
32
|
+
Y: np.ndarray,
|
|
33
|
+
Z: np.ndarray,
|
|
34
|
+
delta: np.ndarray,
|
|
35
|
+
tol: float = 1e-5,
|
|
36
|
+
lambda_: float = 0.01
|
|
37
|
+
) -> np.ndarray:
|
|
38
|
+
"""
|
|
39
|
+
Compute adjusted variance-covariance matrix for weighted outcome regression.
|
|
40
|
+
|
|
41
|
+
Adjusts standard errors to account for uncertainty in CBPS weight
|
|
42
|
+
estimation when using continuous treatment weights. Implements the
|
|
43
|
+
asymptotic variance formula from Section 3.2 of Fong, Hazlett, and
|
|
44
|
+
Imai (2018), treating the weighted regression as a GMM estimator.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
cbps_fit : CBPSResults
|
|
49
|
+
Fitted continuous treatment CBPS object with attributes: Ttilde
|
|
50
|
+
(standardized treatment), Xtilde (whitened covariates), beta_tilde,
|
|
51
|
+
sigmasq_tilde, and weights.
|
|
52
|
+
Y : array-like of shape (n,)
|
|
53
|
+
Outcome variable.
|
|
54
|
+
Z : array-like of shape (n, q)
|
|
55
|
+
Outcome model design matrix (including treatment and intercept).
|
|
56
|
+
delta : array-like of shape (q,)
|
|
57
|
+
WLS coefficients from the weighted outcome regression.
|
|
58
|
+
tol : float, default=1e-5
|
|
59
|
+
Condition number tolerance. If the smallest singular value of M
|
|
60
|
+
divided by the largest is below tol, regularization is applied.
|
|
61
|
+
lambda_ : float, default=0.01
|
|
62
|
+
Ridge regularization constant added to diagonal of M when
|
|
63
|
+
ill-conditioned.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
V : ndarray of shape (q, q)
|
|
68
|
+
Adjusted variance-covariance matrix for delta.
|
|
69
|
+
|
|
70
|
+
Raises
|
|
71
|
+
------
|
|
72
|
+
ValueError
|
|
73
|
+
If cbps_fit lacks continuous treatment attributes or dimensions
|
|
74
|
+
are incompatible.
|
|
75
|
+
|
|
76
|
+
See Also
|
|
77
|
+
--------
|
|
78
|
+
asy_var : Variance estimation for binary treatment ATE.
|
|
79
|
+
|
|
80
|
+
Notes
|
|
81
|
+
-----
|
|
82
|
+
The variance formula accounts for estimation uncertainty in both the
|
|
83
|
+
propensity score parameters (beta, sigma^2) and the outcome regression
|
|
84
|
+
coefficients (delta). The sandwich estimator follows Newey and McFadden
|
|
85
|
+
(1994, Theorem 6.1).
|
|
86
|
+
|
|
87
|
+
Examples
|
|
88
|
+
--------
|
|
89
|
+
>>> import statsmodels.api as sm
|
|
90
|
+
>>> from cbps import CBPS, vcov_outcome
|
|
91
|
+
>>> fit = CBPS('T ~ X1 + X2 + X3', data=df, att=False)
|
|
92
|
+
>>> Z = sm.add_constant(df[['T', 'X1', 'X2']])
|
|
93
|
+
>>> wls = sm.WLS(df['Y'], Z, weights=fit.weights).fit()
|
|
94
|
+
>>> V_adj = vcov_outcome(fit, df['Y'], Z, wls.params)
|
|
95
|
+
>>> se_adj = np.sqrt(np.diag(V_adj))
|
|
96
|
+
"""
|
|
97
|
+
# Input validation
|
|
98
|
+
if not hasattr(cbps_fit, 'Ttilde') or cbps_fit.Ttilde is None:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
"cbps_fit must be a continuous treatment CBPS object with Ttilde "
|
|
101
|
+
"attribute. For binary treatments, use asy_var() instead."
|
|
102
|
+
)
|
|
103
|
+
if not hasattr(cbps_fit, 'Xtilde') or cbps_fit.Xtilde is None:
|
|
104
|
+
raise ValueError("cbps_fit missing Xtilde attribute")
|
|
105
|
+
if not hasattr(cbps_fit, 'beta_tilde') or cbps_fit.beta_tilde is None:
|
|
106
|
+
raise ValueError("cbps_fit missing beta_tilde attribute")
|
|
107
|
+
if not hasattr(cbps_fit, 'sigmasq_tilde') or cbps_fit.sigmasq_tilde is None:
|
|
108
|
+
raise ValueError("cbps_fit missing sigmasq_tilde attribute")
|
|
109
|
+
|
|
110
|
+
# Extract attributes
|
|
111
|
+
Xtilde = cbps_fit.Xtilde
|
|
112
|
+
Ttilde = cbps_fit.Ttilde
|
|
113
|
+
w = cbps_fit.weights
|
|
114
|
+
beta_tilde = cbps_fit.beta_tilde
|
|
115
|
+
sigmasq_tilde = cbps_fit.sigmasq_tilde
|
|
116
|
+
|
|
117
|
+
# Convert to numpy arrays
|
|
118
|
+
Y = np.asarray(Y).ravel()
|
|
119
|
+
Z = np.asarray(Z)
|
|
120
|
+
delta = np.asarray(delta).ravel()
|
|
121
|
+
|
|
122
|
+
# Dimension validation and shape normalization
|
|
123
|
+
N = len(Y)
|
|
124
|
+
Ttilde = np.asarray(Ttilde).reshape(-1)
|
|
125
|
+
w = np.asarray(w).reshape(-1)
|
|
126
|
+
Xtilde = np.asarray(Xtilde)
|
|
127
|
+
Z = np.asarray(Z)
|
|
128
|
+
if Xtilde.ndim != 2:
|
|
129
|
+
raise ValueError("Xtilde must be a 2D matrix")
|
|
130
|
+
if Z.ndim != 2:
|
|
131
|
+
raise ValueError("Z must be a 2D matrix")
|
|
132
|
+
if Xtilde.shape[0] != N and Xtilde.shape[1] == N:
|
|
133
|
+
Xtilde = Xtilde.T
|
|
134
|
+
if Z.shape[0] != N and Z.shape[1] == N:
|
|
135
|
+
Z = Z.T
|
|
136
|
+
if len(Ttilde) != N:
|
|
137
|
+
raise ValueError(f"Ttilde length ({len(Ttilde)}) does not match Y length ({N})")
|
|
138
|
+
if Xtilde.shape[0] != N:
|
|
139
|
+
raise ValueError(f"Xtilde row count ({Xtilde.shape[0]}) does not match Y length ({N})")
|
|
140
|
+
if Z.shape[0] != N:
|
|
141
|
+
raise ValueError(f"Z row count ({Z.shape[0]}) does not match Y length ({N})")
|
|
142
|
+
if len(delta) != Z.shape[1]:
|
|
143
|
+
raise ValueError(f"delta length ({len(delta)}) does not match Z column count ({Z.shape[1]})")
|
|
144
|
+
if len(w) != N:
|
|
145
|
+
raise ValueError(f"weights length ({len(w)}) does not match Y length ({N})")
|
|
146
|
+
|
|
147
|
+
# Parameter validation
|
|
148
|
+
if tol <= 0:
|
|
149
|
+
raise ValueError(f"tol must be positive, got {tol}")
|
|
150
|
+
if tol > 1.0:
|
|
151
|
+
import warnings
|
|
152
|
+
warnings.warn(
|
|
153
|
+
f"tol={tol} > 1 triggers regularization unconditionally",
|
|
154
|
+
UserWarning
|
|
155
|
+
)
|
|
156
|
+
if lambda_ < 0:
|
|
157
|
+
raise ValueError(f"lambda_ must be >= 0, got {lambda_}")
|
|
158
|
+
|
|
159
|
+
# Dimensions: K = number of covariates in propensity model, P = outcome model
|
|
160
|
+
K = Xtilde.shape[1]
|
|
161
|
+
P = Z.shape[1]
|
|
162
|
+
Sdelta = np.zeros((P, P))
|
|
163
|
+
Stheta = np.zeros((P, K+1))
|
|
164
|
+
|
|
165
|
+
# Residuals from propensity and outcome models
|
|
166
|
+
eps_beta = Ttilde - Xtilde @ beta_tilde
|
|
167
|
+
eps_delta = Y - Z @ delta
|
|
168
|
+
|
|
169
|
+
# M-matrix: Jacobian of moment conditions (Section 3.2, Fong et al. 2018)
|
|
170
|
+
M11 = np.mean(-2/sigmasq_tilde * eps_beta[:, None] * Xtilde, axis=0)
|
|
171
|
+
M12 = np.mean(-1/sigmasq_tilde**2 * eps_beta**2)
|
|
172
|
+
M22 = np.mean(
|
|
173
|
+
(1/(2*sigmasq_tilde) * w * (1 - 1/sigmasq_tilde * eps_beta**2) * Ttilde)[:, None] * Xtilde,
|
|
174
|
+
axis=0
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Compute M21, Sdelta, Stheta via accumulation
|
|
178
|
+
M21 = np.zeros((K, K))
|
|
179
|
+
for i in range(N):
|
|
180
|
+
M21 += (-1/sigmasq_tilde * w[i] * Ttilde[i] * eps_beta[i]) * np.outer(Xtilde[i], Xtilde[i]) / N
|
|
181
|
+
Sdelta -= w[i] * np.outer(Z[i], Z[i]) / N
|
|
182
|
+
Stheta += np.hstack([
|
|
183
|
+
-1/sigmasq_tilde * w[i] * eps_beta[i] * eps_delta[i] * np.outer(Z[i], Xtilde[i]),
|
|
184
|
+
(1/(2*sigmasq_tilde) * w[i] * (1 - 1/sigmasq_tilde * eps_beta[i]**2) * eps_delta[i] * Z[i])[:, None]
|
|
185
|
+
]) / N
|
|
186
|
+
|
|
187
|
+
# Assemble M-matrix
|
|
188
|
+
M = np.vstack([
|
|
189
|
+
np.hstack([M11, [M12]]),
|
|
190
|
+
np.hstack([M21, M22[:, None]])
|
|
191
|
+
])
|
|
192
|
+
|
|
193
|
+
# Ridge regularization if M is ill-conditioned
|
|
194
|
+
sv = np.linalg.svd(M, compute_uv=False)
|
|
195
|
+
cond_num = sv[0] / sv[-1]
|
|
196
|
+
if cond_num > (1/tol):
|
|
197
|
+
M = M + lambda_ * np.eye(M.shape[0])
|
|
198
|
+
|
|
199
|
+
# Sandwich variance estimator (Section 3.2, Fong et al. 2018)
|
|
200
|
+
s = (w * eps_delta)[:, None] * Z
|
|
201
|
+
mtheta = np.hstack([
|
|
202
|
+
((1/sigmasq_tilde) * (eps_beta**2) - 1)[:, None],
|
|
203
|
+
(w * Ttilde)[:, None] * Xtilde
|
|
204
|
+
])
|
|
205
|
+
assert mtheta.shape == (N, K+1), f"mtheta shape mismatch: {mtheta.shape}"
|
|
206
|
+
|
|
207
|
+
M_inv = np.linalg.inv(M)
|
|
208
|
+
inner = np.zeros((P, P))
|
|
209
|
+
for i in range(N):
|
|
210
|
+
inner_part = s[i] - Stheta @ M_inv @ mtheta[i]
|
|
211
|
+
inner += np.outer(inner_part, inner_part) / N
|
|
212
|
+
|
|
213
|
+
Sdelta_inv = np.linalg.inv(Sdelta)
|
|
214
|
+
V = Sdelta_inv @ inner @ Sdelta_inv.T / N
|
|
215
|
+
|
|
216
|
+
return V
|
|
217
|
+
|
cbps/iv/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Covariate Balancing Propensity Score for Instrumental Variables (CBIV)
|
|
3
|
+
=======================================================================
|
|
4
|
+
|
|
5
|
+
This module implements the Covariate Balancing Propensity Score (CBPS) methodology
|
|
6
|
+
for instrumental variable (IV) settings with treatment noncompliance. CBIV estimates
|
|
7
|
+
compliance type probabilities using generalized method of moments (GMM), simultaneously
|
|
8
|
+
optimizing covariate balance among compliers and prediction of treatment assignment.
|
|
9
|
+
|
|
10
|
+
In IV settings with noncompliance, units can be classified into principal strata
|
|
11
|
+
based on their potential treatment status under different instrument values:
|
|
12
|
+
|
|
13
|
+
- **Compliers**: Units who take treatment when encouraged (Z=1) and do not
|
|
14
|
+
take treatment when not encouraged (Z=0).
|
|
15
|
+
- **Always-takers**: Units who take treatment regardless of encouragement.
|
|
16
|
+
- **Never-takers**: Units who do not take treatment regardless of encouragement.
|
|
17
|
+
|
|
18
|
+
The local average treatment effect (LATE) is identified among compliers. CBIV
|
|
19
|
+
provides weights (inverse of complier probability) that can be used for
|
|
20
|
+
downstream causal effect estimation.
|
|
21
|
+
|
|
22
|
+
Key Components
|
|
23
|
+
--------------
|
|
24
|
+
- ``CBIV``: Main function for estimating compliance type propensity scores
|
|
25
|
+
- ``CBIVResults``: Result container with fitted compliance probabilities and weights
|
|
26
|
+
- ``CBIVNumericalWarning``: Warning class for numerical stability issues
|
|
27
|
+
|
|
28
|
+
Noncompliance Models
|
|
29
|
+
--------------------
|
|
30
|
+
- **Two-sided noncompliance** (default): Models compliers, always-takers, and
|
|
31
|
+
never-takers using multinomial logistic regression with three compliance types.
|
|
32
|
+
- **One-sided noncompliance**: Models compliers and never-takers only (assumes
|
|
33
|
+
no always-takers), using binary logistic regression.
|
|
34
|
+
|
|
35
|
+
References
|
|
36
|
+
----------
|
|
37
|
+
Imai, K. and Ratkovic, M. (2014). Covariate Balancing Propensity Score.
|
|
38
|
+
Journal of the Royal Statistical Society: Series B (Statistical Methodology),
|
|
39
|
+
76(1), 243-263. https://doi.org/10.1111/rssb.12027
|
|
40
|
+
|
|
41
|
+
Angrist, J. D., Imbens, G. W., and Rubin, D. B. (1996). Identification of
|
|
42
|
+
Causal Effects Using Instrumental Variables. Journal of the American
|
|
43
|
+
Statistical Association, 91(434), 444-455. https://doi.org/10.1080/01621459.1996.10476902
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from .cbiv import CBIV, CBIVResults, CBIVNumericalWarning
|
|
47
|
+
|
|
48
|
+
__all__ = ["CBIV", "CBIVResults", "CBIVNumericalWarning"]
|