cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/core/results.py
ADDED
|
@@ -0,0 +1,1447 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result Classes for Covariate Balancing Propensity Score Estimation
|
|
3
|
+
|
|
4
|
+
================================================================
|
|
5
|
+
|
|
6
|
+
This module implements the primary result containers for CBPS estimators,
|
|
7
|
+
providing a unified interface for accessing estimation results, conducting
|
|
8
|
+
statistical inference, and performing diagnostic assessments.
|
|
9
|
+
|
|
10
|
+
The module contains two main classes:
|
|
11
|
+
|
|
12
|
+
- :class:`CBPSResults`: Main result object containing all fitted model components
|
|
13
|
+
- :class:`CBPSSummary`: Statistical summary with coefficient table and diagnostics
|
|
14
|
+
|
|
15
|
+
These classes implement a comprehensive statistical modeling interface with
|
|
16
|
+
methods for inference, prediction, and diagnostic evaluation, maintaining
|
|
17
|
+
compatibility with established statistical software conventions while following
|
|
18
|
+
Python best practices.
|
|
19
|
+
|
|
20
|
+
Mathematical Framework
|
|
21
|
+
---------------------
|
|
22
|
+
|
|
23
|
+
The CBPS estimator solves the generalized method of moments (GMM) optimization
|
|
24
|
+
problem:
|
|
25
|
+
|
|
26
|
+
min_β ḡ(β)' Σ^(-1) ḡ(β)
|
|
27
|
+
|
|
28
|
+
where ḡ(β) is the sample average of moment conditions combining:
|
|
29
|
+
|
|
30
|
+
1. Score function: ∂ℓ(β)/∂β for treatment prediction
|
|
31
|
+
2. Balance conditions: (T_i - e(X_i,β))X_i for covariate balance
|
|
32
|
+
|
|
33
|
+
The resulting weights are:
|
|
34
|
+
|
|
35
|
+
w_i = T_i/e(X_i,β) - (1-T_i)/(1-e(X_i,β))
|
|
36
|
+
|
|
37
|
+
which satisfy the moment conditions E[w_i X_i] = 0 when correctly specified.
|
|
38
|
+
|
|
39
|
+
References
|
|
40
|
+
----------
|
|
41
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
42
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
43
|
+
https://doi.org/10.1111/rssb.12027
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from typing import Optional, List, Union
|
|
47
|
+
|
|
48
|
+
import numpy as np
|
|
49
|
+
import pandas as pd
|
|
50
|
+
import scipy.stats
|
|
51
|
+
import warnings
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def j_test_pvalue(J, n_moment_conditions, n_parameters):
|
|
55
|
+
"""Compute asymptotic p-value for Hansen's J-test of overidentification.
|
|
56
|
+
|
|
57
|
+
Under H0 (model correctly specified):
|
|
58
|
+
J ~ chi2(df), where df = n_moment_conditions - n_parameters
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
J : float
|
|
63
|
+
Hansen's J-statistic value. Must be non-negative (quadratic form).
|
|
64
|
+
n_moment_conditions : int
|
|
65
|
+
Number of moment conditions in the GMM system.
|
|
66
|
+
n_parameters : int
|
|
67
|
+
Number of estimated parameters.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
float or None
|
|
72
|
+
Asymptotic p-value from chi-squared distribution.
|
|
73
|
+
Returns None if just-identified (df <= 0) or if J is invalid.
|
|
74
|
+
|
|
75
|
+
Raises
|
|
76
|
+
------
|
|
77
|
+
ValueError
|
|
78
|
+
If J is negative or NaN (indicates upstream computation error).
|
|
79
|
+
|
|
80
|
+
References
|
|
81
|
+
----------
|
|
82
|
+
Imai, K. & Ratkovic, M. (2014), JRSSB. Section 3.
|
|
83
|
+
Hansen, L.P. (1982), Econometrica.
|
|
84
|
+
"""
|
|
85
|
+
# Validate J-statistic
|
|
86
|
+
if np.isnan(J):
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"J-statistic is NaN. This indicates an error in the GMM "
|
|
89
|
+
"objective computation. Check for numerical issues in estimation."
|
|
90
|
+
)
|
|
91
|
+
if J < 0:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"J-statistic must be non-negative (it is a quadratic form), "
|
|
94
|
+
f"got J={J:.6g}. This indicates an error in the GMM objective "
|
|
95
|
+
f"computation."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
df = n_moment_conditions - n_parameters
|
|
99
|
+
if df <= 0:
|
|
100
|
+
return None # Just-identified: no overidentification test
|
|
101
|
+
p_value = 1.0 - scipy.stats.chi2.cdf(J, df)
|
|
102
|
+
return p_value
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class CBPSResults:
|
|
106
|
+
"""
|
|
107
|
+
Result object from CBPS estimation.
|
|
108
|
+
|
|
109
|
+
This class encapsulates all outputs from the CBPS fitting procedure,
|
|
110
|
+
providing a unified interface for accessing coefficients, weights,
|
|
111
|
+
fitted propensity scores, and diagnostic statistics.
|
|
112
|
+
|
|
113
|
+
Attributes
|
|
114
|
+
----------
|
|
115
|
+
coefficients : ndarray, shape (k, 1) or (k, n_treats-1)
|
|
116
|
+
Estimated propensity score model coefficients.
|
|
117
|
+
For binary treatment: (k, 1) matrix.
|
|
118
|
+
For multi-valued treatment: (k, n_treats-1) matrix.
|
|
119
|
+
fitted_values : ndarray, shape (n,)
|
|
120
|
+
Fitted propensity scores for each observation.
|
|
121
|
+
weights : ndarray, shape (n,)
|
|
122
|
+
Optimal inverse probability weights for causal effect estimation.
|
|
123
|
+
linear_predictor : ndarray, shape (n,)
|
|
124
|
+
Linear predictor X @ beta before link function transformation.
|
|
125
|
+
y : ndarray, shape (n,)
|
|
126
|
+
Treatment assignment vector.
|
|
127
|
+
x : ndarray, shape (n, k)
|
|
128
|
+
Covariate matrix including intercept.
|
|
129
|
+
J : float
|
|
130
|
+
Hansen J-statistic for the GMM over-identification test.
|
|
131
|
+
mle_J : float
|
|
132
|
+
J-statistic evaluated at MLE estimates (baseline comparison).
|
|
133
|
+
deviance : float
|
|
134
|
+
Model deviance (-2 * log-likelihood).
|
|
135
|
+
converged : bool
|
|
136
|
+
Whether the optimization algorithm converged successfully.
|
|
137
|
+
var : ndarray, shape (k, k)
|
|
138
|
+
Variance-covariance matrix of coefficients (sandwich estimator).
|
|
139
|
+
coef_names : list of str
|
|
140
|
+
Names of coefficients extracted from the model formula.
|
|
141
|
+
call_info : str
|
|
142
|
+
String representation of the function call.
|
|
143
|
+
formula : str or None
|
|
144
|
+
The model formula used for fitting (if formula interface was used).
|
|
145
|
+
att : int or None
|
|
146
|
+
Target estimand: 0 for ATE, 1 for ATT.
|
|
147
|
+
method : str or None
|
|
148
|
+
Estimation method: 'over' (over-identified) or 'exact' (just-identified).
|
|
149
|
+
standardize : bool or None
|
|
150
|
+
Whether weights are standardized to sum to sample size.
|
|
151
|
+
two_step : bool or None
|
|
152
|
+
Whether two-step GMM estimator was used.
|
|
153
|
+
sigmasq : float or None
|
|
154
|
+
Residual variance estimate (continuous treatment only).
|
|
155
|
+
Ttilde : ndarray or None
|
|
156
|
+
Standardized treatment (zero mean, unit variance) for continuous treatment.
|
|
157
|
+
Used by vcov_outcome for variance estimation.
|
|
158
|
+
Xtilde : ndarray or None
|
|
159
|
+
Cholesky-whitened covariates for continuous treatment.
|
|
160
|
+
Used by vcov_outcome for variance estimation.
|
|
161
|
+
beta_tilde : ndarray or None
|
|
162
|
+
Coefficients in whitened space for continuous treatment.
|
|
163
|
+
sigmasq_tilde : float or None
|
|
164
|
+
Variance in whitened space for continuous treatment.
|
|
165
|
+
treat_names : list of str or None
|
|
166
|
+
Treatment level names for multi-valued treatment.
|
|
167
|
+
Example: ['Control', 'Low', 'High'] for 3-valued treatment.
|
|
168
|
+
na_action : dict or None
|
|
169
|
+
Missing value handling information containing:
|
|
170
|
+
- 'method': handling method ('omit', 'fail', 'ignore')
|
|
171
|
+
- 'n_dropped': number of dropped observations (only for method='omit')
|
|
172
|
+
|
|
173
|
+
Examples
|
|
174
|
+
--------
|
|
175
|
+
>>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
|
|
176
|
+
>>> summ = fit.summary() # Compute summary statistics
|
|
177
|
+
>>> print(summ) # Print full coefficient table
|
|
178
|
+
>>> vcov_mat = fit.vcov() # Get variance-covariance matrix
|
|
179
|
+
>>> print(fit) # Concise output
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
# Core estimation results
|
|
185
|
+
coefficients: np.ndarray,
|
|
186
|
+
fitted_values: np.ndarray,
|
|
187
|
+
weights: np.ndarray,
|
|
188
|
+
linear_predictor: np.ndarray,
|
|
189
|
+
y: np.ndarray,
|
|
190
|
+
x: np.ndarray,
|
|
191
|
+
J: float,
|
|
192
|
+
mle_J: float,
|
|
193
|
+
deviance: float,
|
|
194
|
+
converged: bool,
|
|
195
|
+
var: np.ndarray,
|
|
196
|
+
nulldeviance: Optional[float] = None,
|
|
197
|
+
|
|
198
|
+
# Metadata
|
|
199
|
+
coef_names: Optional[List[str]] = None,
|
|
200
|
+
call_info: Optional[str] = None,
|
|
201
|
+
formula: Optional[str] = None,
|
|
202
|
+
data: Optional[pd.DataFrame] = None,
|
|
203
|
+
terms: Optional[str] = None,
|
|
204
|
+
model: Optional[np.ndarray] = None,
|
|
205
|
+
xlevels: Optional[dict] = None,
|
|
206
|
+
|
|
207
|
+
# Input parameters
|
|
208
|
+
att: Optional[int] = None,
|
|
209
|
+
method: Optional[str] = None,
|
|
210
|
+
standardize: Optional[bool] = None,
|
|
211
|
+
two_step: Optional[bool] = None,
|
|
212
|
+
|
|
213
|
+
# Continuous treatment specific
|
|
214
|
+
sigmasq: Optional[float] = None,
|
|
215
|
+
Ttilde: Optional[np.ndarray] = None,
|
|
216
|
+
Xtilde: Optional[np.ndarray] = None,
|
|
217
|
+
beta_tilde: Optional[np.ndarray] = None,
|
|
218
|
+
sigmasq_tilde: Optional[float] = None,
|
|
219
|
+
stabilizers: Optional[np.ndarray] = None,
|
|
220
|
+
|
|
221
|
+
# Multi-valued treatment specific
|
|
222
|
+
treat_names: Optional[List[str]] = None,
|
|
223
|
+
|
|
224
|
+
# Missing data handling
|
|
225
|
+
na_action: Optional[dict] = None
|
|
226
|
+
):
|
|
227
|
+
"""
|
|
228
|
+
Initialize CBPS result object.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
coefficients : ndarray
|
|
233
|
+
Coefficient matrix, shape (k, 1) for binary or (k, n_treats-1) for multi-valued.
|
|
234
|
+
fitted_values : ndarray
|
|
235
|
+
Fitted propensity scores, shape (n,).
|
|
236
|
+
weights : ndarray
|
|
237
|
+
Optimal IPW weights, shape (n,).
|
|
238
|
+
linear_predictor : ndarray
|
|
239
|
+
Linear predictor X @ beta, shape (n,).
|
|
240
|
+
y : ndarray
|
|
241
|
+
Treatment vector (original), shape (n,).
|
|
242
|
+
x : ndarray
|
|
243
|
+
Covariate matrix (with intercept), shape (n, k).
|
|
244
|
+
J : float
|
|
245
|
+
Hansen J-statistic (over-identification test).
|
|
246
|
+
mle_J : float
|
|
247
|
+
MLE baseline J-statistic.
|
|
248
|
+
deviance : float
|
|
249
|
+
Negative 2 times log-likelihood.
|
|
250
|
+
converged : bool
|
|
251
|
+
Optimization convergence status.
|
|
252
|
+
var : ndarray
|
|
253
|
+
Coefficient variance-covariance matrix, shape (k, k).
|
|
254
|
+
nulldeviance : float, optional
|
|
255
|
+
Null model deviance for pseudo R-squared calculation.
|
|
256
|
+
coef_names : list, optional
|
|
257
|
+
Coefficient names from formula.
|
|
258
|
+
call_info : str, optional
|
|
259
|
+
Call information string.
|
|
260
|
+
formula : str, optional
|
|
261
|
+
Model formula string.
|
|
262
|
+
|
|
263
|
+
Notes
|
|
264
|
+
-----
|
|
265
|
+
All parameters are typically passed from internal fitting routines
|
|
266
|
+
and should not be constructed manually by users.
|
|
267
|
+
|
|
268
|
+
The coefficients must be a 2D matrix:
|
|
269
|
+
- Binary treatment: (k, 1)
|
|
270
|
+
- 3-valued treatment: (k, 2)
|
|
271
|
+
- 4-valued treatment: (k, 3)
|
|
272
|
+
"""
|
|
273
|
+
# Core estimation results
|
|
274
|
+
self.coefficients = coefficients
|
|
275
|
+
self.fitted_values = fitted_values
|
|
276
|
+
self.weights = weights
|
|
277
|
+
self.linear_predictor = linear_predictor
|
|
278
|
+
self.y = y
|
|
279
|
+
self.x = x
|
|
280
|
+
self.J = J
|
|
281
|
+
self.mle_J = mle_J
|
|
282
|
+
self.deviance = deviance
|
|
283
|
+
self.nulldeviance = nulldeviance
|
|
284
|
+
self.converged = converged
|
|
285
|
+
self.var = var
|
|
286
|
+
|
|
287
|
+
# Metadata
|
|
288
|
+
self.call_info = call_info or "CBPS()"
|
|
289
|
+
self.call = call_info or "CBPS()"
|
|
290
|
+
self.coef_names = coef_names or self._default_coef_names()
|
|
291
|
+
self.formula = formula
|
|
292
|
+
self.data = data
|
|
293
|
+
self.terms = terms
|
|
294
|
+
self.model = model
|
|
295
|
+
self.xlevels = xlevels
|
|
296
|
+
|
|
297
|
+
# Input parameters
|
|
298
|
+
self.att = att
|
|
299
|
+
self.method = method
|
|
300
|
+
self.standardize = standardize
|
|
301
|
+
self.two_step = two_step
|
|
302
|
+
|
|
303
|
+
# Continuous treatment specific
|
|
304
|
+
self.sigmasq = sigmasq
|
|
305
|
+
self.Ttilde = Ttilde
|
|
306
|
+
self.Xtilde = Xtilde
|
|
307
|
+
self.beta_tilde = beta_tilde
|
|
308
|
+
self.sigmasq_tilde = sigmasq_tilde
|
|
309
|
+
self.stabilizers = stabilizers
|
|
310
|
+
|
|
311
|
+
# Multi-valued treatment specific
|
|
312
|
+
self.treat_names = treat_names
|
|
313
|
+
|
|
314
|
+
# Missing data handling
|
|
315
|
+
self.na_action = na_action
|
|
316
|
+
|
|
317
|
+
# Validate coefficients shape
|
|
318
|
+
if self.coefficients.ndim != 2:
|
|
319
|
+
raise ValueError(
|
|
320
|
+
f"coefficients must be 2D array, got shape {self.coefficients.shape}. "
|
|
321
|
+
f"Expected (k, 1) for binary or (k, n_treats-1) for multi-valued."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def _default_coef_names(self) -> List[str]:
|
|
325
|
+
"""Generate default coefficient names when none are provided."""
|
|
326
|
+
k = self.coefficients.shape[0]
|
|
327
|
+
if k == 0:
|
|
328
|
+
return []
|
|
329
|
+
return ["Intercept"] + [f"X{i}" for i in range(1, k)]
|
|
330
|
+
|
|
331
|
+
def __getstate__(self):
|
|
332
|
+
"""Support pickle serialization.
|
|
333
|
+
|
|
334
|
+
Excludes unpicklable objects (e.g., patsy DesignInfo) by
|
|
335
|
+
setting them to None in the serialized state.
|
|
336
|
+
"""
|
|
337
|
+
state = self.__dict__.copy()
|
|
338
|
+
# patsy DesignInfo does not support pickle
|
|
339
|
+
# (see https://github.com/pydata/patsy/issues/26)
|
|
340
|
+
for key in ('terms', 'data'):
|
|
341
|
+
if key in state:
|
|
342
|
+
try:
|
|
343
|
+
pickle_test = state[key]
|
|
344
|
+
if pickle_test is not None:
|
|
345
|
+
import pickle as _pkl
|
|
346
|
+
_pkl.dumps(pickle_test)
|
|
347
|
+
except (NotImplementedError, TypeError, AttributeError):
|
|
348
|
+
state[key] = None
|
|
349
|
+
return state
|
|
350
|
+
|
|
351
|
+
def __setstate__(self, state):
|
|
352
|
+
"""Support pickle deserialization."""
|
|
353
|
+
self.__dict__.update(state)
|
|
354
|
+
|
|
355
|
+
def vcov(self) -> np.ndarray:
|
|
356
|
+
"""
|
|
357
|
+
Return the variance-covariance matrix of the estimated coefficients.
|
|
358
|
+
|
|
359
|
+
Returns
|
|
360
|
+
-------
|
|
361
|
+
ndarray, shape (k, k)
|
|
362
|
+
Variance-covariance matrix computed using the sandwich estimator.
|
|
363
|
+
|
|
364
|
+
Raises
|
|
365
|
+
------
|
|
366
|
+
ValueError
|
|
367
|
+
If the variance matrix was not computed during fitting (var is None).
|
|
368
|
+
|
|
369
|
+
Warns
|
|
370
|
+
-----
|
|
371
|
+
UserWarning
|
|
372
|
+
If the condition number exceeds 1e10, indicating potential
|
|
373
|
+
near-collinearity that may affect standard error reliability.
|
|
374
|
+
|
|
375
|
+
Notes
|
|
376
|
+
-----
|
|
377
|
+
This method directly returns the stored variance matrix computed during
|
|
378
|
+
fitting using the sandwich formula. It does not recompute the matrix.
|
|
379
|
+
|
|
380
|
+
The variance matrix is computed as:
|
|
381
|
+
``vcov = (G' W G)^{-1} G' W Omega W' G (G' W G)^{-1}``
|
|
382
|
+
|
|
383
|
+
where G is the gradient matrix, W is the weighting matrix, and Omega
|
|
384
|
+
is the covariance of the moment conditions.
|
|
385
|
+
"""
|
|
386
|
+
if self.var is None:
|
|
387
|
+
raise ValueError(
|
|
388
|
+
"Variance-covariance matrix not computed. "
|
|
389
|
+
"This may indicate a fitting error."
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Check condition number to detect near-collinearity
|
|
393
|
+
try:
|
|
394
|
+
cond_number = np.linalg.cond(self.var)
|
|
395
|
+
if cond_number > 1e10:
|
|
396
|
+
warnings.warn(
|
|
397
|
+
f"Variance-covariance matrix has high condition number ({cond_number:.2e}). "
|
|
398
|
+
f"This suggests near-collinearity among covariates. "
|
|
399
|
+
f"Standard errors may be unreliable. "
|
|
400
|
+
f"Consider:\n"
|
|
401
|
+
f" 1. Removing highly correlated covariates\n"
|
|
402
|
+
f" 2. Using regularization (e.g., hdCBPS)\n"
|
|
403
|
+
f" 3. Checking for perfect collinearity with np.linalg.matrix_rank(X)",
|
|
404
|
+
UserWarning,
|
|
405
|
+
stacklevel=2
|
|
406
|
+
)
|
|
407
|
+
except np.linalg.LinAlgError:
|
|
408
|
+
warnings.warn(
|
|
409
|
+
"Failed to compute condition number of variance-covariance matrix. "
|
|
410
|
+
"Matrix may be singular or near-singular.",
|
|
411
|
+
UserWarning,
|
|
412
|
+
stacklevel=2
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
return self.var
|
|
416
|
+
|
|
417
|
+
@property
|
|
418
|
+
def residuals(self) -> np.ndarray:
|
|
419
|
+
"""
|
|
420
|
+
Model residuals (observed minus fitted values).
|
|
421
|
+
|
|
422
|
+
Returns
|
|
423
|
+
-------
|
|
424
|
+
ndarray
|
|
425
|
+
Residual vector or matrix depending on treatment type:
|
|
426
|
+
|
|
427
|
+
- Binary: y - fitted_values, shape (n,)
|
|
428
|
+
- Continuous: standardized residuals in whitened space, shape (n,)
|
|
429
|
+
- Multi-valued: one-hot encoded y minus fitted probabilities, shape (n, k)
|
|
430
|
+
"""
|
|
431
|
+
# Continuous treatment
|
|
432
|
+
if self.Ttilde is not None:
|
|
433
|
+
return self.Ttilde - self.linear_predictor.ravel()
|
|
434
|
+
|
|
435
|
+
# Binary treatment
|
|
436
|
+
if self.fitted_values.ndim == 1 or (self.fitted_values.ndim == 2 and self.fitted_values.shape[1] == 1):
|
|
437
|
+
return self.y - self.fitted_values.ravel()
|
|
438
|
+
|
|
439
|
+
# Multi-valued treatment
|
|
440
|
+
n_samples = len(self.y)
|
|
441
|
+
n_classes = self.fitted_values.shape[1]
|
|
442
|
+
y_onehot = np.zeros((n_samples, n_classes))
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
y_int = self.y.astype(int)
|
|
446
|
+
if y_int.min() >= 0 and y_int.max() < n_classes:
|
|
447
|
+
y_onehot[np.arange(n_samples), y_int] = 1
|
|
448
|
+
return y_onehot - self.fitted_values
|
|
449
|
+
except Exception:
|
|
450
|
+
pass
|
|
451
|
+
|
|
452
|
+
raise NotImplementedError(
|
|
453
|
+
"Residuals not supported for this multi-valued treatment format"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
@property
|
|
457
|
+
def pseudo_r2(self) -> Optional[float]:
|
|
458
|
+
"""
|
|
459
|
+
McFadden's pseudo R-squared measure of model fit.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
float or None
|
|
464
|
+
Pseudo R² = 1 - deviance / null_deviance.
|
|
465
|
+
Returns None if null deviance is unavailable or zero.
|
|
466
|
+
|
|
467
|
+
Notes
|
|
468
|
+
-----
|
|
469
|
+
The pseudo R² measures improvement over the null (intercept-only) model:
|
|
470
|
+
|
|
471
|
+
- 0: No improvement over null model
|
|
472
|
+
- 1: Perfect fit
|
|
473
|
+
- Typical range: 0.05-0.40 for logistic models
|
|
474
|
+
|
|
475
|
+
The null model contains only the intercept and predicts all observations
|
|
476
|
+
with probability equal to the sample mean.
|
|
477
|
+
|
|
478
|
+
Examples
|
|
479
|
+
--------
|
|
480
|
+
>>> fit = CBPS('treat ~ age + educ', data=data)
|
|
481
|
+
>>> print(f"Pseudo R²: {fit.pseudo_r2:.4f}")
|
|
482
|
+
"""
|
|
483
|
+
if self.nulldeviance is None or self.nulldeviance == 0:
|
|
484
|
+
return None
|
|
485
|
+
return 1.0 - self.deviance / self.nulldeviance
|
|
486
|
+
|
|
487
|
+
def balance(self, **kwargs):
|
|
488
|
+
"""
|
|
489
|
+
Compute covariate balance statistics.
|
|
490
|
+
|
|
491
|
+
This is a convenience method that calls the standalone ``balance()``
|
|
492
|
+
function. Both ``fit.balance()`` and ``balance(fit)`` are supported,
|
|
493
|
+
allowing users to choose either object-oriented or functional style.
|
|
494
|
+
|
|
495
|
+
Parameters
|
|
496
|
+
----------
|
|
497
|
+
**kwargs
|
|
498
|
+
Additional arguments passed to ``balance()``:
|
|
499
|
+
|
|
500
|
+
- enhanced : bool, default=False
|
|
501
|
+
Whether to return enhanced diagnostic information.
|
|
502
|
+
- threshold : float, default=0.1
|
|
503
|
+
Imbalance threshold (SMD or correlation) for flagging covariates.
|
|
504
|
+
- covariate_names : list, optional
|
|
505
|
+
Covariate names for enhanced output.
|
|
506
|
+
|
|
507
|
+
Returns
|
|
508
|
+
-------
|
|
509
|
+
dict
|
|
510
|
+
Dictionary containing balance statistics:
|
|
511
|
+
|
|
512
|
+
- 'balanced': Weighted covariate balance measures
|
|
513
|
+
- 'original' or 'unweighted': Unweighted baseline measures
|
|
514
|
+
|
|
515
|
+
See ``cbps.balance()`` documentation for full details.
|
|
516
|
+
|
|
517
|
+
Examples
|
|
518
|
+
--------
|
|
519
|
+
>>> fit = CBPS('treat ~ age + educ', data=df, att=1)
|
|
520
|
+
>>>
|
|
521
|
+
>>> # Method 1: Standalone function
|
|
522
|
+
>>> from cbps import balance
|
|
523
|
+
>>> bal = balance(fit)
|
|
524
|
+
>>>
|
|
525
|
+
>>> # Method 2: Object method (Python style)
|
|
526
|
+
>>> bal = fit.balance()
|
|
527
|
+
>>>
|
|
528
|
+
>>> # Both methods produce identical results
|
|
529
|
+
"""
|
|
530
|
+
from cbps import balance as balance_func
|
|
531
|
+
|
|
532
|
+
cbps_dict = {
|
|
533
|
+
'weights': self.weights,
|
|
534
|
+
'x': self.x,
|
|
535
|
+
'y': self.y,
|
|
536
|
+
'fitted_values': self.fitted_values,
|
|
537
|
+
'coefficients': self.coefficients
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
return balance_func(cbps_dict, **kwargs)
|
|
541
|
+
|
|
542
|
+
@property
|
|
543
|
+
def coef(self) -> np.ndarray:
|
|
544
|
+
"""
|
|
545
|
+
Coefficient vector (1D convenience accessor).
|
|
546
|
+
|
|
547
|
+
Returns
|
|
548
|
+
-------
|
|
549
|
+
ndarray, shape (k,)
|
|
550
|
+
Coefficient vector (1D), extracted from the coefficients matrix.
|
|
551
|
+
|
|
552
|
+
Notes
|
|
553
|
+
-----
|
|
554
|
+
This is a convenience property providing a 1D view of coefficients.
|
|
555
|
+
|
|
556
|
+
- For binary treatment: returns ``coefficients[:, 0]`` (1D)
|
|
557
|
+
- For multi-valued treatment: returns ``coefficients[:, 0]`` (first contrast)
|
|
558
|
+
|
|
559
|
+
The full coefficient matrix is still accessible via ``fit.coefficients``.
|
|
560
|
+
|
|
561
|
+
Comparison with other Python packages:
|
|
562
|
+
|
|
563
|
+
- statsmodels: ``result.params`` (1D)
|
|
564
|
+
- sklearn: ``model.coef_`` (may be 2D)
|
|
565
|
+
- CBPS: ``fit.coef`` (1D, this property) + ``fit.coefficients`` (full 2D)
|
|
566
|
+
|
|
567
|
+
Examples
|
|
568
|
+
--------
|
|
569
|
+
>>> fit = CBPS('treat ~ age + educ', data=df, att=1)
|
|
570
|
+
>>> fit.coef # Convenient 1D access
|
|
571
|
+
array([0.123, 0.456, -0.789])
|
|
572
|
+
>>> fit.coefficients # Full 2D matrix
|
|
573
|
+
array([[0.123],
|
|
574
|
+
[0.456],
|
|
575
|
+
[-0.789]])
|
|
576
|
+
"""
|
|
577
|
+
if self.coefficients.ndim == 1:
|
|
578
|
+
return self.coefficients
|
|
579
|
+
else:
|
|
580
|
+
return self.coefficients[:, 0] if self.coefficients.shape[1] == 1 else self.coefficients.ravel()
|
|
581
|
+
|
|
582
|
+
@property
|
|
583
|
+
def fitted(self) -> np.ndarray:
|
|
584
|
+
"""
|
|
585
|
+
Alias for fitted_values (alternative accessor).
|
|
586
|
+
|
|
587
|
+
Returns
|
|
588
|
+
-------
|
|
589
|
+
ndarray
|
|
590
|
+
Fitted propensity scores, equivalent to ``fitted_values``.
|
|
591
|
+
|
|
592
|
+
Notes
|
|
593
|
+
-----
|
|
594
|
+
This is an alias for ``fitted_values`` for convenience.
|
|
595
|
+
|
|
596
|
+
Examples
|
|
597
|
+
--------
|
|
598
|
+
>>> fit = CBPS('treat ~ age + educ', data=df)
|
|
599
|
+
>>> # The following are equivalent
|
|
600
|
+
>>> fv1 = fit.fitted_values
|
|
601
|
+
>>> fv2 = fit.fitted
|
|
602
|
+
>>> np.allclose(fv1, fv2)
|
|
603
|
+
True
|
|
604
|
+
"""
|
|
605
|
+
return self.fitted_values
|
|
606
|
+
|
|
607
|
+
@property
|
|
608
|
+
def J_stat(self) -> float:
|
|
609
|
+
"""
|
|
610
|
+
Alias for J (Hansen's J-statistic).
|
|
611
|
+
|
|
612
|
+
Returns
|
|
613
|
+
-------
|
|
614
|
+
float
|
|
615
|
+
The GMM over-identification test statistic.
|
|
616
|
+
|
|
617
|
+
Notes
|
|
618
|
+
-----
|
|
619
|
+
The J-statistic is used for the GMM over-identification test.
|
|
620
|
+
Under the null hypothesis of correct model specification, J is
|
|
621
|
+
asymptotically chi-squared distributed with degrees of freedom
|
|
622
|
+
equal to the number of over-identifying restrictions.
|
|
623
|
+
|
|
624
|
+
Examples
|
|
625
|
+
--------
|
|
626
|
+
>>> fit = CBPS('treat ~ age + educ', data=df)
|
|
627
|
+
>>> j1 = fit.J # Original attribute
|
|
628
|
+
>>> j2 = fit.J_stat # Alias
|
|
629
|
+
>>> assert j1 == j2
|
|
630
|
+
"""
|
|
631
|
+
return self.J
|
|
632
|
+
|
|
633
|
+
@property
|
|
634
|
+
def sigma_squared(self) -> Optional[float]:
|
|
635
|
+
"""
|
|
636
|
+
Residual variance estimate (continuous treatment only).
|
|
637
|
+
|
|
638
|
+
Returns
|
|
639
|
+
-------
|
|
640
|
+
float or None
|
|
641
|
+
Variance estimate for continuous treatment models.
|
|
642
|
+
Returns None for binary or multi-valued treatments.
|
|
643
|
+
|
|
644
|
+
Notes
|
|
645
|
+
-----
|
|
646
|
+
Only available for continuous treatment CBPS. For binary and
|
|
647
|
+
multi-valued treatments, this property returns None.
|
|
648
|
+
|
|
649
|
+
Examples
|
|
650
|
+
--------
|
|
651
|
+
>>> # Continuous treatment
|
|
652
|
+
>>> fit_cont = CBPS('dose ~ age + educ', data=df)
|
|
653
|
+
>>> sigma2 = fit_cont.sigma_squared
|
|
654
|
+
>>>
|
|
655
|
+
>>> # Binary treatment
|
|
656
|
+
>>> fit_bin = CBPS('treat ~ age + educ', data=df)
|
|
657
|
+
>>> assert fit_bin.sigma_squared is None
|
|
658
|
+
"""
|
|
659
|
+
return getattr(self, 'sigmasq', None)
|
|
660
|
+
|
|
661
|
+
def predict(self, newdata: Optional[Union[pd.DataFrame, np.ndarray]] = None, type: str = 'response') -> np.ndarray:
|
|
662
|
+
"""
|
|
663
|
+
Predict propensity scores for new data.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
newdata : DataFrame, ndarray, or None
|
|
668
|
+
New data for prediction. If None, returns fitted values from
|
|
669
|
+
the training data.
|
|
670
|
+
|
|
671
|
+
- DataFrame: Required when using formula interface
|
|
672
|
+
- ndarray: Shape (n_new, k) matching the training covariates
|
|
673
|
+
|
|
674
|
+
type : {'response', 'link'}, default='response'
|
|
675
|
+
Type of prediction:
|
|
676
|
+
|
|
677
|
+
- 'response': Probabilities/expected values (after link function)
|
|
678
|
+
- 'link': Linear predictor X @ beta (before transformation)
|
|
679
|
+
|
|
680
|
+
Returns
|
|
681
|
+
-------
|
|
682
|
+
ndarray
|
|
683
|
+
Predicted values. Shape depends on treatment type:
|
|
684
|
+
|
|
685
|
+
- Binary: (n_new,) probabilities
|
|
686
|
+
- Multi-valued: (n_new, n_levels) probabilities
|
|
687
|
+
- Continuous: (n_new,) conditional means
|
|
688
|
+
|
|
689
|
+
Raises
|
|
690
|
+
------
|
|
691
|
+
ValueError
|
|
692
|
+
If type is invalid or newdata dimensions do not match.
|
|
693
|
+
|
|
694
|
+
Notes
|
|
695
|
+
-----
|
|
696
|
+
**Treatment type handling:**
|
|
697
|
+
|
|
698
|
+
- Binary treatment: logistic link (expit)
|
|
699
|
+
- Continuous treatment: identity link
|
|
700
|
+
- Multi-valued treatment: multinomial logistic (softmax)
|
|
701
|
+
|
|
702
|
+
**Formula vs array interface:**
|
|
703
|
+
|
|
704
|
+
- Formula interface: uses patsy DesignInfo to rebuild design matrix
|
|
705
|
+
- Array interface: directly uses newdata as covariate matrix
|
|
706
|
+
|
|
707
|
+
Examples
|
|
708
|
+
--------
|
|
709
|
+
>>> # Train model
|
|
710
|
+
>>> fit = CBPS('treat ~ x1 + x2', data=train_df)
|
|
711
|
+
>>>
|
|
712
|
+
>>> # Predict new data
|
|
713
|
+
>>> pred = fit.predict(test_df)
|
|
714
|
+
>>>
|
|
715
|
+
>>> # Predict linear predictor
|
|
716
|
+
>>> linear_pred = fit.predict(test_df, type='link')
|
|
717
|
+
>>>
|
|
718
|
+
>>> # Get training data fitted values
|
|
719
|
+
>>> fitted = fit.predict() # Equivalent to fit.fitted_values
|
|
720
|
+
"""
|
|
721
|
+
valid_types = {'response', 'link'}
|
|
722
|
+
if type not in valid_types:
|
|
723
|
+
raise ValueError(
|
|
724
|
+
f"Invalid type: '{type}'. Must be one of {valid_types}."
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
if newdata is None:
|
|
728
|
+
if type == 'response':
|
|
729
|
+
return self.fitted_values
|
|
730
|
+
elif type == 'link':
|
|
731
|
+
return self.linear_predictor
|
|
732
|
+
|
|
733
|
+
X_new = self._prepare_newdata(newdata)
|
|
734
|
+
linear_pred = X_new @ self.coefficients
|
|
735
|
+
|
|
736
|
+
if type == 'link':
|
|
737
|
+
if linear_pred.ndim == 2 and linear_pred.shape[1] == 1:
|
|
738
|
+
return linear_pred.ravel()
|
|
739
|
+
return linear_pred
|
|
740
|
+
elif type == 'response':
|
|
741
|
+
return self._apply_link_function(linear_pred)
|
|
742
|
+
|
|
743
|
+
def _prepare_newdata(self, newdata: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
|
|
744
|
+
"""Prepare design matrix from new data for prediction."""
|
|
745
|
+
import pandas as pd
|
|
746
|
+
|
|
747
|
+
if self.formula is not None and self.terms is not None:
|
|
748
|
+
if not isinstance(newdata, pd.DataFrame):
|
|
749
|
+
raise TypeError(
|
|
750
|
+
f"When using formula interface, newdata must be a DataFrame. "
|
|
751
|
+
f"Got {type(newdata).__name__}."
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
try:
|
|
755
|
+
from patsy import dmatrix
|
|
756
|
+
X_new_df = dmatrix(self.terms, newdata, return_type='dataframe')
|
|
757
|
+
X_new = X_new_df.values
|
|
758
|
+
except Exception as e:
|
|
759
|
+
raise ValueError(
|
|
760
|
+
f"Failed to build design matrix from newdata using formula '{self.formula}'. "
|
|
761
|
+
f"Error: {str(e)}\n"
|
|
762
|
+
f"Make sure newdata contains all variables used in the formula."
|
|
763
|
+
) from e
|
|
764
|
+
else:
|
|
765
|
+
X_new = np.asarray(newdata)
|
|
766
|
+
if X_new.ndim == 1:
|
|
767
|
+
X_new = X_new.reshape(1, -1)
|
|
768
|
+
|
|
769
|
+
if X_new.shape[1] != self.x.shape[1]:
|
|
770
|
+
raise ValueError(
|
|
771
|
+
f"newdata has {X_new.shape[1]} columns, "
|
|
772
|
+
f"but model was trained with {self.x.shape[1]} columns. "
|
|
773
|
+
f"Expected shape: (n_new, {self.x.shape[1]})"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
return X_new
|
|
777
|
+
|
|
778
|
+
def _apply_link_function(self, linear_pred: np.ndarray) -> np.ndarray:
|
|
779
|
+
"""Apply inverse link function to convert linear predictor to response scale."""
|
|
780
|
+
coef_shape = self.coefficients.shape
|
|
781
|
+
|
|
782
|
+
# Binary treatment: logistic link
|
|
783
|
+
if len(coef_shape) == 2 and coef_shape[1] == 1:
|
|
784
|
+
from scipy.special import expit
|
|
785
|
+
return expit(linear_pred).ravel()
|
|
786
|
+
|
|
787
|
+
# Continuous treatment: identity link
|
|
788
|
+
elif self.sigmasq is not None:
|
|
789
|
+
return linear_pred.ravel()
|
|
790
|
+
|
|
791
|
+
# Multi-valued treatment: multinomial logistic (softmax)
|
|
792
|
+
elif len(coef_shape) == 2 and coef_shape[1] > 1:
|
|
793
|
+
exp_pred = np.exp(linear_pred)
|
|
794
|
+
denom = 1.0 + exp_pred.sum(axis=1, keepdims=True)
|
|
795
|
+
prob_baseline = 1.0 / denom
|
|
796
|
+
prob_others = exp_pred / denom
|
|
797
|
+
return np.column_stack([prob_baseline, prob_others])
|
|
798
|
+
|
|
799
|
+
else:
|
|
800
|
+
raise ValueError(
|
|
801
|
+
f"Cannot determine treatment type from coefficients shape {coef_shape}. "
|
|
802
|
+
f"Expected (k, 1) for binary/continuous or (k, K-1) for multi-valued."
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def plot_deviance_residuals(self, ax=None, **kwargs):
|
|
807
|
+
"""
|
|
808
|
+
Plot deviance residual diagnostics (binary treatment only).
|
|
809
|
+
|
|
810
|
+
Generates a 2x2 panel of diagnostic plots:
|
|
811
|
+
|
|
812
|
+
1. Residuals vs Fitted: Check for non-linearity and heteroscedasticity
|
|
813
|
+
2. Q-Q Plot: Assess normality of residuals
|
|
814
|
+
3. Scale-Location: Check homoscedasticity assumption
|
|
815
|
+
4. Residuals vs Leverage: Identify influential observations
|
|
816
|
+
|
|
817
|
+
Parameters
|
|
818
|
+
----------
|
|
819
|
+
ax : matplotlib.axes.Axes, optional
|
|
820
|
+
Axes object for plotting. If None, creates a new figure.
|
|
821
|
+
**kwargs : dict
|
|
822
|
+
Additional arguments passed to matplotlib plotting functions.
|
|
823
|
+
|
|
824
|
+
Returns
|
|
825
|
+
-------
|
|
826
|
+
fig : matplotlib.figure.Figure
|
|
827
|
+
The figure object.
|
|
828
|
+
axes : ndarray of matplotlib.axes.Axes
|
|
829
|
+
Array of axes objects (2x2).
|
|
830
|
+
|
|
831
|
+
Raises
|
|
832
|
+
------
|
|
833
|
+
ValueError
|
|
834
|
+
If treatment is not binary or required data is missing.
|
|
835
|
+
ImportError
|
|
836
|
+
If matplotlib is not installed.
|
|
837
|
+
"""
|
|
838
|
+
try:
|
|
839
|
+
import matplotlib.pyplot as plt
|
|
840
|
+
from scipy import stats
|
|
841
|
+
except ImportError:
|
|
842
|
+
raise ImportError(
|
|
843
|
+
"matplotlib and scipy are required for plotting. "
|
|
844
|
+
"Install with: pip install matplotlib scipy"
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
if not hasattr(self, 'y') or self.y is None:
|
|
848
|
+
raise ValueError("Deviance residuals plot requires y (treatment) data")
|
|
849
|
+
|
|
850
|
+
y_binary = np.asarray(self.y).ravel()
|
|
851
|
+
unique_y = np.unique(y_binary)
|
|
852
|
+
if len(unique_y) != 2:
|
|
853
|
+
raise ValueError(
|
|
854
|
+
f"Deviance residuals plot only available for binary treatment. "
|
|
855
|
+
f"Found {len(unique_y)} unique treatment values."
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# Compute deviance residuals
|
|
859
|
+
fitted_values = np.asarray(self.fitted_values).ravel()
|
|
860
|
+
|
|
861
|
+
eps = 1e-10
|
|
862
|
+
fitted_safe = np.clip(fitted_values, eps, 1 - eps)
|
|
863
|
+
|
|
864
|
+
sign = np.where(y_binary == 1, 1, -1)
|
|
865
|
+
deviance_resid = sign * np.sqrt(-2 * (
|
|
866
|
+
y_binary * np.log(fitted_safe) +
|
|
867
|
+
(1 - y_binary) * np.log(1 - fitted_safe)
|
|
868
|
+
))
|
|
869
|
+
|
|
870
|
+
# Standardized residuals
|
|
871
|
+
std_resid = deviance_resid / np.std(deviance_resid)
|
|
872
|
+
|
|
873
|
+
# Create 2x2 subplot grid
|
|
874
|
+
if ax is None:
|
|
875
|
+
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
|
876
|
+
axes = axes.ravel()
|
|
877
|
+
else:
|
|
878
|
+
fig = ax.figure
|
|
879
|
+
axes = [ax]
|
|
880
|
+
if len(axes) < 4:
|
|
881
|
+
raise ValueError("Need 4 axes for diagnostic plots. Pass ax=None to create new figure.")
|
|
882
|
+
|
|
883
|
+
# Panel 1: Residuals vs Fitted
|
|
884
|
+
axes[0].scatter(fitted_values, deviance_resid, alpha=0.5, **kwargs)
|
|
885
|
+
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=1)
|
|
886
|
+
|
|
887
|
+
try:
|
|
888
|
+
from statsmodels.nonparametric.smoothers_lowess import lowess
|
|
889
|
+
smoothed = lowess(deviance_resid, fitted_values, frac=0.3)
|
|
890
|
+
axes[0].plot(smoothed[:, 0], smoothed[:, 1], 'b-', linewidth=2, label='LOWESS')
|
|
891
|
+
axes[0].legend()
|
|
892
|
+
except ImportError:
|
|
893
|
+
pass
|
|
894
|
+
|
|
895
|
+
axes[0].set_xlabel('Fitted values')
|
|
896
|
+
axes[0].set_ylabel('Deviance Residuals')
|
|
897
|
+
axes[0].set_title('Residuals vs Fitted')
|
|
898
|
+
axes[0].grid(True, alpha=0.3)
|
|
899
|
+
|
|
900
|
+
# Panel 2: Q-Q Plot
|
|
901
|
+
scipy.stats.probplot(deviance_resid, dist="norm", plot=axes[1])
|
|
902
|
+
axes[1].set_title('Normal Q-Q Plot')
|
|
903
|
+
axes[1].grid(True, alpha=0.3)
|
|
904
|
+
|
|
905
|
+
# Panel 3: Scale-Location
|
|
906
|
+
sqrt_std_resid = np.sqrt(np.abs(std_resid))
|
|
907
|
+
axes[2].scatter(fitted_values, sqrt_std_resid, alpha=0.5, **kwargs)
|
|
908
|
+
|
|
909
|
+
# Add LOWESS smoother
|
|
910
|
+
try:
|
|
911
|
+
from statsmodels.nonparametric.smoothers_lowess import lowess
|
|
912
|
+
smoothed = lowess(sqrt_std_resid, fitted_values, frac=0.3)
|
|
913
|
+
axes[2].plot(smoothed[:, 0], smoothed[:, 1], 'b-', linewidth=2, label='LOWESS')
|
|
914
|
+
axes[2].legend()
|
|
915
|
+
except ImportError:
|
|
916
|
+
pass
|
|
917
|
+
|
|
918
|
+
axes[2].set_xlabel('Fitted values')
|
|
919
|
+
axes[2].set_ylabel('√|Standardized Residuals|')
|
|
920
|
+
axes[2].set_title('Scale-Location')
|
|
921
|
+
axes[2].grid(True, alpha=0.3)
|
|
922
|
+
|
|
923
|
+
# Panel 4: Residuals vs Leverage
|
|
924
|
+
leverage = fitted_values * (1 - fitted_values)
|
|
925
|
+
axes[3].scatter(leverage, std_resid, alpha=0.5, **kwargs)
|
|
926
|
+
axes[3].axhline(y=0, color='r', linestyle='--', linewidth=1)
|
|
927
|
+
|
|
928
|
+
# Mark high-influence points
|
|
929
|
+
cook_threshold = 4 / len(y_binary)
|
|
930
|
+
high_influence = np.abs(std_resid) * leverage > cook_threshold
|
|
931
|
+
if np.any(high_influence):
|
|
932
|
+
axes[3].scatter(leverage[high_influence], std_resid[high_influence],
|
|
933
|
+
color='red', s=100, alpha=0.7, label='High influence')
|
|
934
|
+
axes[3].legend()
|
|
935
|
+
|
|
936
|
+
axes[3].set_xlabel('Leverage')
|
|
937
|
+
axes[3].set_ylabel('Standardized Residuals')
|
|
938
|
+
axes[3].set_title('Residuals vs Leverage')
|
|
939
|
+
axes[3].grid(True, alpha=0.3)
|
|
940
|
+
|
|
941
|
+
plt.tight_layout()
|
|
942
|
+
return fig, axes
|
|
943
|
+
|
|
944
|
+
def plot(self, kind='deviance', **kwargs):
|
|
945
|
+
"""
|
|
946
|
+
Generate diagnostic plots for the CBPS fit.
|
|
947
|
+
|
|
948
|
+
Parameters
|
|
949
|
+
----------
|
|
950
|
+
kind : {'deviance'}, default='deviance'
|
|
951
|
+
Type of diagnostic plot to generate.
|
|
952
|
+
Currently only 'deviance' (residual diagnostics) is supported.
|
|
953
|
+
**kwargs : dict
|
|
954
|
+
Additional arguments passed to the plotting function.
|
|
955
|
+
|
|
956
|
+
Returns
|
|
957
|
+
-------
|
|
958
|
+
fig : matplotlib.figure.Figure
|
|
959
|
+
axes : matplotlib.axes.Axes or array of Axes
|
|
960
|
+
|
|
961
|
+
Raises
|
|
962
|
+
------
|
|
963
|
+
ValueError
|
|
964
|
+
If an unknown plot kind is specified.
|
|
965
|
+
"""
|
|
966
|
+
if kind == 'deviance':
|
|
967
|
+
return self.plot_deviance_residuals(**kwargs)
|
|
968
|
+
else:
|
|
969
|
+
raise ValueError(
|
|
970
|
+
f"Unknown plot kind: '{kind}'. "
|
|
971
|
+
f"Available options: 'deviance'"
|
|
972
|
+
)
|
|
973
|
+
@staticmethod
|
|
974
|
+
def _symnum(pval: np.ndarray) -> List[str]:
|
|
975
|
+
"""Convert p-values to significance symbols."""
|
|
976
|
+
symbols = []
|
|
977
|
+
for p in pval:
|
|
978
|
+
if p < 0.001:
|
|
979
|
+
symbols.append('***')
|
|
980
|
+
elif p < 0.01:
|
|
981
|
+
symbols.append('**')
|
|
982
|
+
elif p < 0.05:
|
|
983
|
+
symbols.append('*')
|
|
984
|
+
elif p < 0.1:
|
|
985
|
+
symbols.append('.')
|
|
986
|
+
else:
|
|
987
|
+
symbols.append(' ')
|
|
988
|
+
return symbols
|
|
989
|
+
|
|
990
|
+
def summary(self) -> 'CBPSSummary':
|
|
991
|
+
"""
|
|
992
|
+
Compute and return a statistical summary of the CBPS fit.
|
|
993
|
+
|
|
994
|
+
Returns
|
|
995
|
+
-------
|
|
996
|
+
CBPSSummary
|
|
997
|
+
Summary object containing coefficient table with estimates,
|
|
998
|
+
standard errors, z-values, p-values, and significance codes.
|
|
999
|
+
|
|
1000
|
+
Raises
|
|
1001
|
+
------
|
|
1002
|
+
ValueError
|
|
1003
|
+
If the variance-covariance matrix was not computed (var is None),
|
|
1004
|
+
standard errors cannot be calculated.
|
|
1005
|
+
|
|
1006
|
+
Notes
|
|
1007
|
+
-----
|
|
1008
|
+
Key implementation details:
|
|
1009
|
+
|
|
1010
|
+
1. Standard errors are computed from the diagonal of the variance matrix
|
|
1011
|
+
2. z-values are computed as coefficient / standard error
|
|
1012
|
+
3. p-values are two-sided: p = 2 * (1 - Phi(abs(z)))
|
|
1013
|
+
4. Row names differ for binary vs multi-valued treatment
|
|
1014
|
+
|
|
1015
|
+
Examples
|
|
1016
|
+
--------
|
|
1017
|
+
>>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
|
|
1018
|
+
>>> summ = fit.summary()
|
|
1019
|
+
>>> print(summ) # Formatted coefficient table
|
|
1020
|
+
>>> summ.coef # Coefficient estimates
|
|
1021
|
+
>>> summ.se # Standard errors
|
|
1022
|
+
>>> summ.pvalues # Two-sided p-values
|
|
1023
|
+
"""
|
|
1024
|
+
if self.var is None:
|
|
1025
|
+
raise ValueError(
|
|
1026
|
+
"Variance-covariance matrix required for summary. "
|
|
1027
|
+
"Cannot compute standard errors."
|
|
1028
|
+
)
|
|
1029
|
+
|
|
1030
|
+
std_err = np.sqrt(np.diag(self.var))
|
|
1031
|
+
coef = self.coefficients.ravel()
|
|
1032
|
+
z_value = coef / std_err
|
|
1033
|
+
p_value = 2 * (1 - scipy.stats.norm.cdf(np.abs(z_value)))
|
|
1034
|
+
coef_table = np.column_stack([coef, std_err, z_value, p_value])
|
|
1035
|
+
|
|
1036
|
+
significance = self._symnum(p_value)
|
|
1037
|
+
|
|
1038
|
+
if self.coefficients.shape[1] == 1:
|
|
1039
|
+
row_names = self.coef_names
|
|
1040
|
+
else:
|
|
1041
|
+
row_names = self._format_multitreat_names()
|
|
1042
|
+
|
|
1043
|
+
# Compute J-test p-value for over-identified models
|
|
1044
|
+
# For binary CBPS with method='over':
|
|
1045
|
+
# n_moment_conditions = 2k (k score + k balance)
|
|
1046
|
+
# n_parameters = k
|
|
1047
|
+
# df = n_moment_conditions - n_parameters (overidentification degrees of freedom)
|
|
1048
|
+
k = self.coefficients.shape[0]
|
|
1049
|
+
if self.method == 'over':
|
|
1050
|
+
n_moments = 2 * k
|
|
1051
|
+
else:
|
|
1052
|
+
# Just-identified ('exact'): n_moments = k, df = 0
|
|
1053
|
+
n_moments = k
|
|
1054
|
+
|
|
1055
|
+
j_pval = None
|
|
1056
|
+
j_df = n_moments - k
|
|
1057
|
+
# Only attempt p-value computation if J is valid
|
|
1058
|
+
if (self.J is not None and np.isfinite(self.J) and self.J >= 0):
|
|
1059
|
+
try:
|
|
1060
|
+
j_pval = j_test_pvalue(self.J, n_moments, k)
|
|
1061
|
+
except ValueError:
|
|
1062
|
+
j_pval = None
|
|
1063
|
+
|
|
1064
|
+
return CBPSSummary(
|
|
1065
|
+
call=self.call_info,
|
|
1066
|
+
coef_table=coef_table,
|
|
1067
|
+
coef_names=row_names,
|
|
1068
|
+
significance=significance,
|
|
1069
|
+
J=self.J,
|
|
1070
|
+
j_pvalue=j_pval,
|
|
1071
|
+
j_df=j_df,
|
|
1072
|
+
deviance=self.deviance,
|
|
1073
|
+
sigmasq=self.sigmasq,
|
|
1074
|
+
y=self.y,
|
|
1075
|
+
fitted_values=self.fitted_values,
|
|
1076
|
+
weights=self.weights,
|
|
1077
|
+
converged=self.converged,
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
def _format_multitreat_names(self) -> List[str]:
|
|
1081
|
+
"""Format coefficient names for multi-valued treatment display."""
|
|
1082
|
+
row_names = []
|
|
1083
|
+
n_row, n_col = self.coefficients.shape
|
|
1084
|
+
|
|
1085
|
+
if self.treat_names is not None and len(self.treat_names) >= n_col:
|
|
1086
|
+
level_names = self.treat_names[:n_col]
|
|
1087
|
+
else:
|
|
1088
|
+
level_names = [f"Level{i}" for i in range(n_col)]
|
|
1089
|
+
|
|
1090
|
+
for i in range(n_col):
|
|
1091
|
+
for j in range(n_row):
|
|
1092
|
+
row_names.append(f"{level_names[i]}: {self.coef_names[j]}")
|
|
1093
|
+
|
|
1094
|
+
return row_names
|
|
1095
|
+
|
|
1096
|
+
def __str__(self) -> str:
|
|
1097
|
+
"""Return formatted string representation of the CBPS fit."""
|
|
1098
|
+
digits = 3
|
|
1099
|
+
output = f"\nCall:\n {self.call_info}\n\n"
|
|
1100
|
+
|
|
1101
|
+
if self.coefficients.size > 0:
|
|
1102
|
+
output += "Coefficients:\n"
|
|
1103
|
+
coef_str = np.array2string(
|
|
1104
|
+
self.coefficients,
|
|
1105
|
+
precision=digits,
|
|
1106
|
+
suppress_small=True
|
|
1107
|
+
)
|
|
1108
|
+
output += coef_str + "\n"
|
|
1109
|
+
else:
|
|
1110
|
+
output += "No coefficients\n\n"
|
|
1111
|
+
|
|
1112
|
+
if self.sigmasq is not None:
|
|
1113
|
+
output += f"\nSigma-Squared: {self.sigmasq}\n"
|
|
1114
|
+
|
|
1115
|
+
output += f"Residual Deviance:\t{self.deviance:.{digits}g}\n"
|
|
1116
|
+
output += f"J-Statistic:\t\t{self.J:.{digits}g}\n"
|
|
1117
|
+
output += f"Log-Likelihood:\t{-0.5 * self.deviance:.{digits}g}\n"
|
|
1118
|
+
|
|
1119
|
+
# Diagnostics block
|
|
1120
|
+
output += f"\nDiagnostics:\n"
|
|
1121
|
+
output += f" Converged: {'Yes' if self.converged else 'No'}\n"
|
|
1122
|
+
|
|
1123
|
+
if self.weights is not None:
|
|
1124
|
+
w = self.weights
|
|
1125
|
+
output += f" Weight Summary:\n"
|
|
1126
|
+
output += f" Min: {w.min():10.4f} Max: {w.max():10.4f} Mean: {w.mean():8.4f}\n"
|
|
1127
|
+
ess = (w.sum() ** 2) / (w ** 2).sum()
|
|
1128
|
+
output += f" Effective Sample Size: {ess:.1f}\n"
|
|
1129
|
+
|
|
1130
|
+
return output
|
|
1131
|
+
|
|
1132
|
+
def __repr__(self) -> str:
|
|
1133
|
+
"""Return concise representation for interactive display."""
|
|
1134
|
+
return (f"CBPSResults(n={len(self.y)}, k={self.coefficients.shape[0]}, "
|
|
1135
|
+
f"J={self.J:.6f}, converged={self.converged})")
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
class CBPSSummary:
|
|
1139
|
+
"""
|
|
1140
|
+
Summary object from CBPS estimation.
|
|
1141
|
+
|
|
1142
|
+
Contains the coefficient table with estimates, standard errors,
|
|
1143
|
+
z-values, p-values, and significance codes. This object is returned
|
|
1144
|
+
by the ``summary()`` method of ``CBPSResults``.
|
|
1145
|
+
|
|
1146
|
+
Attributes
|
|
1147
|
+
----------
|
|
1148
|
+
call : str
|
|
1149
|
+
String representation of the fitting call.
|
|
1150
|
+
coefficients : ndarray, shape (k, 4)
|
|
1151
|
+
Coefficient table with columns: Estimate, Std. Error, z value, Pr(>z).
|
|
1152
|
+
coef_names : list of str
|
|
1153
|
+
Names of coefficients (row labels).
|
|
1154
|
+
significance : list of str
|
|
1155
|
+
Significance codes for each coefficient ('***', '**', '*', '.', ' ').
|
|
1156
|
+
J : float
|
|
1157
|
+
Hansen J-statistic for over-identification test.
|
|
1158
|
+
j_pvalue : float or None
|
|
1159
|
+
Asymptotic p-value for J-test (None if just-identified).
|
|
1160
|
+
j_df : int
|
|
1161
|
+
Degrees of freedom for J-test chi-squared distribution.
|
|
1162
|
+
deviance : float
|
|
1163
|
+
Model deviance (-2 * log-likelihood).
|
|
1164
|
+
sigmasq : float or None
|
|
1165
|
+
Residual variance (continuous treatment only, None for binary/multi-valued).
|
|
1166
|
+
|
|
1167
|
+
Examples
|
|
1168
|
+
--------
|
|
1169
|
+
>>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
|
|
1170
|
+
>>> summ = fit.summary()
|
|
1171
|
+
>>> print(summ) # Formatted table
|
|
1172
|
+
>>> summ.coef # Coefficient estimates
|
|
1173
|
+
>>> summ.se # Standard errors
|
|
1174
|
+
>>> summ.zvalues # z-statistics
|
|
1175
|
+
>>> summ.pvalues # Two-sided p-values
|
|
1176
|
+
"""
|
|
1177
|
+
|
|
1178
|
+
def __init__(
|
|
1179
|
+
self,
|
|
1180
|
+
call: str,
|
|
1181
|
+
coef_table: np.ndarray,
|
|
1182
|
+
coef_names: List[str],
|
|
1183
|
+
significance: List[str],
|
|
1184
|
+
J: float,
|
|
1185
|
+
deviance: float,
|
|
1186
|
+
sigmasq: Optional[float] = None,
|
|
1187
|
+
y: Optional[np.ndarray] = None,
|
|
1188
|
+
fitted_values: Optional[np.ndarray] = None,
|
|
1189
|
+
weights: Optional[np.ndarray] = None,
|
|
1190
|
+
converged: Optional[bool] = None,
|
|
1191
|
+
j_pvalue: Optional[float] = None,
|
|
1192
|
+
j_df: Optional[int] = None,
|
|
1193
|
+
):
|
|
1194
|
+
"""
|
|
1195
|
+
Initialize summary object.
|
|
1196
|
+
|
|
1197
|
+
Parameters
|
|
1198
|
+
----------
|
|
1199
|
+
call : str
|
|
1200
|
+
Call information string.
|
|
1201
|
+
coef_table : ndarray
|
|
1202
|
+
Coefficient table (k × 4 matrix).
|
|
1203
|
+
coef_names : list
|
|
1204
|
+
Coefficient name list.
|
|
1205
|
+
significance : list
|
|
1206
|
+
Significance symbols list.
|
|
1207
|
+
J : float
|
|
1208
|
+
J-statistic.
|
|
1209
|
+
deviance : float
|
|
1210
|
+
Model deviance.
|
|
1211
|
+
sigmasq : float, optional
|
|
1212
|
+
Sigma squared (continuous treatment only, default None).
|
|
1213
|
+
y : ndarray, optional
|
|
1214
|
+
Treatment variable (for computing deviance residuals).
|
|
1215
|
+
fitted_values : ndarray, optional
|
|
1216
|
+
Fitted propensity scores (for computing deviance residuals).
|
|
1217
|
+
weights : ndarray, optional
|
|
1218
|
+
Estimated weights for diagnostics output.
|
|
1219
|
+
converged : bool, optional
|
|
1220
|
+
Whether the optimization converged.
|
|
1221
|
+
"""
|
|
1222
|
+
self.call = call
|
|
1223
|
+
self.coefficients = coef_table
|
|
1224
|
+
self.coef_names = coef_names
|
|
1225
|
+
self.significance = significance
|
|
1226
|
+
self.J = J
|
|
1227
|
+
self.j_pvalue = j_pvalue
|
|
1228
|
+
self.j_df = j_df
|
|
1229
|
+
self.deviance = deviance
|
|
1230
|
+
self.sigmasq = sigmasq
|
|
1231
|
+
self.y = y
|
|
1232
|
+
self.fitted_values = fitted_values
|
|
1233
|
+
self.weights = weights
|
|
1234
|
+
self.converged = converged
|
|
1235
|
+
|
|
1236
|
+
def __str__(self) -> str:
|
|
1237
|
+
"""Return formatted summary table with professional ASCII layout."""
|
|
1238
|
+
width = 60
|
|
1239
|
+
lines = []
|
|
1240
|
+
lines.append("=" * width)
|
|
1241
|
+
lines.append("CBPS Estimation Summary")
|
|
1242
|
+
lines.append("=" * width)
|
|
1243
|
+
|
|
1244
|
+
# Call info
|
|
1245
|
+
lines.append(f"Call: {self.call}")
|
|
1246
|
+
|
|
1247
|
+
# Sample info
|
|
1248
|
+
if self.y is not None:
|
|
1249
|
+
n = len(self.y)
|
|
1250
|
+
unique_y = np.unique(self.y)
|
|
1251
|
+
if len(unique_y) == 2:
|
|
1252
|
+
n_treated = int(np.sum(self.y == unique_y[1]))
|
|
1253
|
+
n_control = int(np.sum(self.y == unique_y[0]))
|
|
1254
|
+
lines.append(
|
|
1255
|
+
f"N: {n} (Treated: {n_treated}, Control: {n_control})"
|
|
1256
|
+
)
|
|
1257
|
+
else:
|
|
1258
|
+
lines.append(f"N: {n}")
|
|
1259
|
+
|
|
1260
|
+
# Convergence
|
|
1261
|
+
if self.converged is not None:
|
|
1262
|
+
lines.append(f"Converged: {'Yes' if self.converged else 'No'}")
|
|
1263
|
+
|
|
1264
|
+
# Deviance residuals for binary treatment
|
|
1265
|
+
if self.y is not None and self.fitted_values is not None:
|
|
1266
|
+
unique_y = np.unique(self.y)
|
|
1267
|
+
if len(unique_y) == 2:
|
|
1268
|
+
fitted = self.fitted_values.ravel()
|
|
1269
|
+
y_binary = self.y.ravel()
|
|
1270
|
+
eps = 1e-10
|
|
1271
|
+
fitted_safe = np.clip(fitted, eps, 1 - eps)
|
|
1272
|
+
sign = np.where(y_binary == 1, 1, -1)
|
|
1273
|
+
deviance_resid = sign * np.sqrt(-2 * (
|
|
1274
|
+
y_binary * np.log(fitted_safe) +
|
|
1275
|
+
(1 - y_binary) * np.log(1 - fitted_safe)
|
|
1276
|
+
))
|
|
1277
|
+
percentiles = np.percentile(
|
|
1278
|
+
deviance_resid, [0, 25, 50, 75, 100]
|
|
1279
|
+
)
|
|
1280
|
+
lines.append("-" * width)
|
|
1281
|
+
lines.append("Deviance Residuals:")
|
|
1282
|
+
lines.append(
|
|
1283
|
+
f" Min 1Q Median 3Q Max"
|
|
1284
|
+
)
|
|
1285
|
+
lines.append(
|
|
1286
|
+
f"{percentiles[0]:7.4f} {percentiles[1]:7.4f} "
|
|
1287
|
+
f"{percentiles[2]:7.4f} {percentiles[3]:7.4f} "
|
|
1288
|
+
f"{percentiles[4]:7.4f}"
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
# Coefficients
|
|
1292
|
+
lines.append("-" * width)
|
|
1293
|
+
lines.append("Coefficients:")
|
|
1294
|
+
# Header
|
|
1295
|
+
lines.append(
|
|
1296
|
+
f"{'':20s} {'Estimate':>10s} {'Std. Error':>10s} "
|
|
1297
|
+
f"{'z value':>8s} {'Pr(>|z|)':>10s}"
|
|
1298
|
+
)
|
|
1299
|
+
for i, name in enumerate(self.coef_names):
|
|
1300
|
+
row = self.coefficients[i]
|
|
1301
|
+
sig = self.significance[i]
|
|
1302
|
+
# Truncate long names
|
|
1303
|
+
display_name = name[:19] if len(name) > 19 else name
|
|
1304
|
+
lines.append(
|
|
1305
|
+
f"{display_name:20s} {row[0]:10.4f} {row[1]:10.4f} "
|
|
1306
|
+
f"{row[2]:8.3f} {row[3]:10.3e} {sig}"
|
|
1307
|
+
)
|
|
1308
|
+
lines.append("---")
|
|
1309
|
+
lines.append(
|
|
1310
|
+
"Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
# Sigma-squared (continuous treatment only)
|
|
1314
|
+
if self.sigmasq is not None:
|
|
1315
|
+
lines.append(f"\nSigma-Squared: {self.sigmasq}")
|
|
1316
|
+
|
|
1317
|
+
# Diagnostics
|
|
1318
|
+
lines.append("-" * width)
|
|
1319
|
+
lines.append("Diagnostics:")
|
|
1320
|
+
# Format J-statistic line based on validity and identification status
|
|
1321
|
+
if self.J is None or not np.isfinite(self.J) or self.J < 0:
|
|
1322
|
+
lines.append(" J-statistic: N/A")
|
|
1323
|
+
elif self.j_df is not None and self.j_df == 0:
|
|
1324
|
+
lines.append(
|
|
1325
|
+
f" J-statistic: {self.J:.4f} "
|
|
1326
|
+
f"(just-identified, no overid test)"
|
|
1327
|
+
)
|
|
1328
|
+
elif (self.j_pvalue is not None
|
|
1329
|
+
and np.isfinite(self.j_pvalue)
|
|
1330
|
+
and self.j_df is not None
|
|
1331
|
+
and self.j_df > 0):
|
|
1332
|
+
lines.append(
|
|
1333
|
+
f" J-statistic: {self.J:.4f} "
|
|
1334
|
+
f"(df={self.j_df}, p={self.j_pvalue:.4f})"
|
|
1335
|
+
)
|
|
1336
|
+
elif self.j_df is None:
|
|
1337
|
+
lines.append(
|
|
1338
|
+
f" J-statistic: {self.J:.4f} (df not available)"
|
|
1339
|
+
)
|
|
1340
|
+
else:
|
|
1341
|
+
lines.append(f" J-statistic: {self.J:.4f}")
|
|
1342
|
+
lines.append(f" Log-Likelihood: {-0.5 * self.deviance:.4f}")
|
|
1343
|
+
|
|
1344
|
+
if self.weights is not None:
|
|
1345
|
+
w = self.weights
|
|
1346
|
+
ess = (w.sum() ** 2) / (w ** 2).sum()
|
|
1347
|
+
n_total = len(w)
|
|
1348
|
+
lines.append(
|
|
1349
|
+
f" Effective Sample Size: {ess:.1f} / {n_total} "
|
|
1350
|
+
f"({100 * ess / n_total:.1f}%)"
|
|
1351
|
+
)
|
|
1352
|
+
lines.append(
|
|
1353
|
+
f" Weights: min={w.min():.4f}, "
|
|
1354
|
+
f"max={w.max():.4f}, mean={w.mean():.4f}"
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
lines.append("=" * width)
|
|
1358
|
+
return "\n".join(lines)
|
|
1359
|
+
|
|
1360
|
+
def __repr__(self) -> str:
|
|
1361
|
+
"""Return concise representation."""
|
|
1362
|
+
return f"CBPSSummary(k={len(self.coef_names)}, J={self.J:.6f})"
|
|
1363
|
+
|
|
1364
|
+
@property
|
|
1365
|
+
def coef(self) -> np.ndarray:
|
|
1366
|
+
"""
|
|
1367
|
+
Coefficient estimates (convenience property).
|
|
1368
|
+
|
|
1369
|
+
Returns
|
|
1370
|
+
-------
|
|
1371
|
+
ndarray
|
|
1372
|
+
Coefficient vector, equivalent to ``self.coefficients[:, 0]``.
|
|
1373
|
+
|
|
1374
|
+
Examples
|
|
1375
|
+
--------
|
|
1376
|
+
>>> summ = fit.summary()
|
|
1377
|
+
>>> summ.coef # Convenient access
|
|
1378
|
+
array([...])
|
|
1379
|
+
>>> summ.coefficients[:, 0] # Original access (still supported)
|
|
1380
|
+
array([...])
|
|
1381
|
+
"""
|
|
1382
|
+
return self.coefficients[:, 0]
|
|
1383
|
+
|
|
1384
|
+
@property
|
|
1385
|
+
def se(self) -> np.ndarray:
|
|
1386
|
+
"""
|
|
1387
|
+
Standard errors of coefficient estimates (convenience property).
|
|
1388
|
+
|
|
1389
|
+
Returns
|
|
1390
|
+
-------
|
|
1391
|
+
ndarray
|
|
1392
|
+
Standard error vector, equivalent to ``self.coefficients[:, 1]``.
|
|
1393
|
+
|
|
1394
|
+
Notes
|
|
1395
|
+
-----
|
|
1396
|
+
Aligns with statsmodels API: ``fit.bse`` (standard error of coefficients).
|
|
1397
|
+
|
|
1398
|
+
Examples
|
|
1399
|
+
--------
|
|
1400
|
+
>>> summ.se # Convenient access
|
|
1401
|
+
array([...])
|
|
1402
|
+
"""
|
|
1403
|
+
return self.coefficients[:, 1]
|
|
1404
|
+
|
|
1405
|
+
@property
|
|
1406
|
+
def zvalues(self) -> np.ndarray:
|
|
1407
|
+
"""
|
|
1408
|
+
Z-statistics for coefficient estimates (convenience property).
|
|
1409
|
+
|
|
1410
|
+
Returns
|
|
1411
|
+
-------
|
|
1412
|
+
ndarray
|
|
1413
|
+
z-statistic vector, equivalent to ``self.coefficients[:, 2]``.
|
|
1414
|
+
|
|
1415
|
+
Notes
|
|
1416
|
+
-----
|
|
1417
|
+
Aligns with statsmodels API: ``fit.tvalues`` (t-statistic, z for large samples).
|
|
1418
|
+
|
|
1419
|
+
Examples
|
|
1420
|
+
--------
|
|
1421
|
+
>>> summ.zvalues # Convenient access
|
|
1422
|
+
array([...])
|
|
1423
|
+
"""
|
|
1424
|
+
return self.coefficients[:, 2]
|
|
1425
|
+
|
|
1426
|
+
@property
|
|
1427
|
+
def pvalues(self) -> np.ndarray:
|
|
1428
|
+
"""
|
|
1429
|
+
Two-sided p-values for coefficient estimates (convenience property).
|
|
1430
|
+
|
|
1431
|
+
Returns
|
|
1432
|
+
-------
|
|
1433
|
+
ndarray
|
|
1434
|
+
p-value vector, equivalent to ``self.coefficients[:, 3]``.
|
|
1435
|
+
|
|
1436
|
+
Notes
|
|
1437
|
+
-----
|
|
1438
|
+
Two-sided test: p = 2 * (1 - Phi(abs(z)))
|
|
1439
|
+
|
|
1440
|
+
Aligns with statsmodels API: ``fit.pvalues``.
|
|
1441
|
+
|
|
1442
|
+
Examples
|
|
1443
|
+
--------
|
|
1444
|
+
>>> summ.pvalues # Convenient access
|
|
1445
|
+
array([...])
|
|
1446
|
+
"""
|
|
1447
|
+
return self.coefficients[:, 3]
|