cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/core/results.py ADDED
@@ -0,0 +1,1447 @@
1
+ """
2
+ Result Classes for Covariate Balancing Propensity Score Estimation
3
+
4
+ ================================================================
5
+
6
+ This module implements the primary result containers for CBPS estimators,
7
+ providing a unified interface for accessing estimation results, conducting
8
+ statistical inference, and performing diagnostic assessments.
9
+
10
+ The module contains two main classes:
11
+
12
+ - :class:`CBPSResults`: Main result object containing all fitted model components
13
+ - :class:`CBPSSummary`: Statistical summary with coefficient table and diagnostics
14
+
15
+ These classes implement a comprehensive statistical modeling interface with
16
+ methods for inference, prediction, and diagnostic evaluation, maintaining
17
+ compatibility with established statistical software conventions while following
18
+ Python best practices.
19
+
20
+ Mathematical Framework
21
+ ---------------------
22
+
23
+ The CBPS estimator solves the generalized method of moments (GMM) optimization
24
+ problem:
25
+
26
+ min_β ḡ(β)' Σ^(-1) ḡ(β)
27
+
28
+ where ḡ(β) is the sample average of moment conditions combining:
29
+
30
+ 1. Score function: ∂ℓ(β)/∂β for treatment prediction
31
+ 2. Balance conditions: (T_i - e(X_i,β))X_i for covariate balance
32
+
33
+ The resulting weights are:
34
+
35
+ w_i = T_i/e(X_i,β) - (1-T_i)/(1-e(X_i,β))
36
+
37
+ which satisfy the moment conditions E[w_i X_i] = 0 when correctly specified.
38
+
39
+ References
40
+ ----------
41
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
42
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
43
+ https://doi.org/10.1111/rssb.12027
44
+ """
45
+
46
+ from typing import Optional, List, Union
47
+
48
+ import numpy as np
49
+ import pandas as pd
50
+ import scipy.stats
51
+ import warnings
52
+
53
+
54
+ def j_test_pvalue(J, n_moment_conditions, n_parameters):
55
+ """Compute asymptotic p-value for Hansen's J-test of overidentification.
56
+
57
+ Under H0 (model correctly specified):
58
+ J ~ chi2(df), where df = n_moment_conditions - n_parameters
59
+
60
+ Parameters
61
+ ----------
62
+ J : float
63
+ Hansen's J-statistic value. Must be non-negative (quadratic form).
64
+ n_moment_conditions : int
65
+ Number of moment conditions in the GMM system.
66
+ n_parameters : int
67
+ Number of estimated parameters.
68
+
69
+ Returns
70
+ -------
71
+ float or None
72
+ Asymptotic p-value from chi-squared distribution.
73
+ Returns None if just-identified (df <= 0) or if J is invalid.
74
+
75
+ Raises
76
+ ------
77
+ ValueError
78
+ If J is negative or NaN (indicates upstream computation error).
79
+
80
+ References
81
+ ----------
82
+ Imai, K. & Ratkovic, M. (2014), JRSSB. Section 3.
83
+ Hansen, L.P. (1982), Econometrica.
84
+ """
85
+ # Validate J-statistic
86
+ if np.isnan(J):
87
+ raise ValueError(
88
+ "J-statistic is NaN. This indicates an error in the GMM "
89
+ "objective computation. Check for numerical issues in estimation."
90
+ )
91
+ if J < 0:
92
+ raise ValueError(
93
+ f"J-statistic must be non-negative (it is a quadratic form), "
94
+ f"got J={J:.6g}. This indicates an error in the GMM objective "
95
+ f"computation."
96
+ )
97
+
98
+ df = n_moment_conditions - n_parameters
99
+ if df <= 0:
100
+ return None # Just-identified: no overidentification test
101
+ p_value = 1.0 - scipy.stats.chi2.cdf(J, df)
102
+ return p_value
103
+
104
+
105
+ class CBPSResults:
106
+ """
107
+ Result object from CBPS estimation.
108
+
109
+ This class encapsulates all outputs from the CBPS fitting procedure,
110
+ providing a unified interface for accessing coefficients, weights,
111
+ fitted propensity scores, and diagnostic statistics.
112
+
113
+ Attributes
114
+ ----------
115
+ coefficients : ndarray, shape (k, 1) or (k, n_treats-1)
116
+ Estimated propensity score model coefficients.
117
+ For binary treatment: (k, 1) matrix.
118
+ For multi-valued treatment: (k, n_treats-1) matrix.
119
+ fitted_values : ndarray, shape (n,)
120
+ Fitted propensity scores for each observation.
121
+ weights : ndarray, shape (n,)
122
+ Optimal inverse probability weights for causal effect estimation.
123
+ linear_predictor : ndarray, shape (n,)
124
+ Linear predictor X @ beta before link function transformation.
125
+ y : ndarray, shape (n,)
126
+ Treatment assignment vector.
127
+ x : ndarray, shape (n, k)
128
+ Covariate matrix including intercept.
129
+ J : float
130
+ Hansen J-statistic for the GMM over-identification test.
131
+ mle_J : float
132
+ J-statistic evaluated at MLE estimates (baseline comparison).
133
+ deviance : float
134
+ Model deviance (-2 * log-likelihood).
135
+ converged : bool
136
+ Whether the optimization algorithm converged successfully.
137
+ var : ndarray, shape (k, k)
138
+ Variance-covariance matrix of coefficients (sandwich estimator).
139
+ coef_names : list of str
140
+ Names of coefficients extracted from the model formula.
141
+ call_info : str
142
+ String representation of the function call.
143
+ formula : str or None
144
+ The model formula used for fitting (if formula interface was used).
145
+ att : int or None
146
+ Target estimand: 0 for ATE, 1 for ATT.
147
+ method : str or None
148
+ Estimation method: 'over' (over-identified) or 'exact' (just-identified).
149
+ standardize : bool or None
150
+ Whether weights are standardized to sum to sample size.
151
+ two_step : bool or None
152
+ Whether two-step GMM estimator was used.
153
+ sigmasq : float or None
154
+ Residual variance estimate (continuous treatment only).
155
+ Ttilde : ndarray or None
156
+ Standardized treatment (zero mean, unit variance) for continuous treatment.
157
+ Used by vcov_outcome for variance estimation.
158
+ Xtilde : ndarray or None
159
+ Cholesky-whitened covariates for continuous treatment.
160
+ Used by vcov_outcome for variance estimation.
161
+ beta_tilde : ndarray or None
162
+ Coefficients in whitened space for continuous treatment.
163
+ sigmasq_tilde : float or None
164
+ Variance in whitened space for continuous treatment.
165
+ treat_names : list of str or None
166
+ Treatment level names for multi-valued treatment.
167
+ Example: ['Control', 'Low', 'High'] for 3-valued treatment.
168
+ na_action : dict or None
169
+ Missing value handling information containing:
170
+ - 'method': handling method ('omit', 'fail', 'ignore')
171
+ - 'n_dropped': number of dropped observations (only for method='omit')
172
+
173
+ Examples
174
+ --------
175
+ >>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
176
+ >>> summ = fit.summary() # Compute summary statistics
177
+ >>> print(summ) # Print full coefficient table
178
+ >>> vcov_mat = fit.vcov() # Get variance-covariance matrix
179
+ >>> print(fit) # Concise output
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ # Core estimation results
185
+ coefficients: np.ndarray,
186
+ fitted_values: np.ndarray,
187
+ weights: np.ndarray,
188
+ linear_predictor: np.ndarray,
189
+ y: np.ndarray,
190
+ x: np.ndarray,
191
+ J: float,
192
+ mle_J: float,
193
+ deviance: float,
194
+ converged: bool,
195
+ var: np.ndarray,
196
+ nulldeviance: Optional[float] = None,
197
+
198
+ # Metadata
199
+ coef_names: Optional[List[str]] = None,
200
+ call_info: Optional[str] = None,
201
+ formula: Optional[str] = None,
202
+ data: Optional[pd.DataFrame] = None,
203
+ terms: Optional[str] = None,
204
+ model: Optional[np.ndarray] = None,
205
+ xlevels: Optional[dict] = None,
206
+
207
+ # Input parameters
208
+ att: Optional[int] = None,
209
+ method: Optional[str] = None,
210
+ standardize: Optional[bool] = None,
211
+ two_step: Optional[bool] = None,
212
+
213
+ # Continuous treatment specific
214
+ sigmasq: Optional[float] = None,
215
+ Ttilde: Optional[np.ndarray] = None,
216
+ Xtilde: Optional[np.ndarray] = None,
217
+ beta_tilde: Optional[np.ndarray] = None,
218
+ sigmasq_tilde: Optional[float] = None,
219
+ stabilizers: Optional[np.ndarray] = None,
220
+
221
+ # Multi-valued treatment specific
222
+ treat_names: Optional[List[str]] = None,
223
+
224
+ # Missing data handling
225
+ na_action: Optional[dict] = None
226
+ ):
227
+ """
228
+ Initialize CBPS result object.
229
+
230
+ Parameters
231
+ ----------
232
+ coefficients : ndarray
233
+ Coefficient matrix, shape (k, 1) for binary or (k, n_treats-1) for multi-valued.
234
+ fitted_values : ndarray
235
+ Fitted propensity scores, shape (n,).
236
+ weights : ndarray
237
+ Optimal IPW weights, shape (n,).
238
+ linear_predictor : ndarray
239
+ Linear predictor X @ beta, shape (n,).
240
+ y : ndarray
241
+ Treatment vector (original), shape (n,).
242
+ x : ndarray
243
+ Covariate matrix (with intercept), shape (n, k).
244
+ J : float
245
+ Hansen J-statistic (over-identification test).
246
+ mle_J : float
247
+ MLE baseline J-statistic.
248
+ deviance : float
249
+ Negative 2 times log-likelihood.
250
+ converged : bool
251
+ Optimization convergence status.
252
+ var : ndarray
253
+ Coefficient variance-covariance matrix, shape (k, k).
254
+ nulldeviance : float, optional
255
+ Null model deviance for pseudo R-squared calculation.
256
+ coef_names : list, optional
257
+ Coefficient names from formula.
258
+ call_info : str, optional
259
+ Call information string.
260
+ formula : str, optional
261
+ Model formula string.
262
+
263
+ Notes
264
+ -----
265
+ All parameters are typically passed from internal fitting routines
266
+ and should not be constructed manually by users.
267
+
268
+ The coefficients must be a 2D matrix:
269
+ - Binary treatment: (k, 1)
270
+ - 3-valued treatment: (k, 2)
271
+ - 4-valued treatment: (k, 3)
272
+ """
273
+ # Core estimation results
274
+ self.coefficients = coefficients
275
+ self.fitted_values = fitted_values
276
+ self.weights = weights
277
+ self.linear_predictor = linear_predictor
278
+ self.y = y
279
+ self.x = x
280
+ self.J = J
281
+ self.mle_J = mle_J
282
+ self.deviance = deviance
283
+ self.nulldeviance = nulldeviance
284
+ self.converged = converged
285
+ self.var = var
286
+
287
+ # Metadata
288
+ self.call_info = call_info or "CBPS()"
289
+ self.call = call_info or "CBPS()"
290
+ self.coef_names = coef_names or self._default_coef_names()
291
+ self.formula = formula
292
+ self.data = data
293
+ self.terms = terms
294
+ self.model = model
295
+ self.xlevels = xlevels
296
+
297
+ # Input parameters
298
+ self.att = att
299
+ self.method = method
300
+ self.standardize = standardize
301
+ self.two_step = two_step
302
+
303
+ # Continuous treatment specific
304
+ self.sigmasq = sigmasq
305
+ self.Ttilde = Ttilde
306
+ self.Xtilde = Xtilde
307
+ self.beta_tilde = beta_tilde
308
+ self.sigmasq_tilde = sigmasq_tilde
309
+ self.stabilizers = stabilizers
310
+
311
+ # Multi-valued treatment specific
312
+ self.treat_names = treat_names
313
+
314
+ # Missing data handling
315
+ self.na_action = na_action
316
+
317
+ # Validate coefficients shape
318
+ if self.coefficients.ndim != 2:
319
+ raise ValueError(
320
+ f"coefficients must be 2D array, got shape {self.coefficients.shape}. "
321
+ f"Expected (k, 1) for binary or (k, n_treats-1) for multi-valued."
322
+ )
323
+
324
+ def _default_coef_names(self) -> List[str]:
325
+ """Generate default coefficient names when none are provided."""
326
+ k = self.coefficients.shape[0]
327
+ if k == 0:
328
+ return []
329
+ return ["Intercept"] + [f"X{i}" for i in range(1, k)]
330
+
331
+ def __getstate__(self):
332
+ """Support pickle serialization.
333
+
334
+ Excludes unpicklable objects (e.g., patsy DesignInfo) by
335
+ setting them to None in the serialized state.
336
+ """
337
+ state = self.__dict__.copy()
338
+ # patsy DesignInfo does not support pickle
339
+ # (see https://github.com/pydata/patsy/issues/26)
340
+ for key in ('terms', 'data'):
341
+ if key in state:
342
+ try:
343
+ pickle_test = state[key]
344
+ if pickle_test is not None:
345
+ import pickle as _pkl
346
+ _pkl.dumps(pickle_test)
347
+ except (NotImplementedError, TypeError, AttributeError):
348
+ state[key] = None
349
+ return state
350
+
351
+ def __setstate__(self, state):
352
+ """Support pickle deserialization."""
353
+ self.__dict__.update(state)
354
+
355
+ def vcov(self) -> np.ndarray:
356
+ """
357
+ Return the variance-covariance matrix of the estimated coefficients.
358
+
359
+ Returns
360
+ -------
361
+ ndarray, shape (k, k)
362
+ Variance-covariance matrix computed using the sandwich estimator.
363
+
364
+ Raises
365
+ ------
366
+ ValueError
367
+ If the variance matrix was not computed during fitting (var is None).
368
+
369
+ Warns
370
+ -----
371
+ UserWarning
372
+ If the condition number exceeds 1e10, indicating potential
373
+ near-collinearity that may affect standard error reliability.
374
+
375
+ Notes
376
+ -----
377
+ This method directly returns the stored variance matrix computed during
378
+ fitting using the sandwich formula. It does not recompute the matrix.
379
+
380
+ The variance matrix is computed as:
381
+ ``vcov = (G' W G)^{-1} G' W Omega W' G (G' W G)^{-1}``
382
+
383
+ where G is the gradient matrix, W is the weighting matrix, and Omega
384
+ is the covariance of the moment conditions.
385
+ """
386
+ if self.var is None:
387
+ raise ValueError(
388
+ "Variance-covariance matrix not computed. "
389
+ "This may indicate a fitting error."
390
+ )
391
+
392
+ # Check condition number to detect near-collinearity
393
+ try:
394
+ cond_number = np.linalg.cond(self.var)
395
+ if cond_number > 1e10:
396
+ warnings.warn(
397
+ f"Variance-covariance matrix has high condition number ({cond_number:.2e}). "
398
+ f"This suggests near-collinearity among covariates. "
399
+ f"Standard errors may be unreliable. "
400
+ f"Consider:\n"
401
+ f" 1. Removing highly correlated covariates\n"
402
+ f" 2. Using regularization (e.g., hdCBPS)\n"
403
+ f" 3. Checking for perfect collinearity with np.linalg.matrix_rank(X)",
404
+ UserWarning,
405
+ stacklevel=2
406
+ )
407
+ except np.linalg.LinAlgError:
408
+ warnings.warn(
409
+ "Failed to compute condition number of variance-covariance matrix. "
410
+ "Matrix may be singular or near-singular.",
411
+ UserWarning,
412
+ stacklevel=2
413
+ )
414
+
415
+ return self.var
416
+
417
+ @property
418
+ def residuals(self) -> np.ndarray:
419
+ """
420
+ Model residuals (observed minus fitted values).
421
+
422
+ Returns
423
+ -------
424
+ ndarray
425
+ Residual vector or matrix depending on treatment type:
426
+
427
+ - Binary: y - fitted_values, shape (n,)
428
+ - Continuous: standardized residuals in whitened space, shape (n,)
429
+ - Multi-valued: one-hot encoded y minus fitted probabilities, shape (n, k)
430
+ """
431
+ # Continuous treatment
432
+ if self.Ttilde is not None:
433
+ return self.Ttilde - self.linear_predictor.ravel()
434
+
435
+ # Binary treatment
436
+ if self.fitted_values.ndim == 1 or (self.fitted_values.ndim == 2 and self.fitted_values.shape[1] == 1):
437
+ return self.y - self.fitted_values.ravel()
438
+
439
+ # Multi-valued treatment
440
+ n_samples = len(self.y)
441
+ n_classes = self.fitted_values.shape[1]
442
+ y_onehot = np.zeros((n_samples, n_classes))
443
+
444
+ try:
445
+ y_int = self.y.astype(int)
446
+ if y_int.min() >= 0 and y_int.max() < n_classes:
447
+ y_onehot[np.arange(n_samples), y_int] = 1
448
+ return y_onehot - self.fitted_values
449
+ except Exception:
450
+ pass
451
+
452
+ raise NotImplementedError(
453
+ "Residuals not supported for this multi-valued treatment format"
454
+ )
455
+
456
+ @property
457
+ def pseudo_r2(self) -> Optional[float]:
458
+ """
459
+ McFadden's pseudo R-squared measure of model fit.
460
+
461
+ Returns
462
+ -------
463
+ float or None
464
+ Pseudo R² = 1 - deviance / null_deviance.
465
+ Returns None if null deviance is unavailable or zero.
466
+
467
+ Notes
468
+ -----
469
+ The pseudo R² measures improvement over the null (intercept-only) model:
470
+
471
+ - 0: No improvement over null model
472
+ - 1: Perfect fit
473
+ - Typical range: 0.05-0.40 for logistic models
474
+
475
+ The null model contains only the intercept and predicts all observations
476
+ with probability equal to the sample mean.
477
+
478
+ Examples
479
+ --------
480
+ >>> fit = CBPS('treat ~ age + educ', data=data)
481
+ >>> print(f"Pseudo R²: {fit.pseudo_r2:.4f}")
482
+ """
483
+ if self.nulldeviance is None or self.nulldeviance == 0:
484
+ return None
485
+ return 1.0 - self.deviance / self.nulldeviance
486
+
487
+ def balance(self, **kwargs):
488
+ """
489
+ Compute covariate balance statistics.
490
+
491
+ This is a convenience method that calls the standalone ``balance()``
492
+ function. Both ``fit.balance()`` and ``balance(fit)`` are supported,
493
+ allowing users to choose either object-oriented or functional style.
494
+
495
+ Parameters
496
+ ----------
497
+ **kwargs
498
+ Additional arguments passed to ``balance()``:
499
+
500
+ - enhanced : bool, default=False
501
+ Whether to return enhanced diagnostic information.
502
+ - threshold : float, default=0.1
503
+ Imbalance threshold (SMD or correlation) for flagging covariates.
504
+ - covariate_names : list, optional
505
+ Covariate names for enhanced output.
506
+
507
+ Returns
508
+ -------
509
+ dict
510
+ Dictionary containing balance statistics:
511
+
512
+ - 'balanced': Weighted covariate balance measures
513
+ - 'original' or 'unweighted': Unweighted baseline measures
514
+
515
+ See ``cbps.balance()`` documentation for full details.
516
+
517
+ Examples
518
+ --------
519
+ >>> fit = CBPS('treat ~ age + educ', data=df, att=1)
520
+ >>>
521
+ >>> # Method 1: Standalone function
522
+ >>> from cbps import balance
523
+ >>> bal = balance(fit)
524
+ >>>
525
+ >>> # Method 2: Object method (Python style)
526
+ >>> bal = fit.balance()
527
+ >>>
528
+ >>> # Both methods produce identical results
529
+ """
530
+ from cbps import balance as balance_func
531
+
532
+ cbps_dict = {
533
+ 'weights': self.weights,
534
+ 'x': self.x,
535
+ 'y': self.y,
536
+ 'fitted_values': self.fitted_values,
537
+ 'coefficients': self.coefficients
538
+ }
539
+
540
+ return balance_func(cbps_dict, **kwargs)
541
+
542
+ @property
543
+ def coef(self) -> np.ndarray:
544
+ """
545
+ Coefficient vector (1D convenience accessor).
546
+
547
+ Returns
548
+ -------
549
+ ndarray, shape (k,)
550
+ Coefficient vector (1D), extracted from the coefficients matrix.
551
+
552
+ Notes
553
+ -----
554
+ This is a convenience property providing a 1D view of coefficients.
555
+
556
+ - For binary treatment: returns ``coefficients[:, 0]`` (1D)
557
+ - For multi-valued treatment: returns ``coefficients[:, 0]`` (first contrast)
558
+
559
+ The full coefficient matrix is still accessible via ``fit.coefficients``.
560
+
561
+ Comparison with other Python packages:
562
+
563
+ - statsmodels: ``result.params`` (1D)
564
+ - sklearn: ``model.coef_`` (may be 2D)
565
+ - CBPS: ``fit.coef`` (1D, this property) + ``fit.coefficients`` (full 2D)
566
+
567
+ Examples
568
+ --------
569
+ >>> fit = CBPS('treat ~ age + educ', data=df, att=1)
570
+ >>> fit.coef # Convenient 1D access
571
+ array([0.123, 0.456, -0.789])
572
+ >>> fit.coefficients # Full 2D matrix
573
+ array([[0.123],
574
+ [0.456],
575
+ [-0.789]])
576
+ """
577
+ if self.coefficients.ndim == 1:
578
+ return self.coefficients
579
+ else:
580
+ return self.coefficients[:, 0] if self.coefficients.shape[1] == 1 else self.coefficients.ravel()
581
+
582
+ @property
583
+ def fitted(self) -> np.ndarray:
584
+ """
585
+ Alias for fitted_values (alternative accessor).
586
+
587
+ Returns
588
+ -------
589
+ ndarray
590
+ Fitted propensity scores, equivalent to ``fitted_values``.
591
+
592
+ Notes
593
+ -----
594
+ This is an alias for ``fitted_values`` for convenience.
595
+
596
+ Examples
597
+ --------
598
+ >>> fit = CBPS('treat ~ age + educ', data=df)
599
+ >>> # The following are equivalent
600
+ >>> fv1 = fit.fitted_values
601
+ >>> fv2 = fit.fitted
602
+ >>> np.allclose(fv1, fv2)
603
+ True
604
+ """
605
+ return self.fitted_values
606
+
607
+ @property
608
+ def J_stat(self) -> float:
609
+ """
610
+ Alias for J (Hansen's J-statistic).
611
+
612
+ Returns
613
+ -------
614
+ float
615
+ The GMM over-identification test statistic.
616
+
617
+ Notes
618
+ -----
619
+ The J-statistic is used for the GMM over-identification test.
620
+ Under the null hypothesis of correct model specification, J is
621
+ asymptotically chi-squared distributed with degrees of freedom
622
+ equal to the number of over-identifying restrictions.
623
+
624
+ Examples
625
+ --------
626
+ >>> fit = CBPS('treat ~ age + educ', data=df)
627
+ >>> j1 = fit.J # Original attribute
628
+ >>> j2 = fit.J_stat # Alias
629
+ >>> assert j1 == j2
630
+ """
631
+ return self.J
632
+
633
+ @property
634
+ def sigma_squared(self) -> Optional[float]:
635
+ """
636
+ Residual variance estimate (continuous treatment only).
637
+
638
+ Returns
639
+ -------
640
+ float or None
641
+ Variance estimate for continuous treatment models.
642
+ Returns None for binary or multi-valued treatments.
643
+
644
+ Notes
645
+ -----
646
+ Only available for continuous treatment CBPS. For binary and
647
+ multi-valued treatments, this property returns None.
648
+
649
+ Examples
650
+ --------
651
+ >>> # Continuous treatment
652
+ >>> fit_cont = CBPS('dose ~ age + educ', data=df)
653
+ >>> sigma2 = fit_cont.sigma_squared
654
+ >>>
655
+ >>> # Binary treatment
656
+ >>> fit_bin = CBPS('treat ~ age + educ', data=df)
657
+ >>> assert fit_bin.sigma_squared is None
658
+ """
659
+ return getattr(self, 'sigmasq', None)
660
+
661
+ def predict(self, newdata: Optional[Union[pd.DataFrame, np.ndarray]] = None, type: str = 'response') -> np.ndarray:
662
+ """
663
+ Predict propensity scores for new data.
664
+
665
+ Parameters
666
+ ----------
667
+ newdata : DataFrame, ndarray, or None
668
+ New data for prediction. If None, returns fitted values from
669
+ the training data.
670
+
671
+ - DataFrame: Required when using formula interface
672
+ - ndarray: Shape (n_new, k) matching the training covariates
673
+
674
+ type : {'response', 'link'}, default='response'
675
+ Type of prediction:
676
+
677
+ - 'response': Probabilities/expected values (after link function)
678
+ - 'link': Linear predictor X @ beta (before transformation)
679
+
680
+ Returns
681
+ -------
682
+ ndarray
683
+ Predicted values. Shape depends on treatment type:
684
+
685
+ - Binary: (n_new,) probabilities
686
+ - Multi-valued: (n_new, n_levels) probabilities
687
+ - Continuous: (n_new,) conditional means
688
+
689
+ Raises
690
+ ------
691
+ ValueError
692
+ If type is invalid or newdata dimensions do not match.
693
+
694
+ Notes
695
+ -----
696
+ **Treatment type handling:**
697
+
698
+ - Binary treatment: logistic link (expit)
699
+ - Continuous treatment: identity link
700
+ - Multi-valued treatment: multinomial logistic (softmax)
701
+
702
+ **Formula vs array interface:**
703
+
704
+ - Formula interface: uses patsy DesignInfo to rebuild design matrix
705
+ - Array interface: directly uses newdata as covariate matrix
706
+
707
+ Examples
708
+ --------
709
+ >>> # Train model
710
+ >>> fit = CBPS('treat ~ x1 + x2', data=train_df)
711
+ >>>
712
+ >>> # Predict new data
713
+ >>> pred = fit.predict(test_df)
714
+ >>>
715
+ >>> # Predict linear predictor
716
+ >>> linear_pred = fit.predict(test_df, type='link')
717
+ >>>
718
+ >>> # Get training data fitted values
719
+ >>> fitted = fit.predict() # Equivalent to fit.fitted_values
720
+ """
721
+ valid_types = {'response', 'link'}
722
+ if type not in valid_types:
723
+ raise ValueError(
724
+ f"Invalid type: '{type}'. Must be one of {valid_types}."
725
+ )
726
+
727
+ if newdata is None:
728
+ if type == 'response':
729
+ return self.fitted_values
730
+ elif type == 'link':
731
+ return self.linear_predictor
732
+
733
+ X_new = self._prepare_newdata(newdata)
734
+ linear_pred = X_new @ self.coefficients
735
+
736
+ if type == 'link':
737
+ if linear_pred.ndim == 2 and linear_pred.shape[1] == 1:
738
+ return linear_pred.ravel()
739
+ return linear_pred
740
+ elif type == 'response':
741
+ return self._apply_link_function(linear_pred)
742
+
743
+ def _prepare_newdata(self, newdata: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
744
+ """Prepare design matrix from new data for prediction."""
745
+ import pandas as pd
746
+
747
+ if self.formula is not None and self.terms is not None:
748
+ if not isinstance(newdata, pd.DataFrame):
749
+ raise TypeError(
750
+ f"When using formula interface, newdata must be a DataFrame. "
751
+ f"Got {type(newdata).__name__}."
752
+ )
753
+
754
+ try:
755
+ from patsy import dmatrix
756
+ X_new_df = dmatrix(self.terms, newdata, return_type='dataframe')
757
+ X_new = X_new_df.values
758
+ except Exception as e:
759
+ raise ValueError(
760
+ f"Failed to build design matrix from newdata using formula '{self.formula}'. "
761
+ f"Error: {str(e)}\n"
762
+ f"Make sure newdata contains all variables used in the formula."
763
+ ) from e
764
+ else:
765
+ X_new = np.asarray(newdata)
766
+ if X_new.ndim == 1:
767
+ X_new = X_new.reshape(1, -1)
768
+
769
+ if X_new.shape[1] != self.x.shape[1]:
770
+ raise ValueError(
771
+ f"newdata has {X_new.shape[1]} columns, "
772
+ f"but model was trained with {self.x.shape[1]} columns. "
773
+ f"Expected shape: (n_new, {self.x.shape[1]})"
774
+ )
775
+
776
+ return X_new
777
+
778
+ def _apply_link_function(self, linear_pred: np.ndarray) -> np.ndarray:
779
+ """Apply inverse link function to convert linear predictor to response scale."""
780
+ coef_shape = self.coefficients.shape
781
+
782
+ # Binary treatment: logistic link
783
+ if len(coef_shape) == 2 and coef_shape[1] == 1:
784
+ from scipy.special import expit
785
+ return expit(linear_pred).ravel()
786
+
787
+ # Continuous treatment: identity link
788
+ elif self.sigmasq is not None:
789
+ return linear_pred.ravel()
790
+
791
+ # Multi-valued treatment: multinomial logistic (softmax)
792
+ elif len(coef_shape) == 2 and coef_shape[1] > 1:
793
+ exp_pred = np.exp(linear_pred)
794
+ denom = 1.0 + exp_pred.sum(axis=1, keepdims=True)
795
+ prob_baseline = 1.0 / denom
796
+ prob_others = exp_pred / denom
797
+ return np.column_stack([prob_baseline, prob_others])
798
+
799
+ else:
800
+ raise ValueError(
801
+ f"Cannot determine treatment type from coefficients shape {coef_shape}. "
802
+ f"Expected (k, 1) for binary/continuous or (k, K-1) for multi-valued."
803
+ )
804
+
805
+
806
+ def plot_deviance_residuals(self, ax=None, **kwargs):
807
+ """
808
+ Plot deviance residual diagnostics (binary treatment only).
809
+
810
+ Generates a 2x2 panel of diagnostic plots:
811
+
812
+ 1. Residuals vs Fitted: Check for non-linearity and heteroscedasticity
813
+ 2. Q-Q Plot: Assess normality of residuals
814
+ 3. Scale-Location: Check homoscedasticity assumption
815
+ 4. Residuals vs Leverage: Identify influential observations
816
+
817
+ Parameters
818
+ ----------
819
+ ax : matplotlib.axes.Axes, optional
820
+ Axes object for plotting. If None, creates a new figure.
821
+ **kwargs : dict
822
+ Additional arguments passed to matplotlib plotting functions.
823
+
824
+ Returns
825
+ -------
826
+ fig : matplotlib.figure.Figure
827
+ The figure object.
828
+ axes : ndarray of matplotlib.axes.Axes
829
+ Array of axes objects (2x2).
830
+
831
+ Raises
832
+ ------
833
+ ValueError
834
+ If treatment is not binary or required data is missing.
835
+ ImportError
836
+ If matplotlib is not installed.
837
+ """
838
+ try:
839
+ import matplotlib.pyplot as plt
840
+ from scipy import stats
841
+ except ImportError:
842
+ raise ImportError(
843
+ "matplotlib and scipy are required for plotting. "
844
+ "Install with: pip install matplotlib scipy"
845
+ )
846
+
847
+ if not hasattr(self, 'y') or self.y is None:
848
+ raise ValueError("Deviance residuals plot requires y (treatment) data")
849
+
850
+ y_binary = np.asarray(self.y).ravel()
851
+ unique_y = np.unique(y_binary)
852
+ if len(unique_y) != 2:
853
+ raise ValueError(
854
+ f"Deviance residuals plot only available for binary treatment. "
855
+ f"Found {len(unique_y)} unique treatment values."
856
+ )
857
+
858
+ # Compute deviance residuals
859
+ fitted_values = np.asarray(self.fitted_values).ravel()
860
+
861
+ eps = 1e-10
862
+ fitted_safe = np.clip(fitted_values, eps, 1 - eps)
863
+
864
+ sign = np.where(y_binary == 1, 1, -1)
865
+ deviance_resid = sign * np.sqrt(-2 * (
866
+ y_binary * np.log(fitted_safe) +
867
+ (1 - y_binary) * np.log(1 - fitted_safe)
868
+ ))
869
+
870
+ # Standardized residuals
871
+ std_resid = deviance_resid / np.std(deviance_resid)
872
+
873
+ # Create 2x2 subplot grid
874
+ if ax is None:
875
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
876
+ axes = axes.ravel()
877
+ else:
878
+ fig = ax.figure
879
+ axes = [ax]
880
+ if len(axes) < 4:
881
+ raise ValueError("Need 4 axes for diagnostic plots. Pass ax=None to create new figure.")
882
+
883
+ # Panel 1: Residuals vs Fitted
884
+ axes[0].scatter(fitted_values, deviance_resid, alpha=0.5, **kwargs)
885
+ axes[0].axhline(y=0, color='r', linestyle='--', linewidth=1)
886
+
887
+ try:
888
+ from statsmodels.nonparametric.smoothers_lowess import lowess
889
+ smoothed = lowess(deviance_resid, fitted_values, frac=0.3)
890
+ axes[0].plot(smoothed[:, 0], smoothed[:, 1], 'b-', linewidth=2, label='LOWESS')
891
+ axes[0].legend()
892
+ except ImportError:
893
+ pass
894
+
895
+ axes[0].set_xlabel('Fitted values')
896
+ axes[0].set_ylabel('Deviance Residuals')
897
+ axes[0].set_title('Residuals vs Fitted')
898
+ axes[0].grid(True, alpha=0.3)
899
+
900
+ # Panel 2: Q-Q Plot
901
+ scipy.stats.probplot(deviance_resid, dist="norm", plot=axes[1])
902
+ axes[1].set_title('Normal Q-Q Plot')
903
+ axes[1].grid(True, alpha=0.3)
904
+
905
+ # Panel 3: Scale-Location
906
+ sqrt_std_resid = np.sqrt(np.abs(std_resid))
907
+ axes[2].scatter(fitted_values, sqrt_std_resid, alpha=0.5, **kwargs)
908
+
909
+ # Add LOWESS smoother
910
+ try:
911
+ from statsmodels.nonparametric.smoothers_lowess import lowess
912
+ smoothed = lowess(sqrt_std_resid, fitted_values, frac=0.3)
913
+ axes[2].plot(smoothed[:, 0], smoothed[:, 1], 'b-', linewidth=2, label='LOWESS')
914
+ axes[2].legend()
915
+ except ImportError:
916
+ pass
917
+
918
+ axes[2].set_xlabel('Fitted values')
919
+ axes[2].set_ylabel('√|Standardized Residuals|')
920
+ axes[2].set_title('Scale-Location')
921
+ axes[2].grid(True, alpha=0.3)
922
+
923
+ # Panel 4: Residuals vs Leverage
924
+ leverage = fitted_values * (1 - fitted_values)
925
+ axes[3].scatter(leverage, std_resid, alpha=0.5, **kwargs)
926
+ axes[3].axhline(y=0, color='r', linestyle='--', linewidth=1)
927
+
928
+ # Mark high-influence points
929
+ cook_threshold = 4 / len(y_binary)
930
+ high_influence = np.abs(std_resid) * leverage > cook_threshold
931
+ if np.any(high_influence):
932
+ axes[3].scatter(leverage[high_influence], std_resid[high_influence],
933
+ color='red', s=100, alpha=0.7, label='High influence')
934
+ axes[3].legend()
935
+
936
+ axes[3].set_xlabel('Leverage')
937
+ axes[3].set_ylabel('Standardized Residuals')
938
+ axes[3].set_title('Residuals vs Leverage')
939
+ axes[3].grid(True, alpha=0.3)
940
+
941
+ plt.tight_layout()
942
+ return fig, axes
943
+
944
+ def plot(self, kind='deviance', **kwargs):
945
+ """
946
+ Generate diagnostic plots for the CBPS fit.
947
+
948
+ Parameters
949
+ ----------
950
+ kind : {'deviance'}, default='deviance'
951
+ Type of diagnostic plot to generate.
952
+ Currently only 'deviance' (residual diagnostics) is supported.
953
+ **kwargs : dict
954
+ Additional arguments passed to the plotting function.
955
+
956
+ Returns
957
+ -------
958
+ fig : matplotlib.figure.Figure
959
+ axes : matplotlib.axes.Axes or array of Axes
960
+
961
+ Raises
962
+ ------
963
+ ValueError
964
+ If an unknown plot kind is specified.
965
+ """
966
+ if kind == 'deviance':
967
+ return self.plot_deviance_residuals(**kwargs)
968
+ else:
969
+ raise ValueError(
970
+ f"Unknown plot kind: '{kind}'. "
971
+ f"Available options: 'deviance'"
972
+ )
973
+ @staticmethod
974
+ def _symnum(pval: np.ndarray) -> List[str]:
975
+ """Convert p-values to significance symbols."""
976
+ symbols = []
977
+ for p in pval:
978
+ if p < 0.001:
979
+ symbols.append('***')
980
+ elif p < 0.01:
981
+ symbols.append('**')
982
+ elif p < 0.05:
983
+ symbols.append('*')
984
+ elif p < 0.1:
985
+ symbols.append('.')
986
+ else:
987
+ symbols.append(' ')
988
+ return symbols
989
+
990
+ def summary(self) -> 'CBPSSummary':
991
+ """
992
+ Compute and return a statistical summary of the CBPS fit.
993
+
994
+ Returns
995
+ -------
996
+ CBPSSummary
997
+ Summary object containing coefficient table with estimates,
998
+ standard errors, z-values, p-values, and significance codes.
999
+
1000
+ Raises
1001
+ ------
1002
+ ValueError
1003
+ If the variance-covariance matrix was not computed (var is None),
1004
+ standard errors cannot be calculated.
1005
+
1006
+ Notes
1007
+ -----
1008
+ Key implementation details:
1009
+
1010
+ 1. Standard errors are computed from the diagonal of the variance matrix
1011
+ 2. z-values are computed as coefficient / standard error
1012
+ 3. p-values are two-sided: p = 2 * (1 - Phi(abs(z)))
1013
+ 4. Row names differ for binary vs multi-valued treatment
1014
+
1015
+ Examples
1016
+ --------
1017
+ >>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
1018
+ >>> summ = fit.summary()
1019
+ >>> print(summ) # Formatted coefficient table
1020
+ >>> summ.coef # Coefficient estimates
1021
+ >>> summ.se # Standard errors
1022
+ >>> summ.pvalues # Two-sided p-values
1023
+ """
1024
+ if self.var is None:
1025
+ raise ValueError(
1026
+ "Variance-covariance matrix required for summary. "
1027
+ "Cannot compute standard errors."
1028
+ )
1029
+
1030
+ std_err = np.sqrt(np.diag(self.var))
1031
+ coef = self.coefficients.ravel()
1032
+ z_value = coef / std_err
1033
+ p_value = 2 * (1 - scipy.stats.norm.cdf(np.abs(z_value)))
1034
+ coef_table = np.column_stack([coef, std_err, z_value, p_value])
1035
+
1036
+ significance = self._symnum(p_value)
1037
+
1038
+ if self.coefficients.shape[1] == 1:
1039
+ row_names = self.coef_names
1040
+ else:
1041
+ row_names = self._format_multitreat_names()
1042
+
1043
+ # Compute J-test p-value for over-identified models
1044
+ # For binary CBPS with method='over':
1045
+ # n_moment_conditions = 2k (k score + k balance)
1046
+ # n_parameters = k
1047
+ # df = n_moment_conditions - n_parameters (overidentification degrees of freedom)
1048
+ k = self.coefficients.shape[0]
1049
+ if self.method == 'over':
1050
+ n_moments = 2 * k
1051
+ else:
1052
+ # Just-identified ('exact'): n_moments = k, df = 0
1053
+ n_moments = k
1054
+
1055
+ j_pval = None
1056
+ j_df = n_moments - k
1057
+ # Only attempt p-value computation if J is valid
1058
+ if (self.J is not None and np.isfinite(self.J) and self.J >= 0):
1059
+ try:
1060
+ j_pval = j_test_pvalue(self.J, n_moments, k)
1061
+ except ValueError:
1062
+ j_pval = None
1063
+
1064
+ return CBPSSummary(
1065
+ call=self.call_info,
1066
+ coef_table=coef_table,
1067
+ coef_names=row_names,
1068
+ significance=significance,
1069
+ J=self.J,
1070
+ j_pvalue=j_pval,
1071
+ j_df=j_df,
1072
+ deviance=self.deviance,
1073
+ sigmasq=self.sigmasq,
1074
+ y=self.y,
1075
+ fitted_values=self.fitted_values,
1076
+ weights=self.weights,
1077
+ converged=self.converged,
1078
+ )
1079
+
1080
+ def _format_multitreat_names(self) -> List[str]:
1081
+ """Format coefficient names for multi-valued treatment display."""
1082
+ row_names = []
1083
+ n_row, n_col = self.coefficients.shape
1084
+
1085
+ if self.treat_names is not None and len(self.treat_names) >= n_col:
1086
+ level_names = self.treat_names[:n_col]
1087
+ else:
1088
+ level_names = [f"Level{i}" for i in range(n_col)]
1089
+
1090
+ for i in range(n_col):
1091
+ for j in range(n_row):
1092
+ row_names.append(f"{level_names[i]}: {self.coef_names[j]}")
1093
+
1094
+ return row_names
1095
+
1096
+ def __str__(self) -> str:
1097
+ """Return formatted string representation of the CBPS fit."""
1098
+ digits = 3
1099
+ output = f"\nCall:\n {self.call_info}\n\n"
1100
+
1101
+ if self.coefficients.size > 0:
1102
+ output += "Coefficients:\n"
1103
+ coef_str = np.array2string(
1104
+ self.coefficients,
1105
+ precision=digits,
1106
+ suppress_small=True
1107
+ )
1108
+ output += coef_str + "\n"
1109
+ else:
1110
+ output += "No coefficients\n\n"
1111
+
1112
+ if self.sigmasq is not None:
1113
+ output += f"\nSigma-Squared: {self.sigmasq}\n"
1114
+
1115
+ output += f"Residual Deviance:\t{self.deviance:.{digits}g}\n"
1116
+ output += f"J-Statistic:\t\t{self.J:.{digits}g}\n"
1117
+ output += f"Log-Likelihood:\t{-0.5 * self.deviance:.{digits}g}\n"
1118
+
1119
+ # Diagnostics block
1120
+ output += f"\nDiagnostics:\n"
1121
+ output += f" Converged: {'Yes' if self.converged else 'No'}\n"
1122
+
1123
+ if self.weights is not None:
1124
+ w = self.weights
1125
+ output += f" Weight Summary:\n"
1126
+ output += f" Min: {w.min():10.4f} Max: {w.max():10.4f} Mean: {w.mean():8.4f}\n"
1127
+ ess = (w.sum() ** 2) / (w ** 2).sum()
1128
+ output += f" Effective Sample Size: {ess:.1f}\n"
1129
+
1130
+ return output
1131
+
1132
+ def __repr__(self) -> str:
1133
+ """Return concise representation for interactive display."""
1134
+ return (f"CBPSResults(n={len(self.y)}, k={self.coefficients.shape[0]}, "
1135
+ f"J={self.J:.6f}, converged={self.converged})")
1136
+
1137
+
1138
+ class CBPSSummary:
1139
+ """
1140
+ Summary object from CBPS estimation.
1141
+
1142
+ Contains the coefficient table with estimates, standard errors,
1143
+ z-values, p-values, and significance codes. This object is returned
1144
+ by the ``summary()`` method of ``CBPSResults``.
1145
+
1146
+ Attributes
1147
+ ----------
1148
+ call : str
1149
+ String representation of the fitting call.
1150
+ coefficients : ndarray, shape (k, 4)
1151
+ Coefficient table with columns: Estimate, Std. Error, z value, Pr(>z).
1152
+ coef_names : list of str
1153
+ Names of coefficients (row labels).
1154
+ significance : list of str
1155
+ Significance codes for each coefficient ('***', '**', '*', '.', ' ').
1156
+ J : float
1157
+ Hansen J-statistic for over-identification test.
1158
+ j_pvalue : float or None
1159
+ Asymptotic p-value for J-test (None if just-identified).
1160
+ j_df : int
1161
+ Degrees of freedom for J-test chi-squared distribution.
1162
+ deviance : float
1163
+ Model deviance (-2 * log-likelihood).
1164
+ sigmasq : float or None
1165
+ Residual variance (continuous treatment only, None for binary/multi-valued).
1166
+
1167
+ Examples
1168
+ --------
1169
+ >>> fit = CBPS('treat ~ age + educ', data=lalonde, att=1)
1170
+ >>> summ = fit.summary()
1171
+ >>> print(summ) # Formatted table
1172
+ >>> summ.coef # Coefficient estimates
1173
+ >>> summ.se # Standard errors
1174
+ >>> summ.zvalues # z-statistics
1175
+ >>> summ.pvalues # Two-sided p-values
1176
+ """
1177
+
1178
+ def __init__(
1179
+ self,
1180
+ call: str,
1181
+ coef_table: np.ndarray,
1182
+ coef_names: List[str],
1183
+ significance: List[str],
1184
+ J: float,
1185
+ deviance: float,
1186
+ sigmasq: Optional[float] = None,
1187
+ y: Optional[np.ndarray] = None,
1188
+ fitted_values: Optional[np.ndarray] = None,
1189
+ weights: Optional[np.ndarray] = None,
1190
+ converged: Optional[bool] = None,
1191
+ j_pvalue: Optional[float] = None,
1192
+ j_df: Optional[int] = None,
1193
+ ):
1194
+ """
1195
+ Initialize summary object.
1196
+
1197
+ Parameters
1198
+ ----------
1199
+ call : str
1200
+ Call information string.
1201
+ coef_table : ndarray
1202
+ Coefficient table (k × 4 matrix).
1203
+ coef_names : list
1204
+ Coefficient name list.
1205
+ significance : list
1206
+ Significance symbols list.
1207
+ J : float
1208
+ J-statistic.
1209
+ deviance : float
1210
+ Model deviance.
1211
+ sigmasq : float, optional
1212
+ Sigma squared (continuous treatment only, default None).
1213
+ y : ndarray, optional
1214
+ Treatment variable (for computing deviance residuals).
1215
+ fitted_values : ndarray, optional
1216
+ Fitted propensity scores (for computing deviance residuals).
1217
+ weights : ndarray, optional
1218
+ Estimated weights for diagnostics output.
1219
+ converged : bool, optional
1220
+ Whether the optimization converged.
1221
+ """
1222
+ self.call = call
1223
+ self.coefficients = coef_table
1224
+ self.coef_names = coef_names
1225
+ self.significance = significance
1226
+ self.J = J
1227
+ self.j_pvalue = j_pvalue
1228
+ self.j_df = j_df
1229
+ self.deviance = deviance
1230
+ self.sigmasq = sigmasq
1231
+ self.y = y
1232
+ self.fitted_values = fitted_values
1233
+ self.weights = weights
1234
+ self.converged = converged
1235
+
1236
+ def __str__(self) -> str:
1237
+ """Return formatted summary table with professional ASCII layout."""
1238
+ width = 60
1239
+ lines = []
1240
+ lines.append("=" * width)
1241
+ lines.append("CBPS Estimation Summary")
1242
+ lines.append("=" * width)
1243
+
1244
+ # Call info
1245
+ lines.append(f"Call: {self.call}")
1246
+
1247
+ # Sample info
1248
+ if self.y is not None:
1249
+ n = len(self.y)
1250
+ unique_y = np.unique(self.y)
1251
+ if len(unique_y) == 2:
1252
+ n_treated = int(np.sum(self.y == unique_y[1]))
1253
+ n_control = int(np.sum(self.y == unique_y[0]))
1254
+ lines.append(
1255
+ f"N: {n} (Treated: {n_treated}, Control: {n_control})"
1256
+ )
1257
+ else:
1258
+ lines.append(f"N: {n}")
1259
+
1260
+ # Convergence
1261
+ if self.converged is not None:
1262
+ lines.append(f"Converged: {'Yes' if self.converged else 'No'}")
1263
+
1264
+ # Deviance residuals for binary treatment
1265
+ if self.y is not None and self.fitted_values is not None:
1266
+ unique_y = np.unique(self.y)
1267
+ if len(unique_y) == 2:
1268
+ fitted = self.fitted_values.ravel()
1269
+ y_binary = self.y.ravel()
1270
+ eps = 1e-10
1271
+ fitted_safe = np.clip(fitted, eps, 1 - eps)
1272
+ sign = np.where(y_binary == 1, 1, -1)
1273
+ deviance_resid = sign * np.sqrt(-2 * (
1274
+ y_binary * np.log(fitted_safe) +
1275
+ (1 - y_binary) * np.log(1 - fitted_safe)
1276
+ ))
1277
+ percentiles = np.percentile(
1278
+ deviance_resid, [0, 25, 50, 75, 100]
1279
+ )
1280
+ lines.append("-" * width)
1281
+ lines.append("Deviance Residuals:")
1282
+ lines.append(
1283
+ f" Min 1Q Median 3Q Max"
1284
+ )
1285
+ lines.append(
1286
+ f"{percentiles[0]:7.4f} {percentiles[1]:7.4f} "
1287
+ f"{percentiles[2]:7.4f} {percentiles[3]:7.4f} "
1288
+ f"{percentiles[4]:7.4f}"
1289
+ )
1290
+
1291
+ # Coefficients
1292
+ lines.append("-" * width)
1293
+ lines.append("Coefficients:")
1294
+ # Header
1295
+ lines.append(
1296
+ f"{'':20s} {'Estimate':>10s} {'Std. Error':>10s} "
1297
+ f"{'z value':>8s} {'Pr(>|z|)':>10s}"
1298
+ )
1299
+ for i, name in enumerate(self.coef_names):
1300
+ row = self.coefficients[i]
1301
+ sig = self.significance[i]
1302
+ # Truncate long names
1303
+ display_name = name[:19] if len(name) > 19 else name
1304
+ lines.append(
1305
+ f"{display_name:20s} {row[0]:10.4f} {row[1]:10.4f} "
1306
+ f"{row[2]:8.3f} {row[3]:10.3e} {sig}"
1307
+ )
1308
+ lines.append("---")
1309
+ lines.append(
1310
+ "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
1311
+ )
1312
+
1313
+ # Sigma-squared (continuous treatment only)
1314
+ if self.sigmasq is not None:
1315
+ lines.append(f"\nSigma-Squared: {self.sigmasq}")
1316
+
1317
+ # Diagnostics
1318
+ lines.append("-" * width)
1319
+ lines.append("Diagnostics:")
1320
+ # Format J-statistic line based on validity and identification status
1321
+ if self.J is None or not np.isfinite(self.J) or self.J < 0:
1322
+ lines.append(" J-statistic: N/A")
1323
+ elif self.j_df is not None and self.j_df == 0:
1324
+ lines.append(
1325
+ f" J-statistic: {self.J:.4f} "
1326
+ f"(just-identified, no overid test)"
1327
+ )
1328
+ elif (self.j_pvalue is not None
1329
+ and np.isfinite(self.j_pvalue)
1330
+ and self.j_df is not None
1331
+ and self.j_df > 0):
1332
+ lines.append(
1333
+ f" J-statistic: {self.J:.4f} "
1334
+ f"(df={self.j_df}, p={self.j_pvalue:.4f})"
1335
+ )
1336
+ elif self.j_df is None:
1337
+ lines.append(
1338
+ f" J-statistic: {self.J:.4f} (df not available)"
1339
+ )
1340
+ else:
1341
+ lines.append(f" J-statistic: {self.J:.4f}")
1342
+ lines.append(f" Log-Likelihood: {-0.5 * self.deviance:.4f}")
1343
+
1344
+ if self.weights is not None:
1345
+ w = self.weights
1346
+ ess = (w.sum() ** 2) / (w ** 2).sum()
1347
+ n_total = len(w)
1348
+ lines.append(
1349
+ f" Effective Sample Size: {ess:.1f} / {n_total} "
1350
+ f"({100 * ess / n_total:.1f}%)"
1351
+ )
1352
+ lines.append(
1353
+ f" Weights: min={w.min():.4f}, "
1354
+ f"max={w.max():.4f}, mean={w.mean():.4f}"
1355
+ )
1356
+
1357
+ lines.append("=" * width)
1358
+ return "\n".join(lines)
1359
+
1360
+ def __repr__(self) -> str:
1361
+ """Return concise representation."""
1362
+ return f"CBPSSummary(k={len(self.coef_names)}, J={self.J:.6f})"
1363
+
1364
+ @property
1365
+ def coef(self) -> np.ndarray:
1366
+ """
1367
+ Coefficient estimates (convenience property).
1368
+
1369
+ Returns
1370
+ -------
1371
+ ndarray
1372
+ Coefficient vector, equivalent to ``self.coefficients[:, 0]``.
1373
+
1374
+ Examples
1375
+ --------
1376
+ >>> summ = fit.summary()
1377
+ >>> summ.coef # Convenient access
1378
+ array([...])
1379
+ >>> summ.coefficients[:, 0] # Original access (still supported)
1380
+ array([...])
1381
+ """
1382
+ return self.coefficients[:, 0]
1383
+
1384
+ @property
1385
+ def se(self) -> np.ndarray:
1386
+ """
1387
+ Standard errors of coefficient estimates (convenience property).
1388
+
1389
+ Returns
1390
+ -------
1391
+ ndarray
1392
+ Standard error vector, equivalent to ``self.coefficients[:, 1]``.
1393
+
1394
+ Notes
1395
+ -----
1396
+ Aligns with statsmodels API: ``fit.bse`` (standard error of coefficients).
1397
+
1398
+ Examples
1399
+ --------
1400
+ >>> summ.se # Convenient access
1401
+ array([...])
1402
+ """
1403
+ return self.coefficients[:, 1]
1404
+
1405
+ @property
1406
+ def zvalues(self) -> np.ndarray:
1407
+ """
1408
+ Z-statistics for coefficient estimates (convenience property).
1409
+
1410
+ Returns
1411
+ -------
1412
+ ndarray
1413
+ z-statistic vector, equivalent to ``self.coefficients[:, 2]``.
1414
+
1415
+ Notes
1416
+ -----
1417
+ Aligns with statsmodels API: ``fit.tvalues`` (t-statistic, z for large samples).
1418
+
1419
+ Examples
1420
+ --------
1421
+ >>> summ.zvalues # Convenient access
1422
+ array([...])
1423
+ """
1424
+ return self.coefficients[:, 2]
1425
+
1426
+ @property
1427
+ def pvalues(self) -> np.ndarray:
1428
+ """
1429
+ Two-sided p-values for coefficient estimates (convenience property).
1430
+
1431
+ Returns
1432
+ -------
1433
+ ndarray
1434
+ p-value vector, equivalent to ``self.coefficients[:, 3]``.
1435
+
1436
+ Notes
1437
+ -----
1438
+ Two-sided test: p = 2 * (1 - Phi(abs(z)))
1439
+
1440
+ Aligns with statsmodels API: ``fit.pvalues``.
1441
+
1442
+ Examples
1443
+ --------
1444
+ >>> summ.pvalues # Convenient access
1445
+ array([...])
1446
+ """
1447
+ return self.coefficients[:, 3]