cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,112 @@
1
+ """Rank selection diagnostics for CBMSM covariate matrices.
2
+
3
+ WARNING: Automatic rank selection methods (energy ratio, information criteria)
4
+ go beyond Imai & Ratkovic (2015) specification. These tools are provided for
5
+ sensitivity analysis only. The default fixed threshold (1e-4) should be used
6
+ for published analyses unless justified.
7
+
8
+ References
9
+ ----------
10
+ Imai, K. & Ratkovic, M. (2015). Robust estimation of inverse probability
11
+ weights for marginal structural models. JASA, 110(511), 1013-1023.
12
+ """
13
+
14
+ import numpy as np
15
+ from typing import Any, Dict, List, Optional
16
+
17
+
18
+ def diagnose_rank_selection(
19
+ X_mat: np.ndarray,
20
+ thresholds: Optional[List[float]] = None,
21
+ ) -> Dict[str, Any]:
22
+ """Compare rank under different SVD thresholds.
23
+
24
+ Helps users assess sensitivity of CBMSM results to rank choice.
25
+ This is a diagnostic tool only; it does NOT change the default behavior
26
+ of the CBMSM estimator.
27
+
28
+ Parameters
29
+ ----------
30
+ X_mat : np.ndarray, shape (n, k)
31
+ Covariate matrix (mean-centered recommended).
32
+ thresholds : list of float, optional
33
+ SVD thresholds to compare. Default: [1e-6, 1e-5, 1e-4, 1e-3, 1e-2].
34
+
35
+ Returns
36
+ -------
37
+ dict with keys:
38
+ - 'singular_values': np.ndarray, all singular values (descending)
39
+ - 'total_columns': int, original number of columns k
40
+ - 'ranks_by_threshold': dict mapping threshold -> retained rank
41
+ - 'energy_by_rank': np.ndarray, cumulative variance explained
42
+ at each rank (cumsum(s**2) / sum(s**2))
43
+ - 'recommended_action': str, guidance for the user
44
+
45
+ Examples
46
+ --------
47
+ >>> import numpy as np
48
+ >>> from cbps.msm.rank_diagnostics import diagnose_rank_selection
49
+ >>> rng = np.random.default_rng(42)
50
+ >>> X = rng.standard_normal((100, 5))
51
+ >>> result = diagnose_rank_selection(X)
52
+ >>> result['total_columns']
53
+ 5
54
+ """
55
+ if thresholds is None:
56
+ thresholds = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
57
+
58
+ X = np.asarray(X_mat, dtype=np.float64)
59
+ n, k = X.shape
60
+
61
+ # Compute SVD
62
+ if k == 0 or n == 0:
63
+ return {
64
+ "singular_values": np.array([], dtype=float),
65
+ "total_columns": k,
66
+ "ranks_by_threshold": {t: 0 for t in thresholds},
67
+ "energy_by_rank": np.array([], dtype=float),
68
+ "recommended_action": "No covariates provided.",
69
+ }
70
+
71
+ _U, s, _Vt = np.linalg.svd(X, full_matrices=False)
72
+
73
+ # Ranks by threshold (number of singular values exceeding each threshold)
74
+ ranks_by_threshold = {}
75
+ for t in sorted(thresholds):
76
+ ranks_by_threshold[t] = int(np.sum(s > t))
77
+
78
+ # Cumulative energy (variance explained)
79
+ s_sq = s ** 2
80
+ total_energy = s_sq.sum()
81
+ if total_energy > 0:
82
+ energy_by_rank = np.cumsum(s_sq) / total_energy
83
+ else:
84
+ energy_by_rank = np.zeros_like(s_sq)
85
+
86
+ # Generate recommendation
87
+ default_rank = int(np.sum(s > 1e-4))
88
+ if default_rank == k:
89
+ recommended_action = (
90
+ "All singular values exceed 1e-4. The matrix appears full rank; "
91
+ "no dimension reduction occurs with the default threshold."
92
+ )
93
+ elif default_rank == 0:
94
+ recommended_action = (
95
+ "No singular values exceed 1e-4. Consider using a smaller threshold "
96
+ "or checking for degenerate covariates."
97
+ )
98
+ else:
99
+ energy_at_default = energy_by_rank[default_rank - 1] if default_rank > 0 else 0.0
100
+ recommended_action = (
101
+ f"Default threshold (1e-4) retains {default_rank}/{k} components "
102
+ f"explaining {energy_at_default:.4f} of total variance. "
103
+ f"Verify that CBMSM estimates are stable across nearby thresholds."
104
+ )
105
+
106
+ return {
107
+ "singular_values": s,
108
+ "total_columns": k,
109
+ "ranks_by_threshold": ranks_by_threshold,
110
+ "energy_by_rank": energy_by_rank,
111
+ "recommended_action": recommended_action,
112
+ }
@@ -0,0 +1,58 @@
1
+ """
2
+ Nonparametric CBPS Module.
3
+
4
+ This subpackage implements the nonparametric covariate balancing generalized
5
+ propensity score (npCBGPS) estimator from Section 3.3 of Fong, Hazlett, and
6
+ Imai (2018). The function is named :func:`npCBPS` for API consistency with
7
+ the parametric version.
8
+
9
+ Unlike parametric CBPS, this approach does not require specifying a functional
10
+ form for the propensity score. Instead, it directly estimates inverse
11
+ probability weights by maximizing the empirical likelihood subject to
12
+ covariate balance constraints.
13
+
14
+ Main API
15
+ --------
16
+ :func:`npCBPS`
17
+ Estimate nonparametric covariate balancing weights from a formula
18
+ and DataFrame.
19
+ :class:`NPCBPSResults`
20
+ Container for estimation results including weights and diagnostics.
21
+
22
+ Submodules
23
+ ----------
24
+ :mod:`taylor_approx`
25
+ Modified logarithm with Taylor approximation for numerical stability.
26
+ :mod:`cholesky_whitening`
27
+ Covariate whitening via Cholesky decomposition.
28
+ :mod:`empirical_likelihood`
29
+ Dual optimization routines for empirical likelihood.
30
+
31
+ When to Use npCBPS
32
+ ------------------
33
+ - When you are uncertain about the correct propensity score model specification.
34
+ - When you prefer a nonparametric approach that directly targets balance.
35
+ - When computational cost is acceptable (npCBPS is slower than parametric CBPS).
36
+
37
+ References
38
+ ----------
39
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
40
+ score for a continuous treatment: Application to the efficacy of political
41
+ advertisements. The Annals of Applied Statistics, 12(1), 156-177.
42
+ https://doi.org/10.1214/17-AOAS1101
43
+ """
44
+
45
+ from .npcbps import npCBPS, NPCBPSResults
46
+ from .taylor_approx import llog, llogp
47
+ from .cholesky_whitening import cholesky_whitening
48
+ from .empirical_likelihood import get_w, log_post
49
+
50
+ __all__ = [
51
+ 'npCBPS',
52
+ 'NPCBPSResults',
53
+ 'llog',
54
+ 'llogp',
55
+ 'cholesky_whitening',
56
+ 'get_w',
57
+ 'log_post'
58
+ ]
@@ -0,0 +1,232 @@
1
+ """
2
+ Cholesky Whitening Transform for Nonparametric CBPS.
3
+
4
+ This module implements covariate whitening via Cholesky decomposition,
5
+ transforming covariates to have zero mean, unit variance, and zero
6
+ correlation. This preprocessing step is essential for the empirical
7
+ likelihood formulation in npCBPS.
8
+
9
+ Mathematical Background
10
+ -----------------------
11
+ The whitening transform orthogonalizes covariates as described in
12
+ Section 3.1 of Fong, Hazlett, and Imai (2018):
13
+
14
+ .. math::
15
+
16
+ X_i^* = S_X^{-1/2}(X_i - \\bar{X})
17
+
18
+ where :math:`\\bar{X}` is the sample mean and :math:`S_X` is the sample
19
+ covariance matrix. The Cholesky decomposition provides a numerically
20
+ stable way to compute :math:`S_X^{-1/2}`.
21
+
22
+ After whitening, :math:`\\text{Cov}(X^*) = I_K` (identity matrix), which
23
+ simplifies the covariate balancing constraints in the empirical likelihood
24
+ optimization.
25
+
26
+ References
27
+ ----------
28
+ Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
29
+ score for a continuous treatment: Application to the efficacy of political
30
+ advertisements. The Annals of Applied Statistics, 12(1), 156-177.
31
+ https://doi.org/10.1214/17-AOAS1101
32
+
33
+ See Section 3.1 for the notation and Section 3.3.1 for the nonparametric
34
+ formulation.
35
+ """
36
+
37
+ import numpy as np
38
+ import scipy.linalg
39
+
40
+
41
+ def cholesky_whitening(X: np.ndarray, verify: bool = True) -> np.ndarray:
42
+ """
43
+ Transform covariates to have identity covariance matrix.
44
+
45
+ Applies a two-step whitening procedure using Cholesky decomposition:
46
+
47
+ 1. **Decorrelation**: :math:`X' = X \\cdot \\text{inv}(\\text{chol}(S_X))`
48
+ where :math:`S_X` is the sample covariance matrix.
49
+ 2. **Standardization**: Center to zero mean and scale to unit variance.
50
+
51
+ The result satisfies :math:`\\text{Cov}(X^*) = I_K`, which is required
52
+ for the covariate balancing constraints in npCBPS.
53
+
54
+ Parameters
55
+ ----------
56
+ X : np.ndarray of shape (n, k)
57
+ Covariate matrix with n observations and k variables.
58
+ verify : bool, default=True
59
+ If True, verify that the output covariance equals the identity
60
+ matrix within numerical tolerance. Raises AssertionError on failure.
61
+
62
+ Returns
63
+ -------
64
+ np.ndarray of shape (n, k)
65
+ Whitened covariate matrix satisfying:
66
+
67
+ - Column means are zero
68
+ - Column standard deviations are one
69
+ - Covariance matrix equals identity
70
+
71
+ Raises
72
+ ------
73
+ AssertionError
74
+ If ``verify=True`` and the whitening verification fails.
75
+ numpy.linalg.LinAlgError
76
+ If the covariance matrix is not positive definite.
77
+
78
+ Notes
79
+ -----
80
+ **Algorithm details:**
81
+
82
+ The Cholesky decomposition factorizes :math:`S_X = L L^T` where L is
83
+ lower triangular. This implementation uses the upper triangular form
84
+ :math:`S_X = U^T U` via ``scipy.linalg.cholesky(..., lower=False)``.
85
+
86
+ The whitening transform is then :math:`X' = X \\cdot U^{-1}`, followed
87
+ by standardization to ensure exact zero mean and unit variance.
88
+
89
+ **Verification criteria (tolerance 1e-10):**
90
+
91
+ - Diagonal of :math:`\\text{Cov}(X^*)` equals 1
92
+ - Off-diagonal elements equal 0
93
+ - Column means equal 0
94
+
95
+ References
96
+ ----------
97
+ Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.1 describes the
98
+ whitening notation :math:`X_i^* = S_X^{-1/2}(X_i - \\bar{X})`.
99
+
100
+ Examples
101
+ --------
102
+ >>> import numpy as np
103
+ >>> np.random.seed(42)
104
+ >>> X = np.random.randn(100, 3)
105
+ >>> X_white = cholesky_whitening(X)
106
+ >>> cov = np.cov(X_white.T, ddof=1)
107
+ >>> np.allclose(cov, np.eye(3), atol=1e-10)
108
+ True
109
+ >>> np.allclose(X_white.mean(axis=0), 0, atol=1e-10)
110
+ True
111
+ """
112
+ n, k = X.shape
113
+
114
+ # Step 1: Cholesky whitening
115
+ # Compute unbiased covariance estimate
116
+ cov_X = np.cov(X.T, ddof=1)
117
+
118
+ # Cholesky decomposition returns upper triangular matrix
119
+ chol_upper = scipy.linalg.cholesky(cov_X, lower=False)
120
+
121
+ # Apply whitening transform
122
+ X_white_step1 = X @ np.linalg.inv(chol_upper)
123
+
124
+ # Step 2: Full standardization (center=True, scale=True)
125
+ # Ensures zero mean and unit variance
126
+ X_white = (X_white_step1 - X_white_step1.mean(axis=0)) / X_white_step1.std(axis=0, ddof=1)
127
+
128
+ # Whitening verification (optional, enabled by default)
129
+ if verify:
130
+ cov_white = np.cov(X_white.T, ddof=1)
131
+
132
+ # Single variable case: cov returns 0-dim scalar, reshape to (1,1)
133
+ if k == 1:
134
+ cov_white = cov_white.reshape(1, 1)
135
+
136
+ # Check diagonal elements are close to 1
137
+ diagonal = np.diag(cov_white)
138
+ if not np.allclose(diagonal, 1.0, atol=1e-10):
139
+ raise AssertionError(
140
+ f"Whitening failed: cov(X_white) diagonal not close to 1\n"
141
+ f"Diagonal values: {diagonal}\n"
142
+ f"Expected: [1, 1, ..., 1]"
143
+ )
144
+
145
+ # Check off-diagonal elements are close to 0
146
+ off_diagonal_max = np.max(np.abs(cov_white - np.eye(k)))
147
+ if off_diagonal_max > 1e-10:
148
+ raise AssertionError(
149
+ f"Whitening failed: cov(X_white) off-diagonal elements too large\n"
150
+ f"Maximum off-diagonal absolute value: {off_diagonal_max}\n"
151
+ f"Expected: approximately 0 (tolerance 1e-10)"
152
+ )
153
+
154
+ # Check overall covariance matrix
155
+ if not np.allclose(cov_white, np.eye(k), atol=1e-10):
156
+ raise AssertionError(
157
+ f"Whitening failed: cov(X_white) not close to identity matrix I\n"
158
+ f"Maximum deviation: {np.max(np.abs(cov_white - np.eye(k)))}"
159
+ )
160
+
161
+ return X_white
162
+
163
+
164
+ def verify_whitening(X: np.ndarray, X_white: np.ndarray, atol: float = 1e-10) -> dict:
165
+ """
166
+ Compute diagnostic metrics for whitening quality.
167
+
168
+ This function provides detailed verification of the whitening transform
169
+ beyond the basic checks in :func:`cholesky_whitening`.
170
+
171
+ Parameters
172
+ ----------
173
+ X : np.ndarray of shape (n, k)
174
+ Original covariate matrix (unused, kept for API consistency).
175
+ X_white : np.ndarray of shape (n, k)
176
+ Whitened covariate matrix to verify.
177
+ atol : float, default=1e-10
178
+ Absolute tolerance for numerical comparisons.
179
+
180
+ Returns
181
+ -------
182
+ dict
183
+ Verification metrics with keys:
184
+
185
+ - **cov_is_identity** : bool
186
+ True if covariance matrix equals identity within tolerance.
187
+ - **mean_is_zero** : bool
188
+ True if all column means are zero within tolerance.
189
+ - **std_is_one** : bool
190
+ True if all column standard deviations are one within tolerance.
191
+ - **max_cov_deviation** : float
192
+ Maximum absolute deviation of covariance from identity matrix.
193
+ - **condition_number** : float
194
+ Condition number of the whitened matrix (measures numerical stability).
195
+
196
+ Examples
197
+ --------
198
+ >>> import numpy as np
199
+ >>> np.random.seed(42)
200
+ >>> X = np.random.randn(100, 3)
201
+ >>> X_white = cholesky_whitening(X)
202
+ >>> metrics = verify_whitening(X, X_white)
203
+ >>> metrics['cov_is_identity']
204
+ True
205
+ >>> metrics['max_cov_deviation'] < 1e-10
206
+ True
207
+ """
208
+ k = X.shape[1]
209
+
210
+ # Compute covariance matrix
211
+ cov_white = np.cov(X_white.T, ddof=1)
212
+
213
+ # Compute mean and standard deviation
214
+ mean_white = X_white.mean(axis=0)
215
+ std_white = X_white.std(axis=0, ddof=1)
216
+
217
+ # Verification metrics
218
+ cov_is_identity = np.allclose(cov_white, np.eye(k), atol=atol)
219
+ mean_is_zero = np.allclose(mean_white, 0, atol=atol)
220
+ std_is_one = np.allclose(std_white, 1, atol=atol)
221
+ max_cov_deviation = np.max(np.abs(cov_white - np.eye(k)))
222
+
223
+ # Condition number (measures numerical stability)
224
+ condition_number = np.linalg.cond(X_white)
225
+
226
+ return {
227
+ 'cov_is_identity': cov_is_identity,
228
+ 'mean_is_zero': mean_is_zero,
229
+ 'std_is_one': std_is_one,
230
+ 'max_cov_deviation': max_cov_deviation,
231
+ 'condition_number': condition_number
232
+ }