cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/utils/__init__.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ CBPS Utility Functions
3
+
4
+ This module provides shared components for formula parsing, weight computation,
5
+ and data preprocessing used across the CBPS package.
6
+
7
+ Submodules
8
+ ----------
9
+ formula
10
+ Wilkinson-Rogers formula parsing using patsy with extensions for
11
+ treatment models and dual formula specifications.
12
+
13
+ weights
14
+ Inverse probability weight computation for ATE, ATT, and continuous
15
+ treatment estimands with group-wise standardization.
16
+
17
+ helpers
18
+ Data validation, missing value handling, and treatment encoding utilities.
19
+
20
+ numerics
21
+ Numerical linear algebra utilities including pseudoinverse computation.
22
+
23
+ validation
24
+ Centralized input validation with informative error messages.
25
+
26
+ Exported Functions
27
+ ------------------
28
+ **Formula Parsing**:
29
+
30
+ - ``parse_formula`` - Parse treatment ~ covariates formulas
31
+ - ``parse_dual_formulas`` - Parse baseline and difference formulas
32
+ - ``parse_arrays`` - Construct design matrix from arrays
33
+
34
+ **Weight Computation**:
35
+
36
+ - ``WeightNormalizer`` - Unified weight normalization class
37
+ - ``compute_ate_weights`` - ATE inverse probability weights
38
+ - ``compute_att_weights`` - ATT inverse probability weights
39
+ - ``compute_continuous_weights`` - Continuous treatment weights
40
+ - ``standardize_weights`` - Group-normalized weights
41
+
42
+
43
+ **Data Utilities**:
44
+
45
+ - ``validate_arrays`` - Validate array dimensions and types
46
+ - ``handle_missing`` - Remove observations with missing values
47
+ - ``encode_treatment_factor`` - Convert categorical treatment to numeric
48
+ - ``normalize_sample_weights`` - Normalize weights to sum to n
49
+ """
50
+
51
+ from cbps.utils.formula import parse_arrays, parse_dual_formulas, parse_formula
52
+ from cbps.utils.helpers import (
53
+ encode_treatment_factor,
54
+ handle_missing,
55
+ normalize_sample_weights,
56
+ validate_arrays,
57
+ )
58
+ from cbps.utils.weights import (
59
+ WeightNormalizer,
60
+ compute_ate_weights,
61
+ compute_att_weights,
62
+ compute_continuous_weights,
63
+ standardize_weights,
64
+ )
65
+
66
+ __all__ = [
67
+ # Formula parsing functions
68
+ "parse_formula",
69
+ "parse_dual_formulas",
70
+ "parse_arrays",
71
+ # Weight computation functions
72
+ "WeightNormalizer",
73
+ "compute_ate_weights",
74
+ "compute_att_weights",
75
+ "compute_continuous_weights",
76
+ "standardize_weights",
77
+ # Utility functions
78
+ "normalize_sample_weights",
79
+ "validate_arrays",
80
+ "handle_missing",
81
+ "encode_treatment_factor",
82
+ ]
cbps/utils/formula.py ADDED
@@ -0,0 +1,415 @@
1
+ """
2
+ Formula Parsing Utilities
3
+
4
+ This module provides formula parsing functionality using patsy, with
5
+ extensions tailored for CBPS treatment models.
6
+
7
+ Supported formula interfaces:
8
+
9
+ - **Standard formulas**: Parse ``treatment ~ covariates`` specifications
10
+ into treatment vectors and design matrices
11
+ - **Dual formulas**: Parse separate baseline and difference formulas
12
+ for optimal CBPS estimation
13
+ - **Array interface**: Direct matrix input for programmatic use
14
+
15
+ The formula parser supports standard patsy syntax including:
16
+
17
+ - Categorical variables via ``C(variable)`` or ``factor(variable)``
18
+ - Interactions via ``:`` operator
19
+ - Polynomial terms via ``I()`` for as-is expressions
20
+ - Automatic intercept handling
21
+
22
+ Functions
23
+ ---------
24
+ parse_formula
25
+ Parse treatment ~ covariates formula to arrays.
26
+ parse_dual_formulas
27
+ Parse baseline and difference formulas for optimal CBPS.
28
+ parse_arrays
29
+ Construct design matrix from array inputs.
30
+
31
+ References
32
+ ----------
33
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
34
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
35
+ """
36
+
37
+ from typing import Optional, Tuple, Union
38
+ import re
39
+
40
+ import numpy as np
41
+ import pandas as pd
42
+ from patsy import dmatrices, dmatrix
43
+
44
+
45
+ def _convert_r_formula_to_patsy(formula: str) -> str:
46
+ """
47
+ Convert alternative formula syntax to patsy-compatible format.
48
+
49
+ Transforms ``factor(var)`` notation to patsy's ``C(var)`` notation
50
+ for categorical variable specification.
51
+
52
+ Parameters
53
+ ----------
54
+ formula : str
55
+ Formula string potentially containing factor() syntax.
56
+
57
+ Returns
58
+ -------
59
+ str
60
+ Patsy-compatible formula string with C() notation.
61
+
62
+ Examples
63
+ --------
64
+ >>> _convert_r_formula_to_patsy('treat ~ x1 + factor(country)')
65
+ 'treat ~ x1 + C(country)'
66
+ >>> _convert_r_formula_to_patsy('treat ~ factor(year) + x1')
67
+ 'treat ~ C(year) + x1'
68
+ """
69
+ # Replace factor(var) with C(var)
70
+ # Use regex to match factor(...) and replace with C(...)
71
+ converted = re.sub(r'\bfactor\s*\(', 'C(', formula)
72
+ return converted
73
+
74
+
75
+ def parse_formula(
76
+ formula: str,
77
+ data: pd.DataFrame,
78
+ return_type: str = 'dataframe',
79
+ preserve_categorical: bool = False
80
+ ) -> Tuple[Union[np.ndarray, pd.Series], np.ndarray]:
81
+ """
82
+ Parse a Wilkinson-Rogers formula into treatment vector and design matrix.
83
+
84
+ Parameters
85
+ ----------
86
+ formula : str
87
+ Formula specification in the form ``"treatment ~ covariates"``.
88
+ Supports patsy syntax including:
89
+
90
+ - Main effects: ``age + educ``
91
+ - Interactions: ``age:educ``
92
+ - Categorical: ``C(region)`` or ``factor(region)``
93
+ - As-is expressions: ``I(re75==0)``
94
+ - Remove intercept: ``-1``
95
+
96
+ data : pd.DataFrame
97
+ DataFrame containing all variables referenced in the formula.
98
+ return_type : str, default='dataframe'
99
+ Currently unused; arrays are always returned as numpy arrays.
100
+ preserve_categorical : bool, default=False
101
+ If True and treatment is pd.Categorical, preserve the original
102
+ Categorical dtype. Used internally for multi-level treatment models.
103
+
104
+ Returns
105
+ -------
106
+ y : np.ndarray or pd.Series
107
+ Treatment vector, shape (n,).
108
+ Returns float64 array unless preserve_categorical=True and
109
+ the original treatment is Categorical.
110
+ X : np.ndarray
111
+ Design matrix, shape (n, k), dtype=float64.
112
+ Includes intercept column by default (first column).
113
+
114
+ Notes
115
+ -----
116
+ **Design matrix structure**:
117
+
118
+ - Patsy adds an intercept column by default (suppress with ``-1``)
119
+ - Categorical variables are dummy-coded with K-1 columns
120
+ - Column order follows patsy conventions: intercept first,
121
+ then main effects, interactions, and I() terms
122
+
123
+ **Post-processing**: The caller is typically responsible for
124
+ zero-variance column filtering if needed::
125
+
126
+ std = X.std(axis=0, ddof=1)
127
+ X = X[:, std > 0]
128
+
129
+ Examples
130
+ --------
131
+ >>> import pandas as pd
132
+ >>> df = pd.DataFrame({
133
+ ... 'treat': [0, 1, 0, 1],
134
+ ... 'age': [25, 30, 35, 40],
135
+ ... 'educ': [12, 16, 14, 18]
136
+ ... })
137
+ >>> y, X = parse_formula("treat ~ age + educ", df)
138
+ >>> y.shape
139
+ (4,)
140
+ >>> X.shape # (n, k) with intercept
141
+ (4, 3)
142
+
143
+ References
144
+ ----------
145
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
146
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
147
+ """
148
+ # Step 0a: Convert factor() syntax to patsy C() notation
149
+ formula = _convert_r_formula_to_patsy(formula)
150
+
151
+ # Step 0b: Detect if treatment variable is Categorical
152
+ # Extract treatment variable name from formula
153
+ if '~' in formula:
154
+ lhs = formula.split('~')[0].strip()
155
+ treat_var_name = lhs
156
+
157
+ # Check if variable in original data is Categorical
158
+ is_categorical_treat = (
159
+ preserve_categorical and
160
+ treat_var_name in data.columns and
161
+ isinstance(data[treat_var_name].dtype, pd.CategoricalDtype)
162
+ )
163
+
164
+ if is_categorical_treat:
165
+ # Save original Categorical Series
166
+ original_treat = data[treat_var_name].copy()
167
+ # Temporarily convert to numeric to avoid patsy one-hot encoding
168
+ data_temp = data.copy()
169
+ data_temp[treat_var_name] = data[treat_var_name].cat.codes.astype(np.float64)
170
+ else:
171
+ data_temp = data
172
+ original_treat = None
173
+ else:
174
+ data_temp = data
175
+ original_treat = None
176
+ is_categorical_treat = False
177
+
178
+ # Step 1: Parse formula using patsy
179
+ y, X_df = dmatrices(formula, data_temp, return_type='dataframe')
180
+
181
+ # Step 2: Reorder columns to match formula order
182
+ # Patsy orders alphabetically, we want formula order
183
+ # Strategy: Intercept first, then non-I() terms, then I() terms
184
+ col_names = X_df.columns.tolist()
185
+
186
+ # Extract variable order from formula
187
+ if '~' in formula:
188
+ rhs = formula.split('~')[1].strip()
189
+ formula_terms = [t.strip() for t in rhs.split('+')]
190
+
191
+ # Separate simple variables and I() variables
192
+ simple_vars = []
193
+ i_vars = []
194
+ for term in formula_terms:
195
+ if term.startswith('I('):
196
+ i_vars.append(term)
197
+ else:
198
+ simple_vars.append(term)
199
+
200
+ # Build new order
201
+ new_order = []
202
+
203
+ # 1. Intercept first
204
+ for i, col in enumerate(col_names):
205
+ if col == 'Intercept':
206
+ new_order.append(i)
207
+ break
208
+
209
+ # 2. Add simple variables in formula order (exact match)
210
+ for var in simple_vars:
211
+ for i, col in enumerate(col_names):
212
+ # Exact match: column name equals variable name
213
+ if i not in new_order and col == var and '[T.' not in col:
214
+ new_order.append(i)
215
+ break
216
+
217
+ # 3. Add I() variables in formula order
218
+ import re
219
+ for i_var in i_vars:
220
+ # I(re75==0) needs to match "I(re75 == 0)[T.True]"
221
+ for i, col in enumerate(col_names):
222
+ if i not in new_order and col.startswith('I('):
223
+ # Check if variable in I() matches
224
+ formula_match = re.search(r'I\((\w+)', i_var)
225
+ col_match = re.search(r'I\((\w+)', col)
226
+ if formula_match and col_match:
227
+ if formula_match.group(1) == col_match.group(1):
228
+ new_order.append(i)
229
+ break
230
+
231
+ # 4. Add any remaining columns
232
+ for i in range(len(col_names)):
233
+ if i not in new_order:
234
+ new_order.append(i)
235
+
236
+ # Reorder columns
237
+ X_reordered = X_df.iloc[:, new_order]
238
+ else:
239
+ X_reordered = X_df
240
+
241
+ # Step 3: Convert to numpy array
242
+ X = X_reordered.values
243
+
244
+ # Step 4: Process treatment variable
245
+ if is_categorical_treat:
246
+ # Return original Categorical Series (preserve factor semantics)
247
+ y = original_treat
248
+ else:
249
+ # Convert to float64 array
250
+ y = y.values.ravel()
251
+ y = y.astype(np.float64)
252
+
253
+ # Step 5: Convert X to float64 (enforce double precision)
254
+ X = X.astype(np.float64)
255
+
256
+ # Step 6: Return reordered design matrix
257
+ return y, X
258
+
259
+
260
+ def parse_dual_formulas(
261
+ baseline_formula: Optional[str],
262
+ diff_formula: Optional[str],
263
+ data: pd.DataFrame
264
+ ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
265
+ """
266
+ Parse baseline and difference formulas for optimal CBPS.
267
+
268
+ Optimal CBPS uses separate covariate specifications for the baseline
269
+ outcome model E[Y(0)|X] and the treatment effect heterogeneity
270
+ E[Y(1)-Y(0)|X]. This function parses both formulas and returns
271
+ their design matrices.
272
+
273
+ Parameters
274
+ ----------
275
+ baseline_formula : str or None
276
+ Right-hand-side formula for baseline covariates, e.g., ``"~ age + educ"``.
277
+ diff_formula : str or None
278
+ Right-hand-side formula for treatment effect covariates, e.g., ``"~ age"``.
279
+ data : pd.DataFrame
280
+ DataFrame containing all referenced variables.
281
+
282
+ Returns
283
+ -------
284
+ baselineX : np.ndarray or None
285
+ Design matrix for baseline formula, shape (n, k1).
286
+ Zero-variance columns are automatically removed.
287
+ diffX : np.ndarray or None
288
+ Design matrix for difference formula, shape (n, k2).
289
+ Zero-variance columns are automatically removed.
290
+
291
+ Raises
292
+ ------
293
+ ValueError
294
+ If exactly one formula is None (both must be specified together
295
+ or both must be None).
296
+
297
+ Notes
298
+ -----
299
+ Unlike ``parse_formula()``, this function:
300
+
301
+ - Takes right-hand-side only formulas (no treatment variable)
302
+ - Automatically filters zero-variance columns
303
+ - Returns None for both outputs if both inputs are None
304
+
305
+ Examples
306
+ --------
307
+ >>> import pandas as pd
308
+ >>> df = pd.DataFrame({
309
+ ... 'age': [25, 30, 35, 40],
310
+ ... 'educ': [12, 16, 14, 18]
311
+ ... })
312
+ >>> baselineX, diffX = parse_dual_formulas("~ age + educ", "~ age", df)
313
+ >>> baselineX.shape[1] >= diffX.shape[1]
314
+ True
315
+
316
+ References
317
+ ----------
318
+ Fan, J., Imai, K., Liu, H., Ning, Y., and Yang, X. (2021). Improving
319
+ covariate balancing propensity score: A doubly robust and efficient
320
+ approach. Working paper.
321
+ """
322
+ # Step 1: XOR check - both must be specified or both None
323
+ if (baseline_formula is None) != (diff_formula is None):
324
+ raise ValueError(
325
+ "Either baseline_formula or diff_formula not specified. "
326
+ "Both must be specified to use CBPSOptimal. Otherwise, leave both None."
327
+ )
328
+
329
+ # Step 2: Return None if both are None
330
+ if baseline_formula is None and diff_formula is None:
331
+ return None, None
332
+
333
+ # Step 3: Parse baseline formula
334
+ baselineX = dmatrix(baseline_formula, data, return_type='matrix')
335
+ baselineX = np.asarray(baselineX, dtype=np.float64)
336
+
337
+ # Filter zero-variance columns (threshold > 0)
338
+ std_baseline = baselineX.std(axis=0, ddof=1)
339
+ baselineX = baselineX[:, std_baseline > 0]
340
+
341
+ # Step 4: Parse diff formula
342
+ diffX = dmatrix(diff_formula, data, return_type='matrix')
343
+ diffX = np.asarray(diffX, dtype=np.float64)
344
+
345
+ # Filter zero-variance columns
346
+ std_diff = diffX.std(axis=0, ddof=1)
347
+ diffX = diffX[:, std_diff > 0]
348
+
349
+ return baselineX, diffX
350
+
351
+
352
+ def parse_arrays(
353
+ treatment: Union[np.ndarray, pd.Series],
354
+ covariates: Union[np.ndarray, pd.DataFrame],
355
+ add_intercept: bool = True
356
+ ) -> Tuple[np.ndarray, np.ndarray]:
357
+ """
358
+ Construct treatment vector and design matrix from array inputs.
359
+
360
+ Provides a programmatic interface for CBPS when data is already
361
+ available as arrays rather than in a DataFrame with formula specification.
362
+
363
+ Parameters
364
+ ----------
365
+ treatment : array-like
366
+ Treatment variable, shape (n,) or (n, 1).
367
+ covariates : array-like
368
+ Covariate matrix, shape (n, k) or (n,) for single covariate.
369
+ add_intercept : bool, default=True
370
+ If True, prepend a column of ones to the covariate matrix.
371
+
372
+ Returns
373
+ -------
374
+ y : np.ndarray
375
+ Treatment vector, shape (n,), dtype=float64.
376
+ X : np.ndarray
377
+ Design matrix, shape (n, k) or (n, k+1) with intercept.
378
+ dtype=float64.
379
+
380
+ Notes
381
+ -----
382
+ This function produces output compatible with ``parse_formula()``,
383
+ enabling consistent downstream processing regardless of input method.
384
+
385
+ Examples
386
+ --------
387
+ >>> import numpy as np
388
+ >>> treatment = np.array([0, 1, 0, 1])
389
+ >>> covariates = np.array([[25, 12], [30, 16], [35, 14], [40, 18]])
390
+ >>> y, X = parse_arrays(treatment, covariates, add_intercept=True)
391
+ >>> X.shape
392
+ (4, 3)
393
+ >>> np.allclose(X[:, 0], 1.0) # First column is intercept
394
+ True
395
+
396
+ References
397
+ ----------
398
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
399
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
400
+ """
401
+ # Convert to numpy arrays
402
+ y = np.asarray(treatment, dtype=np.float64).ravel()
403
+ X = np.asarray(covariates, dtype=np.float64)
404
+
405
+ # Ensure X is 2-dimensional
406
+ if X.ndim == 1:
407
+ X = X.reshape(-1, 1)
408
+
409
+ # Add intercept column if requested
410
+ if add_intercept:
411
+ intercept = np.ones((len(y), 1), dtype=np.float64)
412
+ X = np.column_stack([intercept, X])
413
+
414
+ return y, X
415
+