cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cbps/__init__.py +3462 -0
  2. cbps/constants.py +46 -0
  3. cbps/core/__init__.py +93 -0
  4. cbps/core/cbps_binary.py +1943 -0
  5. cbps/core/cbps_continuous.py +945 -0
  6. cbps/core/cbps_multitreat.py +1123 -0
  7. cbps/core/cbps_optimal.py +507 -0
  8. cbps/core/results.py +1447 -0
  9. cbps/data/Blackwell.csv +571 -0
  10. cbps/data/LaLonde.csv +3213 -0
  11. cbps/data/npcbps_continuous_sim.csv +501 -0
  12. cbps/data/nsw.csv +723 -0
  13. cbps/data/nsw_dw.csv +446 -0
  14. cbps/data/political_ads_urban_niebler.csv +16266 -0
  15. cbps/data/psid_controls.csv +2491 -0
  16. cbps/data/psid_controls2.csv +254 -0
  17. cbps/data/psid_controls3.csv +129 -0
  18. cbps/data/simulation_dgp1_seed12345.csv +201 -0
  19. cbps/data/simulation_dgp2_seed12345.csv +201 -0
  20. cbps/data/simulation_dgp3_seed12345.csv +201 -0
  21. cbps/data/simulation_dgp4_seed12345.csv +201 -0
  22. cbps/datasets/__init__.py +78 -0
  23. cbps/datasets/blackwell.py +112 -0
  24. cbps/datasets/continuous.py +223 -0
  25. cbps/datasets/lalonde.py +272 -0
  26. cbps/datasets/npcbps_sim.py +101 -0
  27. cbps/diagnostics/__init__.py +101 -0
  28. cbps/diagnostics/balance.py +760 -0
  29. cbps/diagnostics/balance_cbmsm_addon.py +162 -0
  30. cbps/diagnostics/continuous_diagnostics.py +259 -0
  31. cbps/diagnostics/normality.py +173 -0
  32. cbps/diagnostics/ocbps_conditions.py +197 -0
  33. cbps/diagnostics/overlap.py +198 -0
  34. cbps/diagnostics/plots.py +1193 -0
  35. cbps/diagnostics/weights_diag.py +205 -0
  36. cbps/highdim/__init__.py +84 -0
  37. cbps/highdim/gmm_loss.py +340 -0
  38. cbps/highdim/hdcbps.py +1078 -0
  39. cbps/highdim/lasso_utils.py +498 -0
  40. cbps/highdim/weight_funcs.py +298 -0
  41. cbps/inference/__init__.py +42 -0
  42. cbps/inference/asyvar.py +621 -0
  43. cbps/inference/vcov_outcome.py +217 -0
  44. cbps/iv/__init__.py +48 -0
  45. cbps/iv/cbiv.py +2603 -0
  46. cbps/logging_config.py +45 -0
  47. cbps/msm/__init__.py +45 -0
  48. cbps/msm/cbmsm.py +1871 -0
  49. cbps/msm/rank_diagnostics.py +112 -0
  50. cbps/nonparametric/__init__.py +58 -0
  51. cbps/nonparametric/cholesky_whitening.py +232 -0
  52. cbps/nonparametric/empirical_likelihood.py +339 -0
  53. cbps/nonparametric/npcbps.py +1036 -0
  54. cbps/nonparametric/taylor_approx.py +207 -0
  55. cbps/py.typed +0 -0
  56. cbps/sklearn/__init__.py +42 -0
  57. cbps/sklearn/estimator.py +378 -0
  58. cbps/utils/__init__.py +82 -0
  59. cbps/utils/formula.py +415 -0
  60. cbps/utils/helpers.py +378 -0
  61. cbps/utils/numerics.py +438 -0
  62. cbps/utils/r_compat.py +109 -0
  63. cbps/utils/validation.py +224 -0
  64. cbps/utils/variance_transform.py +483 -0
  65. cbps/utils/weights.py +586 -0
  66. cbps-0.2.0.dist-info/METADATA +1090 -0
  67. cbps-0.2.0.dist-info/RECORD +70 -0
  68. cbps-0.2.0.dist-info/WHEEL +5 -0
  69. cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
  70. cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/utils/helpers.py ADDED
@@ -0,0 +1,378 @@
1
+ """
2
+ Data Preprocessing and Validation Utilities
3
+
4
+ This module provides helper functions for data preprocessing tasks
5
+ commonly needed before CBPS estimation, including sample weight
6
+ normalization, input validation, missing value handling, and
7
+ treatment variable encoding.
8
+
9
+ Functions
10
+ ---------
11
+ normalize_sample_weights
12
+ Normalize sampling weights to sum to the sample size.
13
+ validate_arrays
14
+ Validate treatment and covariate array dimensions and types.
15
+ handle_missing
16
+ Remove observations with missing values.
17
+ encode_treatment_factor
18
+ Convert categorical treatment to numeric encoding.
19
+
20
+ References
21
+ ----------
22
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
23
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
24
+ """
25
+
26
+ import warnings
27
+ from typing import Optional, Tuple, Union
28
+
29
+ import numpy as np
30
+ import pandas as pd
31
+
32
+
33
+ def normalize_sample_weights(
34
+ sample_weights: Optional[np.ndarray],
35
+ n: int
36
+ ) -> np.ndarray:
37
+ """
38
+ Normalize sampling weights to sum to the sample size.
39
+
40
+ Applies the transformation ``sw = sw / mean(sw)`` to ensure
41
+ ``sum(sw) = n``, which is required for proper weighted estimation.
42
+
43
+ Parameters
44
+ ----------
45
+ sample_weights : np.ndarray or None
46
+ Original sampling weights, shape (n,).
47
+ If None, returns uniform weights (all ones).
48
+ n : int
49
+ Number of observations (target sum).
50
+
51
+ Returns
52
+ -------
53
+ np.ndarray
54
+ Normalized weights satisfying ``sum(weights) = n``, dtype=float64.
55
+
56
+ Raises
57
+ ------
58
+ ValueError
59
+ If all weights are zero, any weights are negative, or
60
+ normalization produces unexpected results.
61
+
62
+ Warns
63
+ -----
64
+ UserWarning
65
+ If any weights are exactly zero (valid but noteworthy).
66
+
67
+ Notes
68
+ -----
69
+ Zero weights are permitted for trimmed or survey designs, but a
70
+ warning is issued since those observations are effectively excluded.
71
+
72
+ Examples
73
+ --------
74
+ >>> import numpy as np
75
+ >>> sw = np.array([0.5, 1.0, 1.5, 2.0])
76
+ >>> sw_norm = normalize_sample_weights(sw, n=4)
77
+ >>> bool(np.isclose(sw_norm.sum(), 4.0))
78
+ True
79
+
80
+ References
81
+ ----------
82
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
83
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
84
+ """
85
+ # Step 1: Handle None case
86
+ if sample_weights is None:
87
+ return np.ones(n, dtype=np.float64)
88
+
89
+ # Step 2: Convert to float64 array
90
+ sw = np.asarray(sample_weights, dtype=np.float64)
91
+
92
+ # Check for all-zero weights and provide informative error
93
+ if not np.any(sw > 0): # All weights <= 0
94
+ if np.all(sw == 0):
95
+ raise ValueError(
96
+ f"sample_weights cannot be all zeros. "
97
+ f"Received weights with sum={sw.sum():.1f} for n={n} observations."
98
+ )
99
+ else:
100
+ raise ValueError(
101
+ f"sample_weights must contain at least one positive value. "
102
+ f"Received weights with sum={sw.sum():.6f} (all non-positive)."
103
+ )
104
+
105
+ # Step 3: Check for negative weights (not allowed)
106
+ if np.any(sw < 0):
107
+ raise ValueError(
108
+ f"sample_weights must be non-negative (>= 0). "
109
+ f"Found {(sw < 0).sum()} negative weights."
110
+ )
111
+
112
+ # Allow zero weights with warning (valid for survey designs and trimmed weights)
113
+ if np.any(sw == 0):
114
+ n_zeros = (sw == 0).sum()
115
+ warnings.warn(
116
+ f"sample_weights contains {n_zeros} zero values. "
117
+ f"These observations will be effectively excluded from the analysis. "
118
+ f"This is valid for trimmed weights or survey designs, but verify this is intentional.",
119
+ UserWarning,
120
+ stacklevel=3
121
+ )
122
+
123
+ # Step 4: Normalize by dividing by mean
124
+ sw = sw / sw.mean()
125
+
126
+ # Step 5: Verify sum equals n (tolerance 1e-10)
127
+ sum_sw = sw.sum()
128
+ # Use ValueError instead of assert for better error messages
129
+ if not np.isclose(sum_sw, n, atol=1e-10):
130
+ raise ValueError(
131
+ f"Internal error in weight normalization: "
132
+ f"normalized weights sum to {sum_sw:.10f} instead of {n}. "
133
+ f"Difference: {abs(sum_sw - n):.2e}. This should not happen."
134
+ )
135
+
136
+ return sw
137
+
138
+
139
+ def validate_arrays(
140
+ treat: np.ndarray,
141
+ X: np.ndarray,
142
+ check_rank: bool = True
143
+ ) -> Tuple[np.ndarray, np.ndarray]:
144
+ """
145
+ Validate and standardize treatment and covariate arrays.
146
+
147
+ Performs dimension checking, type conversion, missing value detection,
148
+ and optional rank verification for the design matrix.
149
+
150
+ Parameters
151
+ ----------
152
+ treat : np.ndarray
153
+ Treatment vector, shape (n,).
154
+ X : np.ndarray
155
+ Covariate matrix, shape (n, k).
156
+ check_rank : bool, default=True
157
+ If True, verify X has full column rank.
158
+
159
+ Returns
160
+ -------
161
+ treat : np.ndarray
162
+ Validated treatment vector, dtype=float64.
163
+ X : np.ndarray
164
+ Validated covariate matrix, dtype=float64.
165
+
166
+ Raises
167
+ ------
168
+ ValueError
169
+ If dimensions mismatch, arrays contain NaN, or X is rank-deficient
170
+ (when check_rank=True).
171
+
172
+ Notes
173
+ -----
174
+ Full column rank is required for the GMM optimization to have a
175
+ unique solution. Rank deficiency typically indicates collinear
176
+ covariates that should be removed.
177
+
178
+ Examples
179
+ --------
180
+ >>> import numpy as np
181
+ >>> treat = np.array([1, 0, 1, 0])
182
+ >>> X = np.array([[1, 25], [1, 30], [1, 35], [1, 40]])
183
+ >>> treat_v, X_v = validate_arrays(treat, X)
184
+ >>> treat_v.dtype
185
+ dtype('float64')
186
+
187
+ References
188
+ ----------
189
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
190
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
191
+ """
192
+ # Dimension check
193
+ if len(treat) != X.shape[0]:
194
+ raise ValueError(
195
+ f"treat length {len(treat)} != X rows {X.shape[0]}"
196
+ )
197
+
198
+ # Type conversion (enforce float64)
199
+ treat = np.asarray(treat, dtype=np.float64)
200
+ X = np.asarray(X, dtype=np.float64)
201
+
202
+ # Missing value check
203
+ if np.isnan(treat).any() or np.isnan(X).any():
204
+ raise ValueError(
205
+ "Arrays contain NaN values. Use handle_missing() first."
206
+ )
207
+
208
+ # Full rank check
209
+ if check_rank:
210
+ rank = np.linalg.matrix_rank(X)
211
+ if rank < X.shape[1]:
212
+ raise ValueError(
213
+ f"X is not full rank: rank={rank} < ncol={X.shape[1]}"
214
+ )
215
+
216
+ return treat, X
217
+
218
+
219
+ def handle_missing(
220
+ data: pd.DataFrame,
221
+ relevant_cols: Optional[list] = None
222
+ ) -> Tuple[pd.DataFrame, int]:
223
+ """
224
+ Remove observations with missing values.
225
+
226
+ Performs listwise deletion of rows containing NA/NaN in the
227
+ specified columns, with a warning indicating how many rows
228
+ were removed.
229
+
230
+ Parameters
231
+ ----------
232
+ data : pd.DataFrame
233
+ Input DataFrame.
234
+ relevant_cols : list of str, optional
235
+ Columns to check for missing values.
236
+ If None, checks all columns.
237
+
238
+ Returns
239
+ -------
240
+ data_clean : pd.DataFrame
241
+ DataFrame with missing-value rows removed.
242
+ Original index is preserved.
243
+ n_dropped : int
244
+ Number of rows removed.
245
+
246
+ Warns
247
+ -----
248
+ UserWarning
249
+ If any rows were dropped, indicates the count.
250
+
251
+ Examples
252
+ --------
253
+ >>> import pandas as pd
254
+ >>> import numpy as np
255
+ >>> df = pd.DataFrame({
256
+ ... 'treat': [1, 0, np.nan, 1],
257
+ ... 'age': [25, 30, 35, np.nan]
258
+ ... })
259
+ >>> df_clean, n_drop = handle_missing(df)
260
+ >>> len(df_clean)
261
+ 2
262
+ >>> n_drop
263
+ 2
264
+
265
+ References
266
+ ----------
267
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
268
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
269
+ """
270
+ if relevant_cols is None:
271
+ relevant_cols = data.columns.tolist()
272
+
273
+ # Drop rows containing NA (preserve index)
274
+ data_clean = data.dropna(subset=relevant_cols, inplace=False)
275
+ n_dropped = len(data) - len(data_clean)
276
+
277
+ # Warning message (matches na.omit behavior)
278
+ if n_dropped > 0:
279
+ warnings.warn(
280
+ f"Removed {n_dropped} observations with missing values",
281
+ UserWarning
282
+ )
283
+
284
+ return data_clean, n_dropped
285
+
286
+
287
+ def encode_treatment_factor(
288
+ treat: Union[pd.Series, np.ndarray],
289
+ att: int,
290
+ verbose: int = 1
291
+ ) -> Tuple[np.ndarray, list, np.ndarray]:
292
+ """
293
+ Encode categorical treatment variable to binary 0/1.
294
+
295
+ Converts a two-level categorical treatment to numeric encoding,
296
+ with the encoding direction controlled by the ATT parameter.
297
+
298
+ Parameters
299
+ ----------
300
+ treat : pd.Series or np.ndarray
301
+ Categorical treatment variable with exactly 2 levels.
302
+ att : int
303
+ Target estimand controlling encoding:
304
+
305
+ - 0: ATE - second level (alphabetically) becomes 1
306
+ - 1: ATT - second level as treated group
307
+ - 2: ATT - first level as treated group (inverts encoding)
308
+
309
+ verbose : int, default=1
310
+ If > 0, print information about which level is treated.
311
+
312
+ Returns
313
+ -------
314
+ treat_numeric : np.ndarray
315
+ Binary treatment vector (0/1), dtype=float64.
316
+ levels : list
317
+ Sorted list of the two factor levels.
318
+ treat_orig : np.ndarray
319
+ Original treatment values (for reference).
320
+
321
+ Notes
322
+ -----
323
+ The encoding follows alphabetical ordering of levels:
324
+
325
+ - Levels are sorted, so ``['control', 'treatment']`` becomes
326
+ ``[0, 1]`` with 'treatment' = 1
327
+ - ATT=2 inverts this, making the first level (alphabetically) = 1
328
+
329
+ Examples
330
+ --------
331
+ >>> import pandas as pd
332
+ >>> treat = pd.Categorical(['control', 'treatment', 'control', 'treatment'])
333
+ >>> treat_num, levels, _ = encode_treatment_factor(treat, att=1, verbose=0)
334
+ >>> [float(x) for x in treat_num]
335
+ [0.0, 1.0, 0.0, 1.0]
336
+ >>> levels
337
+ ['control', 'treatment']
338
+
339
+ References
340
+ ----------
341
+ Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
342
+ Journal of the Royal Statistical Society, Series B 76(1), 243-263.
343
+ """
344
+ # Save original values
345
+ treat_orig = np.asarray(treat).copy()
346
+
347
+ # Extract levels (sorted)
348
+ if isinstance(treat, pd.Series):
349
+ if hasattr(treat, 'cat'):
350
+ levels = treat.cat.categories.tolist()
351
+ else:
352
+ levels = sorted(treat.unique())
353
+ else:
354
+ levels = sorted(np.unique(treat))
355
+
356
+ # ATT encoding: second level becomes 1, first level becomes 0
357
+ treat_numeric = (treat_orig == levels[1]).astype(int)
358
+
359
+ # ATT=2: Invert treatment assignment
360
+ if att == 2:
361
+ treat_numeric = 1 - treat_numeric
362
+
363
+ # Print ATT information (controlled by verbose parameter)
364
+ if verbose > 0:
365
+ if att == 1:
366
+ print(
367
+ f"Finding ATT with T={levels[1]} as the treatment. "
368
+ f"Set ATT=2 to find ATT with T={levels[0]} as the treatment"
369
+ )
370
+ elif att == 2:
371
+ print(
372
+ f"Finding ATT with T={levels[0]} as the treatment. "
373
+ f"Set ATT=1 to find ATT with T={levels[1]} as the treatment"
374
+ )
375
+ # ATT=0: No message (ATE scenario)
376
+
377
+ return treat_numeric.astype(np.float64), levels, treat_orig
378
+