cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/utils/__init__.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CBPS Utility Functions
|
|
3
|
+
|
|
4
|
+
This module provides shared components for formula parsing, weight computation,
|
|
5
|
+
and data preprocessing used across the CBPS package.
|
|
6
|
+
|
|
7
|
+
Submodules
|
|
8
|
+
----------
|
|
9
|
+
formula
|
|
10
|
+
Wilkinson-Rogers formula parsing using patsy with extensions for
|
|
11
|
+
treatment models and dual formula specifications.
|
|
12
|
+
|
|
13
|
+
weights
|
|
14
|
+
Inverse probability weight computation for ATE, ATT, and continuous
|
|
15
|
+
treatment estimands with group-wise standardization.
|
|
16
|
+
|
|
17
|
+
helpers
|
|
18
|
+
Data validation, missing value handling, and treatment encoding utilities.
|
|
19
|
+
|
|
20
|
+
numerics
|
|
21
|
+
Numerical linear algebra utilities including pseudoinverse computation.
|
|
22
|
+
|
|
23
|
+
validation
|
|
24
|
+
Centralized input validation with informative error messages.
|
|
25
|
+
|
|
26
|
+
Exported Functions
|
|
27
|
+
------------------
|
|
28
|
+
**Formula Parsing**:
|
|
29
|
+
|
|
30
|
+
- ``parse_formula`` - Parse treatment ~ covariates formulas
|
|
31
|
+
- ``parse_dual_formulas`` - Parse baseline and difference formulas
|
|
32
|
+
- ``parse_arrays`` - Construct design matrix from arrays
|
|
33
|
+
|
|
34
|
+
**Weight Computation**:
|
|
35
|
+
|
|
36
|
+
- ``WeightNormalizer`` - Unified weight normalization class
|
|
37
|
+
- ``compute_ate_weights`` - ATE inverse probability weights
|
|
38
|
+
- ``compute_att_weights`` - ATT inverse probability weights
|
|
39
|
+
- ``compute_continuous_weights`` - Continuous treatment weights
|
|
40
|
+
- ``standardize_weights`` - Group-normalized weights
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
**Data Utilities**:
|
|
44
|
+
|
|
45
|
+
- ``validate_arrays`` - Validate array dimensions and types
|
|
46
|
+
- ``handle_missing`` - Remove observations with missing values
|
|
47
|
+
- ``encode_treatment_factor`` - Convert categorical treatment to numeric
|
|
48
|
+
- ``normalize_sample_weights`` - Normalize weights to sum to n
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from cbps.utils.formula import parse_arrays, parse_dual_formulas, parse_formula
|
|
52
|
+
from cbps.utils.helpers import (
|
|
53
|
+
encode_treatment_factor,
|
|
54
|
+
handle_missing,
|
|
55
|
+
normalize_sample_weights,
|
|
56
|
+
validate_arrays,
|
|
57
|
+
)
|
|
58
|
+
from cbps.utils.weights import (
|
|
59
|
+
WeightNormalizer,
|
|
60
|
+
compute_ate_weights,
|
|
61
|
+
compute_att_weights,
|
|
62
|
+
compute_continuous_weights,
|
|
63
|
+
standardize_weights,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
__all__ = [
|
|
67
|
+
# Formula parsing functions
|
|
68
|
+
"parse_formula",
|
|
69
|
+
"parse_dual_formulas",
|
|
70
|
+
"parse_arrays",
|
|
71
|
+
# Weight computation functions
|
|
72
|
+
"WeightNormalizer",
|
|
73
|
+
"compute_ate_weights",
|
|
74
|
+
"compute_att_weights",
|
|
75
|
+
"compute_continuous_weights",
|
|
76
|
+
"standardize_weights",
|
|
77
|
+
# Utility functions
|
|
78
|
+
"normalize_sample_weights",
|
|
79
|
+
"validate_arrays",
|
|
80
|
+
"handle_missing",
|
|
81
|
+
"encode_treatment_factor",
|
|
82
|
+
]
|
cbps/utils/formula.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Formula Parsing Utilities
|
|
3
|
+
|
|
4
|
+
This module provides formula parsing functionality using patsy, with
|
|
5
|
+
extensions tailored for CBPS treatment models.
|
|
6
|
+
|
|
7
|
+
Supported formula interfaces:
|
|
8
|
+
|
|
9
|
+
- **Standard formulas**: Parse ``treatment ~ covariates`` specifications
|
|
10
|
+
into treatment vectors and design matrices
|
|
11
|
+
- **Dual formulas**: Parse separate baseline and difference formulas
|
|
12
|
+
for optimal CBPS estimation
|
|
13
|
+
- **Array interface**: Direct matrix input for programmatic use
|
|
14
|
+
|
|
15
|
+
The formula parser supports standard patsy syntax including:
|
|
16
|
+
|
|
17
|
+
- Categorical variables via ``C(variable)`` or ``factor(variable)``
|
|
18
|
+
- Interactions via ``:`` operator
|
|
19
|
+
- Polynomial terms via ``I()`` for as-is expressions
|
|
20
|
+
- Automatic intercept handling
|
|
21
|
+
|
|
22
|
+
Functions
|
|
23
|
+
---------
|
|
24
|
+
parse_formula
|
|
25
|
+
Parse treatment ~ covariates formula to arrays.
|
|
26
|
+
parse_dual_formulas
|
|
27
|
+
Parse baseline and difference formulas for optimal CBPS.
|
|
28
|
+
parse_arrays
|
|
29
|
+
Construct design matrix from array inputs.
|
|
30
|
+
|
|
31
|
+
References
|
|
32
|
+
----------
|
|
33
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
34
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from typing import Optional, Tuple, Union
|
|
38
|
+
import re
|
|
39
|
+
|
|
40
|
+
import numpy as np
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from patsy import dmatrices, dmatrix
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _convert_r_formula_to_patsy(formula: str) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Convert alternative formula syntax to patsy-compatible format.
|
|
48
|
+
|
|
49
|
+
Transforms ``factor(var)`` notation to patsy's ``C(var)`` notation
|
|
50
|
+
for categorical variable specification.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
formula : str
|
|
55
|
+
Formula string potentially containing factor() syntax.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
str
|
|
60
|
+
Patsy-compatible formula string with C() notation.
|
|
61
|
+
|
|
62
|
+
Examples
|
|
63
|
+
--------
|
|
64
|
+
>>> _convert_r_formula_to_patsy('treat ~ x1 + factor(country)')
|
|
65
|
+
'treat ~ x1 + C(country)'
|
|
66
|
+
>>> _convert_r_formula_to_patsy('treat ~ factor(year) + x1')
|
|
67
|
+
'treat ~ C(year) + x1'
|
|
68
|
+
"""
|
|
69
|
+
# Replace factor(var) with C(var)
|
|
70
|
+
# Use regex to match factor(...) and replace with C(...)
|
|
71
|
+
converted = re.sub(r'\bfactor\s*\(', 'C(', formula)
|
|
72
|
+
return converted
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def parse_formula(
|
|
76
|
+
formula: str,
|
|
77
|
+
data: pd.DataFrame,
|
|
78
|
+
return_type: str = 'dataframe',
|
|
79
|
+
preserve_categorical: bool = False
|
|
80
|
+
) -> Tuple[Union[np.ndarray, pd.Series], np.ndarray]:
|
|
81
|
+
"""
|
|
82
|
+
Parse a Wilkinson-Rogers formula into treatment vector and design matrix.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
formula : str
|
|
87
|
+
Formula specification in the form ``"treatment ~ covariates"``.
|
|
88
|
+
Supports patsy syntax including:
|
|
89
|
+
|
|
90
|
+
- Main effects: ``age + educ``
|
|
91
|
+
- Interactions: ``age:educ``
|
|
92
|
+
- Categorical: ``C(region)`` or ``factor(region)``
|
|
93
|
+
- As-is expressions: ``I(re75==0)``
|
|
94
|
+
- Remove intercept: ``-1``
|
|
95
|
+
|
|
96
|
+
data : pd.DataFrame
|
|
97
|
+
DataFrame containing all variables referenced in the formula.
|
|
98
|
+
return_type : str, default='dataframe'
|
|
99
|
+
Currently unused; arrays are always returned as numpy arrays.
|
|
100
|
+
preserve_categorical : bool, default=False
|
|
101
|
+
If True and treatment is pd.Categorical, preserve the original
|
|
102
|
+
Categorical dtype. Used internally for multi-level treatment models.
|
|
103
|
+
|
|
104
|
+
Returns
|
|
105
|
+
-------
|
|
106
|
+
y : np.ndarray or pd.Series
|
|
107
|
+
Treatment vector, shape (n,).
|
|
108
|
+
Returns float64 array unless preserve_categorical=True and
|
|
109
|
+
the original treatment is Categorical.
|
|
110
|
+
X : np.ndarray
|
|
111
|
+
Design matrix, shape (n, k), dtype=float64.
|
|
112
|
+
Includes intercept column by default (first column).
|
|
113
|
+
|
|
114
|
+
Notes
|
|
115
|
+
-----
|
|
116
|
+
**Design matrix structure**:
|
|
117
|
+
|
|
118
|
+
- Patsy adds an intercept column by default (suppress with ``-1``)
|
|
119
|
+
- Categorical variables are dummy-coded with K-1 columns
|
|
120
|
+
- Column order follows patsy conventions: intercept first,
|
|
121
|
+
then main effects, interactions, and I() terms
|
|
122
|
+
|
|
123
|
+
**Post-processing**: The caller is typically responsible for
|
|
124
|
+
zero-variance column filtering if needed::
|
|
125
|
+
|
|
126
|
+
std = X.std(axis=0, ddof=1)
|
|
127
|
+
X = X[:, std > 0]
|
|
128
|
+
|
|
129
|
+
Examples
|
|
130
|
+
--------
|
|
131
|
+
>>> import pandas as pd
|
|
132
|
+
>>> df = pd.DataFrame({
|
|
133
|
+
... 'treat': [0, 1, 0, 1],
|
|
134
|
+
... 'age': [25, 30, 35, 40],
|
|
135
|
+
... 'educ': [12, 16, 14, 18]
|
|
136
|
+
... })
|
|
137
|
+
>>> y, X = parse_formula("treat ~ age + educ", df)
|
|
138
|
+
>>> y.shape
|
|
139
|
+
(4,)
|
|
140
|
+
>>> X.shape # (n, k) with intercept
|
|
141
|
+
(4, 3)
|
|
142
|
+
|
|
143
|
+
References
|
|
144
|
+
----------
|
|
145
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
146
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
147
|
+
"""
|
|
148
|
+
# Step 0a: Convert factor() syntax to patsy C() notation
|
|
149
|
+
formula = _convert_r_formula_to_patsy(formula)
|
|
150
|
+
|
|
151
|
+
# Step 0b: Detect if treatment variable is Categorical
|
|
152
|
+
# Extract treatment variable name from formula
|
|
153
|
+
if '~' in formula:
|
|
154
|
+
lhs = formula.split('~')[0].strip()
|
|
155
|
+
treat_var_name = lhs
|
|
156
|
+
|
|
157
|
+
# Check if variable in original data is Categorical
|
|
158
|
+
is_categorical_treat = (
|
|
159
|
+
preserve_categorical and
|
|
160
|
+
treat_var_name in data.columns and
|
|
161
|
+
isinstance(data[treat_var_name].dtype, pd.CategoricalDtype)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if is_categorical_treat:
|
|
165
|
+
# Save original Categorical Series
|
|
166
|
+
original_treat = data[treat_var_name].copy()
|
|
167
|
+
# Temporarily convert to numeric to avoid patsy one-hot encoding
|
|
168
|
+
data_temp = data.copy()
|
|
169
|
+
data_temp[treat_var_name] = data[treat_var_name].cat.codes.astype(np.float64)
|
|
170
|
+
else:
|
|
171
|
+
data_temp = data
|
|
172
|
+
original_treat = None
|
|
173
|
+
else:
|
|
174
|
+
data_temp = data
|
|
175
|
+
original_treat = None
|
|
176
|
+
is_categorical_treat = False
|
|
177
|
+
|
|
178
|
+
# Step 1: Parse formula using patsy
|
|
179
|
+
y, X_df = dmatrices(formula, data_temp, return_type='dataframe')
|
|
180
|
+
|
|
181
|
+
# Step 2: Reorder columns to match formula order
|
|
182
|
+
# Patsy orders alphabetically, we want formula order
|
|
183
|
+
# Strategy: Intercept first, then non-I() terms, then I() terms
|
|
184
|
+
col_names = X_df.columns.tolist()
|
|
185
|
+
|
|
186
|
+
# Extract variable order from formula
|
|
187
|
+
if '~' in formula:
|
|
188
|
+
rhs = formula.split('~')[1].strip()
|
|
189
|
+
formula_terms = [t.strip() for t in rhs.split('+')]
|
|
190
|
+
|
|
191
|
+
# Separate simple variables and I() variables
|
|
192
|
+
simple_vars = []
|
|
193
|
+
i_vars = []
|
|
194
|
+
for term in formula_terms:
|
|
195
|
+
if term.startswith('I('):
|
|
196
|
+
i_vars.append(term)
|
|
197
|
+
else:
|
|
198
|
+
simple_vars.append(term)
|
|
199
|
+
|
|
200
|
+
# Build new order
|
|
201
|
+
new_order = []
|
|
202
|
+
|
|
203
|
+
# 1. Intercept first
|
|
204
|
+
for i, col in enumerate(col_names):
|
|
205
|
+
if col == 'Intercept':
|
|
206
|
+
new_order.append(i)
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
# 2. Add simple variables in formula order (exact match)
|
|
210
|
+
for var in simple_vars:
|
|
211
|
+
for i, col in enumerate(col_names):
|
|
212
|
+
# Exact match: column name equals variable name
|
|
213
|
+
if i not in new_order and col == var and '[T.' not in col:
|
|
214
|
+
new_order.append(i)
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
# 3. Add I() variables in formula order
|
|
218
|
+
import re
|
|
219
|
+
for i_var in i_vars:
|
|
220
|
+
# I(re75==0) needs to match "I(re75 == 0)[T.True]"
|
|
221
|
+
for i, col in enumerate(col_names):
|
|
222
|
+
if i not in new_order and col.startswith('I('):
|
|
223
|
+
# Check if variable in I() matches
|
|
224
|
+
formula_match = re.search(r'I\((\w+)', i_var)
|
|
225
|
+
col_match = re.search(r'I\((\w+)', col)
|
|
226
|
+
if formula_match and col_match:
|
|
227
|
+
if formula_match.group(1) == col_match.group(1):
|
|
228
|
+
new_order.append(i)
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
# 4. Add any remaining columns
|
|
232
|
+
for i in range(len(col_names)):
|
|
233
|
+
if i not in new_order:
|
|
234
|
+
new_order.append(i)
|
|
235
|
+
|
|
236
|
+
# Reorder columns
|
|
237
|
+
X_reordered = X_df.iloc[:, new_order]
|
|
238
|
+
else:
|
|
239
|
+
X_reordered = X_df
|
|
240
|
+
|
|
241
|
+
# Step 3: Convert to numpy array
|
|
242
|
+
X = X_reordered.values
|
|
243
|
+
|
|
244
|
+
# Step 4: Process treatment variable
|
|
245
|
+
if is_categorical_treat:
|
|
246
|
+
# Return original Categorical Series (preserve factor semantics)
|
|
247
|
+
y = original_treat
|
|
248
|
+
else:
|
|
249
|
+
# Convert to float64 array
|
|
250
|
+
y = y.values.ravel()
|
|
251
|
+
y = y.astype(np.float64)
|
|
252
|
+
|
|
253
|
+
# Step 5: Convert X to float64 (enforce double precision)
|
|
254
|
+
X = X.astype(np.float64)
|
|
255
|
+
|
|
256
|
+
# Step 6: Return reordered design matrix
|
|
257
|
+
return y, X
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def parse_dual_formulas(
|
|
261
|
+
baseline_formula: Optional[str],
|
|
262
|
+
diff_formula: Optional[str],
|
|
263
|
+
data: pd.DataFrame
|
|
264
|
+
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
|
265
|
+
"""
|
|
266
|
+
Parse baseline and difference formulas for optimal CBPS.
|
|
267
|
+
|
|
268
|
+
Optimal CBPS uses separate covariate specifications for the baseline
|
|
269
|
+
outcome model E[Y(0)|X] and the treatment effect heterogeneity
|
|
270
|
+
E[Y(1)-Y(0)|X]. This function parses both formulas and returns
|
|
271
|
+
their design matrices.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
baseline_formula : str or None
|
|
276
|
+
Right-hand-side formula for baseline covariates, e.g., ``"~ age + educ"``.
|
|
277
|
+
diff_formula : str or None
|
|
278
|
+
Right-hand-side formula for treatment effect covariates, e.g., ``"~ age"``.
|
|
279
|
+
data : pd.DataFrame
|
|
280
|
+
DataFrame containing all referenced variables.
|
|
281
|
+
|
|
282
|
+
Returns
|
|
283
|
+
-------
|
|
284
|
+
baselineX : np.ndarray or None
|
|
285
|
+
Design matrix for baseline formula, shape (n, k1).
|
|
286
|
+
Zero-variance columns are automatically removed.
|
|
287
|
+
diffX : np.ndarray or None
|
|
288
|
+
Design matrix for difference formula, shape (n, k2).
|
|
289
|
+
Zero-variance columns are automatically removed.
|
|
290
|
+
|
|
291
|
+
Raises
|
|
292
|
+
------
|
|
293
|
+
ValueError
|
|
294
|
+
If exactly one formula is None (both must be specified together
|
|
295
|
+
or both must be None).
|
|
296
|
+
|
|
297
|
+
Notes
|
|
298
|
+
-----
|
|
299
|
+
Unlike ``parse_formula()``, this function:
|
|
300
|
+
|
|
301
|
+
- Takes right-hand-side only formulas (no treatment variable)
|
|
302
|
+
- Automatically filters zero-variance columns
|
|
303
|
+
- Returns None for both outputs if both inputs are None
|
|
304
|
+
|
|
305
|
+
Examples
|
|
306
|
+
--------
|
|
307
|
+
>>> import pandas as pd
|
|
308
|
+
>>> df = pd.DataFrame({
|
|
309
|
+
... 'age': [25, 30, 35, 40],
|
|
310
|
+
... 'educ': [12, 16, 14, 18]
|
|
311
|
+
... })
|
|
312
|
+
>>> baselineX, diffX = parse_dual_formulas("~ age + educ", "~ age", df)
|
|
313
|
+
>>> baselineX.shape[1] >= diffX.shape[1]
|
|
314
|
+
True
|
|
315
|
+
|
|
316
|
+
References
|
|
317
|
+
----------
|
|
318
|
+
Fan, J., Imai, K., Liu, H., Ning, Y., and Yang, X. (2021). Improving
|
|
319
|
+
covariate balancing propensity score: A doubly robust and efficient
|
|
320
|
+
approach. Working paper.
|
|
321
|
+
"""
|
|
322
|
+
# Step 1: XOR check - both must be specified or both None
|
|
323
|
+
if (baseline_formula is None) != (diff_formula is None):
|
|
324
|
+
raise ValueError(
|
|
325
|
+
"Either baseline_formula or diff_formula not specified. "
|
|
326
|
+
"Both must be specified to use CBPSOptimal. Otherwise, leave both None."
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Step 2: Return None if both are None
|
|
330
|
+
if baseline_formula is None and diff_formula is None:
|
|
331
|
+
return None, None
|
|
332
|
+
|
|
333
|
+
# Step 3: Parse baseline formula
|
|
334
|
+
baselineX = dmatrix(baseline_formula, data, return_type='matrix')
|
|
335
|
+
baselineX = np.asarray(baselineX, dtype=np.float64)
|
|
336
|
+
|
|
337
|
+
# Filter zero-variance columns (threshold > 0)
|
|
338
|
+
std_baseline = baselineX.std(axis=0, ddof=1)
|
|
339
|
+
baselineX = baselineX[:, std_baseline > 0]
|
|
340
|
+
|
|
341
|
+
# Step 4: Parse diff formula
|
|
342
|
+
diffX = dmatrix(diff_formula, data, return_type='matrix')
|
|
343
|
+
diffX = np.asarray(diffX, dtype=np.float64)
|
|
344
|
+
|
|
345
|
+
# Filter zero-variance columns
|
|
346
|
+
std_diff = diffX.std(axis=0, ddof=1)
|
|
347
|
+
diffX = diffX[:, std_diff > 0]
|
|
348
|
+
|
|
349
|
+
return baselineX, diffX
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def parse_arrays(
|
|
353
|
+
treatment: Union[np.ndarray, pd.Series],
|
|
354
|
+
covariates: Union[np.ndarray, pd.DataFrame],
|
|
355
|
+
add_intercept: bool = True
|
|
356
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
357
|
+
"""
|
|
358
|
+
Construct treatment vector and design matrix from array inputs.
|
|
359
|
+
|
|
360
|
+
Provides a programmatic interface for CBPS when data is already
|
|
361
|
+
available as arrays rather than in a DataFrame with formula specification.
|
|
362
|
+
|
|
363
|
+
Parameters
|
|
364
|
+
----------
|
|
365
|
+
treatment : array-like
|
|
366
|
+
Treatment variable, shape (n,) or (n, 1).
|
|
367
|
+
covariates : array-like
|
|
368
|
+
Covariate matrix, shape (n, k) or (n,) for single covariate.
|
|
369
|
+
add_intercept : bool, default=True
|
|
370
|
+
If True, prepend a column of ones to the covariate matrix.
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
y : np.ndarray
|
|
375
|
+
Treatment vector, shape (n,), dtype=float64.
|
|
376
|
+
X : np.ndarray
|
|
377
|
+
Design matrix, shape (n, k) or (n, k+1) with intercept.
|
|
378
|
+
dtype=float64.
|
|
379
|
+
|
|
380
|
+
Notes
|
|
381
|
+
-----
|
|
382
|
+
This function produces output compatible with ``parse_formula()``,
|
|
383
|
+
enabling consistent downstream processing regardless of input method.
|
|
384
|
+
|
|
385
|
+
Examples
|
|
386
|
+
--------
|
|
387
|
+
>>> import numpy as np
|
|
388
|
+
>>> treatment = np.array([0, 1, 0, 1])
|
|
389
|
+
>>> covariates = np.array([[25, 12], [30, 16], [35, 14], [40, 18]])
|
|
390
|
+
>>> y, X = parse_arrays(treatment, covariates, add_intercept=True)
|
|
391
|
+
>>> X.shape
|
|
392
|
+
(4, 3)
|
|
393
|
+
>>> np.allclose(X[:, 0], 1.0) # First column is intercept
|
|
394
|
+
True
|
|
395
|
+
|
|
396
|
+
References
|
|
397
|
+
----------
|
|
398
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
399
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
400
|
+
"""
|
|
401
|
+
# Convert to numpy arrays
|
|
402
|
+
y = np.asarray(treatment, dtype=np.float64).ravel()
|
|
403
|
+
X = np.asarray(covariates, dtype=np.float64)
|
|
404
|
+
|
|
405
|
+
# Ensure X is 2-dimensional
|
|
406
|
+
if X.ndim == 1:
|
|
407
|
+
X = X.reshape(-1, 1)
|
|
408
|
+
|
|
409
|
+
# Add intercept column if requested
|
|
410
|
+
if add_intercept:
|
|
411
|
+
intercept = np.ones((len(y), 1), dtype=np.float64)
|
|
412
|
+
X = np.column_stack([intercept, X])
|
|
413
|
+
|
|
414
|
+
return y, X
|
|
415
|
+
|