pyleebounds 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/PKG-INFO +10 -12
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/README.md +9 -9
- pyleebounds-0.2.0/pyleebounds/lee_bounds.py +329 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/PKG-INFO +10 -12
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/requires.txt +0 -2
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/setup.py +1 -1
- pyleebounds-0.1.0/pyleebounds/lee_bounds.py +0 -291
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/LICENSE +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds/__init__.py +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/SOURCES.txt +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/dependency_links.txt +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/not-zip-safe +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/pyleebounds.egg-info/top_level.txt +0 -0
- {pyleebounds-0.1.0 → pyleebounds-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pyleebounds
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Python package for Lee 2009 treatment effect bounds under sample selection
|
5
5
|
Home-page: https://github.com/vyasenov/pyleebounds
|
6
6
|
Author: Vasco Yasenov
|
@@ -19,8 +19,6 @@ Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
20
20
|
Requires-Dist: numpy>=1.20.0
|
21
21
|
Requires-Dist: pandas>=1.3.0
|
22
|
-
Requires-Dist: matplotlib>=3.3.0
|
23
|
-
Requires-Dist: seaborn>=0.11.0
|
24
22
|
Requires-Dist: scipy>=1.7.0
|
25
23
|
Provides-Extra: dev
|
26
24
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
@@ -39,9 +37,10 @@ Dynamic: requires-dist
|
|
39
37
|
Dynamic: requires-python
|
40
38
|
Dynamic: summary
|
41
39
|
|
42
|
-
|
43
40
|
# pyleebounds
|
44
41
|
|
42
|
+

|
43
|
+
|
45
44
|
A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
|
46
45
|
|
47
46
|
## Installation
|
@@ -57,7 +56,7 @@ pip install pyleebounds
|
|
57
56
|
* Sharp nonparametric bounds on treatment effects under endogenous sample selection
|
58
57
|
* Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
|
59
58
|
* Bootstrap confidence intervals
|
60
|
-
* Seamless integration with
|
59
|
+
* Seamless integration with `pandas`
|
61
60
|
|
62
61
|
## Quick Start
|
63
62
|
|
@@ -93,19 +92,18 @@ df = pd.DataFrame({
|
|
93
92
|
})
|
94
93
|
|
95
94
|
# Initialize and fit Lee bounds estimator
|
96
|
-
|
95
|
+
# Use fewer bootstrap replications for faster execution in this example
|
96
|
+
lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
|
97
97
|
results = lb.fit(df, outcome='Y', treatment='D', selection='S')
|
98
98
|
|
99
|
-
# View summary
|
100
|
-
print(
|
101
|
-
|
102
|
-
# Plot estimated bounds
|
103
|
-
results.plot()
|
99
|
+
# View comprehensive summary
|
100
|
+
print(lb.summary())
|
104
101
|
```
|
105
102
|
|
103
|
+
|
106
104
|
## Examples
|
107
105
|
|
108
|
-
You can find detailed usage examples in the
|
106
|
+
You can find detailed usage examples in the `examples/` directory.
|
109
107
|
|
110
108
|
## Background
|
111
109
|
|
@@ -1,6 +1,7 @@
|
|
1
|
-
|
2
1
|
# pyleebounds
|
3
2
|
|
3
|
+

|
4
|
+
|
4
5
|
A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
|
5
6
|
|
6
7
|
## Installation
|
@@ -16,7 +17,7 @@ pip install pyleebounds
|
|
16
17
|
* Sharp nonparametric bounds on treatment effects under endogenous sample selection
|
17
18
|
* Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
|
18
19
|
* Bootstrap confidence intervals
|
19
|
-
* Seamless integration with
|
20
|
+
* Seamless integration with `pandas`
|
20
21
|
|
21
22
|
## Quick Start
|
22
23
|
|
@@ -52,19 +53,18 @@ df = pd.DataFrame({
|
|
52
53
|
})
|
53
54
|
|
54
55
|
# Initialize and fit Lee bounds estimator
|
55
|
-
|
56
|
+
# Use fewer bootstrap replications for faster execution in this example
|
57
|
+
lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
|
56
58
|
results = lb.fit(df, outcome='Y', treatment='D', selection='S')
|
57
59
|
|
58
|
-
# View summary
|
59
|
-
print(
|
60
|
-
|
61
|
-
# Plot estimated bounds
|
62
|
-
results.plot()
|
60
|
+
# View comprehensive summary
|
61
|
+
print(lb.summary())
|
63
62
|
```
|
64
63
|
|
64
|
+
|
65
65
|
## Examples
|
66
66
|
|
67
|
-
You can find detailed usage examples in the
|
67
|
+
You can find detailed usage examples in the `examples/` directory.
|
68
68
|
|
69
69
|
## Background
|
70
70
|
|
@@ -0,0 +1,329 @@
|
|
1
|
+
"""
|
2
|
+
Implementation of Lee (2009) treatment effect bounds under sample selection.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
from typing import Tuple, Dict, Any
|
8
|
+
|
9
|
+
class LeeBounds:
|
10
|
+
"""
|
11
|
+
Lee (2009) treatment effect bounds estimator.
|
12
|
+
|
13
|
+
Implements the method from Lee (2009) for estimating sharp bounds on treatment
|
14
|
+
effects when selection into the post-treatment sample is endogenous.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
None
|
19
|
+
|
20
|
+
References
|
21
|
+
----------
|
22
|
+
Lee, D. S. (2009). Training, wages, and sample selection: Estimating sharp
|
23
|
+
bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, n_bootstrap: int = 100, ci_level: float = 0.95):
|
27
|
+
"""
|
28
|
+
Initialize Lee bounds estimator.
|
29
|
+
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
n_bootstrap : int, default=100
|
33
|
+
Number of bootstrap samples for confidence intervals
|
34
|
+
ci_level : float, default=0.95
|
35
|
+
Confidence level for bootstrap confidence intervals
|
36
|
+
"""
|
37
|
+
self.n_bootstrap = n_bootstrap
|
38
|
+
self.ci_level = ci_level
|
39
|
+
|
40
|
+
def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
|
41
|
+
p1: float, p0: float) -> Tuple[float, float]:
|
42
|
+
"""
|
43
|
+
Compute Lee treatment effect bounds using trimming approach.
|
44
|
+
|
45
|
+
Parameters
|
46
|
+
----------
|
47
|
+
Y : np.ndarray; Outcome values
|
48
|
+
D : np.ndarray; Treatment indicators
|
49
|
+
S : np.ndarray; Selection indicators
|
50
|
+
p1 : float; Selection rate in treated group
|
51
|
+
p0 : float; Selection rate in control group
|
52
|
+
|
53
|
+
Returns
|
54
|
+
-------
|
55
|
+
Tuple[float, float]
|
56
|
+
(lower_bound, upper_bound)
|
57
|
+
"""
|
58
|
+
# Control group mean (among selected)
|
59
|
+
control_mean = np.mean(Y[(D == 0) & (S == 1)])
|
60
|
+
|
61
|
+
# Treated group (among selected)
|
62
|
+
treated_selected = Y[(D == 1) & (S == 1)]
|
63
|
+
|
64
|
+
# For Lee bounds, we trim the treated group to match control selection rate
|
65
|
+
# Lower bound: trim from top (keep lowest outcomes)
|
66
|
+
# Upper bound: trim from bottom (keep highest outcomes)
|
67
|
+
if p1 > p0:
|
68
|
+
# Trim proportion
|
69
|
+
trim_prop = (p1 - p0) / p1
|
70
|
+
n_trim = int(len(treated_selected) * trim_prop)
|
71
|
+
|
72
|
+
if n_trim >= len(treated_selected):
|
73
|
+
# Handle case where we'd trim everything
|
74
|
+
raise ValueError("Trim proportion too large - would remove all observations")
|
75
|
+
|
76
|
+
sorted_treated = np.sort(treated_selected)
|
77
|
+
# Lower bound: keep bottom (1 - trim_prop) of observations (trimmed from top)
|
78
|
+
lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
|
79
|
+
# Upper bound: keep top (1 - trim_prop) of observations
|
80
|
+
upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
|
81
|
+
else:
|
82
|
+
lower_trimmed_mean = upper_trimmed_mean = np.mean(treated_selected)
|
83
|
+
|
84
|
+
lower_bound = lower_trimmed_mean - control_mean
|
85
|
+
upper_bound = upper_trimmed_mean - control_mean
|
86
|
+
|
87
|
+
return lower_bound, upper_bound
|
88
|
+
|
89
|
+
def _validate_data(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
|
90
|
+
"""
|
91
|
+
Validate that variables have correct types and values.
|
92
|
+
|
93
|
+
Parameters
|
94
|
+
----------
|
95
|
+
Y : np.ndarray
|
96
|
+
Outcome variable
|
97
|
+
D : np.ndarray
|
98
|
+
Treatment variable
|
99
|
+
S : np.ndarray
|
100
|
+
Selection variable
|
101
|
+
|
102
|
+
Raises
|
103
|
+
------
|
104
|
+
ValueError
|
105
|
+
If validation fails
|
106
|
+
"""
|
107
|
+
# Check that D is binary with values 0 and 1
|
108
|
+
unique_d = np.unique(D)
|
109
|
+
if not np.array_equal(unique_d, np.array([0, 1])):
|
110
|
+
raise ValueError(f"Treatment variable D must be binary (0, 1). Found values: {unique_d}")
|
111
|
+
|
112
|
+
# Check that S is binary with values 0 and 1
|
113
|
+
unique_s = np.unique(S)
|
114
|
+
if not np.array_equal(unique_s, np.array([0, 1])):
|
115
|
+
raise ValueError(f"Selection variable S must be binary (0, 1). Found values: {unique_s}")
|
116
|
+
|
117
|
+
# Check that Y is continuous (not all integers)
|
118
|
+
if len(np.unique(Y)) < len(Y) * 0.1: # If less than 10% unique values, likely discrete
|
119
|
+
raise ValueError("Outcome variable Y should be continuous. Consider if this is appropriate.")
|
120
|
+
|
121
|
+
# Check for reasonable sample sizes
|
122
|
+
if len(Y) < 10:
|
123
|
+
raise ValueError("Sample size too small. Need at least 10 observations.")
|
124
|
+
|
125
|
+
# Check that we have both treatment groups
|
126
|
+
if np.sum(D == 0) == 0:
|
127
|
+
raise ValueError("No control observations (D=0) found.")
|
128
|
+
if np.sum(D == 1) == 0:
|
129
|
+
raise ValueError("No treated observations (D=1) found.")
|
130
|
+
|
131
|
+
# Check that we have selected observations in both groups
|
132
|
+
if np.sum((D == 0) & (S == 1)) == 0:
|
133
|
+
raise ValueError("No selected control observations (D=0, S=1) found.")
|
134
|
+
if np.sum((D == 1) & (S == 1)) == 0:
|
135
|
+
raise ValueError("No selected treated observations (D=1, S=1) found.")
|
136
|
+
|
137
|
+
def _bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
|
138
|
+
selection: str) -> Dict[str, Any]:
|
139
|
+
"""
|
140
|
+
Compute bootstrap confidence intervals for the bounds.
|
141
|
+
|
142
|
+
Parameters
|
143
|
+
----------
|
144
|
+
data : pd.DataFrame; Input data
|
145
|
+
outcome : str; Outcome variable name
|
146
|
+
treatment : str; Treatment variable name
|
147
|
+
selection : str; Selection variable name
|
148
|
+
|
149
|
+
Returns
|
150
|
+
-------
|
151
|
+
Dict[str, Any]
|
152
|
+
Dictionary containing bootstrap results
|
153
|
+
"""
|
154
|
+
lower_bounds = []
|
155
|
+
upper_bounds = []
|
156
|
+
|
157
|
+
for _ in range(self.n_bootstrap):
|
158
|
+
# Bootstrap sample
|
159
|
+
boot_idx = np.random.choice(len(data), size=len(data), replace=True)
|
160
|
+
boot_data = data.iloc[boot_idx].reset_index(drop=True)
|
161
|
+
|
162
|
+
try:
|
163
|
+
# Extract data from bootstrap sample
|
164
|
+
Y = boot_data[outcome].values
|
165
|
+
D = boot_data[treatment].values
|
166
|
+
S = boot_data[selection].values
|
167
|
+
|
168
|
+
# Remove missing values
|
169
|
+
valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
|
170
|
+
Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
|
171
|
+
|
172
|
+
# Validate data types and values
|
173
|
+
self._validate_data(Y, D, S)
|
174
|
+
|
175
|
+
# Calculate selection rates
|
176
|
+
p1 = np.mean(S[D == 1])
|
177
|
+
p0 = np.mean(S[D == 0])
|
178
|
+
|
179
|
+
if p1 <= p0:
|
180
|
+
continue # Skip this bootstrap sample
|
181
|
+
|
182
|
+
# Calculate bounds directly
|
183
|
+
lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
|
184
|
+
lower_bounds.append(lower_bound)
|
185
|
+
upper_bounds.append(upper_bound)
|
186
|
+
|
187
|
+
except Exception:
|
188
|
+
# Skip if bootstrap sample fails
|
189
|
+
continue
|
190
|
+
|
191
|
+
# Calculate confidence intervals and standard errors
|
192
|
+
if len(lower_bounds) > 0:
|
193
|
+
alpha = 1 - self.ci_level
|
194
|
+
lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
|
195
|
+
upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
|
196
|
+
|
197
|
+
# Calculate bootstrap standard errors
|
198
|
+
lower_bound_se = np.std(lower_bounds, ddof=1)
|
199
|
+
upper_bound_se = np.std(upper_bounds, ddof=1)
|
200
|
+
else:
|
201
|
+
# If no successful bootstrap samples, use point estimates
|
202
|
+
lower_ci = upper_ci = np.array([np.nan, np.nan])
|
203
|
+
lower_bound_se = upper_bound_se = np.nan
|
204
|
+
|
205
|
+
return {
|
206
|
+
'lower_bound_ci': lower_ci,
|
207
|
+
'upper_bound_ci': upper_ci,
|
208
|
+
'lower_bound_se': lower_bound_se,
|
209
|
+
'upper_bound_se': upper_bound_se,
|
210
|
+
'lower_bounds': lower_bounds,
|
211
|
+
'upper_bounds': upper_bounds,
|
212
|
+
'ci_level': self.ci_level
|
213
|
+
}
|
214
|
+
|
215
|
+
def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
|
216
|
+
selection: str) -> 'LeeBounds':
|
217
|
+
"""
|
218
|
+
Fit Lee bounds estimator to the data.
|
219
|
+
|
220
|
+
Parameters
|
221
|
+
----------
|
222
|
+
data : pd.DataFrame; Input data containing outcome, treatment, and selection variables
|
223
|
+
outcome : str; Name of the outcome variable column
|
224
|
+
treatment : str; Name of the treatment indicator column (0=control, 1=treated)
|
225
|
+
selection : str; Name of the selection indicator column (0=missing, 1=observed)
|
226
|
+
|
227
|
+
Returns
|
228
|
+
-------
|
229
|
+
LeeBounds
|
230
|
+
Self with fitted results
|
231
|
+
"""
|
232
|
+
# Validate inputs
|
233
|
+
required_cols = [outcome, treatment, selection]
|
234
|
+
missing_cols = [col for col in required_cols if col not in data.columns]
|
235
|
+
if missing_cols:
|
236
|
+
raise ValueError(f"Missing columns: {missing_cols}")
|
237
|
+
|
238
|
+
# Extract data
|
239
|
+
Y = data[outcome].values
|
240
|
+
D = data[treatment].values
|
241
|
+
S = data[selection].values
|
242
|
+
|
243
|
+
# Remove missing values
|
244
|
+
valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
|
245
|
+
Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
|
246
|
+
|
247
|
+
# Validate data types and values
|
248
|
+
self._validate_data(Y, D, S)
|
249
|
+
|
250
|
+
# Calculate selection rates
|
251
|
+
p1 = np.mean(S[D == 1]) # Selection rate in treated group
|
252
|
+
p0 = np.mean(S[D == 0]) # Selection rate in control group
|
253
|
+
|
254
|
+
if p1 <= p0:
|
255
|
+
raise ValueError("Selection rate in treated group must be greater than control group")
|
256
|
+
|
257
|
+
# Calculate bounds
|
258
|
+
lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
|
259
|
+
|
260
|
+
# Store results directly in self
|
261
|
+
self.lower_bound = lower_bound
|
262
|
+
self.upper_bound = upper_bound
|
263
|
+
self.p1 = p1
|
264
|
+
self.p0 = p0
|
265
|
+
self.trim_proportion = p1 - p0
|
266
|
+
self.n_treated = np.sum(D == 1)
|
267
|
+
self.n_control = np.sum(D == 0)
|
268
|
+
self.n_treated_selected = np.sum((D == 1) & (S == 1))
|
269
|
+
self.n_control_selected = np.sum((D == 0) & (S == 1))
|
270
|
+
|
271
|
+
# Compute bootstrap confidence intervals and standard errors
|
272
|
+
bootstrap_results = self._bootstrap(data, outcome, treatment, selection)
|
273
|
+
self.lower_bound_ci = bootstrap_results['lower_bound_ci']
|
274
|
+
self.upper_bound_ci = bootstrap_results['upper_bound_ci']
|
275
|
+
self.lower_bound_se = bootstrap_results['lower_bound_se']
|
276
|
+
self.upper_bound_se = bootstrap_results['upper_bound_se']
|
277
|
+
self.lower_bounds_bootstrap = bootstrap_results['lower_bounds']
|
278
|
+
self.upper_bounds_bootstrap = bootstrap_results['upper_bounds']
|
279
|
+
|
280
|
+
return self
|
281
|
+
|
282
|
+
def summary(self) -> str:
|
283
|
+
"""
|
284
|
+
Return a summary of the results.
|
285
|
+
|
286
|
+
Returns
|
287
|
+
-------
|
288
|
+
str
|
289
|
+
Formatted summary string
|
290
|
+
"""
|
291
|
+
if not hasattr(self, 'lower_bound'):
|
292
|
+
return "No results available. Please fit the model first."
|
293
|
+
|
294
|
+
# Format confidence intervals and standard errors
|
295
|
+
if not np.isnan(self.lower_bound_ci[0]):
|
296
|
+
lower_ci_str = f"[{self.lower_bound_ci[0]:.4f}, {self.lower_bound_ci[1]:.4f}]"
|
297
|
+
upper_ci_str = f"[{self.upper_bound_ci[0]:.4f}, {self.upper_bound_ci[1]:.4f}]"
|
298
|
+
lower_se_str = f"{self.lower_bound_se:.4f}"
|
299
|
+
upper_se_str = f"{self.upper_bound_se:.4f}"
|
300
|
+
else:
|
301
|
+
lower_ci_str = "Not computed"
|
302
|
+
upper_ci_str = "Not computed"
|
303
|
+
lower_se_str = "Not computed"
|
304
|
+
upper_se_str = "Not computed"
|
305
|
+
|
306
|
+
summary = f"""
|
307
|
+
Lee (2009) Treatment Effect Bounds
|
308
|
+
=================================
|
309
|
+
|
310
|
+
Treatment Effect Bounds:
|
311
|
+
- Lower bound: {self.lower_bound:.4f}
|
312
|
+
- Upper bound: {self.upper_bound:.4f}
|
313
|
+
- Bound width: {self.upper_bound - self.lower_bound:.4f}
|
314
|
+
|
315
|
+
Bootstrap Confidence Intervals ({int(self.ci_level*100)}%):
|
316
|
+
- Lower bound CI: {lower_ci_str}
|
317
|
+
- Upper bound CI: {upper_ci_str}
|
318
|
+
|
319
|
+
Bootstrap Standard Errors:
|
320
|
+
- Lower bound SE: {lower_se_str}
|
321
|
+
- Upper bound SE: {upper_se_str}
|
322
|
+
|
323
|
+
Sample Information:
|
324
|
+
- Treated observations: {self.n_treated} (selected: {self.n_treated_selected})
|
325
|
+
- Control observations: {self.n_control} (selected: {self.n_control_selected})
|
326
|
+
- Selection rates: p₁ = {self.p1:.3f}, p₀ = {self.p0:.3f}
|
327
|
+
- Trim proportion: {self.trim_proportion:.3f}
|
328
|
+
"""
|
329
|
+
return summary
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pyleebounds
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Python package for Lee 2009 treatment effect bounds under sample selection
|
5
5
|
Home-page: https://github.com/vyasenov/pyleebounds
|
6
6
|
Author: Vasco Yasenov
|
@@ -19,8 +19,6 @@ Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
20
20
|
Requires-Dist: numpy>=1.20.0
|
21
21
|
Requires-Dist: pandas>=1.3.0
|
22
|
-
Requires-Dist: matplotlib>=3.3.0
|
23
|
-
Requires-Dist: seaborn>=0.11.0
|
24
22
|
Requires-Dist: scipy>=1.7.0
|
25
23
|
Provides-Extra: dev
|
26
24
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
@@ -39,9 +37,10 @@ Dynamic: requires-dist
|
|
39
37
|
Dynamic: requires-python
|
40
38
|
Dynamic: summary
|
41
39
|
|
42
|
-
|
43
40
|
# pyleebounds
|
44
41
|
|
42
|
+

|
43
|
+
|
45
44
|
A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
|
46
45
|
|
47
46
|
## Installation
|
@@ -57,7 +56,7 @@ pip install pyleebounds
|
|
57
56
|
* Sharp nonparametric bounds on treatment effects under endogenous sample selection
|
58
57
|
* Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
|
59
58
|
* Bootstrap confidence intervals
|
60
|
-
* Seamless integration with
|
59
|
+
* Seamless integration with `pandas`
|
61
60
|
|
62
61
|
## Quick Start
|
63
62
|
|
@@ -93,19 +92,18 @@ df = pd.DataFrame({
|
|
93
92
|
})
|
94
93
|
|
95
94
|
# Initialize and fit Lee bounds estimator
|
96
|
-
|
95
|
+
# Use fewer bootstrap replications for faster execution in this example
|
96
|
+
lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
|
97
97
|
results = lb.fit(df, outcome='Y', treatment='D', selection='S')
|
98
98
|
|
99
|
-
# View summary
|
100
|
-
print(
|
101
|
-
|
102
|
-
# Plot estimated bounds
|
103
|
-
results.plot()
|
99
|
+
# View comprehensive summary
|
100
|
+
print(lb.summary())
|
104
101
|
```
|
105
102
|
|
103
|
+
|
106
104
|
## Examples
|
107
105
|
|
108
|
-
You can find detailed usage examples in the
|
106
|
+
You can find detailed usage examples in the `examples/` directory.
|
109
107
|
|
110
108
|
## Background
|
111
109
|
|
@@ -1,291 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Implementation of Lee (2009) treatment effect bounds under sample selection.
|
3
|
-
"""
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import pandas as pd
|
7
|
-
from typing import Tuple, Dict, Any
|
8
|
-
import matplotlib.pyplot as plt
|
9
|
-
|
10
|
-
|
11
|
-
class LeeBounds:
|
12
|
-
"""
|
13
|
-
Lee (2009) treatment effect bounds estimator.
|
14
|
-
|
15
|
-
Implements the method from Lee (2009) for estimating sharp bounds on treatment
|
16
|
-
effects when selection into the post-treatment sample is endogenous.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
None
|
21
|
-
|
22
|
-
References
|
23
|
-
----------
|
24
|
-
Lee, D. S. (2009). Training, wages, and sample selection: Estimating sharp
|
25
|
-
bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
|
26
|
-
"""
|
27
|
-
|
28
|
-
def __init__(self, trim_method: str = 'quantile'):
|
29
|
-
self.trim_method = trim_method
|
30
|
-
self.fitted = False
|
31
|
-
self.results = None
|
32
|
-
|
33
|
-
def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
|
34
|
-
selection: str) -> 'LeeBoundsResults':
|
35
|
-
"""
|
36
|
-
Fit Lee bounds estimator to the data.
|
37
|
-
|
38
|
-
Parameters
|
39
|
-
----------
|
40
|
-
data : pd.DataFrame
|
41
|
-
Input data containing outcome, treatment, and selection variables
|
42
|
-
outcome : str
|
43
|
-
Name of the outcome variable column
|
44
|
-
treatment : str
|
45
|
-
Name of the treatment indicator column (0=control, 1=treated)
|
46
|
-
selection : str
|
47
|
-
Name of the selection indicator column (0=missing, 1=observed)
|
48
|
-
|
49
|
-
Returns
|
50
|
-
-------
|
51
|
-
LeeBoundsResults
|
52
|
-
Results object containing bounds and summary statistics
|
53
|
-
"""
|
54
|
-
# Validate inputs
|
55
|
-
required_cols = [outcome, treatment, selection]
|
56
|
-
missing_cols = [col for col in required_cols if col not in data.columns]
|
57
|
-
if missing_cols:
|
58
|
-
raise ValueError(f"Missing columns: {missing_cols}")
|
59
|
-
|
60
|
-
# Extract data
|
61
|
-
Y = data[outcome].values
|
62
|
-
D = data[treatment].values
|
63
|
-
S = data[selection].values
|
64
|
-
|
65
|
-
# Remove missing values
|
66
|
-
valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
|
67
|
-
Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
|
68
|
-
|
69
|
-
# Validate data types and values
|
70
|
-
self._validate_variables(Y, D, S)
|
71
|
-
|
72
|
-
# Calculate selection rates
|
73
|
-
p1 = np.mean(S[D == 1]) # Selection rate in treated group
|
74
|
-
p0 = np.mean(S[D == 0]) # Selection rate in control group
|
75
|
-
|
76
|
-
if p1 <= p0:
|
77
|
-
raise ValueError("Selection rate in treated group must be greater than control group")
|
78
|
-
|
79
|
-
# Calculate bounds
|
80
|
-
lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
|
81
|
-
|
82
|
-
# Store results
|
83
|
-
self.results = LeeBoundsResults(
|
84
|
-
lower_bound=lower_bound,
|
85
|
-
upper_bound=upper_bound,
|
86
|
-
p1=p1,
|
87
|
-
p0=p0,
|
88
|
-
trim_proportion=p1 - p0,
|
89
|
-
n_treated=np.sum(D == 1),
|
90
|
-
n_control=np.sum(D == 0),
|
91
|
-
n_treated_selected=np.sum((D == 1) & (S == 1)),
|
92
|
-
n_control_selected=np.sum((D == 0) & (S == 1))
|
93
|
-
)
|
94
|
-
|
95
|
-
self.fitted = True
|
96
|
-
return self.results
|
97
|
-
|
98
|
-
def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
|
99
|
-
p1: float, p0: float) -> Tuple[float, float]:
|
100
|
-
"""
|
101
|
-
Compute Lee bounds using trimming approach.
|
102
|
-
|
103
|
-
Parameters
|
104
|
-
----------
|
105
|
-
Y : np.ndarray
|
106
|
-
Outcome values
|
107
|
-
D : np.ndarray
|
108
|
-
Treatment indicators
|
109
|
-
S : np.ndarray
|
110
|
-
Selection indicators
|
111
|
-
p1 : float
|
112
|
-
Selection rate in treated group
|
113
|
-
p0 : float
|
114
|
-
Selection rate in control group
|
115
|
-
|
116
|
-
Returns
|
117
|
-
-------
|
118
|
-
Tuple[float, float]
|
119
|
-
(lower_bound, upper_bound)
|
120
|
-
"""
|
121
|
-
# Control group mean (among selected)
|
122
|
-
control_mean = np.mean(Y[(D == 0) & (S == 1)])
|
123
|
-
|
124
|
-
# Treated group (among selected)
|
125
|
-
treated_selected = Y[(D == 1) & (S == 1)]
|
126
|
-
|
127
|
-
# Trim proportion
|
128
|
-
trim_prop = p1 - p0
|
129
|
-
|
130
|
-
# For Lee bounds, we trim the treated group to match control selection rate
|
131
|
-
# Lower bound: trim from top (keep lowest outcomes)
|
132
|
-
# Upper bound: trim from bottom (keep highest outcomes)
|
133
|
-
n_trim = int(len(treated_selected) * trim_prop)
|
134
|
-
|
135
|
-
if n_trim > 0:
|
136
|
-
sorted_treated = np.sort(treated_selected)
|
137
|
-
# Lower bound: keep bottom (1 - trim_prop) of observations
|
138
|
-
lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
|
139
|
-
# Upper bound: keep top (1 - trim_prop) of observations
|
140
|
-
upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
|
141
|
-
else:
|
142
|
-
lower_trimmed_mean = upper_trimmed_mean = np.mean(treated_selected)
|
143
|
-
|
144
|
-
lower_bound = lower_trimmed_mean - control_mean
|
145
|
-
upper_bound = upper_trimmed_mean - control_mean
|
146
|
-
|
147
|
-
return lower_bound, upper_bound
|
148
|
-
|
149
|
-
def _validate_variables(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
|
150
|
-
"""
|
151
|
-
Validate that variables have correct types and values.
|
152
|
-
|
153
|
-
Parameters
|
154
|
-
----------
|
155
|
-
Y : np.ndarray
|
156
|
-
Outcome variable
|
157
|
-
D : np.ndarray
|
158
|
-
Treatment variable
|
159
|
-
S : np.ndarray
|
160
|
-
Selection variable
|
161
|
-
|
162
|
-
Raises
|
163
|
-
------
|
164
|
-
ValueError
|
165
|
-
If validation fails
|
166
|
-
"""
|
167
|
-
# Check that D is binary with values 0 and 1
|
168
|
-
unique_d = np.unique(D)
|
169
|
-
if not np.array_equal(unique_d, np.array([0, 1])):
|
170
|
-
raise ValueError(f"Treatment variable D must be binary (0, 1). Found values: {unique_d}")
|
171
|
-
|
172
|
-
# Check that S is binary with values 0 and 1
|
173
|
-
unique_s = np.unique(S)
|
174
|
-
if not np.array_equal(unique_s, np.array([0, 1])):
|
175
|
-
raise ValueError(f"Selection variable S must be binary (0, 1). Found values: {unique_s}")
|
176
|
-
|
177
|
-
# Check that Y is continuous (not all integers)
|
178
|
-
if len(np.unique(Y)) < len(Y) * 0.1: # If less than 10% unique values, likely discrete
|
179
|
-
raise ValueError("Outcome variable Y should be continuous. Consider if this is appropriate.")
|
180
|
-
|
181
|
-
# Check for reasonable sample sizes
|
182
|
-
if len(Y) < 10:
|
183
|
-
raise ValueError("Sample size too small. Need at least 10 observations.")
|
184
|
-
|
185
|
-
# Check that we have both treatment groups
|
186
|
-
if np.sum(D == 0) == 0:
|
187
|
-
raise ValueError("No control observations (D=0) found.")
|
188
|
-
if np.sum(D == 1) == 0:
|
189
|
-
raise ValueError("No treated observations (D=1) found.")
|
190
|
-
|
191
|
-
# Check that we have selected observations in both groups
|
192
|
-
if np.sum((D == 0) & (S == 1)) == 0:
|
193
|
-
raise ValueError("No selected control observations (D=0, S=1) found.")
|
194
|
-
if np.sum((D == 1) & (S == 1)) == 0:
|
195
|
-
raise ValueError("No selected treated observations (D=1, S=1) found.")
|
196
|
-
|
197
|
-
def bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
|
198
|
-
selection: str, n_bootstrap: int = 500,
|
199
|
-
ci_level: float = 0.95) -> Dict[str, Any]:
|
200
|
-
"""
|
201
|
-
Compute bootstrap confidence intervals for the bounds.
|
202
|
-
|
203
|
-
Parameters
|
204
|
-
----------
|
205
|
-
data : pd.DataFrame
|
206
|
-
Input data
|
207
|
-
outcome : str
|
208
|
-
Outcome variable name
|
209
|
-
treatment : str
|
210
|
-
Treatment variable name
|
211
|
-
selection : str
|
212
|
-
Selection variable name
|
213
|
-
n_bootstrap : int
|
214
|
-
Number of bootstrap samples
|
215
|
-
ci_level : float
|
216
|
-
Confidence level (e.g., 0.95 for 95% CI)
|
217
|
-
|
218
|
-
Returns
|
219
|
-
-------
|
220
|
-
Dict[str, Any]
|
221
|
-
Dictionary containing bootstrap results
|
222
|
-
"""
|
223
|
-
lower_bounds = []
|
224
|
-
upper_bounds = []
|
225
|
-
|
226
|
-
for _ in range(n_bootstrap):
|
227
|
-
# Bootstrap sample
|
228
|
-
boot_idx = np.random.choice(len(data), size=len(data), replace=True)
|
229
|
-
boot_data = data.iloc[boot_idx].reset_index(drop=True)
|
230
|
-
|
231
|
-
try:
|
232
|
-
# Fit Lee bounds on bootstrap sample
|
233
|
-
lb = LeeBounds()
|
234
|
-
results = lb.fit(boot_data, outcome, treatment, selection)
|
235
|
-
lower_bounds.append(results.lower_bound)
|
236
|
-
upper_bounds.append(results.upper_bound)
|
237
|
-
except:
|
238
|
-
# Skip if bootstrap sample fails
|
239
|
-
continue
|
240
|
-
|
241
|
-
# Calculate confidence intervals
|
242
|
-
alpha = 1 - ci_level
|
243
|
-
lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
|
244
|
-
upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
|
245
|
-
|
246
|
-
return {
|
247
|
-
'lower_bound_ci': lower_ci,
|
248
|
-
'upper_bound_ci': upper_ci,
|
249
|
-
'lower_bounds': lower_bounds,
|
250
|
-
'upper_bounds': upper_bounds,
|
251
|
-
'ci_level': ci_level
|
252
|
-
}
|
253
|
-
|
254
|
-
|
255
|
-
class LeeBoundsResults:
|
256
|
-
"""
|
257
|
-
Results from Lee bounds estimation.
|
258
|
-
"""
|
259
|
-
|
260
|
-
def __init__(self, lower_bound: float, upper_bound: float, p1: float,
|
261
|
-
p0: float, trim_proportion: float, n_treated: int, n_control: int,
|
262
|
-
n_treated_selected: int, n_control_selected: int):
|
263
|
-
self.lower_bound = lower_bound
|
264
|
-
self.upper_bound = upper_bound
|
265
|
-
self.p1 = p1
|
266
|
-
self.p0 = p0
|
267
|
-
self.trim_proportion = trim_proportion
|
268
|
-
self.n_treated = n_treated
|
269
|
-
self.n_control = n_control
|
270
|
-
self.n_treated_selected = n_treated_selected
|
271
|
-
self.n_control_selected = n_control_selected
|
272
|
-
|
273
|
-
def summary(self) -> str:
|
274
|
-
"""
|
275
|
-
Return a summary of the results.
|
276
|
-
|
277
|
-
Returns
|
278
|
-
-------
|
279
|
-
str
|
280
|
-
Formatted summary string
|
281
|
-
"""
|
282
|
-
summary = f"""
|
283
|
-
Lee (2009) Treatment Effect Bounds
|
284
|
-
=================================
|
285
|
-
|
286
|
-
Treatment Effect Bounds:
|
287
|
-
- Lower bound: {self.lower_bound:.4f}
|
288
|
-
- Upper bound: {self.upper_bound:.4f}
|
289
|
-
- Bound width: {self.upper_bound - self.lower_bound:.4f}
|
290
|
-
"""
|
291
|
-
return summary
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|