pyleebounds 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyleebounds
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Python package for Lee 2009 treatment effect bounds under sample selection
5
5
  Home-page: https://github.com/vyasenov/pyleebounds
6
6
  Author: Vasco Yasenov
@@ -19,8 +19,6 @@ Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
20
  Requires-Dist: numpy>=1.20.0
21
21
  Requires-Dist: pandas>=1.3.0
22
- Requires-Dist: matplotlib>=3.3.0
23
- Requires-Dist: seaborn>=0.11.0
24
22
  Requires-Dist: scipy>=1.7.0
25
23
  Provides-Extra: dev
26
24
  Requires-Dist: pytest>=6.0; extra == "dev"
@@ -39,9 +37,10 @@ Dynamic: requires-dist
39
37
  Dynamic: requires-python
40
38
  Dynamic: summary
41
39
 
42
-
43
40
  # pyleebounds
44
41
 
42
+ ![](https://img.shields.io/badge/license-MIT-green)
43
+
45
44
  A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
46
45
 
47
46
  ## Installation
@@ -57,7 +56,7 @@ pip install pyleebounds
57
56
  * Sharp nonparametric bounds on treatment effects under endogenous sample selection
58
57
  * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
59
58
  * Bootstrap confidence intervals
60
- * Seamless integration with Pandas
59
+ * Seamless integration with `pandas`
61
60
 
62
61
  ## Quick Start
63
62
 
@@ -93,19 +92,18 @@ df = pd.DataFrame({
93
92
  })
94
93
 
95
94
  # Initialize and fit Lee bounds estimator
96
- lb = LeeBounds()
95
+ # Use fewer bootstrap replications for faster execution in this example
96
+ lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
97
97
  results = lb.fit(df, outcome='Y', treatment='D', selection='S')
98
98
 
99
- # View summary
100
- print(results.summary())
101
-
102
- # Plot estimated bounds
103
- results.plot()
99
+ # View comprehensive summary
100
+ print(lb.summary())
104
101
  ```
105
102
 
103
+
106
104
  ## Examples
107
105
 
108
- You can find detailed usage examples in the `examples/` directory.
106
+ You can find detailed usage examples in the `examples/` directory.
109
107
 
110
108
  ## Background
111
109
 
@@ -1,6 +1,7 @@
1
-
2
1
  # pyleebounds
3
2
 
3
+ ![](https://img.shields.io/badge/license-MIT-green)
4
+
4
5
  A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
5
6
 
6
7
  ## Installation
@@ -16,7 +17,7 @@ pip install pyleebounds
16
17
  * Sharp nonparametric bounds on treatment effects under endogenous sample selection
17
18
  * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
18
19
  * Bootstrap confidence intervals
19
- * Seamless integration with Pandas
20
+ * Seamless integration with `pandas`
20
21
 
21
22
  ## Quick Start
22
23
 
@@ -52,19 +53,18 @@ df = pd.DataFrame({
52
53
  })
53
54
 
54
55
  # Initialize and fit Lee bounds estimator
55
- lb = LeeBounds()
56
+ # Use fewer bootstrap replications for faster execution in this example
57
+ lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
56
58
  results = lb.fit(df, outcome='Y', treatment='D', selection='S')
57
59
 
58
- # View summary
59
- print(results.summary())
60
-
61
- # Plot estimated bounds
62
- results.plot()
60
+ # View comprehensive summary
61
+ print(lb.summary())
63
62
  ```
64
63
 
64
+
65
65
  ## Examples
66
66
 
67
- You can find detailed usage examples in the `examples/` directory.
67
+ You can find detailed usage examples in the `examples/` directory.
68
68
 
69
69
  ## Background
70
70
 
@@ -0,0 +1,329 @@
1
+ """
2
+ Implementation of Lee (2009) treatment effect bounds under sample selection.
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Tuple, Dict, Any
8
+
9
+ class LeeBounds:
10
+ """
11
+ Lee (2009) treatment effect bounds estimator.
12
+
13
+ Implements the method from Lee (2009) for estimating sharp bounds on treatment
14
+ effects when selection into the post-treatment sample is endogenous.
15
+
16
+ Parameters
17
+ ----------
18
+ None
19
+
20
+ References
21
+ ----------
22
+ Lee, D. S. (2009). Training, wages, and sample selection: Estimating sharp
23
+ bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
24
+ """
25
+
26
+ def __init__(self, n_bootstrap: int = 100, ci_level: float = 0.95):
27
+ """
28
+ Initialize Lee bounds estimator.
29
+
30
+ Parameters
31
+ ----------
32
+ n_bootstrap : int, default=100
33
+ Number of bootstrap samples for confidence intervals
34
+ ci_level : float, default=0.95
35
+ Confidence level for bootstrap confidence intervals
36
+ """
37
+ self.n_bootstrap = n_bootstrap
38
+ self.ci_level = ci_level
39
+
40
+ def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
41
+ p1: float, p0: float) -> Tuple[float, float]:
42
+ """
43
+ Compute Lee treatment effect bounds using trimming approach.
44
+
45
+ Parameters
46
+ ----------
47
+ Y : np.ndarray; Outcome values
48
+ D : np.ndarray; Treatment indicators
49
+ S : np.ndarray; Selection indicators
50
+ p1 : float; Selection rate in treated group
51
+ p0 : float; Selection rate in control group
52
+
53
+ Returns
54
+ -------
55
+ Tuple[float, float]
56
+ (lower_bound, upper_bound)
57
+ """
58
+ # Control group mean (among selected)
59
+ control_mean = np.mean(Y[(D == 0) & (S == 1)])
60
+
61
+ # Treated group (among selected)
62
+ treated_selected = Y[(D == 1) & (S == 1)]
63
+
64
+ # For Lee bounds, we trim the treated group to match control selection rate
65
+ # Lower bound: trim from top (keep lowest outcomes)
66
+ # Upper bound: trim from bottom (keep highest outcomes)
67
+ if p1 > p0:
68
+ # Trim proportion
69
+ trim_prop = (p1 - p0) / p1
70
+ n_trim = int(len(treated_selected) * trim_prop)
71
+
72
+ if n_trim >= len(treated_selected):
73
+ # Handle case where we'd trim everything
74
+ raise ValueError("Trim proportion too large - would remove all observations")
75
+
76
+ sorted_treated = np.sort(treated_selected)
77
+ # Lower bound: keep bottom (1 - trim_prop) of observations (trimmed from top)
78
+ lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
79
+ # Upper bound: keep top (1 - trim_prop) of observations
80
+ upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
81
+ else:
82
+ lower_trimmed_mean = upper_trimmed_mean = np.mean(treated_selected)
83
+
84
+ lower_bound = lower_trimmed_mean - control_mean
85
+ upper_bound = upper_trimmed_mean - control_mean
86
+
87
+ return lower_bound, upper_bound
88
+
89
+ def _validate_data(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
90
+ """
91
+ Validate that variables have correct types and values.
92
+
93
+ Parameters
94
+ ----------
95
+ Y : np.ndarray
96
+ Outcome variable
97
+ D : np.ndarray
98
+ Treatment variable
99
+ S : np.ndarray
100
+ Selection variable
101
+
102
+ Raises
103
+ ------
104
+ ValueError
105
+ If validation fails
106
+ """
107
+ # Check that D is binary with values 0 and 1
108
+ unique_d = np.unique(D)
109
+ if not np.array_equal(unique_d, np.array([0, 1])):
110
+ raise ValueError(f"Treatment variable D must be binary (0, 1). Found values: {unique_d}")
111
+
112
+ # Check that S is binary with values 0 and 1
113
+ unique_s = np.unique(S)
114
+ if not np.array_equal(unique_s, np.array([0, 1])):
115
+ raise ValueError(f"Selection variable S must be binary (0, 1). Found values: {unique_s}")
116
+
117
+ # Check that Y is continuous (not all integers)
118
+ if len(np.unique(Y)) < len(Y) * 0.1: # If less than 10% unique values, likely discrete
119
+ raise ValueError("Outcome variable Y should be continuous. Consider if this is appropriate.")
120
+
121
+ # Check for reasonable sample sizes
122
+ if len(Y) < 10:
123
+ raise ValueError("Sample size too small. Need at least 10 observations.")
124
+
125
+ # Check that we have both treatment groups
126
+ if np.sum(D == 0) == 0:
127
+ raise ValueError("No control observations (D=0) found.")
128
+ if np.sum(D == 1) == 0:
129
+ raise ValueError("No treated observations (D=1) found.")
130
+
131
+ # Check that we have selected observations in both groups
132
+ if np.sum((D == 0) & (S == 1)) == 0:
133
+ raise ValueError("No selected control observations (D=0, S=1) found.")
134
+ if np.sum((D == 1) & (S == 1)) == 0:
135
+ raise ValueError("No selected treated observations (D=1, S=1) found.")
136
+
137
+ def _bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
138
+ selection: str) -> Dict[str, Any]:
139
+ """
140
+ Compute bootstrap confidence intervals for the bounds.
141
+
142
+ Parameters
143
+ ----------
144
+ data : pd.DataFrame; Input data
145
+ outcome : str; Outcome variable name
146
+ treatment : str; Treatment variable name
147
+ selection : str; Selection variable name
148
+
149
+ Returns
150
+ -------
151
+ Dict[str, Any]
152
+ Dictionary containing bootstrap results
153
+ """
154
+ lower_bounds = []
155
+ upper_bounds = []
156
+
157
+ for _ in range(self.n_bootstrap):
158
+ # Bootstrap sample
159
+ boot_idx = np.random.choice(len(data), size=len(data), replace=True)
160
+ boot_data = data.iloc[boot_idx].reset_index(drop=True)
161
+
162
+ try:
163
+ # Extract data from bootstrap sample
164
+ Y = boot_data[outcome].values
165
+ D = boot_data[treatment].values
166
+ S = boot_data[selection].values
167
+
168
+ # Remove missing values
169
+ valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
170
+ Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
171
+
172
+ # Validate data types and values
173
+ self._validate_data(Y, D, S)
174
+
175
+ # Calculate selection rates
176
+ p1 = np.mean(S[D == 1])
177
+ p0 = np.mean(S[D == 0])
178
+
179
+ if p1 <= p0:
180
+ continue # Skip this bootstrap sample
181
+
182
+ # Calculate bounds directly
183
+ lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
184
+ lower_bounds.append(lower_bound)
185
+ upper_bounds.append(upper_bound)
186
+
187
+ except Exception:
188
+ # Skip if bootstrap sample fails
189
+ continue
190
+
191
+ # Calculate confidence intervals and standard errors
192
+ if len(lower_bounds) > 0:
193
+ alpha = 1 - self.ci_level
194
+ lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
195
+ upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
196
+
197
+ # Calculate bootstrap standard errors
198
+ lower_bound_se = np.std(lower_bounds, ddof=1)
199
+ upper_bound_se = np.std(upper_bounds, ddof=1)
200
+ else:
201
+ # If no successful bootstrap samples, use point estimates
202
+ lower_ci = upper_ci = np.array([np.nan, np.nan])
203
+ lower_bound_se = upper_bound_se = np.nan
204
+
205
+ return {
206
+ 'lower_bound_ci': lower_ci,
207
+ 'upper_bound_ci': upper_ci,
208
+ 'lower_bound_se': lower_bound_se,
209
+ 'upper_bound_se': upper_bound_se,
210
+ 'lower_bounds': lower_bounds,
211
+ 'upper_bounds': upper_bounds,
212
+ 'ci_level': self.ci_level
213
+ }
214
+
215
+ def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
216
+ selection: str) -> 'LeeBounds':
217
+ """
218
+ Fit Lee bounds estimator to the data.
219
+
220
+ Parameters
221
+ ----------
222
+ data : pd.DataFrame; Input data containing outcome, treatment, and selection variables
223
+ outcome : str; Name of the outcome variable column
224
+ treatment : str; Name of the treatment indicator column (0=control, 1=treated)
225
+ selection : str; Name of the selection indicator column (0=missing, 1=observed)
226
+
227
+ Returns
228
+ -------
229
+ LeeBounds
230
+ Self with fitted results
231
+ """
232
+ # Validate inputs
233
+ required_cols = [outcome, treatment, selection]
234
+ missing_cols = [col for col in required_cols if col not in data.columns]
235
+ if missing_cols:
236
+ raise ValueError(f"Missing columns: {missing_cols}")
237
+
238
+ # Extract data
239
+ Y = data[outcome].values
240
+ D = data[treatment].values
241
+ S = data[selection].values
242
+
243
+ # Remove missing values
244
+ valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
245
+ Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
246
+
247
+ # Validate data types and values
248
+ self._validate_data(Y, D, S)
249
+
250
+ # Calculate selection rates
251
+ p1 = np.mean(S[D == 1]) # Selection rate in treated group
252
+ p0 = np.mean(S[D == 0]) # Selection rate in control group
253
+
254
+ if p1 <= p0:
255
+ raise ValueError("Selection rate in treated group must be greater than control group")
256
+
257
+ # Calculate bounds
258
+ lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
259
+
260
+ # Store results directly in self
261
+ self.lower_bound = lower_bound
262
+ self.upper_bound = upper_bound
263
+ self.p1 = p1
264
+ self.p0 = p0
265
+ self.trim_proportion = p1 - p0
266
+ self.n_treated = np.sum(D == 1)
267
+ self.n_control = np.sum(D == 0)
268
+ self.n_treated_selected = np.sum((D == 1) & (S == 1))
269
+ self.n_control_selected = np.sum((D == 0) & (S == 1))
270
+
271
+ # Compute bootstrap confidence intervals and standard errors
272
+ bootstrap_results = self._bootstrap(data, outcome, treatment, selection)
273
+ self.lower_bound_ci = bootstrap_results['lower_bound_ci']
274
+ self.upper_bound_ci = bootstrap_results['upper_bound_ci']
275
+ self.lower_bound_se = bootstrap_results['lower_bound_se']
276
+ self.upper_bound_se = bootstrap_results['upper_bound_se']
277
+ self.lower_bounds_bootstrap = bootstrap_results['lower_bounds']
278
+ self.upper_bounds_bootstrap = bootstrap_results['upper_bounds']
279
+
280
+ return self
281
+
282
+ def summary(self) -> str:
283
+ """
284
+ Return a summary of the results.
285
+
286
+ Returns
287
+ -------
288
+ str
289
+ Formatted summary string
290
+ """
291
+ if not hasattr(self, 'lower_bound'):
292
+ return "No results available. Please fit the model first."
293
+
294
+ # Format confidence intervals and standard errors
295
+ if not np.isnan(self.lower_bound_ci[0]):
296
+ lower_ci_str = f"[{self.lower_bound_ci[0]:.4f}, {self.lower_bound_ci[1]:.4f}]"
297
+ upper_ci_str = f"[{self.upper_bound_ci[0]:.4f}, {self.upper_bound_ci[1]:.4f}]"
298
+ lower_se_str = f"{self.lower_bound_se:.4f}"
299
+ upper_se_str = f"{self.upper_bound_se:.4f}"
300
+ else:
301
+ lower_ci_str = "Not computed"
302
+ upper_ci_str = "Not computed"
303
+ lower_se_str = "Not computed"
304
+ upper_se_str = "Not computed"
305
+
306
+ summary = f"""
307
+ Lee (2009) Treatment Effect Bounds
308
+ =================================
309
+
310
+ Treatment Effect Bounds:
311
+ - Lower bound: {self.lower_bound:.4f}
312
+ - Upper bound: {self.upper_bound:.4f}
313
+ - Bound width: {self.upper_bound - self.lower_bound:.4f}
314
+
315
+ Bootstrap Confidence Intervals ({int(self.ci_level*100)}%):
316
+ - Lower bound CI: {lower_ci_str}
317
+ - Upper bound CI: {upper_ci_str}
318
+
319
+ Bootstrap Standard Errors:
320
+ - Lower bound SE: {lower_se_str}
321
+ - Upper bound SE: {upper_se_str}
322
+
323
+ Sample Information:
324
+ - Treated observations: {self.n_treated} (selected: {self.n_treated_selected})
325
+ - Control observations: {self.n_control} (selected: {self.n_control_selected})
326
+ - Selection rates: p₁ = {self.p1:.3f}, p₀ = {self.p0:.3f}
327
+ - Trim proportion: {self.trim_proportion:.3f}
328
+ """
329
+ return summary
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyleebounds
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Python package for Lee 2009 treatment effect bounds under sample selection
5
5
  Home-page: https://github.com/vyasenov/pyleebounds
6
6
  Author: Vasco Yasenov
@@ -19,8 +19,6 @@ Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
20
  Requires-Dist: numpy>=1.20.0
21
21
  Requires-Dist: pandas>=1.3.0
22
- Requires-Dist: matplotlib>=3.3.0
23
- Requires-Dist: seaborn>=0.11.0
24
22
  Requires-Dist: scipy>=1.7.0
25
23
  Provides-Extra: dev
26
24
  Requires-Dist: pytest>=6.0; extra == "dev"
@@ -39,9 +37,10 @@ Dynamic: requires-dist
39
37
  Dynamic: requires-python
40
38
  Dynamic: summary
41
39
 
42
-
43
40
  # pyleebounds
44
41
 
42
+ ![](https://img.shields.io/badge/license-MIT-green)
43
+
45
44
  A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
46
45
 
47
46
  ## Installation
@@ -57,7 +56,7 @@ pip install pyleebounds
57
56
  * Sharp nonparametric bounds on treatment effects under endogenous sample selection
58
57
  * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
59
58
  * Bootstrap confidence intervals
60
- * Seamless integration with Pandas
59
+ * Seamless integration with `pandas`
61
60
 
62
61
  ## Quick Start
63
62
 
@@ -93,19 +92,18 @@ df = pd.DataFrame({
93
92
  })
94
93
 
95
94
  # Initialize and fit Lee bounds estimator
96
- lb = LeeBounds()
95
+ # Use fewer bootstrap replications for faster execution in this example
96
+ lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
97
97
  results = lb.fit(df, outcome='Y', treatment='D', selection='S')
98
98
 
99
- # View summary
100
- print(results.summary())
101
-
102
- # Plot estimated bounds
103
- results.plot()
99
+ # View comprehensive summary
100
+ print(lb.summary())
104
101
  ```
105
102
 
103
+
106
104
  ## Examples
107
105
 
108
- You can find detailed usage examples in the `examples/` directory.
106
+ You can find detailed usage examples in the `examples/` directory.
109
107
 
110
108
  ## Background
111
109
 
@@ -1,7 +1,5 @@
1
1
  numpy>=1.20.0
2
2
  pandas>=1.3.0
3
- matplotlib>=3.3.0
4
- seaborn>=0.11.0
5
3
  scipy>=1.7.0
6
4
 
7
5
  [dev]
@@ -30,7 +30,7 @@ def read_requirements():
30
30
 
31
31
  setup(
32
32
  name="pyleebounds",
33
- version="0.1.0",
33
+ version="0.2.0",
34
34
  author="Vasco Yasenov",
35
35
  author_email="",
36
36
  description="Python package for Lee 2009 treatment effect bounds under sample selection",
@@ -1,291 +0,0 @@
1
- """
2
- Implementation of Lee (2009) treatment effect bounds under sample selection.
3
- """
4
-
5
- import numpy as np
6
- import pandas as pd
7
- from typing import Tuple, Dict, Any
8
- import matplotlib.pyplot as plt
9
-
10
-
11
- class LeeBounds:
12
- """
13
- Lee (2009) treatment effect bounds estimator.
14
-
15
- Implements the method from Lee (2009) for estimating sharp bounds on treatment
16
- effects when selection into the post-treatment sample is endogenous.
17
-
18
- Parameters
19
- ----------
20
- None
21
-
22
- References
23
- ----------
24
- Lee, D. S. (2009). Training, wages, and sample selection: Estimating sharp
25
- bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
26
- """
27
-
28
- def __init__(self, trim_method: str = 'quantile'):
29
- self.trim_method = trim_method
30
- self.fitted = False
31
- self.results = None
32
-
33
- def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
34
- selection: str) -> 'LeeBoundsResults':
35
- """
36
- Fit Lee bounds estimator to the data.
37
-
38
- Parameters
39
- ----------
40
- data : pd.DataFrame
41
- Input data containing outcome, treatment, and selection variables
42
- outcome : str
43
- Name of the outcome variable column
44
- treatment : str
45
- Name of the treatment indicator column (0=control, 1=treated)
46
- selection : str
47
- Name of the selection indicator column (0=missing, 1=observed)
48
-
49
- Returns
50
- -------
51
- LeeBoundsResults
52
- Results object containing bounds and summary statistics
53
- """
54
- # Validate inputs
55
- required_cols = [outcome, treatment, selection]
56
- missing_cols = [col for col in required_cols if col not in data.columns]
57
- if missing_cols:
58
- raise ValueError(f"Missing columns: {missing_cols}")
59
-
60
- # Extract data
61
- Y = data[outcome].values
62
- D = data[treatment].values
63
- S = data[selection].values
64
-
65
- # Remove missing values
66
- valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
67
- Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
68
-
69
- # Validate data types and values
70
- self._validate_variables(Y, D, S)
71
-
72
- # Calculate selection rates
73
- p1 = np.mean(S[D == 1]) # Selection rate in treated group
74
- p0 = np.mean(S[D == 0]) # Selection rate in control group
75
-
76
- if p1 <= p0:
77
- raise ValueError("Selection rate in treated group must be greater than control group")
78
-
79
- # Calculate bounds
80
- lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
81
-
82
- # Store results
83
- self.results = LeeBoundsResults(
84
- lower_bound=lower_bound,
85
- upper_bound=upper_bound,
86
- p1=p1,
87
- p0=p0,
88
- trim_proportion=p1 - p0,
89
- n_treated=np.sum(D == 1),
90
- n_control=np.sum(D == 0),
91
- n_treated_selected=np.sum((D == 1) & (S == 1)),
92
- n_control_selected=np.sum((D == 0) & (S == 1))
93
- )
94
-
95
- self.fitted = True
96
- return self.results
97
-
98
- def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
99
- p1: float, p0: float) -> Tuple[float, float]:
100
- """
101
- Compute Lee bounds using trimming approach.
102
-
103
- Parameters
104
- ----------
105
- Y : np.ndarray
106
- Outcome values
107
- D : np.ndarray
108
- Treatment indicators
109
- S : np.ndarray
110
- Selection indicators
111
- p1 : float
112
- Selection rate in treated group
113
- p0 : float
114
- Selection rate in control group
115
-
116
- Returns
117
- -------
118
- Tuple[float, float]
119
- (lower_bound, upper_bound)
120
- """
121
- # Control group mean (among selected)
122
- control_mean = np.mean(Y[(D == 0) & (S == 1)])
123
-
124
- # Treated group (among selected)
125
- treated_selected = Y[(D == 1) & (S == 1)]
126
-
127
- # Trim proportion
128
- trim_prop = p1 - p0
129
-
130
- # For Lee bounds, we trim the treated group to match control selection rate
131
- # Lower bound: trim from top (keep lowest outcomes)
132
- # Upper bound: trim from bottom (keep highest outcomes)
133
- n_trim = int(len(treated_selected) * trim_prop)
134
-
135
- if n_trim > 0:
136
- sorted_treated = np.sort(treated_selected)
137
- # Lower bound: keep bottom (1 - trim_prop) of observations
138
- lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
139
- # Upper bound: keep top (1 - trim_prop) of observations
140
- upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
141
- else:
142
- lower_trimmed_mean = upper_trimmed_mean = np.mean(treated_selected)
143
-
144
- lower_bound = lower_trimmed_mean - control_mean
145
- upper_bound = upper_trimmed_mean - control_mean
146
-
147
- return lower_bound, upper_bound
148
-
149
- def _validate_variables(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
150
- """
151
- Validate that variables have correct types and values.
152
-
153
- Parameters
154
- ----------
155
- Y : np.ndarray
156
- Outcome variable
157
- D : np.ndarray
158
- Treatment variable
159
- S : np.ndarray
160
- Selection variable
161
-
162
- Raises
163
- ------
164
- ValueError
165
- If validation fails
166
- """
167
- # Check that D is binary with values 0 and 1
168
- unique_d = np.unique(D)
169
- if not np.array_equal(unique_d, np.array([0, 1])):
170
- raise ValueError(f"Treatment variable D must be binary (0, 1). Found values: {unique_d}")
171
-
172
- # Check that S is binary with values 0 and 1
173
- unique_s = np.unique(S)
174
- if not np.array_equal(unique_s, np.array([0, 1])):
175
- raise ValueError(f"Selection variable S must be binary (0, 1). Found values: {unique_s}")
176
-
177
- # Check that Y is continuous (not all integers)
178
- if len(np.unique(Y)) < len(Y) * 0.1: # If less than 10% unique values, likely discrete
179
- raise ValueError("Outcome variable Y should be continuous. Consider if this is appropriate.")
180
-
181
- # Check for reasonable sample sizes
182
- if len(Y) < 10:
183
- raise ValueError("Sample size too small. Need at least 10 observations.")
184
-
185
- # Check that we have both treatment groups
186
- if np.sum(D == 0) == 0:
187
- raise ValueError("No control observations (D=0) found.")
188
- if np.sum(D == 1) == 0:
189
- raise ValueError("No treated observations (D=1) found.")
190
-
191
- # Check that we have selected observations in both groups
192
- if np.sum((D == 0) & (S == 1)) == 0:
193
- raise ValueError("No selected control observations (D=0, S=1) found.")
194
- if np.sum((D == 1) & (S == 1)) == 0:
195
- raise ValueError("No selected treated observations (D=1, S=1) found.")
196
-
197
- def bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
198
- selection: str, n_bootstrap: int = 500,
199
- ci_level: float = 0.95) -> Dict[str, Any]:
200
- """
201
- Compute bootstrap confidence intervals for the bounds.
202
-
203
- Parameters
204
- ----------
205
- data : pd.DataFrame
206
- Input data
207
- outcome : str
208
- Outcome variable name
209
- treatment : str
210
- Treatment variable name
211
- selection : str
212
- Selection variable name
213
- n_bootstrap : int
214
- Number of bootstrap samples
215
- ci_level : float
216
- Confidence level (e.g., 0.95 for 95% CI)
217
-
218
- Returns
219
- -------
220
- Dict[str, Any]
221
- Dictionary containing bootstrap results
222
- """
223
- lower_bounds = []
224
- upper_bounds = []
225
-
226
- for _ in range(n_bootstrap):
227
- # Bootstrap sample
228
- boot_idx = np.random.choice(len(data), size=len(data), replace=True)
229
- boot_data = data.iloc[boot_idx].reset_index(drop=True)
230
-
231
- try:
232
- # Fit Lee bounds on bootstrap sample
233
- lb = LeeBounds()
234
- results = lb.fit(boot_data, outcome, treatment, selection)
235
- lower_bounds.append(results.lower_bound)
236
- upper_bounds.append(results.upper_bound)
237
- except:
238
- # Skip if bootstrap sample fails
239
- continue
240
-
241
- # Calculate confidence intervals
242
- alpha = 1 - ci_level
243
- lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
244
- upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
245
-
246
- return {
247
- 'lower_bound_ci': lower_ci,
248
- 'upper_bound_ci': upper_ci,
249
- 'lower_bounds': lower_bounds,
250
- 'upper_bounds': upper_bounds,
251
- 'ci_level': ci_level
252
- }
253
-
254
-
255
- class LeeBoundsResults:
256
- """
257
- Results from Lee bounds estimation.
258
- """
259
-
260
- def __init__(self, lower_bound: float, upper_bound: float, p1: float,
261
- p0: float, trim_proportion: float, n_treated: int, n_control: int,
262
- n_treated_selected: int, n_control_selected: int):
263
- self.lower_bound = lower_bound
264
- self.upper_bound = upper_bound
265
- self.p1 = p1
266
- self.p0 = p0
267
- self.trim_proportion = trim_proportion
268
- self.n_treated = n_treated
269
- self.n_control = n_control
270
- self.n_treated_selected = n_treated_selected
271
- self.n_control_selected = n_control_selected
272
-
273
- def summary(self) -> str:
274
- """
275
- Return a summary of the results.
276
-
277
- Returns
278
- -------
279
- str
280
- Formatted summary string
281
- """
282
- summary = f"""
283
- Lee (2009) Treatment Effect Bounds
284
- =================================
285
-
286
- Treatment Effect Bounds:
287
- - Lower bound: {self.lower_bound:.4f}
288
- - Upper bound: {self.upper_bound:.4f}
289
- - Bound width: {self.upper_bound - self.lower_bound:.4f}
290
- """
291
- return summary
File without changes
File without changes