pyleebounds 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Vasil (Vasco) Yasenov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyleebounds
3
+ Version: 0.1.0
4
+ Summary: Python package for Lee 2009 treatment effect bounds under sample selection
5
+ Home-page: https://github.com/vyasenov/pyleebounds
6
+ Author: Vasco Yasenov
7
+ Author-email:
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Requires-Python: >=3.8
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy>=1.20.0
21
+ Requires-Dist: pandas>=1.3.0
22
+ Requires-Dist: matplotlib>=3.3.0
23
+ Requires-Dist: seaborn>=0.11.0
24
+ Requires-Dist: scipy>=1.7.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=6.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
28
+ Requires-Dist: black>=21.0; extra == "dev"
29
+ Requires-Dist: flake8>=3.8; extra == "dev"
30
+ Requires-Dist: mypy>=0.800; extra == "dev"
31
+ Dynamic: author
32
+ Dynamic: classifier
33
+ Dynamic: description
34
+ Dynamic: description-content-type
35
+ Dynamic: home-page
36
+ Dynamic: license-file
37
+ Dynamic: provides-extra
38
+ Dynamic: requires-dist
39
+ Dynamic: requires-python
40
+ Dynamic: summary
41
+
42
+
43
+ # pyleebounds
44
+
45
+ A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
46
+
47
+ ## Installation
48
+
49
+ You can install the package using pip:
50
+
51
+ ```bash
52
+ pip install pyleebounds
53
+ ````
54
+
55
+ ## Features
56
+
57
+ * Sharp nonparametric bounds on treatment effects under endogenous sample selection
58
+ * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
59
+ * Bootstrap confidence intervals
60
+ * Seamless integration with Pandas
61
+
62
+ ## Quick Start
63
+
64
+ ```python
65
+ import pandas as pd
66
+ import numpy as np
67
+ from pyleebounds import LeeBounds
68
+
69
+ # Generate synthetic data
70
+ np.random.seed(1988)
71
+ n = 1000
72
+
73
+ # Treatment assignment (random)
74
+ D = np.random.binomial(1, 0.5, n)
75
+
76
+ # Potential outcomes (e.g., wages)
77
+ Y0 = np.random.normal(50, 10, n) # Control potential outcome
78
+ treatment_effect = np.random.normal(5, 3, n) # Treatment effect
79
+ Y1 = Y0 + treatment_effect # Treated potential outcome
80
+ Y = D * Y1 + (1 - D) * Y0 # Actual outcome
81
+
82
+ # Selection mechanism (e.g., employment)
83
+ # Higher wages and treatment increase employment probability
84
+ employment_prob = 0.3 + 0.4 * (Y > 50) + 0.2 * D
85
+ employment_prob = np.clip(employment_prob, 0, 1)
86
+ S = np.random.binomial(1, employment_prob, n)
87
+
88
+ # Create DataFrame
89
+ df = pd.DataFrame({
90
+ 'Y': Y, # outcome variable
91
+ 'D': D, # treatment indicator (1 = treated, 0 = control)
92
+ 'S': S # selection indicator (1 = observed, 0 = missing/selected out)
93
+ })
94
+
95
+ # Initialize and fit Lee bounds estimator
96
+ lb = LeeBounds()
97
+ results = lb.fit(df, outcome='Y', treatment='D', selection='S')
98
+
99
+ # View summary
100
+ print(results.summary())
101
+
102
+ # Plot estimated bounds
103
+ results.plot()
104
+ ```
105
+
106
+ ## Examples
107
+
108
+ You can find detailed usage examples in the `examples/` directory.
109
+
110
+ ## Background
111
+
112
+ ### Why Treatment Bounds?
113
+
114
+ In many applied settings, outcomes are observed only for a selected subset of the population—e.g., wages are observed only for employed individuals. If treatment affects selection (e.g., job training increases employment), naïvely comparing outcomes may confound treatment effects with selection effects.
115
+
116
+ Lee (2009) offers a way to partially identify treatment effects by trimming the treated group's distribution to match the control group’s selection rate under plausible assumptions.
117
+
118
+ ---
119
+
120
+ ### Notation
121
+
122
+ Let's establish the following notation:
123
+
124
+ * $Y$: observed *continuous* outcome
125
+ * $D \in \{0,1\}$: treatment indicator (1 = treated)
126
+ * $S \in \{0,1\}$: selection indicator (1 = observed)
127
+ * $Y(0), Y(1)$: potential outcomes under control/treatment
128
+ * $S(0), S(1)$: potential selection statuses
129
+ * $p_1 = \Pr(S=1 \mid D=1)$, $p_0 = \Pr(S=1 \mid D=0)$: selection rates
130
+
131
+ For each unit we observe $\left(D, S, Y\times S \right)$.
132
+
133
+ ---
134
+
135
+ ### Assumptions
136
+
137
+ 1. Monotonicity: Treatment weakly increases the probability of being observed $$S(1)\geq S(0).$$
138
+ 2. Exogeneity: Treatment is randomly assigned or unconfounded $$\left(Y(0),Y(1),S(0),S(1)\right) \perp D.$$
139
+
140
+ ---
141
+
142
+ ### Main Result
143
+
144
+ To adjust for differential selection, Lee (2009) suggested trimming the treated group’s outcome distribution among those with $S=1$. We then compute bounds on the average treatment effect (ATE) for the observed sample as:
145
+
146
+ $$
147
+ ATE \in \left[ \underline{\Delta}, \overline{\Delta} \right],
148
+ $$
149
+
150
+ where:
151
+
152
+ $$
153
+ \underline{\Delta} = \mathbb{E}[Y \mid Y\geq q^{1-\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
154
+ $$
155
+
156
+ $$
157
+ \overline{\Delta} = \mathbb{E}[Y \mid Y\leq q^{\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
158
+ $$
159
+
160
+ Here $q^{u}$ represents the $u$th quantile of $Y|D=1,S=1$. These form sharp bounds under the stated assumptions.
161
+
162
+ These bounds can be tightened in presence of additional covariates $X$, but this package does not offer that functionality. See also Semenova (2020).
163
+
164
+ ---
165
+
166
+ ### Confidence Intervals
167
+
168
+ Since the Lee bounds involve non-differentiable operations (quantile trimming), variance formulas are complex. Instead, this package provides bootstrap confidence intervals computed as follows:
169
+
170
+ 1. Resample units with replacement, stratified by treatment group.
171
+ 2. Compute Lee bounds for each bootstrap sample.
172
+ 3. Construct percentile intervals using the empirical bootstrap distribution.
173
+
174
+ ## References
175
+
176
+ * Lee, D. S. (2009). *Training, wages, and sample selection: Estimating sharp bounds on treatment effects*. *The Review of Economic Studies*, 76(3), 1071–1102.
177
+ * Semenova, V. (2020). Generalized lee bounds. arXiv preprint arXiv:2008.12720.
178
+ * Tauchmann, H. (2014). Lee (2009) treatment-effect bounds for nonrandom sample selection. The Stata Journal, 14(4), 884-894.
179
+
180
+ ## License
181
+
182
+ This project is licensed under the MIT License – see the [LICENSE](LICENSE) file for details.
183
+
184
+ ## Citation
185
+
186
+ To cite this package in publications, use the following BibTeX entry:
187
+
188
+ ```bibtex
189
+ @misc{yasenov2025pyleebounds,
190
+ author = {Vasco Yasenov},
191
+ title = {pyleebounds: Python Tools for Estimating Treatment Effect Bounds under Sample Selection},
192
+ year = {2025},
193
+ howpublished = {\url{https://github.com/vyasenov/pyleebounds}},
194
+ note = {Version 0.1.0}
195
+ }
196
+ ```
@@ -0,0 +1,155 @@
1
+
2
+ # pyleebounds
3
+
4
+ A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
5
+
6
+ ## Installation
7
+
8
+ You can install the package using pip:
9
+
10
+ ```bash
11
+ pip install pyleebounds
12
+ ````
13
+
14
+ ## Features
15
+
16
+ * Sharp nonparametric bounds on treatment effects under endogenous sample selection
17
+ * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
18
+ * Bootstrap confidence intervals
19
+ * Seamless integration with Pandas
20
+
21
+ ## Quick Start
22
+
23
+ ```python
24
+ import pandas as pd
25
+ import numpy as np
26
+ from pyleebounds import LeeBounds
27
+
28
+ # Generate synthetic data
29
+ np.random.seed(1988)
30
+ n = 1000
31
+
32
+ # Treatment assignment (random)
33
+ D = np.random.binomial(1, 0.5, n)
34
+
35
+ # Potential outcomes (e.g., wages)
36
+ Y0 = np.random.normal(50, 10, n) # Control potential outcome
37
+ treatment_effect = np.random.normal(5, 3, n) # Treatment effect
38
+ Y1 = Y0 + treatment_effect # Treated potential outcome
39
+ Y = D * Y1 + (1 - D) * Y0 # Actual outcome
40
+
41
+ # Selection mechanism (e.g., employment)
42
+ # Higher wages and treatment increase employment probability
43
+ employment_prob = 0.3 + 0.4 * (Y > 50) + 0.2 * D
44
+ employment_prob = np.clip(employment_prob, 0, 1)
45
+ S = np.random.binomial(1, employment_prob, n)
46
+
47
+ # Create DataFrame
48
+ df = pd.DataFrame({
49
+ 'Y': Y, # outcome variable
50
+ 'D': D, # treatment indicator (1 = treated, 0 = control)
51
+ 'S': S # selection indicator (1 = observed, 0 = missing/selected out)
52
+ })
53
+
54
+ # Initialize and fit Lee bounds estimator
55
+ lb = LeeBounds()
56
+ results = lb.fit(df, outcome='Y', treatment='D', selection='S')
57
+
58
+ # View summary
59
+ print(results.summary())
60
+
61
+ # Plot estimated bounds
62
+ results.plot()
63
+ ```
64
+
65
+ ## Examples
66
+
67
+ You can find detailed usage examples in the `examples/` directory.
68
+
69
+ ## Background
70
+
71
+ ### Why Treatment Bounds?
72
+
73
+ In many applied settings, outcomes are observed only for a selected subset of the population—e.g., wages are observed only for employed individuals. If treatment affects selection (e.g., job training increases employment), naïvely comparing outcomes may confound treatment effects with selection effects.
74
+
75
+ Lee (2009) offers a way to partially identify treatment effects by trimming the treated group's distribution to match the control group’s selection rate under plausible assumptions.
76
+
77
+ ---
78
+
79
+ ### Notation
80
+
81
+ Let's establish the following notation:
82
+
83
+ * $Y$: observed *continuous* outcome
84
+ * $D \in \{0,1\}$: treatment indicator (1 = treated)
85
+ * $S \in \{0,1\}$: selection indicator (1 = observed)
86
+ * $Y(0), Y(1)$: potential outcomes under control/treatment
87
+ * $S(0), S(1)$: potential selection statuses
88
+ * $p_1 = \Pr(S=1 \mid D=1)$, $p_0 = \Pr(S=1 \mid D=0)$: selection rates
89
+
90
+ For each unit we observe $\left(D, S, Y\times S \right)$.
91
+
92
+ ---
93
+
94
+ ### Assumptions
95
+
96
+ 1. Monotonicity: Treatment weakly increases the probability of being observed $$S(1)\geq S(0).$$
97
+ 2. Exogeneity: Treatment is randomly assigned or unconfounded $$\left(Y(0),Y(1),S(0),S(1)\right) \perp D.$$
98
+
99
+ ---
100
+
101
+ ### Main Result
102
+
103
+ To adjust for differential selection, Lee (2009) suggested trimming the treated group’s outcome distribution among those with $S=1$. We then compute bounds on the average treatment effect (ATE) for the observed sample as:
104
+
105
+ $$
106
+ ATE \in \left[ \underline{\Delta}, \overline{\Delta} \right],
107
+ $$
108
+
109
+ where:
110
+
111
+ $$
112
+ \underline{\Delta} = \mathbb{E}[Y \mid Y\geq q^{1-\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
113
+ $$
114
+
115
+ $$
116
+ \overline{\Delta} = \mathbb{E}[Y \mid Y\leq q^{\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
117
+ $$
118
+
119
+ Here $q^{u}$ represents the $u$th quantile of $Y|D=1,S=1$. These form sharp bounds under the stated assumptions.
120
+
121
+ These bounds can be tightened in presence of additional covariates $X$, but this package does not offer that functionality. See also Semenova (2020).
122
+
123
+ ---
124
+
125
+ ### Confidence Intervals
126
+
127
+ Since the Lee bounds involve non-differentiable operations (quantile trimming), variance formulas are complex. Instead, this package provides bootstrap confidence intervals computed as follows:
128
+
129
+ 1. Resample units with replacement, stratified by treatment group.
130
+ 2. Compute Lee bounds for each bootstrap sample.
131
+ 3. Construct percentile intervals using the empirical bootstrap distribution.
132
+
133
+ ## References
134
+
135
+ * Lee, D. S. (2009). *Training, wages, and sample selection: Estimating sharp bounds on treatment effects*. *The Review of Economic Studies*, 76(3), 1071–1102.
136
+ * Semenova, V. (2020). Generalized lee bounds. arXiv preprint arXiv:2008.12720.
137
+ * Tauchmann, H. (2014). Lee (2009) treatment-effect bounds for nonrandom sample selection. The Stata Journal, 14(4), 884-894.
138
+
139
+ ## License
140
+
141
+ This project is licensed under the MIT License – see the [LICENSE](LICENSE) file for details.
142
+
143
+ ## Citation
144
+
145
+ To cite this package in publications, use the following BibTeX entry:
146
+
147
+ ```bibtex
148
+ @misc{yasenov2025pyleebounds,
149
+ author = {Vasco Yasenov},
150
+ title = {pyleebounds: Python Tools for Estimating Treatment Effect Bounds under Sample Selection},
151
+ year = {2025},
152
+ howpublished = {\url{https://github.com/vyasenov/pyleebounds}},
153
+ note = {Version 0.1.0}
154
+ }
155
+ ```
@@ -0,0 +1,14 @@
1
+ """
2
+ pyleebounds: Python package for Lee 2009 treatment effect bounds under sample selection.
3
+
4
+ This package implements the method from Lee (2009) for estimating sharp bounds
5
+ on treatment effects when selection into the post-treatment sample is endogenous.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Vasco Yasenov"
10
+ __email__ = ""
11
+
12
+ from .lee_bounds import LeeBounds
13
+
14
+ __all__ = ["LeeBounds"]
@@ -0,0 +1,291 @@
1
+ """
2
+ Implementation of Lee (2009) treatment effect bounds under sample selection.
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Tuple, Dict, Any
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ class LeeBounds:
12
+ """
13
+ Lee (2009) treatment effect bounds estimator.
14
+
15
+ Implements the method from Lee (2009) for estimating sharp bounds on treatment
16
+ effects when selection into the post-treatment sample is endogenous.
17
+
18
+ Parameters
19
+ ----------
20
+ None
21
+
22
+ References
23
+ ----------
24
+ Lee, D. S. (2009). Training, wages, and sample selection: Estimating sharp
25
+ bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
26
+ """
27
+
28
+ def __init__(self, trim_method: str = 'quantile'):
29
+ self.trim_method = trim_method
30
+ self.fitted = False
31
+ self.results = None
32
+
33
+ def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
34
+ selection: str) -> 'LeeBoundsResults':
35
+ """
36
+ Fit Lee bounds estimator to the data.
37
+
38
+ Parameters
39
+ ----------
40
+ data : pd.DataFrame
41
+ Input data containing outcome, treatment, and selection variables
42
+ outcome : str
43
+ Name of the outcome variable column
44
+ treatment : str
45
+ Name of the treatment indicator column (0=control, 1=treated)
46
+ selection : str
47
+ Name of the selection indicator column (0=missing, 1=observed)
48
+
49
+ Returns
50
+ -------
51
+ LeeBoundsResults
52
+ Results object containing bounds and summary statistics
53
+ """
54
+ # Validate inputs
55
+ required_cols = [outcome, treatment, selection]
56
+ missing_cols = [col for col in required_cols if col not in data.columns]
57
+ if missing_cols:
58
+ raise ValueError(f"Missing columns: {missing_cols}")
59
+
60
+ # Extract data
61
+ Y = data[outcome].values
62
+ D = data[treatment].values
63
+ S = data[selection].values
64
+
65
+ # Remove missing values
66
+ valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
67
+ Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
68
+
69
+ # Validate data types and values
70
+ self._validate_variables(Y, D, S)
71
+
72
+ # Calculate selection rates
73
+ p1 = np.mean(S[D == 1]) # Selection rate in treated group
74
+ p0 = np.mean(S[D == 0]) # Selection rate in control group
75
+
76
+ if p1 <= p0:
77
+ raise ValueError("Selection rate in treated group must be greater than control group")
78
+
79
+ # Calculate bounds
80
+ lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
81
+
82
+ # Store results
83
+ self.results = LeeBoundsResults(
84
+ lower_bound=lower_bound,
85
+ upper_bound=upper_bound,
86
+ p1=p1,
87
+ p0=p0,
88
+ trim_proportion=p1 - p0,
89
+ n_treated=np.sum(D == 1),
90
+ n_control=np.sum(D == 0),
91
+ n_treated_selected=np.sum((D == 1) & (S == 1)),
92
+ n_control_selected=np.sum((D == 0) & (S == 1))
93
+ )
94
+
95
+ self.fitted = True
96
+ return self.results
97
+
98
+ def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
99
+ p1: float, p0: float) -> Tuple[float, float]:
100
+ """
101
+ Compute Lee bounds using trimming approach.
102
+
103
+ Parameters
104
+ ----------
105
+ Y : np.ndarray
106
+ Outcome values
107
+ D : np.ndarray
108
+ Treatment indicators
109
+ S : np.ndarray
110
+ Selection indicators
111
+ p1 : float
112
+ Selection rate in treated group
113
+ p0 : float
114
+ Selection rate in control group
115
+
116
+ Returns
117
+ -------
118
+ Tuple[float, float]
119
+ (lower_bound, upper_bound)
120
+ """
121
+ # Control group mean (among selected)
122
+ control_mean = np.mean(Y[(D == 0) & (S == 1)])
123
+
124
+ # Treated group (among selected)
125
+ treated_selected = Y[(D == 1) & (S == 1)]
126
+
127
+ # Trim proportion
128
+ trim_prop = p1 - p0
129
+
130
+ # For Lee bounds, we trim the treated group to match control selection rate
131
+ # Lower bound: trim from top (keep lowest outcomes)
132
+ # Upper bound: trim from bottom (keep highest outcomes)
133
+ n_trim = int(len(treated_selected) * trim_prop)
134
+
135
+ if n_trim > 0:
136
+ sorted_treated = np.sort(treated_selected)
137
+ # Lower bound: keep bottom (1 - trim_prop) of observations
138
+ lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
139
+ # Upper bound: keep top (1 - trim_prop) of observations
140
+ upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
141
+ else:
142
+ lower_trimmed_mean = upper_trimmed_mean = np.mean(treated_selected)
143
+
144
+ lower_bound = lower_trimmed_mean - control_mean
145
+ upper_bound = upper_trimmed_mean - control_mean
146
+
147
+ return lower_bound, upper_bound
148
+
149
+ def _validate_variables(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
150
+ """
151
+ Validate that variables have correct types and values.
152
+
153
+ Parameters
154
+ ----------
155
+ Y : np.ndarray
156
+ Outcome variable
157
+ D : np.ndarray
158
+ Treatment variable
159
+ S : np.ndarray
160
+ Selection variable
161
+
162
+ Raises
163
+ ------
164
+ ValueError
165
+ If validation fails
166
+ """
167
+ # Check that D is binary with values 0 and 1
168
+ unique_d = np.unique(D)
169
+ if not np.array_equal(unique_d, np.array([0, 1])):
170
+ raise ValueError(f"Treatment variable D must be binary (0, 1). Found values: {unique_d}")
171
+
172
+ # Check that S is binary with values 0 and 1
173
+ unique_s = np.unique(S)
174
+ if not np.array_equal(unique_s, np.array([0, 1])):
175
+ raise ValueError(f"Selection variable S must be binary (0, 1). Found values: {unique_s}")
176
+
177
+ # Check that Y is continuous (not all integers)
178
+ if len(np.unique(Y)) < len(Y) * 0.1: # If less than 10% unique values, likely discrete
179
+ raise ValueError("Outcome variable Y should be continuous. Consider if this is appropriate.")
180
+
181
+ # Check for reasonable sample sizes
182
+ if len(Y) < 10:
183
+ raise ValueError("Sample size too small. Need at least 10 observations.")
184
+
185
+ # Check that we have both treatment groups
186
+ if np.sum(D == 0) == 0:
187
+ raise ValueError("No control observations (D=0) found.")
188
+ if np.sum(D == 1) == 0:
189
+ raise ValueError("No treated observations (D=1) found.")
190
+
191
+ # Check that we have selected observations in both groups
192
+ if np.sum((D == 0) & (S == 1)) == 0:
193
+ raise ValueError("No selected control observations (D=0, S=1) found.")
194
+ if np.sum((D == 1) & (S == 1)) == 0:
195
+ raise ValueError("No selected treated observations (D=1, S=1) found.")
196
+
197
+ def bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
198
+ selection: str, n_bootstrap: int = 500,
199
+ ci_level: float = 0.95) -> Dict[str, Any]:
200
+ """
201
+ Compute bootstrap confidence intervals for the bounds.
202
+
203
+ Parameters
204
+ ----------
205
+ data : pd.DataFrame
206
+ Input data
207
+ outcome : str
208
+ Outcome variable name
209
+ treatment : str
210
+ Treatment variable name
211
+ selection : str
212
+ Selection variable name
213
+ n_bootstrap : int
214
+ Number of bootstrap samples
215
+ ci_level : float
216
+ Confidence level (e.g., 0.95 for 95% CI)
217
+
218
+ Returns
219
+ -------
220
+ Dict[str, Any]
221
+ Dictionary containing bootstrap results
222
+ """
223
+ lower_bounds = []
224
+ upper_bounds = []
225
+
226
+ for _ in range(n_bootstrap):
227
+ # Bootstrap sample
228
+ boot_idx = np.random.choice(len(data), size=len(data), replace=True)
229
+ boot_data = data.iloc[boot_idx].reset_index(drop=True)
230
+
231
+ try:
232
+ # Fit Lee bounds on bootstrap sample
233
+ lb = LeeBounds()
234
+ results = lb.fit(boot_data, outcome, treatment, selection)
235
+ lower_bounds.append(results.lower_bound)
236
+ upper_bounds.append(results.upper_bound)
237
+ except:
238
+ # Skip if bootstrap sample fails
239
+ continue
240
+
241
+ # Calculate confidence intervals
242
+ alpha = 1 - ci_level
243
+ lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
244
+ upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
245
+
246
+ return {
247
+ 'lower_bound_ci': lower_ci,
248
+ 'upper_bound_ci': upper_ci,
249
+ 'lower_bounds': lower_bounds,
250
+ 'upper_bounds': upper_bounds,
251
+ 'ci_level': ci_level
252
+ }
253
+
254
+
255
+ class LeeBoundsResults:
256
+ """
257
+ Results from Lee bounds estimation.
258
+ """
259
+
260
+ def __init__(self, lower_bound: float, upper_bound: float, p1: float,
261
+ p0: float, trim_proportion: float, n_treated: int, n_control: int,
262
+ n_treated_selected: int, n_control_selected: int):
263
+ self.lower_bound = lower_bound
264
+ self.upper_bound = upper_bound
265
+ self.p1 = p1
266
+ self.p0 = p0
267
+ self.trim_proportion = trim_proportion
268
+ self.n_treated = n_treated
269
+ self.n_control = n_control
270
+ self.n_treated_selected = n_treated_selected
271
+ self.n_control_selected = n_control_selected
272
+
273
+ def summary(self) -> str:
274
+ """
275
+ Return a summary of the results.
276
+
277
+ Returns
278
+ -------
279
+ str
280
+ Formatted summary string
281
+ """
282
+ summary = f"""
283
+ Lee (2009) Treatment Effect Bounds
284
+ =================================
285
+
286
+ Treatment Effect Bounds:
287
+ - Lower bound: {self.lower_bound:.4f}
288
+ - Upper bound: {self.upper_bound:.4f}
289
+ - Bound width: {self.upper_bound - self.lower_bound:.4f}
290
+ """
291
+ return summary
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyleebounds
3
+ Version: 0.1.0
4
+ Summary: Python package for Lee 2009 treatment effect bounds under sample selection
5
+ Home-page: https://github.com/vyasenov/pyleebounds
6
+ Author: Vasco Yasenov
7
+ Author-email:
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Requires-Python: >=3.8
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy>=1.20.0
21
+ Requires-Dist: pandas>=1.3.0
22
+ Requires-Dist: matplotlib>=3.3.0
23
+ Requires-Dist: seaborn>=0.11.0
24
+ Requires-Dist: scipy>=1.7.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=6.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
28
+ Requires-Dist: black>=21.0; extra == "dev"
29
+ Requires-Dist: flake8>=3.8; extra == "dev"
30
+ Requires-Dist: mypy>=0.800; extra == "dev"
31
+ Dynamic: author
32
+ Dynamic: classifier
33
+ Dynamic: description
34
+ Dynamic: description-content-type
35
+ Dynamic: home-page
36
+ Dynamic: license-file
37
+ Dynamic: provides-extra
38
+ Dynamic: requires-dist
39
+ Dynamic: requires-python
40
+ Dynamic: summary
41
+
42
+
43
+ # pyleebounds
44
+
45
+ A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
46
+
47
+ ## Installation
48
+
49
+ You can install the package using pip:
50
+
51
+ ```bash
52
+ pip install pyleebounds
53
+ ````
54
+
55
+ ## Features
56
+
57
+ * Sharp nonparametric bounds on treatment effects under endogenous sample selection
58
+ * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
59
+ * Bootstrap confidence intervals
60
+ * Seamless integration with Pandas
61
+
62
+ ## Quick Start
63
+
64
+ ```python
65
+ import pandas as pd
66
+ import numpy as np
67
+ from pyleebounds import LeeBounds
68
+
69
+ # Generate synthetic data
70
+ np.random.seed(1988)
71
+ n = 1000
72
+
73
+ # Treatment assignment (random)
74
+ D = np.random.binomial(1, 0.5, n)
75
+
76
+ # Potential outcomes (e.g., wages)
77
+ Y0 = np.random.normal(50, 10, n) # Control potential outcome
78
+ treatment_effect = np.random.normal(5, 3, n) # Treatment effect
79
+ Y1 = Y0 + treatment_effect # Treated potential outcome
80
+ Y = D * Y1 + (1 - D) * Y0 # Actual outcome
81
+
82
+ # Selection mechanism (e.g., employment)
83
+ # Higher wages and treatment increase employment probability
84
+ employment_prob = 0.3 + 0.4 * (Y > 50) + 0.2 * D
85
+ employment_prob = np.clip(employment_prob, 0, 1)
86
+ S = np.random.binomial(1, employment_prob, n)
87
+
88
+ # Create DataFrame
89
+ df = pd.DataFrame({
90
+ 'Y': Y, # outcome variable
91
+ 'D': D, # treatment indicator (1 = treated, 0 = control)
92
+ 'S': S # selection indicator (1 = observed, 0 = missing/selected out)
93
+ })
94
+
95
+ # Initialize and fit Lee bounds estimator
96
+ lb = LeeBounds()
97
+ results = lb.fit(df, outcome='Y', treatment='D', selection='S')
98
+
99
+ # View summary
100
+ print(results.summary())
101
+
102
+ # Plot estimated bounds
103
+ results.plot()
104
+ ```
105
+
106
+ ## Examples
107
+
108
+ You can find detailed usage examples in the `examples/` directory.
109
+
110
+ ## Background
111
+
112
+ ### Why Treatment Bounds?
113
+
114
+ In many applied settings, outcomes are observed only for a selected subset of the population—e.g., wages are observed only for employed individuals. If treatment affects selection (e.g., job training increases employment), naïvely comparing outcomes may confound treatment effects with selection effects.
115
+
116
+ Lee (2009) offers a way to partially identify treatment effects by trimming the treated group's distribution to match the control group’s selection rate under plausible assumptions.
117
+
118
+ ---
119
+
120
+ ### Notation
121
+
122
+ Let's establish the following notation:
123
+
124
+ * $Y$: observed *continuous* outcome
125
+ * $D \in \{0,1\}$: treatment indicator (1 = treated)
126
+ * $S \in \{0,1\}$: selection indicator (1 = observed)
127
+ * $Y(0), Y(1)$: potential outcomes under control/treatment
128
+ * $S(0), S(1)$: potential selection statuses
129
+ * $p_1 = \Pr(S=1 \mid D=1)$, $p_0 = \Pr(S=1 \mid D=0)$: selection rates
130
+
131
+ For each unit we observe $\left(D, S, Y\times S \right)$.
132
+
133
+ ---
134
+
135
+ ### Assumptions
136
+
137
+ 1. Monotonicity: Treatment weakly increases the probability of being observed $$S(1)\geq S(0).$$
138
+ 2. Exogeneity: Treatment is randomly assigned or unconfounded $$\left(Y(0),Y(1),S(0),S(1)\right) \perp D.$$
139
+
140
+ ---
141
+
142
+ ### Main Result
143
+
144
+ To adjust for differential selection, Lee (2009) suggested trimming the treated group’s outcome distribution among those with $S=1$. We then compute bounds on the average treatment effect (ATE) for the observed sample as:
145
+
146
+ $$
147
+ ATE \in \left[ \underline{\Delta}, \overline{\Delta} \right],
148
+ $$
149
+
150
+ where:
151
+
152
+ $$
153
+ \underline{\Delta} = \mathbb{E}[Y \mid Y\geq q^{1-\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
154
+ $$
155
+
156
+ $$
157
+ \overline{\Delta} = \mathbb{E}[Y \mid Y\leq q^{\frac{p_0}{p_1}}, D=1, S=1] - \mathbb{E}[Y \mid D=0, S=1]
158
+ $$
159
+
160
+ Here $q^{u}$ represents the $u$th quantile of $Y|D=1,S=1$. These form sharp bounds under the stated assumptions.
161
+
162
+ These bounds can be tightened in presence of additional covariates $X$, but this package does not offer that functionality. See also Semenova (2020).
163
+
164
+ ---
165
+
166
+ ### Confidence Intervals
167
+
168
+ Since the Lee bounds involve non-differentiable operations (quantile trimming), variance formulas are complex. Instead, this package provides bootstrap confidence intervals computed as follows:
169
+
170
+ 1. Resample units with replacement, stratified by treatment group.
171
+ 2. Compute Lee bounds for each bootstrap sample.
172
+ 3. Construct percentile intervals using the empirical bootstrap distribution.
173
+
174
+ ## References
175
+
176
+ * Lee, D. S. (2009). *Training, wages, and sample selection: Estimating sharp bounds on treatment effects*. *The Review of Economic Studies*, 76(3), 1071–1102.
177
+ * Semenova, V. (2020). Generalized lee bounds. arXiv preprint arXiv:2008.12720.
178
+ * Tauchmann, H. (2014). Lee (2009) treatment-effect bounds for nonrandom sample selection. The Stata Journal, 14(4), 884-894.
179
+
180
+ ## License
181
+
182
+ This project is licensed under the MIT License – see the [LICENSE](LICENSE) file for details.
183
+
184
+ ## Citation
185
+
186
+ To cite this package in publications, use the following BibTeX entry:
187
+
188
+ ```bibtex
189
+ @misc{yasenov2025pyleebounds,
190
+ author = {Vasco Yasenov},
191
+ title = {pyleebounds: Python Tools for Estimating Treatment Effect Bounds under Sample Selection},
192
+ year = {2025},
193
+ howpublished = {\url{https://github.com/vyasenov/pyleebounds}},
194
+ note = {Version 0.1.0}
195
+ }
196
+ ```
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ pyleebounds/__init__.py
5
+ pyleebounds/lee_bounds.py
6
+ pyleebounds.egg-info/PKG-INFO
7
+ pyleebounds.egg-info/SOURCES.txt
8
+ pyleebounds.egg-info/dependency_links.txt
9
+ pyleebounds.egg-info/not-zip-safe
10
+ pyleebounds.egg-info/requires.txt
11
+ pyleebounds.egg-info/top_level.txt
@@ -0,0 +1,12 @@
1
+ numpy>=1.20.0
2
+ pandas>=1.3.0
3
+ matplotlib>=3.3.0
4
+ seaborn>=0.11.0
5
+ scipy>=1.7.0
6
+
7
+ [dev]
8
+ pytest>=6.0
9
+ pytest-cov>=2.0
10
+ black>=21.0
11
+ flake8>=3.8
12
+ mypy>=0.800
@@ -0,0 +1 @@
1
+ pyleebounds
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,65 @@
1
+ """
2
+ Setup script for pyleebounds package.
3
+ """
4
+
5
+ from setuptools import setup, find_packages
6
+ import os
7
+
8
+ def read_readme():
9
+ """Read README.md file."""
10
+ try:
11
+ with open("README.md", "r", encoding="utf-8") as fh:
12
+ return fh.read()
13
+ except FileNotFoundError:
14
+ return "Python package for Lee 2009 treatment effect bounds under sample selection"
15
+
16
+ def read_requirements():
17
+ """Read requirements.txt file."""
18
+ try:
19
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
20
+ return [line.strip() for line in fh if line.strip() and not line.startswith("#")]
21
+ except FileNotFoundError:
22
+ # Default requirements if file not found
23
+ return [
24
+ "numpy>=1.20.0",
25
+ "pandas>=1.3.0",
26
+ "matplotlib>=3.3.0",
27
+ "seaborn>=0.11.0",
28
+ "scipy>=1.7.0"
29
+ ]
30
+
31
+ setup(
32
+ name="pyleebounds",
33
+ version="0.1.0",
34
+ author="Vasco Yasenov",
35
+ author_email="",
36
+ description="Python package for Lee 2009 treatment effect bounds under sample selection",
37
+ long_description=read_readme(),
38
+ long_description_content_type="text/markdown",
39
+ url="https://github.com/vyasenov/pyleebounds",
40
+ packages=find_packages(),
41
+ classifiers=[
42
+ "Development Status :: 3 - Alpha",
43
+ "Intended Audience :: Science/Research",
44
+ "Topic :: Scientific/Engineering :: Information Analysis",
45
+ "License :: OSI Approved :: MIT License",
46
+ "Programming Language :: Python :: 3",
47
+ "Programming Language :: Python :: 3.8",
48
+ "Programming Language :: Python :: 3.9",
49
+ "Programming Language :: Python :: 3.10",
50
+ "Programming Language :: Python :: 3.11",
51
+ ],
52
+ python_requires=">=3.8",
53
+ install_requires=read_requirements(),
54
+ extras_require={
55
+ "dev": [
56
+ "pytest>=6.0",
57
+ "pytest-cov>=2.0",
58
+ "black>=21.0",
59
+ "flake8>=3.8",
60
+ "mypy>=0.800",
61
+ ],
62
+ },
63
+ include_package_data=True,
64
+ zip_safe=False,
65
+ )