pyleebounds 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyleebounds/lee_bounds.py CHANGED
@@ -5,8 +5,6 @@ Implementation of Lee (2009) treatment effect bounds under sample selection.
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from typing import Tuple, Dict, Any
8
- import matplotlib.pyplot as plt
9
-
10
8
 
11
9
  class LeeBounds:
12
10
  """
@@ -25,93 +23,32 @@ class LeeBounds:
25
23
  bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
26
24
  """
27
25
 
28
- def __init__(self, trim_method: str = 'quantile'):
29
- self.trim_method = trim_method
30
- self.fitted = False
31
- self.results = None
32
-
33
- def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
34
- selection: str) -> 'LeeBoundsResults':
26
+ def __init__(self, n_bootstrap: int = 100, ci_level: float = 0.95):
35
27
  """
36
- Fit Lee bounds estimator to the data.
28
+ Initialize Lee bounds estimator.
37
29
 
38
30
  Parameters
39
31
  ----------
40
- data : pd.DataFrame
41
- Input data containing outcome, treatment, and selection variables
42
- outcome : str
43
- Name of the outcome variable column
44
- treatment : str
45
- Name of the treatment indicator column (0=control, 1=treated)
46
- selection : str
47
- Name of the selection indicator column (0=missing, 1=observed)
48
-
49
- Returns
50
- -------
51
- LeeBoundsResults
52
- Results object containing bounds and summary statistics
32
+ n_bootstrap : int, default=100
33
+ Number of bootstrap samples for confidence intervals
34
+ ci_level : float, default=0.95
35
+ Confidence level for bootstrap confidence intervals
53
36
  """
54
- # Validate inputs
55
- required_cols = [outcome, treatment, selection]
56
- missing_cols = [col for col in required_cols if col not in data.columns]
57
- if missing_cols:
58
- raise ValueError(f"Missing columns: {missing_cols}")
59
-
60
- # Extract data
61
- Y = data[outcome].values
62
- D = data[treatment].values
63
- S = data[selection].values
64
-
65
- # Remove missing values
66
- valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
67
- Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
68
-
69
- # Validate data types and values
70
- self._validate_variables(Y, D, S)
71
-
72
- # Calculate selection rates
73
- p1 = np.mean(S[D == 1]) # Selection rate in treated group
74
- p0 = np.mean(S[D == 0]) # Selection rate in control group
75
-
76
- if p1 <= p0:
77
- raise ValueError("Selection rate in treated group must be greater than control group")
78
-
79
- # Calculate bounds
80
- lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
81
-
82
- # Store results
83
- self.results = LeeBoundsResults(
84
- lower_bound=lower_bound,
85
- upper_bound=upper_bound,
86
- p1=p1,
87
- p0=p0,
88
- trim_proportion=p1 - p0,
89
- n_treated=np.sum(D == 1),
90
- n_control=np.sum(D == 0),
91
- n_treated_selected=np.sum((D == 1) & (S == 1)),
92
- n_control_selected=np.sum((D == 0) & (S == 1))
93
- )
94
-
95
- self.fitted = True
96
- return self.results
37
+ self.n_bootstrap = n_bootstrap
38
+ self.ci_level = ci_level
97
39
 
98
40
  def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
99
41
  p1: float, p0: float) -> Tuple[float, float]:
100
42
  """
101
- Compute Lee bounds using trimming approach.
43
+ Compute Lee treatment effect bounds using trimming approach.
102
44
 
103
45
  Parameters
104
46
  ----------
105
- Y : np.ndarray
106
- Outcome values
107
- D : np.ndarray
108
- Treatment indicators
109
- S : np.ndarray
110
- Selection indicators
111
- p1 : float
112
- Selection rate in treated group
113
- p0 : float
114
- Selection rate in control group
47
+ Y : np.ndarray; Outcome values
48
+ D : np.ndarray; Treatment indicators
49
+ S : np.ndarray; Selection indicators
50
+ p1 : float; Selection rate in treated group
51
+ p0 : float; Selection rate in control group
115
52
 
116
53
  Returns
117
54
  -------
@@ -124,17 +61,20 @@ class LeeBounds:
124
61
  # Treated group (among selected)
125
62
  treated_selected = Y[(D == 1) & (S == 1)]
126
63
 
127
- # Trim proportion
128
- trim_prop = p1 - p0
129
-
130
64
  # For Lee bounds, we trim the treated group to match control selection rate
131
65
  # Lower bound: trim from top (keep lowest outcomes)
132
66
  # Upper bound: trim from bottom (keep highest outcomes)
133
- n_trim = int(len(treated_selected) * trim_prop)
134
-
135
- if n_trim > 0:
67
+ if p1 > p0:
68
+ # Trim proportion
69
+ trim_prop = (p1 - p0) / p1
70
+ n_trim = int(len(treated_selected) * trim_prop)
71
+
72
+ if n_trim >= len(treated_selected):
73
+ # Handle case where we'd trim everything
74
+ raise ValueError("Trim proportion too large - would remove all observations")
75
+
136
76
  sorted_treated = np.sort(treated_selected)
137
- # Lower bound: keep bottom (1 - trim_prop) of observations
77
+ # Lower bound: keep bottom (1 - trim_prop) of observations (trimmed from top)
138
78
  lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
139
79
  # Upper bound: keep top (1 - trim_prop) of observations
140
80
  upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
@@ -146,7 +86,7 @@ class LeeBounds:
146
86
 
147
87
  return lower_bound, upper_bound
148
88
 
149
- def _validate_variables(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
89
+ def _validate_data(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
150
90
  """
151
91
  Validate that variables have correct types and values.
152
92
 
@@ -194,26 +134,17 @@ class LeeBounds:
194
134
  if np.sum((D == 1) & (S == 1)) == 0:
195
135
  raise ValueError("No selected treated observations (D=1, S=1) found.")
196
136
 
197
- def bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
198
- selection: str, n_bootstrap: int = 500,
199
- ci_level: float = 0.95) -> Dict[str, Any]:
137
+ def _bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
138
+ selection: str) -> Dict[str, Any]:
200
139
  """
201
140
  Compute bootstrap confidence intervals for the bounds.
202
141
 
203
142
  Parameters
204
143
  ----------
205
- data : pd.DataFrame
206
- Input data
207
- outcome : str
208
- Outcome variable name
209
- treatment : str
210
- Treatment variable name
211
- selection : str
212
- Selection variable name
213
- n_bootstrap : int
214
- Number of bootstrap samples
215
- ci_level : float
216
- Confidence level (e.g., 0.95 for 95% CI)
144
+ data : pd.DataFrame; Input data
145
+ outcome : str; Outcome variable name
146
+ treatment : str; Treatment variable name
147
+ selection : str; Selection variable name
217
148
 
218
149
  Returns
219
150
  -------
@@ -223,53 +154,131 @@ class LeeBounds:
223
154
  lower_bounds = []
224
155
  upper_bounds = []
225
156
 
226
- for _ in range(n_bootstrap):
157
+ for _ in range(self.n_bootstrap):
227
158
  # Bootstrap sample
228
159
  boot_idx = np.random.choice(len(data), size=len(data), replace=True)
229
160
  boot_data = data.iloc[boot_idx].reset_index(drop=True)
230
161
 
231
162
  try:
232
- # Fit Lee bounds on bootstrap sample
233
- lb = LeeBounds()
234
- results = lb.fit(boot_data, outcome, treatment, selection)
235
- lower_bounds.append(results.lower_bound)
236
- upper_bounds.append(results.upper_bound)
237
- except:
163
+ # Extract data from bootstrap sample
164
+ Y = boot_data[outcome].values
165
+ D = boot_data[treatment].values
166
+ S = boot_data[selection].values
167
+
168
+ # Remove missing values
169
+ valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
170
+ Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
171
+
172
+ # Validate data types and values
173
+ self._validate_data(Y, D, S)
174
+
175
+ # Calculate selection rates
176
+ p1 = np.mean(S[D == 1])
177
+ p0 = np.mean(S[D == 0])
178
+
179
+ if p1 <= p0:
180
+ continue # Skip this bootstrap sample
181
+
182
+ # Calculate bounds directly
183
+ lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
184
+ lower_bounds.append(lower_bound)
185
+ upper_bounds.append(upper_bound)
186
+
187
+ except Exception:
238
188
  # Skip if bootstrap sample fails
239
189
  continue
240
190
 
241
- # Calculate confidence intervals
242
- alpha = 1 - ci_level
243
- lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
244
- upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
191
+ # Calculate confidence intervals and standard errors
192
+ if len(lower_bounds) > 0:
193
+ alpha = 1 - self.ci_level
194
+ lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
195
+ upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
196
+
197
+ # Calculate bootstrap standard errors
198
+ lower_bound_se = np.std(lower_bounds, ddof=1)
199
+ upper_bound_se = np.std(upper_bounds, ddof=1)
200
+ else:
201
+ # If no successful bootstrap samples, use point estimates
202
+ lower_ci = upper_ci = np.array([np.nan, np.nan])
203
+ lower_bound_se = upper_bound_se = np.nan
245
204
 
246
205
  return {
247
206
  'lower_bound_ci': lower_ci,
248
207
  'upper_bound_ci': upper_ci,
208
+ 'lower_bound_se': lower_bound_se,
209
+ 'upper_bound_se': upper_bound_se,
249
210
  'lower_bounds': lower_bounds,
250
211
  'upper_bounds': upper_bounds,
251
- 'ci_level': ci_level
252
- }
253
-
212
+ 'ci_level': self.ci_level
213
+ }
254
214
 
255
- class LeeBoundsResults:
256
- """
257
- Results from Lee bounds estimation.
258
- """
259
-
260
- def __init__(self, lower_bound: float, upper_bound: float, p1: float,
261
- p0: float, trim_proportion: float, n_treated: int, n_control: int,
262
- n_treated_selected: int, n_control_selected: int):
215
+ def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
216
+ selection: str) -> 'LeeBounds':
217
+ """
218
+ Fit Lee bounds estimator to the data.
219
+
220
+ Parameters
221
+ ----------
222
+ data : pd.DataFrame; Input data containing outcome, treatment, and selection variables
223
+ outcome : str; Name of the outcome variable column
224
+ treatment : str; Name of the treatment indicator column (0=control, 1=treated)
225
+ selection : str; Name of the selection indicator column (0=missing, 1=observed)
226
+
227
+ Returns
228
+ -------
229
+ LeeBounds
230
+ Self with fitted results
231
+ """
232
+ # Validate inputs
233
+ required_cols = [outcome, treatment, selection]
234
+ missing_cols = [col for col in required_cols if col not in data.columns]
235
+ if missing_cols:
236
+ raise ValueError(f"Missing columns: {missing_cols}")
237
+
238
+ # Extract data
239
+ Y = data[outcome].values
240
+ D = data[treatment].values
241
+ S = data[selection].values
242
+
243
+ # Remove missing values
244
+ valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
245
+ Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
246
+
247
+ # Validate data types and values
248
+ self._validate_data(Y, D, S)
249
+
250
+ # Calculate selection rates
251
+ p1 = np.mean(S[D == 1]) # Selection rate in treated group
252
+ p0 = np.mean(S[D == 0]) # Selection rate in control group
253
+
254
+ if p1 <= p0:
255
+ raise ValueError("Selection rate in treated group must be greater than control group")
256
+
257
+ # Calculate bounds
258
+ lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
259
+
260
+ # Store results directly in self
263
261
  self.lower_bound = lower_bound
264
262
  self.upper_bound = upper_bound
265
263
  self.p1 = p1
266
264
  self.p0 = p0
267
- self.trim_proportion = trim_proportion
268
- self.n_treated = n_treated
269
- self.n_control = n_control
270
- self.n_treated_selected = n_treated_selected
271
- self.n_control_selected = n_control_selected
265
+ self.trim_proportion = p1 - p0
266
+ self.n_treated = np.sum(D == 1)
267
+ self.n_control = np.sum(D == 0)
268
+ self.n_treated_selected = np.sum((D == 1) & (S == 1))
269
+ self.n_control_selected = np.sum((D == 0) & (S == 1))
270
+
271
+ # Compute bootstrap confidence intervals and standard errors
272
+ bootstrap_results = self._bootstrap(data, outcome, treatment, selection)
273
+ self.lower_bound_ci = bootstrap_results['lower_bound_ci']
274
+ self.upper_bound_ci = bootstrap_results['upper_bound_ci']
275
+ self.lower_bound_se = bootstrap_results['lower_bound_se']
276
+ self.upper_bound_se = bootstrap_results['upper_bound_se']
277
+ self.lower_bounds_bootstrap = bootstrap_results['lower_bounds']
278
+ self.upper_bounds_bootstrap = bootstrap_results['upper_bounds']
272
279
 
280
+ return self
281
+
273
282
  def summary(self) -> str:
274
283
  """
275
284
  Return a summary of the results.
@@ -279,13 +288,42 @@ class LeeBoundsResults:
279
288
  str
280
289
  Formatted summary string
281
290
  """
291
+ if not hasattr(self, 'lower_bound'):
292
+ return "No results available. Please fit the model first."
293
+
294
+ # Format confidence intervals and standard errors
295
+ if not np.isnan(self.lower_bound_ci[0]):
296
+ lower_ci_str = f"[{self.lower_bound_ci[0]:.4f}, {self.lower_bound_ci[1]:.4f}]"
297
+ upper_ci_str = f"[{self.upper_bound_ci[0]:.4f}, {self.upper_bound_ci[1]:.4f}]"
298
+ lower_se_str = f"{self.lower_bound_se:.4f}"
299
+ upper_se_str = f"{self.upper_bound_se:.4f}"
300
+ else:
301
+ lower_ci_str = "Not computed"
302
+ upper_ci_str = "Not computed"
303
+ lower_se_str = "Not computed"
304
+ upper_se_str = "Not computed"
305
+
282
306
  summary = f"""
283
- Lee (2009) Treatment Effect Bounds
284
- =================================
307
+ Lee (2009) Treatment Effect Bounds
308
+ =================================
285
309
 
286
- Treatment Effect Bounds:
287
- - Lower bound: {self.lower_bound:.4f}
288
- - Upper bound: {self.upper_bound:.4f}
289
- - Bound width: {self.upper_bound - self.lower_bound:.4f}
290
- """
310
+ Treatment Effect Bounds:
311
+ - Lower bound: {self.lower_bound:.4f}
312
+ - Upper bound: {self.upper_bound:.4f}
313
+ - Bound width: {self.upper_bound - self.lower_bound:.4f}
314
+
315
+ Bootstrap Confidence Intervals ({int(self.ci_level*100)}%):
316
+ - Lower bound CI: {lower_ci_str}
317
+ - Upper bound CI: {upper_ci_str}
318
+
319
+ Bootstrap Standard Errors:
320
+ - Lower bound SE: {lower_se_str}
321
+ - Upper bound SE: {upper_se_str}
322
+
323
+ Sample Information:
324
+ - Treated observations: {self.n_treated} (selected: {self.n_treated_selected})
325
+ - Control observations: {self.n_control} (selected: {self.n_control_selected})
326
+ - Selection rates: p₁ = {self.p1:.3f}, p₀ = {self.p0:.3f}
327
+ - Trim proportion: {self.trim_proportion:.3f}
328
+ """
291
329
  return summary
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyleebounds
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Python package for Lee 2009 treatment effect bounds under sample selection
5
5
  Home-page: https://github.com/vyasenov/pyleebounds
6
6
  Author: Vasco Yasenov
@@ -39,9 +39,10 @@ Dynamic: requires-dist
39
39
  Dynamic: requires-python
40
40
  Dynamic: summary
41
41
 
42
-
43
42
  # pyleebounds
44
43
 
44
+ ![](https://img.shields.io/badge/license-MIT-green)
45
+
45
46
  A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
46
47
 
47
48
  ## Installation
@@ -57,7 +58,7 @@ pip install pyleebounds
57
58
  * Sharp nonparametric bounds on treatment effects under endogenous sample selection
58
59
  * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
59
60
  * Bootstrap confidence intervals
60
- * Seamless integration with Pandas
61
+ * Seamless integration with `pandas`
61
62
 
62
63
  ## Quick Start
63
64
 
@@ -93,19 +94,18 @@ df = pd.DataFrame({
93
94
  })
94
95
 
95
96
  # Initialize and fit Lee bounds estimator
96
- lb = LeeBounds()
97
+ # Use fewer bootstrap replications for faster execution in this example
98
+ lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
97
99
  results = lb.fit(df, outcome='Y', treatment='D', selection='S')
98
100
 
99
- # View summary
100
- print(results.summary())
101
-
102
- # Plot estimated bounds
103
- results.plot()
101
+ # View comprehensive summary
102
+ print(lb.summary())
104
103
  ```
105
104
 
105
+
106
106
  ## Examples
107
107
 
108
- You can find detailed usage examples in the `examples/` directory.
108
+ You can find detailed usage examples in the `examples/` directory.
109
109
 
110
110
  ## Background
111
111
 
@@ -0,0 +1,7 @@
1
+ pyleebounds/__init__.py,sha256=Njryiwe-TWjSjU14ZNAiFwBkPknIOyYejlK1AmrfTOg,386
2
+ pyleebounds/lee_bounds.py,sha256=vIvQWnui20wIE9VomIBLUzCOscqCyYuLfyySOuG5Wxc,12715
3
+ pyleebounds-0.2.0.dist-info/licenses/LICENSE,sha256=TSASXY1hDmb_hVj855U90g-GnBAwJBmQ1u2ptLTDkPI,1078
4
+ pyleebounds-0.2.0.dist-info/METADATA,sha256=tnCbsmjoNSGP_Y6Zpo9fbFMkZsg5QhTMoxvIl8BGtY0,6771
5
+ pyleebounds-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ pyleebounds-0.2.0.dist-info/top_level.txt,sha256=OAMZaMQmv_5Cmzyg19s1rvMNC5ll-60rskfOkmiS2nc,12
7
+ pyleebounds-0.2.0.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- pyleebounds/__init__.py,sha256=Njryiwe-TWjSjU14ZNAiFwBkPknIOyYejlK1AmrfTOg,386
2
- pyleebounds/lee_bounds.py,sha256=HM10djxKW_8CyMDzK3affTETxL2uMrm8ENlxwCExpAA,10085
3
- pyleebounds-0.1.0.dist-info/licenses/LICENSE,sha256=TSASXY1hDmb_hVj855U90g-GnBAwJBmQ1u2ptLTDkPI,1078
4
- pyleebounds-0.1.0.dist-info/METADATA,sha256=Q-xSztClpeTFqIgXpY0WST0oplTPSDfF_rJ-QGELLrM,6647
5
- pyleebounds-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- pyleebounds-0.1.0.dist-info/top_level.txt,sha256=OAMZaMQmv_5Cmzyg19s1rvMNC5ll-60rskfOkmiS2nc,12
7
- pyleebounds-0.1.0.dist-info/RECORD,,