PyPI - pyleebounds - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

pyleebounds 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

pyleebounds/lee_bounds.py CHANGED Viewed

@@ -5,8 +5,6 @@ Implementation of Lee (2009) treatment effect bounds under sample selection.
 import numpy as np
 import pandas as pd
 from typing import Tuple, Dict, Any
-import matplotlib.pyplot as plt
 class LeeBounds:
     """
@@ -25,93 +23,32 @@ class LeeBounds:
     bounds on treatment effects. The Review of Economic Studies, 76(3), 1071-1102.
     """
-    def __init__(self, trim_method: str = 'quantile'):
-        self.trim_method = trim_method
-        self.fitted = False
-        self.results = None
-    def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
-            selection: str) -> 'LeeBoundsResults':
+    def __init__(self, n_bootstrap: int = 100, ci_level: float = 0.95):
         """
-        Fit Lee bounds estimator to the data.
+        Initialize Lee bounds estimator.
         Parameters
         ----------
-        data : pd.DataFrame
-            Input data containing outcome, treatment, and selection variables
-        outcome : str
-            Name of the outcome variable column
-        treatment : str
-            Name of the treatment indicator column (0=control, 1=treated)
-        selection : str
-            Name of the selection indicator column (0=missing, 1=observed)
-        Returns
-        -------
-        LeeBoundsResults
-            Results object containing bounds and summary statistics
+        n_bootstrap : int, default=100
+            Number of bootstrap samples for confidence intervals
+        ci_level : float, default=0.95
+            Confidence level for bootstrap confidence intervals
         """
-        # Validate inputs
-        required_cols = [outcome, treatment, selection]
-        missing_cols = [col for col in required_cols if col not in data.columns]
-        if missing_cols:
-            raise ValueError(f"Missing columns: {missing_cols}")
-        # Extract data
-        Y = data[outcome].values
-        D = data[treatment].values
-        S = data[selection].values
-        # Remove missing values
-        valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
-        Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
-        # Validate data types and values
-        self._validate_variables(Y, D, S)
-        # Calculate selection rates
-        p1 = np.mean(S[D == 1])  # Selection rate in treated group
-        p0 = np.mean(S[D == 0])  # Selection rate in control group
-        if p1 <= p0:
-            raise ValueError("Selection rate in treated group must be greater than control group")
-        # Calculate bounds
-        lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
-        # Store results
-        self.results = LeeBoundsResults(
-            lower_bound=lower_bound,
-            upper_bound=upper_bound,
-            p1=p1,
-            p0=p0,
-            trim_proportion=p1 - p0,
-            n_treated=np.sum(D == 1),
-            n_control=np.sum(D == 0),
-            n_treated_selected=np.sum((D == 1) & (S == 1)),
-            n_control_selected=np.sum((D == 0) & (S == 1))
-        )
-        self.fitted = True
-        return self.results
+        self.n_bootstrap = n_bootstrap
+        self.ci_level = ci_level
     def _compute_bounds(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray,
                        p1: float, p0: float) -> Tuple[float, float]:
         """
-        Compute Lee bounds using trimming approach.
+        Compute Lee treatment effect bounds using trimming approach.
         Parameters
         ----------
-        Y : np.ndarray
-            Outcome values
-        D : np.ndarray
-            Treatment indicators
-        S : np.ndarray
-            Selection indicators
-        p1 : float
-            Selection rate in treated group
-        p0 : float
-            Selection rate in control group
+        Y : np.ndarray; Outcome values
+        D : np.ndarray; Treatment indicators
+        S : np.ndarray; Selection indicators
+        p1 : float;  Selection rate in treated group
+        p0 : float;  Selection rate in control group
         Returns
         -------
@@ -124,17 +61,20 @@ class LeeBounds:
         # Treated group (among selected)
         treated_selected = Y[(D == 1) & (S == 1)]
-        # Trim proportion
-        trim_prop = p1 - p0
         # For Lee bounds, we trim the treated group to match control selection rate
         # Lower bound: trim from top (keep lowest outcomes)
         # Upper bound: trim from bottom (keep highest outcomes)
-        n_trim = int(len(treated_selected) * trim_prop)
-        if n_trim > 0:
+        if p1 > p0:
+            # Trim proportion
+            trim_prop = (p1 - p0) / p1
+            n_trim = int(len(treated_selected) * trim_prop)
+            if n_trim >= len(treated_selected):
+                # Handle case where we'd trim everything
+                raise ValueError("Trim proportion too large - would remove all observations")
             sorted_treated = np.sort(treated_selected)
-            # Lower bound: keep bottom (1 - trim_prop) of observations
+            # Lower bound: keep bottom (1 - trim_prop) of observations (trimmed from top)
             lower_trimmed_mean = np.mean(sorted_treated[:-n_trim])
             # Upper bound: keep top (1 - trim_prop) of observations
             upper_trimmed_mean = np.mean(sorted_treated[n_trim:])
@@ -146,7 +86,7 @@ class LeeBounds:
         return lower_bound, upper_bound
-    def _validate_variables(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
+    def _validate_data(self, Y: np.ndarray, D: np.ndarray, S: np.ndarray) -> None:
         """
         Validate that variables have correct types and values.
@@ -194,26 +134,17 @@ class LeeBounds:
         if np.sum((D == 1) & (S == 1)) == 0:
             raise ValueError("No selected treated observations (D=1, S=1) found.")
-    def bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
-                  selection: str, n_bootstrap: int = 500,
-                  ci_level: float = 0.95) -> Dict[str, Any]:
+    def _bootstrap(self, data: pd.DataFrame, outcome: str, treatment: str,
+                  selection: str) -> Dict[str, Any]:
         """
         Compute bootstrap confidence intervals for the bounds.
         Parameters
         ----------
-        data : pd.DataFrame
-            Input data
-        outcome : str
-            Outcome variable name
-        treatment : str
-            Treatment variable name
-        selection : str
-            Selection variable name
-        n_bootstrap : int
-            Number of bootstrap samples
-        ci_level : float
-            Confidence level (e.g., 0.95 for 95% CI)
+        data : pd.DataFrame; Input data
+        outcome : str; Outcome variable name
+        treatment : str; Treatment variable name
+        selection : str; Selection variable name
         Returns
         -------
@@ -223,53 +154,131 @@ class LeeBounds:
         lower_bounds = []
         upper_bounds = []
-        for _ in range(n_bootstrap):
+        for _ in range(self.n_bootstrap):
             # Bootstrap sample
             boot_idx = np.random.choice(len(data), size=len(data), replace=True)
             boot_data = data.iloc[boot_idx].reset_index(drop=True)
             try:
-                # Fit Lee bounds on bootstrap sample
-                lb = LeeBounds()
-                results = lb.fit(boot_data, outcome, treatment, selection)
-                lower_bounds.append(results.lower_bound)
-                upper_bounds.append(results.upper_bound)
-            except:
+                # Extract data from bootstrap sample
+                Y = boot_data[outcome].values
+                D = boot_data[treatment].values
+                S = boot_data[selection].values
+                # Remove missing values
+                valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
+                Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
+                # Validate data types and values
+                self._validate_data(Y, D, S)
+                # Calculate selection rates
+                p1 = np.mean(S[D == 1])
+                p0 = np.mean(S[D == 0])
+                if p1 <= p0:
+                    continue  # Skip this bootstrap sample
+                # Calculate bounds directly
+                lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
+                lower_bounds.append(lower_bound)
+                upper_bounds.append(upper_bound)
+            except Exception:
                 # Skip if bootstrap sample fails
                 continue
-        # Calculate confidence intervals
-        alpha = 1 - ci_level
-        lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
-        upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
+        # Calculate confidence intervals and standard errors
+        if len(lower_bounds) > 0:
+            alpha = 1 - self.ci_level
+            lower_ci = np.percentile(lower_bounds, [alpha/2*100, (1-alpha/2)*100])
+            upper_ci = np.percentile(upper_bounds, [alpha/2*100, (1-alpha/2)*100])
+            # Calculate bootstrap standard errors
+            lower_bound_se = np.std(lower_bounds, ddof=1)
+            upper_bound_se = np.std(upper_bounds, ddof=1)
+        else:
+            # If no successful bootstrap samples, use point estimates
+            lower_ci = upper_ci = np.array([np.nan, np.nan])
+            lower_bound_se = upper_bound_se = np.nan
         return {
             'lower_bound_ci': lower_ci,
             'upper_bound_ci': upper_ci,
+            'lower_bound_se': lower_bound_se,
+            'upper_bound_se': upper_bound_se,
             'lower_bounds': lower_bounds,
             'upper_bounds': upper_bounds,
-            'ci_level': ci_level
-        }
+            'ci_level': self.ci_level
+        }
-class LeeBoundsResults:
-    """
-    Results from Lee bounds estimation.
-    """
-    def __init__(self, lower_bound: float, upper_bound: float, p1: float,
-                 p0: float, trim_proportion: float, n_treated: int, n_control: int,
-                 n_treated_selected: int, n_control_selected: int):
+    def fit(self, data: pd.DataFrame, outcome: str, treatment: str,
+            selection: str) -> 'LeeBounds':
+        """
+        Fit Lee bounds estimator to the data.
+        Parameters
+        ----------
+        data : pd.DataFrame; Input data containing outcome, treatment, and selection variables
+        outcome : str; Name of the outcome variable column
+        treatment : str; Name of the treatment indicator column (0=control, 1=treated)
+        selection : str; Name of the selection indicator column (0=missing, 1=observed)
+        Returns
+        -------
+        LeeBounds
+            Self with fitted results
+        """
+        # Validate inputs
+        required_cols = [outcome, treatment, selection]
+        missing_cols = [col for col in required_cols if col not in data.columns]
+        if missing_cols:
+            raise ValueError(f"Missing columns: {missing_cols}")
+        # Extract data
+        Y = data[outcome].values
+        D = data[treatment].values
+        S = data[selection].values
+        # Remove missing values
+        valid_mask = ~(np.isnan(Y) | np.isnan(D) | np.isnan(S))
+        Y, D, S = Y[valid_mask], D[valid_mask], S[valid_mask]
+        # Validate data types and values
+        self._validate_data(Y, D, S)
+        # Calculate selection rates
+        p1 = np.mean(S[D == 1])  # Selection rate in treated group
+        p0 = np.mean(S[D == 0])  # Selection rate in control group
+        if p1 <= p0:
+            raise ValueError("Selection rate in treated group must be greater than control group")
+        # Calculate bounds
+        lower_bound, upper_bound = self._compute_bounds(Y, D, S, p1, p0)
+        # Store results directly in self
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
         self.p1 = p1
         self.p0 = p0
-        self.trim_proportion = trim_proportion
-        self.n_treated = n_treated
-        self.n_control = n_control
-        self.n_treated_selected = n_treated_selected
-        self.n_control_selected = n_control_selected
+        self.trim_proportion = p1 - p0
+        self.n_treated = np.sum(D == 1)
+        self.n_control = np.sum(D == 0)
+        self.n_treated_selected = np.sum((D == 1) & (S == 1))
+        self.n_control_selected = np.sum((D == 0) & (S == 1))
+        # Compute bootstrap confidence intervals and standard errors
+        bootstrap_results = self._bootstrap(data, outcome, treatment, selection)
+        self.lower_bound_ci = bootstrap_results['lower_bound_ci']
+        self.upper_bound_ci = bootstrap_results['upper_bound_ci']
+        self.lower_bound_se = bootstrap_results['lower_bound_se']
+        self.upper_bound_se = bootstrap_results['upper_bound_se']
+        self.lower_bounds_bootstrap = bootstrap_results['lower_bounds']
+        self.upper_bounds_bootstrap = bootstrap_results['upper_bounds']
+        return self
     def summary(self) -> str:
         """
         Return a summary of the results.
@@ -279,13 +288,42 @@ class LeeBoundsResults:
         str
             Formatted summary string
         """
+        if not hasattr(self, 'lower_bound'):
+            return "No results available. Please fit the model first."
+        # Format confidence intervals and standard errors
+        if not np.isnan(self.lower_bound_ci[0]):
+            lower_ci_str = f"[{self.lower_bound_ci[0]:.4f}, {self.lower_bound_ci[1]:.4f}]"
+            upper_ci_str = f"[{self.upper_bound_ci[0]:.4f}, {self.upper_bound_ci[1]:.4f}]"
+            lower_se_str = f"{self.lower_bound_se:.4f}"
+            upper_se_str = f"{self.upper_bound_se:.4f}"
+        else:
+            lower_ci_str = "Not computed"
+            upper_ci_str = "Not computed"
+            lower_se_str = "Not computed"
+            upper_se_str = "Not computed"
         summary = f"""
-Lee (2009) Treatment Effect Bounds
-=================================
+        Lee (2009) Treatment Effect Bounds
+        =================================
-Treatment Effect Bounds:
-- Lower bound: {self.lower_bound:.4f}
-- Upper bound: {self.upper_bound:.4f}
-- Bound width: {self.upper_bound - self.lower_bound:.4f}
-"""
+        Treatment Effect Bounds:
+        - Lower bound: {self.lower_bound:.4f}
+        - Upper bound: {self.upper_bound:.4f}
+        - Bound width: {self.upper_bound - self.lower_bound:.4f}
+        Bootstrap Confidence Intervals ({int(self.ci_level*100)}%):
+        - Lower bound CI: {lower_ci_str}
+        - Upper bound CI: {upper_ci_str}
+        Bootstrap Standard Errors:
+        - Lower bound SE: {lower_se_str}
+        - Upper bound SE: {upper_se_str}
+        Sample Information:
+        - Treated observations: {self.n_treated} (selected: {self.n_treated_selected})
+        - Control observations: {self.n_control} (selected: {self.n_control_selected})
+        - Selection rates: p₁ = {self.p1:.3f}, p₀ = {self.p0:.3f}
+        - Trim proportion: {self.trim_proportion:.3f}
+        """
         return summary

{pyleebounds-0.1.0.dist-info → pyleebounds-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyleebounds
-Version: 0.1.0
+Version: 0.2.0
 Summary: Python package for Lee 2009 treatment effect bounds under sample selection
 Home-page: https://github.com/vyasenov/pyleebounds
 Author: Vasco Yasenov
@@ -39,9 +39,10 @@ Dynamic: requires-dist
 Dynamic: requires-python
 Dynamic: summary
 # pyleebounds
+![](https://img.shields.io/badge/license-MIT-green)
 A Python package for estimating treatment effect bounds under sample selection, based on the method of Lee (2009). This approach is especially useful when selection into the observed sample (e.g., post-treatment employment) differs by treatment status and may introduce bias in outcome comparisons.
 ## Installation
@@ -57,7 +58,7 @@ pip install pyleebounds
 * Sharp nonparametric bounds on treatment effects under endogenous sample selection
 * Automatically handles non-random attrition or truncation (e.g. only observing outcomes for employed individuals)
 * Bootstrap confidence intervals
-* Seamless integration with Pandas
+* Seamless integration with `pandas`
 ## Quick Start
@@ -93,19 +94,18 @@ df = pd.DataFrame({
 })
 # Initialize and fit Lee bounds estimator
-lb = LeeBounds()
+# Use fewer bootstrap replications for faster execution in this example
+lb = LeeBounds(n_bootstrap=20, ci_level=0.95)
 results = lb.fit(df, outcome='Y', treatment='D', selection='S')
-# View summary
-print(results.summary())
-# Plot estimated bounds
-results.plot()
+# View comprehensive summary
+print(lb.summary())
 ```
 ## Examples
-You can find detailed usage examples in the  `examples/` directory.
+You can find detailed usage examples in the `examples/` directory.
 ## Background

pyleebounds-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+pyleebounds/__init__.py,sha256=Njryiwe-TWjSjU14ZNAiFwBkPknIOyYejlK1AmrfTOg,386
+pyleebounds/lee_bounds.py,sha256=vIvQWnui20wIE9VomIBLUzCOscqCyYuLfyySOuG5Wxc,12715
+pyleebounds-0.2.0.dist-info/licenses/LICENSE,sha256=TSASXY1hDmb_hVj855U90g-GnBAwJBmQ1u2ptLTDkPI,1078
+pyleebounds-0.2.0.dist-info/METADATA,sha256=tnCbsmjoNSGP_Y6Zpo9fbFMkZsg5QhTMoxvIl8BGtY0,6771
+pyleebounds-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pyleebounds-0.2.0.dist-info/top_level.txt,sha256=OAMZaMQmv_5Cmzyg19s1rvMNC5ll-60rskfOkmiS2nc,12
+pyleebounds-0.2.0.dist-info/RECORD,,

pyleebounds-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-pyleebounds/__init__.py,sha256=Njryiwe-TWjSjU14ZNAiFwBkPknIOyYejlK1AmrfTOg,386
-pyleebounds/lee_bounds.py,sha256=HM10djxKW_8CyMDzK3affTETxL2uMrm8ENlxwCExpAA,10085
-pyleebounds-0.1.0.dist-info/licenses/LICENSE,sha256=TSASXY1hDmb_hVj855U90g-GnBAwJBmQ1u2ptLTDkPI,1078
-pyleebounds-0.1.0.dist-info/METADATA,sha256=Q-xSztClpeTFqIgXpY0WST0oplTPSDfF_rJ-QGELLrM,6647
-pyleebounds-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-pyleebounds-0.1.0.dist-info/top_level.txt,sha256=OAMZaMQmv_5Cmzyg19s1rvMNC5ll-60rskfOkmiS2nc,12
-pyleebounds-0.1.0.dist-info/RECORD,,

{pyleebounds-0.1.0.dist-info → pyleebounds-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pyleebounds-0.1.0.dist-info → pyleebounds-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{pyleebounds-0.1.0.dist-info → pyleebounds-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

pyleebounds 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

pyleebounds 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl