PyPI - panelbox - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

panelbox 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

panelbox/__init__.py +41 -0
panelbox/__version__.py +13 -1
panelbox/core/formula_parser.py +9 -2
panelbox/core/panel_data.py +1 -1
panelbox/datasets/__init__.py +39 -0
panelbox/datasets/load.py +334 -0
panelbox/gmm/difference_gmm.py +63 -15
panelbox/gmm/estimator.py +46 -5
panelbox/gmm/system_gmm.py +136 -21
panelbox/models/static/__init__.py +4 -0
panelbox/models/static/between.py +434 -0
panelbox/models/static/first_difference.py +494 -0
panelbox/models/static/fixed_effects.py +80 -11
panelbox/models/static/pooled_ols.py +80 -11
panelbox/models/static/random_effects.py +52 -10
panelbox/standard_errors/__init__.py +119 -0
panelbox/standard_errors/clustered.py +386 -0
panelbox/standard_errors/comparison.py +528 -0
panelbox/standard_errors/driscoll_kraay.py +386 -0
panelbox/standard_errors/newey_west.py +324 -0
panelbox/standard_errors/pcse.py +358 -0
panelbox/standard_errors/robust.py +324 -0
panelbox/standard_errors/utils.py +390 -0
panelbox/validation/__init__.py +6 -0
panelbox/validation/robustness/__init__.py +51 -0
panelbox/validation/robustness/bootstrap.py +933 -0
panelbox/validation/robustness/checks.py +143 -0
panelbox/validation/robustness/cross_validation.py +538 -0
panelbox/validation/robustness/influence.py +364 -0
panelbox/validation/robustness/jackknife.py +457 -0
panelbox/validation/robustness/outliers.py +529 -0
panelbox/validation/robustness/sensitivity.py +809 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0

panelbox/standard_errors/driscoll_kraay.py ADDED Viewed

@@ -0,0 +1,386 @@
+"""
+Driscoll-Kraay standard errors for panel data.
+Driscoll-Kraay (1998) standard errors are robust to general forms of
+spatial and temporal dependence when the number of time periods is large.
+They are particularly useful for macro panel data with potential cross-
+sectional correlation.
+"""
+from typing import Optional, Literal
+import numpy as np
+import pandas as pd
+from dataclasses import dataclass
+from .utils import compute_bread, sandwich_covariance
+KernelType = Literal['bartlett', 'parzen', 'quadratic_spectral']
+@dataclass
+class DriscollKraayResult:
+    """
+    Result of Driscoll-Kraay covariance estimation.
+    Attributes
+    ----------
+    cov_matrix : np.ndarray
+        Driscoll-Kraay covariance matrix (k x k)
+    std_errors : np.ndarray
+        Driscoll-Kraay standard errors (k,)
+    max_lags : int
+        Maximum number of lags used
+    kernel : str
+        Kernel function used
+    n_obs : int
+        Number of observations
+    n_params : int
+        Number of parameters
+    n_periods : int
+        Number of time periods
+    bandwidth : Optional[float]
+        Bandwidth parameter (for some kernels)
+    """
+    cov_matrix: np.ndarray
+    std_errors: np.ndarray
+    max_lags: int
+    kernel: str
+    n_obs: int
+    n_params: int
+    n_periods: int
+    bandwidth: Optional[float] = None
+class DriscollKraayStandardErrors:
+    """
+    Driscoll-Kraay (1998) standard errors for panel data.
+    Robust to general forms of spatial and temporal dependence.
+    Particularly useful for macro panels with cross-sectional correlation.
+    Parameters
+    ----------
+    X : np.ndarray
+        Design matrix (n x k)
+    resid : np.ndarray
+        Residuals (n,)
+    time_ids : np.ndarray
+        Time period identifiers (n,)
+    max_lags : int, optional
+        Maximum number of lags. If None, uses floor(4(T/100)^(2/9))
+    kernel : {'bartlett', 'parzen', 'quadratic_spectral'}, default='bartlett'
+        Kernel function for weighting lags
+    Attributes
+    ----------
+    X : np.ndarray
+        Design matrix
+    resid : np.ndarray
+        Residuals
+    time_ids : np.ndarray
+        Time identifiers
+    n_obs : int
+        Number of observations
+    n_params : int
+        Number of parameters
+    n_periods : int
+        Number of time periods
+    Examples
+    --------
+    >>> # Panel data with T=20 periods
+    >>> dk = DriscollKraayStandardErrors(X, resid, time_ids)
+    >>> result = dk.compute()
+    >>> print(result.std_errors)
+    >>> # Custom lags
+    >>> dk = DriscollKraayStandardErrors(X, resid, time_ids, max_lags=5)
+    >>> result = dk.compute()
+    References
+    ----------
+    Driscoll, J. C., & Kraay, A. C. (1998). Consistent covariance matrix
+        estimation with spatially dependent panel data. Review of Economics
+        and Statistics, 80(4), 549-560.
+    Hoechle, D. (2007). Robust standard errors for panel regressions with
+        cross-sectional dependence. The Stata Journal, 7(3), 281-312.
+    """
+    def __init__(
+        self,
+        X: np.ndarray,
+        resid: np.ndarray,
+        time_ids: np.ndarray,
+        max_lags: Optional[int] = None,
+        kernel: KernelType = 'bartlett'
+    ):
+        self.X = X
+        self.resid = resid
+        self.time_ids = np.asarray(time_ids)
+        self.kernel = kernel
+        self.n_obs, self.n_params = X.shape
+        # Validate dimensions
+        if len(self.time_ids) != self.n_obs:
+            raise ValueError(
+                f"time_ids dimension mismatch: expected {self.n_obs}, "
+                f"got {len(self.time_ids)}"
+            )
+        # Count time periods
+        unique_periods = np.unique(self.time_ids)
+        self.n_periods = len(unique_periods)
+        # Set max_lags
+        if max_lags is None:
+            # Newey-West rule: floor(4(T/100)^(2/9))
+            self.max_lags = int(np.floor(4 * (self.n_periods / 100) ** (2/9)))
+        else:
+            self.max_lags = max_lags
+        # Ensure max_lags is reasonable
+        if self.max_lags >= self.n_periods:
+            self.max_lags = self.n_periods - 1
+        # Cache
+        self._bread = None
+        self._time_sorted = None
+    @property
+    def bread(self) -> np.ndarray:
+        """Compute and cache bread matrix."""
+        if self._bread is None:
+            self._bread = compute_bread(self.X)
+        return self._bread
+    def _sort_by_time(self):
+        """Sort data by time periods."""
+        if self._time_sorted is None:
+            # Get unique time periods in order
+            unique_times = np.unique(self.time_ids)
+            # Create mapping
+            time_map = {t: i for i, t in enumerate(unique_times)}
+            # Sort indices by time
+            time_indices = np.array([time_map[t] for t in self.time_ids])
+            sort_idx = np.argsort(time_indices)
+            self._time_sorted = {
+                'X': self.X[sort_idx],
+                'resid': self.resid[sort_idx],
+                'time_ids': self.time_ids[sort_idx],
+                'sort_idx': sort_idx,
+                'unique_times': unique_times
+            }
+        return self._time_sorted
+    def _kernel_weight(self, lag: int) -> float:
+        """
+        Compute kernel weight for given lag.
+        Parameters
+        ----------
+        lag : int
+            Lag number (0, 1, 2, ...)
+        Returns
+        -------
+        weight : float
+            Kernel weight
+        """
+        if lag > self.max_lags:
+            return 0.0
+        if self.kernel == 'bartlett':
+            # Bartlett (triangular) kernel
+            # w(l) = 1 - l/(max_lags + 1)
+            return 1.0 - lag / (self.max_lags + 1)
+        elif self.kernel == 'parzen':
+            # Parzen kernel
+            z = lag / (self.max_lags + 1)
+            if z <= 0.5:
+                return 1 - 6 * z**2 + 6 * z**3
+            else:
+                return 2 * (1 - z)**3
+        elif self.kernel == 'quadratic_spectral':
+            # Quadratic Spectral kernel
+            if lag == 0:
+                return 1.0
+            z = 6 * np.pi * lag / (self.max_lags + 1) / 5
+            return 3 / z**2 * (np.sin(z) / z - np.cos(z))
+        else:
+            raise ValueError(f"Unknown kernel: {self.kernel}")
+    def _compute_gamma(self, lag: int) -> np.ndarray:
+        """
+        Compute autocovariance matrix for given lag.
+        Γ_l = Σ_t X_t' ε̂_t ε̂_{t-l}' X_{t-l}
+        Parameters
+        ----------
+        lag : int
+            Lag number (0, 1, 2, ...)
+        Returns
+        -------
+        gamma : np.ndarray
+            Autocovariance matrix (k x k)
+        """
+        sorted_data = self._sort_by_time()
+        unique_times = sorted_data['unique_times']
+        k = self.n_params
+        gamma = np.zeros((k, k))
+        # For each time period t
+        for t_idx in range(lag, self.n_periods):
+            t = unique_times[t_idx]
+            t_lag = unique_times[t_idx - lag]
+            # Get observations for time t
+            mask_t = sorted_data['time_ids'] == t
+            X_t = sorted_data['X'][mask_t]
+            resid_t = sorted_data['resid'][mask_t]
+            # Get observations for time t-lag
+            mask_t_lag = sorted_data['time_ids'] == t_lag
+            X_t_lag = sorted_data['X'][mask_t_lag]
+            resid_t_lag = sorted_data['resid'][mask_t_lag]
+            # Compute cross-product
+            # For each pair of observations
+            for i in range(len(X_t)):
+                for j in range(len(X_t_lag)):
+                    gamma += np.outer(
+                        X_t[i] * resid_t[i],
+                        X_t_lag[j] * resid_t_lag[j]
+                    )
+        return gamma
+    def compute(self) -> DriscollKraayResult:
+        """
+        Compute Driscoll-Kraay covariance matrix.
+        Returns
+        -------
+        result : DriscollKraayResult
+            Driscoll-Kraay covariance and standard errors
+        Notes
+        -----
+        The Driscoll-Kraay estimator is:
+        V_DK = (X'X)^{-1} S_DK (X'X)^{-1}
+        where:
+        S_DK = Γ_0 + Σ_{l=1}^L w_l (Γ_l + Γ_l')
+        and Γ_l is the lag-l autocovariance matrix:
+        Γ_l = Σ_t X_t' ε̂_t ε̂_{t-l}' X_{t-l}
+        The kernel weights w_l ensure positive semi-definiteness.
+        """
+        k = self.n_params
+        # Start with lag-0 autocovariance
+        S = self._compute_gamma(0)
+        # Add weighted autocovariances for lags 1, ..., max_lags
+        for lag in range(1, self.max_lags + 1):
+            weight = self._kernel_weight(lag)
+            if weight > 0:
+                gamma_l = self._compute_gamma(lag)
+                # Add both Γ_l and Γ_l' (symmetrize)
+                S += weight * (gamma_l + gamma_l.T)
+        # Sandwich: V = Bread @ S @ Bread
+        cov_matrix = sandwich_covariance(self.bread, S)
+        std_errors = np.sqrt(np.diag(cov_matrix))
+        return DriscollKraayResult(
+            cov_matrix=cov_matrix,
+            std_errors=std_errors,
+            max_lags=self.max_lags,
+            kernel=self.kernel,
+            n_obs=self.n_obs,
+            n_params=self.n_params,
+            n_periods=self.n_periods
+        )
+    def diagnostic_summary(self) -> str:
+        """
+        Generate diagnostic summary.
+        Returns
+        -------
+        summary : str
+            Diagnostic information
+        """
+        lines = []
+        lines.append("Driscoll-Kraay Standard Errors Diagnostics")
+        lines.append("=" * 50)
+        lines.append(f"Number of observations: {self.n_obs}")
+        lines.append(f"Number of time periods: {self.n_periods}")
+        lines.append(f"Avg obs per period: {self.n_obs / self.n_periods:.1f}")
+        lines.append(f"Maximum lags: {self.max_lags}")
+        lines.append(f"Kernel function: {self.kernel}")
+        lines.append("")
+        # Recommendations
+        if self.n_periods < 20:
+            lines.append("⚠ WARNING: Few time periods (<20)")
+            lines.append("  Driscoll-Kraay SEs may not perform well with T < 20")
+            lines.append("  Consider alternative methods")
+        if self.max_lags > self.n_periods / 4:
+            lines.append("⚠ WARNING: Large max_lags relative to T")
+            lines.append(f"  max_lags = {self.max_lags}, T = {self.n_periods}")
+        return "\n".join(lines)
+def driscoll_kraay(
+    X: np.ndarray,
+    resid: np.ndarray,
+    time_ids: np.ndarray,
+    max_lags: Optional[int] = None,
+    kernel: KernelType = 'bartlett'
+) -> DriscollKraayResult:
+    """
+    Convenience function for Driscoll-Kraay standard errors.
+    Parameters
+    ----------
+    X : np.ndarray
+        Design matrix (n x k)
+    resid : np.ndarray
+        Residuals (n,)
+    time_ids : np.ndarray
+        Time period identifiers (n,)
+    max_lags : int, optional
+        Maximum number of lags
+    kernel : {'bartlett', 'parzen', 'quadratic_spectral'}, default='bartlett'
+        Kernel function
+    Returns
+    -------
+    result : DriscollKraayResult
+        Driscoll-Kraay covariance and standard errors
+    Examples
+    --------
+    >>> from panelbox.standard_errors import driscoll_kraay
+    >>> result = driscoll_kraay(X, resid, time_ids, max_lags=3)
+    >>> print(result.std_errors)
+    """
+    dk = DriscollKraayStandardErrors(X, resid, time_ids, max_lags, kernel)
+    return dk.compute()

panelbox/standard_errors/newey_west.py ADDED Viewed

@@ -0,0 +1,324 @@
+"""
+Newey-West HAC (Heteroskedasticity and Autocorrelation Consistent) standard errors.
+Newey-West (1987) standard errors are robust to both heteroskedasticity and
+autocorrelation. Useful for time-series and panel data with serial correlation.
+"""
+from typing import Optional, Literal
+import numpy as np
+from dataclasses import dataclass
+from .utils import compute_bread, sandwich_covariance
+KernelType = Literal['bartlett', 'parzen', 'quadratic_spectral']
+@dataclass
+class NeweyWestResult:
+    """
+    Result of Newey-West HAC covariance estimation.
+    Attributes
+    ----------
+    cov_matrix : np.ndarray
+        Newey-West covariance matrix (k x k)
+    std_errors : np.ndarray
+        Newey-West standard errors (k,)
+    max_lags : int
+        Maximum number of lags used
+    kernel : str
+        Kernel function used
+    n_obs : int
+        Number of observations
+    n_params : int
+        Number of parameters
+    prewhitening : bool
+        Whether prewhitening was applied
+    """
+    cov_matrix: np.ndarray
+    std_errors: np.ndarray
+    max_lags: int
+    kernel: str
+    n_obs: int
+    n_params: int
+    prewhitening: bool = False
+class NeweyWestStandardErrors:
+    """
+    Newey-West (1987) HAC standard errors.
+    Robust to heteroskedasticity and autocorrelation. Particularly useful
+    for time-series data and panel data with serial correlation.
+    Parameters
+    ----------
+    X : np.ndarray
+        Design matrix (n x k)
+    resid : np.ndarray
+        Residuals (n,)
+    max_lags : int, optional
+        Maximum number of lags. If None, uses floor(4(T/100)^(2/9))
+    kernel : {'bartlett', 'parzen', 'quadratic_spectral'}, default='bartlett'
+        Kernel function for weighting lags
+    prewhitening : bool, default=False
+        Apply AR(1) prewhitening to reduce finite-sample bias
+    Attributes
+    ----------
+    X : np.ndarray
+        Design matrix
+    resid : np.ndarray
+        Residuals
+    n_obs : int
+        Number of observations
+    n_params : int
+        Number of parameters
+    Examples
+    --------
+    >>> # Time-series with autocorrelation
+    >>> nw = NeweyWestStandardErrors(X, resid, max_lags=4)
+    >>> result = nw.compute()
+    >>> print(result.std_errors)
+    >>> # Auto-select lags
+    >>> nw = NeweyWestStandardErrors(X, resid)
+    >>> result = nw.compute()
+    References
+    ----------
+    Newey, W. K., & West, K. D. (1987). A simple, positive semi-definite,
+        heteroskedasticity and autocorrelation consistent covariance matrix.
+        Econometrica, 55(3), 703-708.
+    Andrews, D. W. K. (1991). Heteroskedasticity and autocorrelation consistent
+        covariance matrix estimation. Econometrica, 59(3), 817-858.
+    """
+    def __init__(
+        self,
+        X: np.ndarray,
+        resid: np.ndarray,
+        max_lags: Optional[int] = None,
+        kernel: KernelType = 'bartlett',
+        prewhitening: bool = False
+    ):
+        self.X = X
+        self.resid = resid
+        self.kernel = kernel
+        self.prewhitening = prewhitening
+        self.n_obs, self.n_params = X.shape
+        # Set max_lags
+        if max_lags is None:
+            # Newey-West rule: floor(4(T/100)^(2/9))
+            self.max_lags = int(np.floor(4 * (self.n_obs / 100) ** (2/9)))
+        else:
+            self.max_lags = max_lags
+        # Ensure max_lags is reasonable
+        if self.max_lags >= self.n_obs:
+            self.max_lags = self.n_obs - 1
+        # Cache
+        self._bread = None
+    @property
+    def bread(self) -> np.ndarray:
+        """Compute and cache bread matrix."""
+        if self._bread is None:
+            self._bread = compute_bread(self.X)
+        return self._bread
+    def _kernel_weight(self, lag: int) -> float:
+        """
+        Compute kernel weight for given lag.
+        Parameters
+        ----------
+        lag : int
+            Lag number (0, 1, 2, ...)
+        Returns
+        -------
+        weight : float
+            Kernel weight
+        """
+        if lag > self.max_lags:
+            return 0.0
+        if self.kernel == 'bartlett':
+            # Bartlett (triangular) kernel
+            # w(l) = 1 - l/(max_lags + 1)
+            return 1.0 - lag / (self.max_lags + 1)
+        elif self.kernel == 'parzen':
+            # Parzen kernel
+            z = lag / (self.max_lags + 1)
+            if z <= 0.5:
+                return 1 - 6 * z**2 + 6 * z**3
+            else:
+                return 2 * (1 - z)**3
+        elif self.kernel == 'quadratic_spectral':
+            # Quadratic Spectral kernel
+            if lag == 0:
+                return 1.0
+            z = 6 * np.pi * lag / (self.max_lags + 1) / 5
+            return 3 / z**2 * (np.sin(z) / z - np.cos(z))
+        else:
+            raise ValueError(f"Unknown kernel: {self.kernel}")
+    def _compute_gamma(self, lag: int) -> np.ndarray:
+        """
+        Compute lag-l autocovariance matrix.
+        Γ_l = (1/n) Σ_{t=l+1}^n (X_t ε_t)(X_{t-l} ε_{t-l})'
+        Parameters
+        ----------
+        lag : int
+            Lag number (0, 1, 2, ...)
+        Returns
+        -------
+        gamma : np.ndarray
+            Autocovariance matrix (k x k)
+        """
+        k = self.n_params
+        n = self.n_obs
+        if lag == 0:
+            # Γ_0 = (1/n) Σ X_t' ε_t² X_t
+            # This is the heteroskedasticity component
+            X_resid = self.X * self.resid[:, np.newaxis]
+            gamma = (X_resid.T @ X_resid) / n
+        else:
+            # Γ_l = (1/n) Σ (X_t ε_t)(X_{t-l} ε_{t-l})'
+            X_resid_t = self.X[lag:] * self.resid[lag:, np.newaxis]
+            X_resid_t_lag = self.X[:-lag] * self.resid[:-lag, np.newaxis]
+            gamma = (X_resid_t.T @ X_resid_t_lag) / n
+        return gamma
+    def compute(self) -> NeweyWestResult:
+        """
+        Compute Newey-West HAC covariance matrix.
+        Returns
+        -------
+        result : NeweyWestResult
+            Newey-West covariance and standard errors
+        Notes
+        -----
+        The Newey-West estimator is:
+        V_NW = (X'X)^{-1} Ω_NW (X'X)^{-1}
+        where:
+        Ω_NW = Γ_0 + Σ_{l=1}^L w_l (Γ_l + Γ_l')
+        and Γ_l is the lag-l autocovariance matrix.
+        The kernel weights w_l ensure positive semi-definiteness.
+        """
+        # Start with lag-0 autocovariance (heteroskedasticity)
+        S = self._compute_gamma(0)
+        # Add weighted autocovariances for lags 1, ..., max_lags
+        for lag in range(1, self.max_lags + 1):
+            weight = self._kernel_weight(lag)
+            if weight > 0:
+                gamma_l = self._compute_gamma(lag)
+                # Add both Γ_l and Γ_l' (symmetrize)
+                S += weight * (gamma_l + gamma_l.T)
+        # Scale by n (since gamma is already divided by n)
+        S *= self.n_obs
+        # Sandwich: V = Bread @ S @ Bread
+        cov_matrix = sandwich_covariance(self.bread, S)
+        std_errors = np.sqrt(np.diag(cov_matrix))
+        return NeweyWestResult(
+            cov_matrix=cov_matrix,
+            std_errors=std_errors,
+            max_lags=self.max_lags,
+            kernel=self.kernel,
+            n_obs=self.n_obs,
+            n_params=self.n_params,
+            prewhitening=self.prewhitening
+        )
+    def diagnostic_summary(self) -> str:
+        """
+        Generate diagnostic summary.
+        Returns
+        -------
+        summary : str
+            Diagnostic information
+        """
+        lines = []
+        lines.append("Newey-West HAC Standard Errors Diagnostics")
+        lines.append("=" * 50)
+        lines.append(f"Number of observations: {self.n_obs}")
+        lines.append(f"Number of parameters: {self.n_params}")
+        lines.append(f"Maximum lags: {self.max_lags}")
+        lines.append(f"Kernel function: {self.kernel}")
+        lines.append(f"Prewhitening: {self.prewhitening}")
+        lines.append("")
+        # Recommendations
+        if self.n_obs < 50:
+            lines.append("⚠ WARNING: Small sample size (<50)")
+            lines.append("  Newey-West SEs may not perform well with few observations")
+        if self.max_lags > self.n_obs / 3:
+            lines.append("⚠ WARNING: Large max_lags relative to sample size")
+            lines.append(f"  max_lags = {self.max_lags}, n = {self.n_obs}")
+        return "\n".join(lines)
+def newey_west(
+    X: np.ndarray,
+    resid: np.ndarray,
+    max_lags: Optional[int] = None,
+    kernel: KernelType = 'bartlett',
+    prewhitening: bool = False
+) -> NeweyWestResult:
+    """
+    Convenience function for Newey-West HAC standard errors.
+    Parameters
+    ----------
+    X : np.ndarray
+        Design matrix (n x k)
+    resid : np.ndarray
+        Residuals (n,)
+    max_lags : int, optional
+        Maximum number of lags
+    kernel : {'bartlett', 'parzen', 'quadratic_spectral'}, default='bartlett'
+        Kernel function
+    prewhitening : bool, default=False
+        Apply AR(1) prewhitening
+    Returns
+    -------
+    result : NeweyWestResult
+        Newey-West covariance and standard errors
+    Examples
+    --------
+    >>> from panelbox.standard_errors import newey_west
+    >>> result = newey_west(X, resid, max_lags=4)
+    >>> print(result.std_errors)
+    """
+    nw = NeweyWestStandardErrors(X, resid, max_lags, kernel, prewhitening)
+    return nw.compute()

panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

panelbox 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl