PyPI - panelbox - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

panelbox 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

panelbox/__init__.py +41 -0
panelbox/__version__.py +13 -1
panelbox/core/formula_parser.py +9 -2
panelbox/core/panel_data.py +1 -1
panelbox/datasets/__init__.py +39 -0
panelbox/datasets/load.py +334 -0
panelbox/gmm/difference_gmm.py +63 -15
panelbox/gmm/estimator.py +46 -5
panelbox/gmm/system_gmm.py +136 -21
panelbox/models/static/__init__.py +4 -0
panelbox/models/static/between.py +434 -0
panelbox/models/static/first_difference.py +494 -0
panelbox/models/static/fixed_effects.py +80 -11
panelbox/models/static/pooled_ols.py +80 -11
panelbox/models/static/random_effects.py +52 -10
panelbox/standard_errors/__init__.py +119 -0
panelbox/standard_errors/clustered.py +386 -0
panelbox/standard_errors/comparison.py +528 -0
panelbox/standard_errors/driscoll_kraay.py +386 -0
panelbox/standard_errors/newey_west.py +324 -0
panelbox/standard_errors/pcse.py +358 -0
panelbox/standard_errors/robust.py +324 -0
panelbox/standard_errors/utils.py +390 -0
panelbox/validation/__init__.py +6 -0
panelbox/validation/robustness/__init__.py +51 -0
panelbox/validation/robustness/bootstrap.py +933 -0
panelbox/validation/robustness/checks.py +143 -0
panelbox/validation/robustness/cross_validation.py +538 -0
panelbox/validation/robustness/influence.py +364 -0
panelbox/validation/robustness/jackknife.py +457 -0
panelbox/validation/robustness/outliers.py +529 -0
panelbox/validation/robustness/sensitivity.py +809 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
{panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0

panelbox/gmm/system_gmm.py CHANGED Viewed

@@ -248,27 +248,51 @@ class SystemGMM(DifferenceGMM):
         # Step 4: Stack equations
         y_stacked = np.vstack([y_diff, y_level])
         X_stacked = np.vstack([X_diff, X_level])
-        Z_stacked = self._stack_instruments(Z_diff, Z_level)
+        Z_stacked_raw = self._stack_instruments(Z_diff, Z_level)
+        # Clean instrument matrix before estimation
+        # Remove observations and columns with NaNs
+        valid_mask = self._get_valid_mask_system(y_stacked, X_stacked, Z_stacked_raw)
+        y_stacked_clean = y_stacked[valid_mask]
+        X_stacked_clean = X_stacked[valid_mask]
+        Z_stacked_clean = Z_stacked_raw[valid_mask]
+        # Remove instrument columns with remaining NaNs
+        valid_instrument_cols = ~np.isnan(Z_stacked_clean).any(axis=0)
+        if not valid_instrument_cols.any():
+            raise ValueError("No valid instrument columns in System GMM. Check data quality.")
+        Z_stacked_clean = Z_stacked_clean[:, valid_instrument_cols]
+        # For tests later, keep track of the full stacked residuals
+        residuals_full = np.full_like(y_stacked, np.nan)
         # Repeat ids and times for stacked system
         ids_stacked = np.concatenate([ids, ids])
         times_stacked = np.concatenate([times, times])
-        # Step 5: Estimate GMM on stacked system
+        # Step 5: Estimate GMM on stacked system (using cleaned data)
         if self.gmm_type == 'one_step':
-            beta, W, residuals = self.estimator.one_step(y_stacked, X_stacked, Z_stacked)
-            vcov = self._compute_one_step_vcov(X_stacked, Z_stacked, residuals, W)
+            beta, W, residuals_clean = self.estimator.one_step(
+                y_stacked_clean, X_stacked_clean, Z_stacked_clean
+            )
+            vcov = self._compute_one_step_vcov(X_stacked_clean, Z_stacked_clean, residuals_clean, W)
             converged = True
         elif self.gmm_type == 'two_step':
-            beta, vcov, W, residuals = self.estimator.two_step(
-                y_stacked, X_stacked, Z_stacked, robust=self.robust
+            beta, vcov, W, residuals_clean = self.estimator.two_step(
+                y_stacked_clean, X_stacked_clean, Z_stacked_clean, robust=self.robust
             )
             converged = True
         else:  # iterative
             beta, vcov, W, converged = self.estimator.iterative(
-                y_stacked, X_stacked, Z_stacked
+                y_stacked_clean, X_stacked_clean, Z_stacked_clean
             )
-            residuals = y_stacked - X_stacked @ beta
+            residuals_clean = y_stacked_clean - X_stacked_clean @ beta
+        # Fill residuals in full array
+        if residuals_full.ndim > 1:
+            residuals_full[valid_mask] = residuals_clean.reshape(-1, 1)
+        else:
+            residuals_full[valid_mask] = residuals_clean.flatten()
         # Ensure beta is 1D for pandas Series
         beta = beta.flatten()
@@ -285,19 +309,19 @@ class SystemGMM(DifferenceGMM):
         # Step 8: Compute specification tests
         n_params = len(beta)
-        # Hansen J-test on full system
+        # Hansen J-test on full system (use cleaned data)
         hansen = self.tester.hansen_j_test(
-            residuals, Z_stacked, W, n_params
+            residuals_clean, Z_stacked_clean, W, n_params
         )
         # Sargan test
         sargan = self.tester.sargan_test(
-            residuals, Z_stacked, n_params
+            residuals_clean, Z_stacked_clean, n_params
         )
         # AR tests (on difference residuals only)
         n_diff = len(y_diff)
-        residuals_diff_only = residuals[:n_diff]
+        residuals_diff_only = residuals_full[:n_diff]
         ids_diff_only = ids_stacked[:n_diff]  # Use stacked ids, first half
         valid_mask_diff = ~np.isnan(residuals_diff_only.flatten())
@@ -312,12 +336,18 @@ class SystemGMM(DifferenceGMM):
         )
         # Difference-in-Hansen test for level instruments
-        diff_hansen = self._compute_diff_hansen(
-            residuals, Z_diff, Z_level, W, n_params
-        )
+        # Note: Disabled when instrument columns are filtered due to dimension mismatches
+        # This is a known limitation when dealing with sparse instrument coverage
+        try:
+            diff_hansen = self._compute_diff_hansen(
+                residuals_full, Z_diff, Z_level, W, n_params
+            )
+        except (ValueError, np.linalg.LinAlgError):
+            # If dimensions don't match (due to column filtering), skip test
+            diff_hansen = None
         # Step 9: Create results object
-        valid_mask = ~np.isnan(residuals.flatten())
+        valid_mask_results = ~np.isnan(residuals_full.flatten())
         self.results = GMMResults(
             params=pd.Series(beta, index=var_names),
             std_errors=pd.Series(std_errors, index=var_names),
@@ -325,7 +355,7 @@ class SystemGMM(DifferenceGMM):
             pvalues=pd.Series(pvalues, index=var_names),
             nobs=int(np.sum(valid_mask)),
             n_groups=self.instrument_builder.n_groups,
-            n_instruments=Z_stacked.shape[1],
+            n_instruments=Z_stacked_clean.shape[1],
             n_params=n_params,
             hansen_j=hansen,
             sargan=sargan,
@@ -339,7 +369,7 @@ class SystemGMM(DifferenceGMM):
             windmeijer_corrected=self.robust and self.two_step,
             model_type='system',
             transformation='fd',
-            residuals=residuals
+            residuals=residuals_full
         )
         self.params = self.results.params
@@ -530,19 +560,104 @@ class SystemGMM(DifferenceGMM):
         """
         n_obs = Z_diff.n_obs
+        # Filter out invalid instrument columns (all NaN or insufficient coverage)
+        # For difference instruments
+        Z_diff_clean = self._filter_invalid_columns(Z_diff.Z, min_coverage=0.10)
+        # For level instruments
+        Z_level_clean = self._filter_invalid_columns(Z_level.Z, min_coverage=0.10)
         # Create block diagonal matrix
-        n_instruments_total = Z_diff.n_instruments + Z_level.n_instruments
+        n_instruments_total = Z_diff_clean.shape[1] + Z_level_clean.shape[1]
         Z_stacked = np.zeros((2 * n_obs, n_instruments_total))
         # Fill difference block
-        Z_stacked[:n_obs, :Z_diff.n_instruments] = Z_diff.Z
+        Z_stacked[:n_obs, :Z_diff_clean.shape[1]] = Z_diff_clean
         # Fill level block
-        Z_stacked[n_obs:, Z_diff.n_instruments:] = Z_level.Z
+        Z_stacked[n_obs:, Z_diff_clean.shape[1]:] = Z_level_clean
         return Z_stacked
+    def _filter_invalid_columns(self, Z: np.ndarray, min_coverage: float = 0.10) -> np.ndarray:
+        """
+        Filter out instrument columns with insufficient coverage.
+        Parameters
+        ----------
+        Z : np.ndarray
+            Instrument matrix
+        min_coverage : float
+            Minimum fraction of non-NaN values required (default: 0.10 = 10%)
+        Returns
+        -------
+        np.ndarray
+            Filtered instrument matrix with only valid columns
+        """
+        if Z.shape[1] == 0:
+            return Z
+        # Count non-NaN values per column
+        n_valid_per_col = (~np.isnan(Z)).sum(axis=0)
+        n_obs = Z.shape[0]
+        # Calculate coverage per column
+        coverage = n_valid_per_col / n_obs
+        # Keep columns with sufficient coverage
+        valid_cols = coverage >= min_coverage
+        # If no columns are valid, return at least one column (all zeros)
+        # This prevents dimension errors, though estimation may fail later
+        if not valid_cols.any():
+            import warnings
+            warnings.warn("No valid instrument columns found. System GMM may fail.")
+            return np.zeros((n_obs, 1))
+        return Z[:, valid_cols]
+    def _get_valid_mask_system(self,
+                              y: np.ndarray,
+                              X: np.ndarray,
+                              Z: np.ndarray,
+                              min_instruments: Optional[int] = None) -> np.ndarray:
+        """
+        Get mask of observations with sufficient valid data for System GMM.
+        Parameters
+        ----------
+        y : np.ndarray
+            Dependent variable
+        X : np.ndarray
+            Regressors
+        Z : np.ndarray
+            Instruments
+        min_instruments : int, optional
+            Minimum number of valid instruments required
+        Returns
+        -------
+        np.ndarray
+            Boolean mask of valid observations
+        """
+        y_valid = ~np.isnan(y).any(axis=1) if y.ndim > 1 else ~np.isnan(y)
+        X_valid = ~np.isnan(X).any(axis=1)
+        # For instruments, count how many are valid per observation
+        Z_notnan = ~np.isnan(Z)
+        n_valid_instruments = Z_notnan.sum(axis=1)
+        # Determine minimum required instruments
+        if min_instruments is None:
+            k = X.shape[1] if X.ndim > 1 else 1
+            min_instruments = k + 1
+        Z_valid = n_valid_instruments >= min_instruments
+        return y_valid & X_valid & Z_valid
     def _compute_diff_hansen(self,
                             residuals: np.ndarray,
                             Z_diff: InstrumentSet,

panelbox/models/static/__init__.py CHANGED Viewed

@@ -5,9 +5,13 @@ Static panel models.
 from panelbox.models.static.pooled_ols import PooledOLS
 from panelbox.models.static.fixed_effects import FixedEffects
 from panelbox.models.static.random_effects import RandomEffects
+from panelbox.models.static.between import BetweenEstimator
+from panelbox.models.static.first_difference import FirstDifferenceEstimator
 __all__ = [
     'PooledOLS',
     'FixedEffects',
     'RandomEffects',
+    'BetweenEstimator',
+    'FirstDifferenceEstimator',
 ]

panelbox/models/static/between.py ADDED Viewed

@@ -0,0 +1,434 @@
+"""
+Between estimator for panel data.
+This module provides the Between estimator which regresses on group means,
+capturing variation between entities rather than within entities.
+"""
+from typing import Optional
+import numpy as np
+import pandas as pd
+from panelbox.core.base_model import PanelModel
+from panelbox.core.results import PanelResults
+from panelbox.utils.matrix_ops import (
+    compute_ols,
+    compute_vcov_nonrobust,
+    compute_panel_rsquared
+)
+from panelbox.standard_errors import (
+    robust_covariance,
+    cluster_by_entity,
+    twoway_cluster,
+    driscoll_kraay,
+    newey_west,
+    pcse
+)
+class BetweenEstimator(PanelModel):
+    """
+    Between estimator for panel data.
+    This estimator regresses on group (entity) means, capturing the variation
+    between entities rather than within entities. It answers: "Do entities with
+    higher average X also have higher average Y?"
+    The between transformation computes group means:
+        ȳ_i = β x̄_i + α + ū_i
+    where bars denote averages over time for each entity i.
+    This estimator is useful when:
+    - T (time periods) is small relative to N (entities)
+    - Focus is on cross-sectional (between-entity) variation
+    - Time-invariant characteristics are of interest
+    Contrast with Fixed Effects (within estimator):
+    - FE uses deviations from entity means (within variation)
+    - BE uses entity means themselves (between variation)
+    Parameters
+    ----------
+    formula : str
+        Model formula in R-style syntax (e.g., "y ~ x1 + x2")
+    data : pd.DataFrame
+        Panel data in long format
+    entity_col : str
+        Name of the column identifying entities
+    time_col : str
+        Name of the column identifying time periods
+    weights : np.ndarray, optional
+        Observation weights (applied to entity means)
+    Attributes
+    ----------
+    entity_means : pd.DataFrame, optional
+        Entity-level means (after fitting)
+    Examples
+    --------
+    >>> import panelbox as pb
+    >>> import pandas as pd
+    >>>
+    >>> # Load data
+    >>> data = pb.load_grunfeld()
+    >>>
+    >>> # Between estimator
+    >>> be = pb.BetweenEstimator("invest ~ value + capital", data, "firm", "year")
+    >>> results = be.fit(cov_type='robust')
+    >>> print(results.summary())
+    >>>
+    >>> # Compare with Fixed Effects (within)
+    >>> fe = pb.FixedEffects("invest ~ value + capital", data, "firm", "year")
+    >>> results_fe = fe.fit()
+    >>>
+    >>> # BE captures between variation, FE captures within variation
+    >>> print(f"Between R²: {results.rsquared:.4f}")
+    >>> print(f"Within R²: {results_fe.rsquared:.4f}")
+    >>>
+    >>> # Access entity means
+    >>> entity_means = be.entity_means
+    >>> print(entity_means.head())
+    Notes
+    -----
+    The Between estimator:
+    1. Computes entity-level means for all variables
+    2. Runs OLS on the N entity means (not NT observations)
+    3. Reports R² as the between R² (variation explained across entities)
+    Degrees of freedom:
+    - N observations (one per entity)
+    - k parameters (slopes + intercept)
+    - df_resid = N - k
+    Standard errors:
+    - All SE types are supported (robust, clustered, etc.)
+    - Applied to the N entity-level observations
+    - Clustering by time is possible if needed
+    References
+    ----------
+    .. [1] Wooldridge, J. M. (2010). Econometric Analysis of Cross Section
+       and Panel Data. MIT Press. Section 10.2.2.
+    .. [2] Baltagi, B. H. (2013). Econometric Analysis of Panel Data.
+       Wiley. Chapter 2.
+    """
+    def __init__(
+        self,
+        formula: str,
+        data: pd.DataFrame,
+        entity_col: str,
+        time_col: str,
+        weights: Optional[np.ndarray] = None
+    ):
+        super().__init__(formula, data, entity_col, time_col, weights)
+        # Entity means (computed after fitting)
+        self.entity_means: Optional[pd.DataFrame] = None
+    def fit(
+        self,
+        cov_type: str = 'nonrobust',
+        **cov_kwds
+    ) -> PanelResults:
+        """
+        Fit the Between estimator.
+        Parameters
+        ----------
+        cov_type : str, default='nonrobust'
+            Type of covariance estimator:
+            - 'nonrobust': Classical standard errors
+            - 'robust' or 'hc1': Heteroskedasticity-robust (HC1)
+            - 'hc0', 'hc2', 'hc3': Other HC variants
+            - 'clustered': Cluster-robust (by entity by default, or custom)
+            - 'twoway': Two-way clustered (entity and time at group level)
+            - 'driscoll_kraay': Driscoll-Kraay (spatial/temporal dependence)
+            - 'newey_west': Newey-West HAC
+            - 'pcse': Panel-Corrected Standard Errors
+        **cov_kwds
+            Additional arguments for covariance estimation:
+            - cluster_col: For custom clustering
+            - max_lags: For Driscoll-Kraay and Newey-West
+            - kernel: For HAC estimators ('bartlett', 'parzen', 'quadratic_spectral')
+        Returns
+        -------
+        PanelResults
+            Fitted model results
+        Examples
+        --------
+        >>> # Classical standard errors
+        >>> results = model.fit(cov_type='nonrobust')
+        >>> # Heteroskedasticity-robust
+        >>> results = model.fit(cov_type='robust')
+        >>> results = model.fit(cov_type='hc3')
+        >>> # Cluster-robust
+        >>> results = model.fit(cov_type='clustered')
+        >>> # Driscoll-Kraay
+        >>> results = model.fit(cov_type='driscoll_kraay', max_lags=3)
+        """
+        # Build design matrices from original data
+        y_orig, X_orig = self.formula_parser.build_design_matrices(
+            self.data.data,
+            return_type='array'
+        )
+        # Get variable names
+        var_names = self.formula_parser.get_variable_names(self.data.data)
+        # Get entity and time identifiers
+        entities = self.data.data[self.data.entity_col].values
+        times = self.data.data[self.data.time_col].values
+        # Compute entity means (between transformation)
+        unique_entities = np.unique(entities)
+        n_entities = len(unique_entities)
+        k = X_orig.shape[1]
+        # Initialize arrays for entity means
+        y_between = np.zeros(n_entities)
+        X_between = np.zeros((n_entities, k))
+        # Compute means for each entity
+        for i, entity in enumerate(unique_entities):
+            mask = entities == entity
+            y_between[i] = y_orig[mask].mean()
+            X_between[i] = X_orig[mask].mean(axis=0)
+        # Store entity means for user access
+        entity_means_dict = {'entity': unique_entities}
+        # Add dependent variable mean
+        dep_var_name = self.formula_parser.dependent
+        entity_means_dict[dep_var_name] = y_between
+        # Add independent variable means (excluding intercept)
+        for j, var_name in enumerate(var_names):
+            if var_name != 'Intercept':
+                # Find the corresponding column in X_orig
+                # var_names includes 'Intercept' if present, so adjust index
+                if 'Intercept' in var_names:
+                    X_col_idx = j
+                else:
+                    X_col_idx = j
+                entity_means_dict[var_name] = X_between[:, X_col_idx]
+        self.entity_means = pd.DataFrame(entity_means_dict)
+        # Estimate coefficients on entity means (OLS)
+        beta, resid, fitted = compute_ols(y_between, X_between, self.weights)
+        # Degrees of freedom
+        n = n_entities  # Number of entity-level observations
+        df_model = k - 1 if 'Intercept' in var_names else k  # Slopes only
+        df_resid = n - k
+        # Ensure df_resid is positive
+        if df_resid <= 0:
+            raise ValueError(
+                f"Insufficient degrees of freedom: df_resid = {df_resid}. "
+                f"n_entities={n}, k={k}. Need more entities than parameters."
+            )
+        # Compute covariance matrix
+        cov_type_lower = cov_type.lower()
+        if cov_type_lower == 'nonrobust':
+            vcov = compute_vcov_nonrobust(X_between, resid, df_resid)
+        elif cov_type_lower in ['robust', 'hc0', 'hc1', 'hc2', 'hc3']:
+            # Map 'robust' to 'hc1'
+            method = 'HC1' if cov_type_lower == 'robust' else cov_type_lower.upper()
+            result = robust_covariance(X_between, resid, method=method)
+            vcov = result.cov_matrix
+        elif cov_type_lower == 'clustered':
+            # For between estimator, clustering is less common but supported
+            # Default: cluster by entity (though each entity appears once)
+            # Could cluster by another grouping variable if specified
+            cluster_col = cov_kwds.get('cluster_col', None)
+            if cluster_col is None:
+                # Each entity is its own cluster - equivalent to robust
+                result = robust_covariance(X_between, resid, method='HC1')
+            else:
+                # Use custom clustering variable from entity_means
+                if cluster_col not in self.entity_means.columns:
+                    raise ValueError(f"cluster_col '{cluster_col}' not found in entity means")
+                cluster_ids = self.entity_means[cluster_col].values
+                result = cluster_by_entity(X_between, resid, cluster_ids, df_correction=True)
+            vcov = result.cov_matrix
+        elif cov_type_lower == 'twoway':
+            # Two-way clustering at entity level
+            # This is unusual for between estimator but technically possible
+            # Would need entity-level time groupings
+            cluster_col1 = cov_kwds.get('cluster_col1', 'entity')
+            cluster_col2 = cov_kwds.get('cluster_col2', None)
+            if cluster_col2 is None:
+                raise ValueError("twoway clustering requires cluster_col2 in cov_kwds")
+            cluster_ids1 = self.entity_means[cluster_col1].values if cluster_col1 in self.entity_means.columns else unique_entities
+            cluster_ids2 = self.entity_means[cluster_col2].values
+            result = twoway_cluster(X_between, resid, cluster_ids1, cluster_ids2, df_correction=True)
+            vcov = result.cov_matrix
+        elif cov_type_lower == 'driscoll_kraay':
+            # Driscoll-Kraay at entity level
+            # Use entity index as "time" dimension
+            max_lags = cov_kwds.get('max_lags', None)
+            kernel = cov_kwds.get('kernel', 'bartlett')
+            result = driscoll_kraay(X_between, resid, unique_entities, max_lags=max_lags, kernel=kernel)
+            vcov = result.cov_matrix
+        elif cov_type_lower == 'newey_west':
+            # Newey-West HAC
+            max_lags = cov_kwds.get('max_lags', None)
+            kernel = cov_kwds.get('kernel', 'bartlett')
+            result = newey_west(X_between, resid, max_lags=max_lags, kernel=kernel)
+            vcov = result.cov_matrix
+        elif cov_type_lower == 'pcse':
+            # Panel-Corrected Standard Errors
+            # For between estimator, each entity appears once
+            # PCSE is less meaningful but technically computable
+            result = pcse(X_between, resid, unique_entities, unique_entities)
+            vcov = result.cov_matrix
+        else:
+            raise ValueError(
+                f"cov_type must be one of: 'nonrobust', 'robust', 'hc0', 'hc1', 'hc2', 'hc3', "
+                f"'clustered', 'twoway', 'driscoll_kraay', 'newey_west', 'pcse', got '{cov_type}'"
+            )
+        # Standard errors
+        std_errors = np.sqrt(np.diag(vcov))
+        # Compute R-squared measures
+        # For between estimator:
+        # - rsquared = between R² (primary measure)
+        # - within R² = 0 by construction (no within variation used)
+        # - overall R² computed from fitted values mapped back to all observations
+        # Between R² (on entity means)
+        tss_between = np.sum((y_between - y_between.mean()) ** 2)
+        ess_between = np.sum(resid ** 2)
+        rsquared_between = 1 - ess_between / tss_between if tss_between > 0 else 0.0
+        # Map fitted values back to original observations for overall R²
+        fitted_all = np.zeros(len(y_orig))
+        for i, entity in enumerate(unique_entities):
+            mask = entities == entity
+            fitted_all[mask] = fitted[i]
+        resid_all = y_orig - fitted_all
+        # Overall R² (on all NT observations)
+        tss_overall = np.sum((y_orig - y_orig.mean()) ** 2)
+        ess_overall = np.sum(resid_all ** 2)
+        rsquared_overall = 1 - ess_overall / tss_overall if tss_overall > 0 else 0.0
+        # Within R² is not meaningful for between estimator
+        # (would require comparing within variation, which BE ignores)
+        rsquared_within = 0.0
+        # Adjusted R-squared (based on between R²)
+        rsquared_adj = 1 - (1 - rsquared_between) * (n - 1) / df_resid
+        # Create Series/DataFrame with variable names
+        params = pd.Series(beta.ravel(), index=var_names)
+        std_errors_series = pd.Series(std_errors, index=var_names)
+        cov_params = pd.DataFrame(vcov, index=var_names, columns=var_names)
+        # Model information
+        model_info = {
+            'model_type': 'Between Estimator',
+            'formula': self.formula,
+            'cov_type': cov_type,
+            'cov_kwds': cov_kwds,
+            'entity_effects': False,
+            'time_effects': False,
+        }
+        # Data information
+        data_info = {
+            'nobs': n,  # Number of entity-level observations
+            'n_entities': self.data.n_entities,
+            'n_periods': self.data.n_periods,
+            'df_model': df_model,
+            'df_resid': df_resid,
+            'entity_index': unique_entities,
+            'time_index': None,  # Not applicable for between estimator
+        }
+        # R-squared dictionary
+        rsquared_dict = {
+            'rsquared': rsquared_between,  # For BE, R² = between R²
+            'rsquared_adj': rsquared_adj,
+            'rsquared_within': rsquared_within,
+            'rsquared_between': rsquared_between,
+            'rsquared_overall': rsquared_overall
+        }
+        # Create results object
+        results = PanelResults(
+            params=params,
+            std_errors=std_errors_series,
+            cov_params=cov_params,
+            resid=resid_all,  # Residuals for all observations
+            fittedvalues=fitted_all,  # Fitted values for all observations
+            model_info=model_info,
+            data_info=data_info,
+            rsquared_dict=rsquared_dict,
+            model=self
+        )
+        # Store results and update state
+        self._results = results
+        self._fitted = True
+        return results
+    def _estimate_coefficients(self) -> np.ndarray:
+        """
+        Estimate coefficients (implementation of abstract method).
+        Returns
+        -------
+        np.ndarray
+            Estimated coefficients
+        """
+        # Build design matrices
+        y, X = self.formula_parser.build_design_matrices(
+            self.data.data,
+            return_type='array'
+        )
+        # Get entity identifiers
+        entities = self.data.data[self.data.entity_col].values
+        # Compute entity means
+        unique_entities = np.unique(entities)
+        n_entities = len(unique_entities)
+        k = X.shape[1]
+        y_between = np.zeros(n_entities)
+        X_between = np.zeros((n_entities, k))
+        for i, entity in enumerate(unique_entities):
+            mask = entities == entity
+            y_between[i] = y[mask].mean()
+            X_between[i] = X[mask].mean(axis=0)
+        # OLS on entity means
+        beta, _, _ = compute_ols(y_between, X_between, self.weights)
+        return beta

panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

panelbox 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl