PyPI - cbps - Versions diffs - 0.2.0__py3-none-any.whl - Mend

cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

cbps/__init__.py +3462 -0
cbps/constants.py +46 -0
cbps/core/__init__.py +93 -0
cbps/core/cbps_binary.py +1943 -0
cbps/core/cbps_continuous.py +945 -0
cbps/core/cbps_multitreat.py +1123 -0
cbps/core/cbps_optimal.py +507 -0
cbps/core/results.py +1447 -0
cbps/data/Blackwell.csv +571 -0
cbps/data/LaLonde.csv +3213 -0
cbps/data/npcbps_continuous_sim.csv +501 -0
cbps/data/nsw.csv +723 -0
cbps/data/nsw_dw.csv +446 -0
cbps/data/political_ads_urban_niebler.csv +16266 -0
cbps/data/psid_controls.csv +2491 -0
cbps/data/psid_controls2.csv +254 -0
cbps/data/psid_controls3.csv +129 -0
cbps/data/simulation_dgp1_seed12345.csv +201 -0
cbps/data/simulation_dgp2_seed12345.csv +201 -0
cbps/data/simulation_dgp3_seed12345.csv +201 -0
cbps/data/simulation_dgp4_seed12345.csv +201 -0
cbps/datasets/__init__.py +78 -0
cbps/datasets/blackwell.py +112 -0
cbps/datasets/continuous.py +223 -0
cbps/datasets/lalonde.py +272 -0
cbps/datasets/npcbps_sim.py +101 -0
cbps/diagnostics/__init__.py +101 -0
cbps/diagnostics/balance.py +760 -0
cbps/diagnostics/balance_cbmsm_addon.py +162 -0
cbps/diagnostics/continuous_diagnostics.py +259 -0
cbps/diagnostics/normality.py +173 -0
cbps/diagnostics/ocbps_conditions.py +197 -0
cbps/diagnostics/overlap.py +198 -0
cbps/diagnostics/plots.py +1193 -0
cbps/diagnostics/weights_diag.py +205 -0
cbps/highdim/__init__.py +84 -0
cbps/highdim/gmm_loss.py +340 -0
cbps/highdim/hdcbps.py +1078 -0
cbps/highdim/lasso_utils.py +498 -0
cbps/highdim/weight_funcs.py +298 -0
cbps/inference/__init__.py +42 -0
cbps/inference/asyvar.py +621 -0
cbps/inference/vcov_outcome.py +217 -0
cbps/iv/__init__.py +48 -0
cbps/iv/cbiv.py +2603 -0
cbps/logging_config.py +45 -0
cbps/msm/__init__.py +45 -0
cbps/msm/cbmsm.py +1871 -0
cbps/msm/rank_diagnostics.py +112 -0
cbps/nonparametric/__init__.py +58 -0
cbps/nonparametric/cholesky_whitening.py +232 -0
cbps/nonparametric/empirical_likelihood.py +339 -0
cbps/nonparametric/npcbps.py +1036 -0
cbps/nonparametric/taylor_approx.py +207 -0
cbps/py.typed +0 -0
cbps/sklearn/__init__.py +42 -0
cbps/sklearn/estimator.py +378 -0
cbps/utils/__init__.py +82 -0
cbps/utils/formula.py +415 -0
cbps/utils/helpers.py +378 -0
cbps/utils/numerics.py +438 -0
cbps/utils/r_compat.py +109 -0
cbps/utils/validation.py +224 -0
cbps/utils/variance_transform.py +483 -0
cbps/utils/weights.py +586 -0
cbps-0.2.0.dist-info/METADATA +1090 -0
cbps-0.2.0.dist-info/RECORD +70 -0
cbps-0.2.0.dist-info/WHEEL +5 -0
cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
cbps-0.2.0.dist-info/top_level.txt +1 -0

cbps/core/cbps_optimal.py ADDED Viewed

@@ -0,0 +1,507 @@
+"""
+Optimal Covariate Balancing Propensity Score (oCBPS)
+====================================================
+This module implements optimal CBPS (oCBPS) that extends the standard
+CBPS by incorporating dual balancing conditions for improved efficiency
+and robustness through the framework of Fan et al. (2022).
+The implementation achieves double robustness and semiparametric efficiency
+by separating the covariate balancing conditions for baseline outcome models
+and treatment effect heterogeneity models.
+Key Innovations
+---------------
+1. **Dual Balancing Conditions** (Fan 2022 Eq. 3.2-3.3):
+   - g1_baseline: Balance covariates h1 related to E(Y(0)|X)
+   - g2_diff: Balance covariates h2 related to E(Y(1)-Y(0)|X)
+2. **Double Robustness** (Theorem 3.1):
+   Consistent if either the propensity score model or outcome model is correct.
+3. **Semiparametric Efficiency** (Corollary 3.2):
+   Achieves Hahn 1998 efficiency bound when both models are correct and m=q.
+Implementation Notes
+--------------------
+- Only supports att=0 (ATE estimation)
+- No sample_weights parameter (oCBPS does not support sampling weights)
+- Dual initialization optimization for robust convergence
+References
+----------
+.. [1] Fan, Jianqing, Kosuke Imai, Inbeom Lee, Han Liu, Yang Ning,
+       and Xiaolin Yang. 2022.
+       "Optimal Covariate Balancing Conditions in Propensity Score
+       Estimation."
+       Journal of Business & Economic Statistics, 41(1), 97-110.
+       https://doi.org/10.1080/07350015.2021.2002159
+       https://imai.fas.harvard.edu/research/CBPStheory.html
+.. [2] Imai, Kosuke and Marc Ratkovic. 2014.
+       "Covariate Balancing Propensity Score."
+       Journal of the Royal Statistical Society, Series B.
+       DOI:10.1111/rssb.12027
+Examples
+--------
+>>> from cbps import CBPS
+>>> from cbps.datasets import load_lalonde
+>>>
+>>> # Load LaLonde data
+>>> lalonde = load_lalonde()
+>>>
+>>> # oCBPS estimation with dual formula specification
+>>> # Note: require m1 + m2 + 1 >= k where k is number of parameters
+>>> fit = CBPS(
+...     formula='treat ~ age + educ + re75 + re74',
+...     data=lalonde,
+...     baseline_formula='~age + educ + re75 + re74',
+...     diff_formula='~I(re75==0)',
+...     att=0  # oCBPS only supports ATE
+... )
+>>>
+>>> # View results
+>>> print(fit.summary())
+>>> print(f"J-statistic: {fit.J:.6f}")
+"""
+from typing import Any, Dict, Optional
+import warnings
+import numpy as np
+import scipy.linalg
+import scipy.special
+import scipy.optimize
+import statsmodels.api as sm
+# Import generalized inverse function from cbps_binary
+from .cbps_binary import _r_ginv
+# Constants
+PROBS_MIN = 1e-6  # Probability clipping threshold for numerical stability
+def _gmm_func1(
+    beta_curr: np.ndarray,
+    X: np.ndarray,
+    treat: np.ndarray,
+    baseline_X: np.ndarray,
+    diff_X: np.ndarray,
+    invV: Optional[np.ndarray] = None,
+    option: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    GMM objective function for optimal CBPS with dual balancing conditions.
+    Parameters
+    ----------
+    beta_curr : np.ndarray
+        Current propensity score coefficients (k-dimensional vector).
+    X : np.ndarray
+        Covariate matrix (n x k, including intercept column).
+    treat : np.ndarray
+        Binary treatment vector (0/1 encoded).
+    baseline_X : np.ndarray
+        Design matrix from baseline formula (n x m1).
+    diff_X : np.ndarray
+        Design matrix from diff formula (n x m2).
+    invV : np.ndarray, optional
+        Precomputed inverse of V matrix (for two-step GMM).
+    option : str, optional
+        None for dual balancing (oCBPS standard), "CBPS" for single balancing
+        (used in pre-optimization).
+    Returns
+    -------
+    dict
+        Dictionary with keys:
+        - 'loss': GMM loss value (quadratic form)
+        - 'invV': Generalized inverse of the V matrix
+    Notes
+    -----
+    The dual balancing conditions (Fan et al. 2022, Eq. 3.2-3.3):
+    - g1_baseline: Balance covariates related to E(Y(0)|X)
+    - g2_diff: Balance covariates related to E(Y(1)-Y(0)|X)
+    When option="CBPS", uses standard single balance condition for pre-optimization.
+    """
+    # Step 1: Sample size
+    n = X.shape[0]
+    # Step 2: Compute propensity scores
+    theta_curr = X @ beta_curr
+    probs_curr = scipy.special.expit(theta_curr)
+    # Sequential clipping (upper bound then lower bound)
+    probs_curr = np.minimum(1 - PROBS_MIN, probs_curr)
+    probs_curr = np.maximum(PROBS_MIN, probs_curr)
+    # Step 3: Compute ATE weights
+    w_curr = treat / probs_curr - (1 - treat) / (1 - probs_curr)
+    # Step 4: Construct moment conditions based on option
+    if option is None:
+        # Dual balancing conditions (oCBPS standard)
+        # Construct X1new: intercept + baseline covariates
+        X1new = np.column_stack([X[:, 0], baseline_X])
+        # g1_baseline balance condition
+        w_curr_del1 = (1/n) * (X1new.T @ w_curr)
+        # g2_diff weights
+        w_curr3 = treat / probs_curr - 1
+        # g2_diff balance condition
+        w_curr_del3 = (1/n) * (diff_X.T @ w_curr3)
+        # Concatenate dual balance conditions
+        gbar = np.concatenate([w_curr_del1, w_curr_del3])
+    elif option == "CBPS":
+        # Single balance condition (for pre-optimization)
+        # Standard ATE balance using full X matrix
+        w_curr_del = (1/n) * (X.T @ w_curr)
+        gbar = w_curr_del
+    else:
+        raise ValueError(f"Unknown option: {option}")
+    # Step 5: Compute covariance matrix V and its inverse
+    if invV is None:
+        # Reconstruct X1new (not defined when option="CBPS")
+        X1new = np.column_stack([X[:, 0], baseline_X])
+        # Block 1: V11
+        factor_1 = ((1 - probs_curr) * probs_curr)**(-0.5)
+        X_1 = X1new * factor_1[:, None]
+        # Block 2: V22
+        factor_2 = (1/probs_curr - 1)**0.5
+        X_2 = diff_X * factor_2[:, None]
+        # Block 3: V12
+        X_1_1 = X1new * (probs_curr**(-0.5))[:, None]
+        # Block 4: V21
+        X_1_2 = diff_X * (probs_curr**(-0.5))[:, None]
+        # Assemble V matrix
+        V11 = (1/n) * (X_1.T @ X_1)
+        V12 = (1/n) * (X_1_1.T @ X_1_2)
+        V21 = (1/n) * (X_1_2.T @ X_1_1)
+        V22 = (1/n) * (X_2.T @ X_2)
+        V = np.block([[V11, V12],
+                      [V21, V22]])
+        # Generalized inverse
+        invV_g = _r_ginv(V)
+    else:
+        invV_g = invV
+    # Step 6: Compute GMM loss (quadratic form)
+    loss = float(gbar.T @ invV_g @ gbar)
+    return {'loss': loss, 'invV': invV_g}
+def _gmm_loss1(beta: np.ndarray, *args, **kwargs) -> float:
+    """
+    GMM loss function wrapper for scipy.optimize.
+    Parameters
+    ----------
+    beta : np.ndarray
+        Propensity score coefficients.
+    *args, **kwargs
+        Arguments passed to _gmm_func1.
+    Returns
+    -------
+    float
+        GMM loss value.
+    """
+    return _gmm_func1(beta, *args, **kwargs)['loss']
+def cbps_optimal_2treat(
+    treat: np.ndarray,
+    X: np.ndarray,
+    baseline_X: np.ndarray,
+    diff_X: np.ndarray,
+    iterations: int = 1000,
+    att: int = 0,
+    standardize: bool = True
+) -> Dict[str, Any]:
+    """
+    Optimal CBPS for binary treatments with double robustness and efficiency.
+    Implements the optimal covariate balancing conditions from Fan et al.
+    (2022), achieving double robustness and semiparametric efficiency by
+    separating balance conditions for baseline outcome and treatment effect
+    heterogeneity models.
+    Parameters
+    ----------
+    treat : np.ndarray
+        Binary treatment vector (0/1 encoded, n-dimensional).
+    X : np.ndarray
+        Covariate matrix including intercept (n x k).
+    baseline_X : np.ndarray
+        Design matrix from baseline formula (h1 covariates, n x m1).
+        Zero-variance columns should be filtered before calling.
+    diff_X : np.ndarray
+        Design matrix from diff formula (h2 covariates, n x m2).
+        Zero-variance columns should be filtered before calling.
+    iterations : int, default 1000
+        Maximum BFGS iterations.
+    att : int, default 0
+        Estimand target. Only att=0 (ATE) is supported for oCBPS.
+    standardize : bool, default True
+        Whether to standardize weights.
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - coefficients: Coefficient matrix (k x 1)
+        - fitted_values: Propensity scores (n-dimensional)
+        - linear_predictor: Linear predictor X @ beta (n-dimensional)
+        - deviance: Negative 2 times log-likelihood
+        - weights: Optimal weights (n-dimensional)
+        - y: Treatment vector (n-dimensional)
+        - x: Covariate matrix (n x k)
+        - converged: Convergence flag (bool)
+        - J: J-statistic (float)
+        - var: Variance-covariance matrix (k x k)
+        - mle_J: MLE baseline J-statistic (float)
+    Raises
+    ------
+    ValueError
+        If baseline and diff model dimensions are incompatible.
+    Notes
+    -----
+    **Key Features:**
+    - Only supports att=0 (ATE estimation)
+    - No sample_weights parameter (oCBPS does not support sampling weights)
+    - Dual initialization optimization for robust convergence
+    **Dual Balancing Conditions** (Fan 2022 Eq. 3.2-3.3):
+    - g1_baseline: (T/π - (1-T)/(1-π)) h1(X) = 0, balances E(Y(0)|X)
+    - g2_diff: (T/π - 1) h2(X) = 0, balances E(Y(1)-Y(0)|X)
+    **Double Robustness** (Theorem 3.1):
+    Consistent if either the propensity score model or outcome model is correct.
+    **Semiparametric Efficiency** (Corollary 3.2):
+    Achieves Hahn 1998 efficiency bound when both models are correct and m=q.
+    References
+    ----------
+    .. [1] Fan et al. (2022). Optimal Covariate Balancing Conditions in
+           Propensity Score Estimation. Journal of Business & Economic
+           Statistics, 41(1), 97-110. https://doi.org/10.1080/07350015.2021.2002159
+    Examples
+    --------
+    >>> # See module-level documentation for complete examples
+    """
+    # Initialize constants
+    n = X.shape[0]
+    # Determine identification status
+    m1 = baseline_X.shape[1]
+    m2 = diff_X.shape[1]
+    k = X.shape[1]
+    if m1 + m2 + 1 > k:
+        bal_only = 3  # Over-identified: m1 + m2 + 1 > q
+        xcov = None
+    elif m1 + m2 + 1 == k:
+        bal_only = 1  # Exactly identified: m1 + m2 + 1 = q
+        xcov = np.eye(m1 + m2 + 1)
+    else:
+        raise ValueError("Invalid baseline and diff models.")
+    # Dual initialization: GLM and CBPS pre-optimization paths
+    # GLM initial values
+    glm_model = sm.GLM(treat, X, family=sm.families.Binomial())
+    glm_result = glm_model.fit()
+    glm_beta_curr = glm_result.params.copy()
+    glm_beta_curr[np.isnan(glm_beta_curr)] = 0
+    # CBPS pre-optimization initial values
+    # Precompute simplified inverse matrix for pre-optimization
+    invV2 = scipy.linalg.pinv(X.T @ X)
+    # CBPS pre-optimization (single balance condition)
+    def gmm_loss_for_preopt(beta):
+        return _gmm_func1(beta, X, treat, baseline_X, diff_X,
+                          invV=invV2, option="CBPS")['loss']
+    cbps_preopt = scipy.optimize.minimize(
+        gmm_loss_for_preopt,
+        glm_beta_curr,
+        method='BFGS'
+    )
+    cbps_beta_curr = cbps_preopt.x
+    # GMM optimization branching
+    gmm_init = glm_beta_curr
+    if bal_only == 1:
+        # Exactly identified
+        opt_bal = scipy.optimize.minimize(
+            lambda b: _gmm_func1(b, X, treat, baseline_X, diff_X, invV=xcov)['loss'],
+            gmm_init,
+            method='BFGS'
+        )
+        opt1 = opt_bal
+    elif bal_only == 3:
+        # Over-identified
+        # GMM loss function (recompute invV each iteration)
+        def gmm_loss_std(beta):
+            return _gmm_func1(beta, X, treat, baseline_X, diff_X)['loss']
+        # GLM path optimization
+        gmm_glm_init = scipy.optimize.minimize(
+            gmm_loss_std,
+            glm_beta_curr,
+            method='BFGS',
+            options={'maxiter': iterations}
+        )
+        # CBPS pre-optimization path
+        gmm_cbps_init = scipy.optimize.minimize(
+            gmm_loss_std,
+            cbps_beta_curr,
+            method='BFGS',
+            options={'maxiter': iterations}
+        )
+        # Select best initialization
+        if gmm_glm_init.fun < gmm_cbps_init.fun:
+            opt1 = gmm_glm_init
+        else:
+            opt1 = gmm_cbps_init
+    # Compute probabilities and weights
+    # Optimal coefficients
+    beta_opt = opt1.x
+    # Optimal propensity scores
+    theta_opt = X @ beta_opt
+    probs_opt = scipy.special.expit(theta_opt)
+    probs_opt = np.minimum(1 - PROBS_MIN, probs_opt)
+    probs_opt = np.maximum(PROBS_MIN, probs_opt)
+    # ATE weights (simplified form)
+    w_opt = np.abs((probs_opt - 1 + treat)**(-1))
+    # Weight standardization
+    if standardize:
+        norm1 = np.sum((treat == 1) / probs_opt)
+        norm2 = np.sum((treat == 0) / (1 - probs_opt))
+    else:
+        norm1 = norm2 = 1.0
+    w_opt = ((treat == 1) / probs_opt / norm1 +
+             (treat == 0) / (1 - probs_opt) / norm2)
+    # Compute variance-covariance matrix
+    # Construct X1new (required for vcov computation)
+    X1new = np.column_stack([X[:, 0], baseline_X])
+    # G matrix construction
+    factor_1 = np.sqrt(np.abs(treat - probs_opt) / (probs_opt * (1 - probs_opt)))
+    XG_1 = -X * factor_1[:, None]
+    XG_12 = -X1new * factor_1[:, None]
+    XW_1 = X1new * ((probs_opt - 1 + treat)**(-1))[:, None]
+    factor_2 = np.sqrt(treat * (1 - probs_opt) / probs_opt)
+    XG_2 = -X * factor_2[:, None]
+    XG_22 = -diff_X * factor_2[:, None]
+    XW_2 = diff_X * ((treat / probs_opt - 1))[:, None]
+    # W1 matrix
+    W1 = np.vstack([XW_1.T, XW_2.T])
+    # G matrix
+    G = np.column_stack([
+        (XG_1.T @ XG_12) / n,
+        (XG_2.T @ XG_22) / n
+    ])
+    # Omega outer product
+    Omega = (W1 @ W1.T) / n
+    # Sandwich variance formula
+    gmm_result = _gmm_func1(beta_opt, X, treat, baseline_X, diff_X, invV=None)
+    W = gmm_result['invV']
+    GWG_inv = _r_ginv(G @ W @ G.T)
+    vcov = GWG_inv @ G @ W @ Omega @ W.T @ G.T @ GWG_inv
+    # Construct return object
+    # J-statistic
+    J_opt = _gmm_func1(beta_opt, X, treat, baseline_X, diff_X, invV=None)['loss']
+    # Deviance
+    deviance = -2 * np.sum(
+        treat * np.log(probs_opt) + (1 - treat) * np.log(1 - probs_opt)
+    )
+    # MLE baseline J-statistic
+    glm1_coef = glm_beta_curr
+    mle_J = _gmm_func1(glm1_coef, X, treat, baseline_X, diff_X)['loss']
+    # Build output dictionary
+    output = {
+        'coefficients': beta_opt.reshape(-1, 1),  # k x 1 matrix
+        'fitted_values': probs_opt,
+        'linear_predictor': theta_opt,
+        'deviance': deviance,
+        'weights': w_opt,
+        'y': treat,
+        'x': X,
+        'converged': opt1.success,
+        'J': J_opt,
+        'var': vcov,
+        'mle_J': mle_J
+    }
+    # ========== oCBPS Condition Verification (P1-18/21) ==========
+    # Verify observable necessary conditions for oCBPS validity.
+    try:
+        from cbps.diagnostics.ocbps_conditions import verify_ocbps_conditions
+        conditions = verify_ocbps_conditions(output, X, treat)
+        output['ocbps_conditions'] = conditions
+        if not conditions['all_conditions_met']:
+            warn_msgs = conditions.get('warnings', [])
+            if warn_msgs:
+                warnings.warn(
+                    "oCBPS efficiency conditions not fully met: "
+                    + "; ".join(warn_msgs),
+                    UserWarning
+                )
+    except (ImportError, Exception) as e:
+        # Diagnostics should never block estimation
+        output['ocbps_conditions'] = {
+            'error': str(e),
+            'all_conditions_met': None
+        }
+    return output