PyPI - cbps - Versions diffs - 0.2.0__py3-none-any.whl - Mend

cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

cbps/__init__.py +3462 -0
cbps/constants.py +46 -0
cbps/core/__init__.py +93 -0
cbps/core/cbps_binary.py +1943 -0
cbps/core/cbps_continuous.py +945 -0
cbps/core/cbps_multitreat.py +1123 -0
cbps/core/cbps_optimal.py +507 -0
cbps/core/results.py +1447 -0
cbps/data/Blackwell.csv +571 -0
cbps/data/LaLonde.csv +3213 -0
cbps/data/npcbps_continuous_sim.csv +501 -0
cbps/data/nsw.csv +723 -0
cbps/data/nsw_dw.csv +446 -0
cbps/data/political_ads_urban_niebler.csv +16266 -0
cbps/data/psid_controls.csv +2491 -0
cbps/data/psid_controls2.csv +254 -0
cbps/data/psid_controls3.csv +129 -0
cbps/data/simulation_dgp1_seed12345.csv +201 -0
cbps/data/simulation_dgp2_seed12345.csv +201 -0
cbps/data/simulation_dgp3_seed12345.csv +201 -0
cbps/data/simulation_dgp4_seed12345.csv +201 -0
cbps/datasets/__init__.py +78 -0
cbps/datasets/blackwell.py +112 -0
cbps/datasets/continuous.py +223 -0
cbps/datasets/lalonde.py +272 -0
cbps/datasets/npcbps_sim.py +101 -0
cbps/diagnostics/__init__.py +101 -0
cbps/diagnostics/balance.py +760 -0
cbps/diagnostics/balance_cbmsm_addon.py +162 -0
cbps/diagnostics/continuous_diagnostics.py +259 -0
cbps/diagnostics/normality.py +173 -0
cbps/diagnostics/ocbps_conditions.py +197 -0
cbps/diagnostics/overlap.py +198 -0
cbps/diagnostics/plots.py +1193 -0
cbps/diagnostics/weights_diag.py +205 -0
cbps/highdim/__init__.py +84 -0
cbps/highdim/gmm_loss.py +340 -0
cbps/highdim/hdcbps.py +1078 -0
cbps/highdim/lasso_utils.py +498 -0
cbps/highdim/weight_funcs.py +298 -0
cbps/inference/__init__.py +42 -0
cbps/inference/asyvar.py +621 -0
cbps/inference/vcov_outcome.py +217 -0
cbps/iv/__init__.py +48 -0
cbps/iv/cbiv.py +2603 -0
cbps/logging_config.py +45 -0
cbps/msm/__init__.py +45 -0
cbps/msm/cbmsm.py +1871 -0
cbps/msm/rank_diagnostics.py +112 -0
cbps/nonparametric/__init__.py +58 -0
cbps/nonparametric/cholesky_whitening.py +232 -0
cbps/nonparametric/empirical_likelihood.py +339 -0
cbps/nonparametric/npcbps.py +1036 -0
cbps/nonparametric/taylor_approx.py +207 -0
cbps/py.typed +0 -0
cbps/sklearn/__init__.py +42 -0
cbps/sklearn/estimator.py +378 -0
cbps/utils/__init__.py +82 -0
cbps/utils/formula.py +415 -0
cbps/utils/helpers.py +378 -0
cbps/utils/numerics.py +438 -0
cbps/utils/r_compat.py +109 -0
cbps/utils/validation.py +224 -0
cbps/utils/variance_transform.py +483 -0
cbps/utils/weights.py +586 -0
cbps-0.2.0.dist-info/METADATA +1090 -0
cbps-0.2.0.dist-info/RECORD +70 -0
cbps-0.2.0.dist-info/WHEEL +5 -0
cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
cbps-0.2.0.dist-info/top_level.txt +1 -0

cbps/msm/rank_diagnostics.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Rank selection diagnostics for CBMSM covariate matrices.
+WARNING: Automatic rank selection methods (energy ratio, information criteria)
+go beyond Imai & Ratkovic (2015) specification. These tools are provided for
+sensitivity analysis only. The default fixed threshold (1e-4) should be used
+for published analyses unless justified.
+References
+----------
+Imai, K. & Ratkovic, M. (2015). Robust estimation of inverse probability
+weights for marginal structural models. JASA, 110(511), 1013-1023.
+"""
+import numpy as np
+from typing import Any, Dict, List, Optional
+def diagnose_rank_selection(
+    X_mat: np.ndarray,
+    thresholds: Optional[List[float]] = None,
+) -> Dict[str, Any]:
+    """Compare rank under different SVD thresholds.
+    Helps users assess sensitivity of CBMSM results to rank choice.
+    This is a diagnostic tool only; it does NOT change the default behavior
+    of the CBMSM estimator.
+    Parameters
+    ----------
+    X_mat : np.ndarray, shape (n, k)
+        Covariate matrix (mean-centered recommended).
+    thresholds : list of float, optional
+        SVD thresholds to compare. Default: [1e-6, 1e-5, 1e-4, 1e-3, 1e-2].
+    Returns
+    -------
+    dict with keys:
+        - 'singular_values': np.ndarray, all singular values (descending)
+        - 'total_columns': int, original number of columns k
+        - 'ranks_by_threshold': dict mapping threshold -> retained rank
+        - 'energy_by_rank': np.ndarray, cumulative variance explained
+          at each rank (cumsum(s**2) / sum(s**2))
+        - 'recommended_action': str, guidance for the user
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cbps.msm.rank_diagnostics import diagnose_rank_selection
+    >>> rng = np.random.default_rng(42)
+    >>> X = rng.standard_normal((100, 5))
+    >>> result = diagnose_rank_selection(X)
+    >>> result['total_columns']
+    5
+    """
+    if thresholds is None:
+        thresholds = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
+    X = np.asarray(X_mat, dtype=np.float64)
+    n, k = X.shape
+    # Compute SVD
+    if k == 0 or n == 0:
+        return {
+            "singular_values": np.array([], dtype=float),
+            "total_columns": k,
+            "ranks_by_threshold": {t: 0 for t in thresholds},
+            "energy_by_rank": np.array([], dtype=float),
+            "recommended_action": "No covariates provided.",
+        }
+    _U, s, _Vt = np.linalg.svd(X, full_matrices=False)
+    # Ranks by threshold (number of singular values exceeding each threshold)
+    ranks_by_threshold = {}
+    for t in sorted(thresholds):
+        ranks_by_threshold[t] = int(np.sum(s > t))
+    # Cumulative energy (variance explained)
+    s_sq = s ** 2
+    total_energy = s_sq.sum()
+    if total_energy > 0:
+        energy_by_rank = np.cumsum(s_sq) / total_energy
+    else:
+        energy_by_rank = np.zeros_like(s_sq)
+    # Generate recommendation
+    default_rank = int(np.sum(s > 1e-4))
+    if default_rank == k:
+        recommended_action = (
+            "All singular values exceed 1e-4. The matrix appears full rank; "
+            "no dimension reduction occurs with the default threshold."
+        )
+    elif default_rank == 0:
+        recommended_action = (
+            "No singular values exceed 1e-4. Consider using a smaller threshold "
+            "or checking for degenerate covariates."
+        )
+    else:
+        energy_at_default = energy_by_rank[default_rank - 1] if default_rank > 0 else 0.0
+        recommended_action = (
+            f"Default threshold (1e-4) retains {default_rank}/{k} components "
+            f"explaining {energy_at_default:.4f} of total variance. "
+            f"Verify that CBMSM estimates are stable across nearby thresholds."
+        )
+    return {
+        "singular_values": s,
+        "total_columns": k,
+        "ranks_by_threshold": ranks_by_threshold,
+        "energy_by_rank": energy_by_rank,
+        "recommended_action": recommended_action,
+    }

cbps/nonparametric/__init__.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""
+Nonparametric CBPS Module.
+This subpackage implements the nonparametric covariate balancing generalized
+propensity score (npCBGPS) estimator from Section 3.3 of Fong, Hazlett, and
+Imai (2018). The function is named :func:`npCBPS` for API consistency with
+the parametric version.
+Unlike parametric CBPS, this approach does not require specifying a functional
+form for the propensity score. Instead, it directly estimates inverse
+probability weights by maximizing the empirical likelihood subject to
+covariate balance constraints.
+Main API
+--------
+:func:`npCBPS`
+    Estimate nonparametric covariate balancing weights from a formula
+    and DataFrame.
+:class:`NPCBPSResults`
+    Container for estimation results including weights and diagnostics.
+Submodules
+----------
+:mod:`taylor_approx`
+    Modified logarithm with Taylor approximation for numerical stability.
+:mod:`cholesky_whitening`
+    Covariate whitening via Cholesky decomposition.
+:mod:`empirical_likelihood`
+    Dual optimization routines for empirical likelihood.
+When to Use npCBPS
+------------------
+- When you are uncertain about the correct propensity score model specification.
+- When you prefer a nonparametric approach that directly targets balance.
+- When computational cost is acceptable (npCBPS is slower than parametric CBPS).
+References
+----------
+Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
+score for a continuous treatment: Application to the efficacy of political
+advertisements. The Annals of Applied Statistics, 12(1), 156-177.
+https://doi.org/10.1214/17-AOAS1101
+"""
+from .npcbps import npCBPS, NPCBPSResults
+from .taylor_approx import llog, llogp
+from .cholesky_whitening import cholesky_whitening
+from .empirical_likelihood import get_w, log_post
+__all__ = [
+    'npCBPS',
+    'NPCBPSResults',
+    'llog',
+    'llogp',
+    'cholesky_whitening',
+    'get_w',
+    'log_post'
+]

cbps/nonparametric/cholesky_whitening.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""
+Cholesky Whitening Transform for Nonparametric CBPS.
+This module implements covariate whitening via Cholesky decomposition,
+transforming covariates to have zero mean, unit variance, and zero
+correlation. This preprocessing step is essential for the empirical
+likelihood formulation in npCBPS.
+Mathematical Background
+-----------------------
+The whitening transform orthogonalizes covariates as described in
+Section 3.1 of Fong, Hazlett, and Imai (2018):
+.. math::
+    X_i^* = S_X^{-1/2}(X_i - \\bar{X})
+where :math:`\\bar{X}` is the sample mean and :math:`S_X` is the sample
+covariance matrix. The Cholesky decomposition provides a numerically
+stable way to compute :math:`S_X^{-1/2}`.
+After whitening, :math:`\\text{Cov}(X^*) = I_K` (identity matrix), which
+simplifies the covariate balancing constraints in the empirical likelihood
+optimization.
+References
+----------
+Fong, C., Hazlett, C., and Imai, K. (2018). Covariate balancing propensity
+score for a continuous treatment: Application to the efficacy of political
+advertisements. The Annals of Applied Statistics, 12(1), 156-177.
+https://doi.org/10.1214/17-AOAS1101
+See Section 3.1 for the notation and Section 3.3.1 for the nonparametric
+formulation.
+"""
+import numpy as np
+import scipy.linalg
+def cholesky_whitening(X: np.ndarray, verify: bool = True) -> np.ndarray:
+    """
+    Transform covariates to have identity covariance matrix.
+    Applies a two-step whitening procedure using Cholesky decomposition:
+    1. **Decorrelation**: :math:`X' = X \\cdot \\text{inv}(\\text{chol}(S_X))`
+       where :math:`S_X` is the sample covariance matrix.
+    2. **Standardization**: Center to zero mean and scale to unit variance.
+    The result satisfies :math:`\\text{Cov}(X^*) = I_K`, which is required
+    for the covariate balancing constraints in npCBPS.
+    Parameters
+    ----------
+    X : np.ndarray of shape (n, k)
+        Covariate matrix with n observations and k variables.
+    verify : bool, default=True
+        If True, verify that the output covariance equals the identity
+        matrix within numerical tolerance. Raises AssertionError on failure.
+    Returns
+    -------
+    np.ndarray of shape (n, k)
+        Whitened covariate matrix satisfying:
+        - Column means are zero
+        - Column standard deviations are one
+        - Covariance matrix equals identity
+    Raises
+    ------
+    AssertionError
+        If ``verify=True`` and the whitening verification fails.
+    numpy.linalg.LinAlgError
+        If the covariance matrix is not positive definite.
+    Notes
+    -----
+    **Algorithm details:**
+    The Cholesky decomposition factorizes :math:`S_X = L L^T` where L is
+    lower triangular. This implementation uses the upper triangular form
+    :math:`S_X = U^T U` via ``scipy.linalg.cholesky(..., lower=False)``.
+    The whitening transform is then :math:`X' = X \\cdot U^{-1}`, followed
+    by standardization to ensure exact zero mean and unit variance.
+    **Verification criteria (tolerance 1e-10):**
+    - Diagonal of :math:`\\text{Cov}(X^*)` equals 1
+    - Off-diagonal elements equal 0
+    - Column means equal 0
+    References
+    ----------
+    Fong, C., Hazlett, C., and Imai, K. (2018). Section 3.1 describes the
+    whitening notation :math:`X_i^* = S_X^{-1/2}(X_i - \\bar{X})`.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> np.random.seed(42)
+    >>> X = np.random.randn(100, 3)
+    >>> X_white = cholesky_whitening(X)
+    >>> cov = np.cov(X_white.T, ddof=1)
+    >>> np.allclose(cov, np.eye(3), atol=1e-10)
+    True
+    >>> np.allclose(X_white.mean(axis=0), 0, atol=1e-10)
+    True
+    """
+    n, k = X.shape
+    # Step 1: Cholesky whitening
+    # Compute unbiased covariance estimate
+    cov_X = np.cov(X.T, ddof=1)
+    # Cholesky decomposition returns upper triangular matrix
+    chol_upper = scipy.linalg.cholesky(cov_X, lower=False)
+    # Apply whitening transform
+    X_white_step1 = X @ np.linalg.inv(chol_upper)
+    # Step 2: Full standardization (center=True, scale=True)
+    # Ensures zero mean and unit variance
+    X_white = (X_white_step1 - X_white_step1.mean(axis=0)) / X_white_step1.std(axis=0, ddof=1)
+    # Whitening verification (optional, enabled by default)
+    if verify:
+        cov_white = np.cov(X_white.T, ddof=1)
+        # Single variable case: cov returns 0-dim scalar, reshape to (1,1)
+        if k == 1:
+            cov_white = cov_white.reshape(1, 1)
+        # Check diagonal elements are close to 1
+        diagonal = np.diag(cov_white)
+        if not np.allclose(diagonal, 1.0, atol=1e-10):
+            raise AssertionError(
+                f"Whitening failed: cov(X_white) diagonal not close to 1\n"
+                f"Diagonal values: {diagonal}\n"
+                f"Expected: [1, 1, ..., 1]"
+            )
+        # Check off-diagonal elements are close to 0
+        off_diagonal_max = np.max(np.abs(cov_white - np.eye(k)))
+        if off_diagonal_max > 1e-10:
+            raise AssertionError(
+                f"Whitening failed: cov(X_white) off-diagonal elements too large\n"
+                f"Maximum off-diagonal absolute value: {off_diagonal_max}\n"
+                f"Expected: approximately 0 (tolerance 1e-10)"
+            )
+        # Check overall covariance matrix
+        if not np.allclose(cov_white, np.eye(k), atol=1e-10):
+            raise AssertionError(
+                f"Whitening failed: cov(X_white) not close to identity matrix I\n"
+                f"Maximum deviation: {np.max(np.abs(cov_white - np.eye(k)))}"
+            )
+    return X_white
+def verify_whitening(X: np.ndarray, X_white: np.ndarray, atol: float = 1e-10) -> dict:
+    """
+    Compute diagnostic metrics for whitening quality.
+    This function provides detailed verification of the whitening transform
+    beyond the basic checks in :func:`cholesky_whitening`.
+    Parameters
+    ----------
+    X : np.ndarray of shape (n, k)
+        Original covariate matrix (unused, kept for API consistency).
+    X_white : np.ndarray of shape (n, k)
+        Whitened covariate matrix to verify.
+    atol : float, default=1e-10
+        Absolute tolerance for numerical comparisons.
+    Returns
+    -------
+    dict
+        Verification metrics with keys:
+        - **cov_is_identity** : bool
+            True if covariance matrix equals identity within tolerance.
+        - **mean_is_zero** : bool
+            True if all column means are zero within tolerance.
+        - **std_is_one** : bool
+            True if all column standard deviations are one within tolerance.
+        - **max_cov_deviation** : float
+            Maximum absolute deviation of covariance from identity matrix.
+        - **condition_number** : float
+            Condition number of the whitened matrix (measures numerical stability).
+    Examples
+    --------
+    >>> import numpy as np
+    >>> np.random.seed(42)
+    >>> X = np.random.randn(100, 3)
+    >>> X_white = cholesky_whitening(X)
+    >>> metrics = verify_whitening(X, X_white)
+    >>> metrics['cov_is_identity']
+    True
+    >>> metrics['max_cov_deviation'] < 1e-10
+    True
+    """
+    k = X.shape[1]
+    # Compute covariance matrix
+    cov_white = np.cov(X_white.T, ddof=1)
+    # Compute mean and standard deviation
+    mean_white = X_white.mean(axis=0)
+    std_white = X_white.std(axis=0, ddof=1)
+    # Verification metrics
+    cov_is_identity = np.allclose(cov_white, np.eye(k), atol=atol)
+    mean_is_zero = np.allclose(mean_white, 0, atol=atol)
+    std_is_one = np.allclose(std_white, 1, atol=atol)
+    max_cov_deviation = np.max(np.abs(cov_white - np.eye(k)))
+    # Condition number (measures numerical stability)
+    condition_number = np.linalg.cond(X_white)
+    return {
+        'cov_is_identity': cov_is_identity,
+        'mean_is_zero': mean_is_zero,
+        'std_is_one': std_is_one,
+        'max_cov_deviation': max_cov_deviation,
+        'condition_number': condition_number
+    }