PyPI - cbps - Versions diffs - 0.2.0__py3-none-any.whl - Mend

cbps 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

cbps/__init__.py +3462 -0
cbps/constants.py +46 -0
cbps/core/__init__.py +93 -0
cbps/core/cbps_binary.py +1943 -0
cbps/core/cbps_continuous.py +945 -0
cbps/core/cbps_multitreat.py +1123 -0
cbps/core/cbps_optimal.py +507 -0
cbps/core/results.py +1447 -0
cbps/data/Blackwell.csv +571 -0
cbps/data/LaLonde.csv +3213 -0
cbps/data/npcbps_continuous_sim.csv +501 -0
cbps/data/nsw.csv +723 -0
cbps/data/nsw_dw.csv +446 -0
cbps/data/political_ads_urban_niebler.csv +16266 -0
cbps/data/psid_controls.csv +2491 -0
cbps/data/psid_controls2.csv +254 -0
cbps/data/psid_controls3.csv +129 -0
cbps/data/simulation_dgp1_seed12345.csv +201 -0
cbps/data/simulation_dgp2_seed12345.csv +201 -0
cbps/data/simulation_dgp3_seed12345.csv +201 -0
cbps/data/simulation_dgp4_seed12345.csv +201 -0
cbps/datasets/__init__.py +78 -0
cbps/datasets/blackwell.py +112 -0
cbps/datasets/continuous.py +223 -0
cbps/datasets/lalonde.py +272 -0
cbps/datasets/npcbps_sim.py +101 -0
cbps/diagnostics/__init__.py +101 -0
cbps/diagnostics/balance.py +760 -0
cbps/diagnostics/balance_cbmsm_addon.py +162 -0
cbps/diagnostics/continuous_diagnostics.py +259 -0
cbps/diagnostics/normality.py +173 -0
cbps/diagnostics/ocbps_conditions.py +197 -0
cbps/diagnostics/overlap.py +198 -0
cbps/diagnostics/plots.py +1193 -0
cbps/diagnostics/weights_diag.py +205 -0
cbps/highdim/__init__.py +84 -0
cbps/highdim/gmm_loss.py +340 -0
cbps/highdim/hdcbps.py +1078 -0
cbps/highdim/lasso_utils.py +498 -0
cbps/highdim/weight_funcs.py +298 -0
cbps/inference/__init__.py +42 -0
cbps/inference/asyvar.py +621 -0
cbps/inference/vcov_outcome.py +217 -0
cbps/iv/__init__.py +48 -0
cbps/iv/cbiv.py +2603 -0
cbps/logging_config.py +45 -0
cbps/msm/__init__.py +45 -0
cbps/msm/cbmsm.py +1871 -0
cbps/msm/rank_diagnostics.py +112 -0
cbps/nonparametric/__init__.py +58 -0
cbps/nonparametric/cholesky_whitening.py +232 -0
cbps/nonparametric/empirical_likelihood.py +339 -0
cbps/nonparametric/npcbps.py +1036 -0
cbps/nonparametric/taylor_approx.py +207 -0
cbps/py.typed +0 -0
cbps/sklearn/__init__.py +42 -0
cbps/sklearn/estimator.py +378 -0
cbps/utils/__init__.py +82 -0
cbps/utils/formula.py +415 -0
cbps/utils/helpers.py +378 -0
cbps/utils/numerics.py +438 -0
cbps/utils/r_compat.py +109 -0
cbps/utils/validation.py +224 -0
cbps/utils/variance_transform.py +483 -0
cbps/utils/weights.py +586 -0
cbps-0.2.0.dist-info/METADATA +1090 -0
cbps-0.2.0.dist-info/RECORD +70 -0
cbps-0.2.0.dist-info/WHEEL +5 -0
cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
cbps-0.2.0.dist-info/top_level.txt +1 -0

cbps/utils/numerics.py ADDED Viewed

@@ -0,0 +1,438 @@
+"""
+Numerical Linear Algebra Utilities
+This module provides numerically stable implementations of matrix operations
+commonly used in CBPS estimation, including pseudoinverses, matrix rank
+computation, and symmetry utilities.
+The pseudoinverse functions implement tolerance-based singular value truncation
+to handle rank-deficient matrices that arise in high-dimensional settings or
+with collinear covariates.
+Functions
+---------
+r_ginv_like
+    Moore-Penrose pseudoinverse with configurable tolerance.
+pinv_match_r
+    Pseudoinverse using NumPy with matched tolerance.
+pinv_symmetric_psd
+    Specialized pseudoinverse for symmetric positive semi-definite matrices.
+numeric_rank
+    Effective numerical rank via singular value decomposition.
+symmetrize
+    Force matrix symmetry by averaging with transpose.
+max_asymmetry
+    Measure of matrix asymmetry (infinity norm of A - A.T).
+is_symmetric
+    Check if matrix is symmetric within tolerance.
+References
+----------
+Golub, G. H. and Van Loan, C. F. (2013). Matrix Computations (4th ed.).
+Johns Hopkins University Press.
+"""
+import warnings
+import numpy as np
+import scipy.linalg as la
+from typing import Optional, Tuple, Dict
+def r_ginv_with_diagnostics(
+    X: np.ndarray,
+    tol: Optional[float] = None,
+    warn_threshold: float = 1e12,
+) -> Tuple[np.ndarray, Dict]:
+    """Compute Moore-Penrose pseudoinverse with condition number diagnostics.
+    Matches R MASS::ginv() tolerance (sqrt(eps) * max(singular values)) but
+    adds condition number monitoring that R lacks. Does NOT apply
+    regularization (no theoretical support in Imai & Ratkovic 2014/2015).
+    Parameters
+    ----------
+    X : np.ndarray
+        Matrix to pseudoinvert.
+    tol : float or None
+        SVD truncation tolerance. None = sqrt(eps) * max(singular values),
+        matching the default used in the internal ``_r_ginv`` helper.
+    warn_threshold : float, default=1e12
+        Condition number threshold for issuing a warning.
+    Returns
+    -------
+    X_pinv : np.ndarray
+        Pseudoinverse of X.
+    diagnostics : dict
+        Contains:
+        - ``condition_number`` (float): ratio of largest to smallest retained
+          singular value (inf if matrix is effectively singular).
+        - ``effective_rank`` (int): number of singular values above tolerance.
+        - ``tolerance`` (float): the tolerance actually used.
+    Warns
+    -----
+    UserWarning
+        When condition number exceeds *warn_threshold*.
+    """
+    X = np.asarray(X, dtype=float)
+    # SVD (reduced)
+    U, s, Vt = la.svd(X, full_matrices=False, lapack_driver='gesdd')
+    # Tolerance: matches R MASS::ginv default – sqrt(eps) * max(s)
+    if tol is None:
+        eps = np.finfo(float).eps
+        tol_value = np.sqrt(eps) * (s[0] if len(s) > 0 else 0.0)
+    else:
+        tol_value = tol
+    # Determine retained singular values
+    positive = s > max(tol_value, 0.0)
+    effective_rank = int(np.sum(positive))
+    # Condition number: ratio of max to min retained singular value
+    if effective_rank == 0:
+        condition_number = float('inf')
+    elif effective_rank == 1:
+        condition_number = float('inf')  # effectively rank-1
+    else:
+        s_retained = s[positive]
+        condition_number = float(s_retained[0] / s_retained[-1])
+    # Compute pseudoinverse with the same logic as _r_ginv in cbps_binary
+    if len(s) == 0 or s[0] < np.finfo(float).eps:
+        X_pinv = np.zeros((X.shape[1], X.shape[0]))
+    elif np.all(positive):
+        X_pinv = (Vt.T / s) @ U.T
+    elif not np.any(positive):
+        X_pinv = np.zeros((X.shape[1], X.shape[0]))
+    else:
+        V_pos = Vt[positive].T
+        s_pos = s[positive]
+        U_pos = U[:, positive]
+        X_pinv = (V_pos / s_pos) @ U_pos.T
+    diagnostics = {
+        'condition_number': condition_number,
+        'effective_rank': effective_rank,
+        'tolerance': tol_value,
+    }
+    # Emit warning for ill-conditioned matrices
+    if condition_number > warn_threshold:
+        warnings.warn(
+            f"Matrix is ill-conditioned: condition number = {condition_number:.2e} "
+            f"(threshold = {warn_threshold:.2e}). "
+            f"Effective rank = {effective_rank}/{min(X.shape)}. "
+            f"Results may be numerically unreliable. "
+            f"Consider checking for collinear covariates.",
+            UserWarning,
+            stacklevel=2,
+        )
+    return X_pinv, diagnostics
+def r_ginv_like(X: np.ndarray, tol: Optional[float] = None) -> np.ndarray:
+    """
+    Compute Moore-Penrose pseudoinverse with tolerance-based truncation.
+    Uses SVD decomposition with a threshold rule for singular value truncation:
+    singular values below the tolerance are set to zero in the inversion.
+    Parameters
+    ----------
+    X : np.ndarray
+        Input matrix to pseudo-invert, shape (m, n).
+    tol : float, optional
+        Absolute tolerance for singular value truncation.
+        If None, uses: max(m, n) * max(singular_values) * machine_epsilon.
+    Returns
+    -------
+    np.ndarray
+        Pseudoinverse of X, shape (n, m).
+    Notes
+    -----
+    The tolerance rule follows the standard numerical convention:
+        tol = max(dim(X)) * sigma_max * eps
+    where sigma_max is the largest singular value and eps is machine epsilon.
+    This ensures robustness against numerical rank deficiency.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
+    >>> X_pinv = r_ginv_like(X)
+    >>> # Verify pseudoinverse property: X @ X_pinv @ X ≈ X
+    >>> assert np.allclose(X @ X_pinv @ X, X, atol=1e-10)
+    """
+    X = np.asarray(X)
+    # Compute SVD once to control tolerance exactly and avoid SciPy defaults
+    U, s, Vt = la.svd(X, full_matrices=False, lapack_driver='gesdd')
+    if tol is None:
+        eps = np.finfo(X.dtype if np.issubdtype(X.dtype, np.floating) else np.float64).eps
+        tol = max(X.shape) * s.max(initial=0.0) * eps
+    # Invert with truncation
+    with np.errstate(divide='ignore', invalid='ignore'):
+        s_inv = np.where(s > tol, 1.0 / s, 0.0)
+    return (Vt.T * s_inv) @ U.T
+def r_ginv_rcond(X: np.ndarray) -> float:
+    """
+    Compute the relative condition number for pseudoinverse truncation.
+    Converts the absolute tolerance rule to a relative cutoff suitable for
+    NumPy/SciPy pinv functions.
+    Parameters
+    ----------
+    X : np.ndarray
+        Input matrix for which to compute rcond.
+    Returns
+    -------
+    float
+        Relative condition number: max(dim(X)) * machine_epsilon.
+    Notes
+    -----
+    The relationship between absolute and relative tolerances is:
+        absolute_tol = rcond * sigma_max
+        rcond = max(m, n) * eps
+    where sigma_max is the largest singular value.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.random.randn(100, 10)
+    >>> rcond = r_ginv_rcond(X)
+    >>> assert rcond > 0
+    """
+    X = np.asarray(X)
+    eps = np.finfo(X.dtype if np.issubdtype(X.dtype, np.floating) else np.float64).eps
+    return max(X.shape) * eps
+def pinv_match_r(X: np.ndarray) -> np.ndarray:
+    """
+    Compute pseudoinverse using NumPy with standard tolerance.
+    A convenience wrapper around numpy.linalg.pinv that applies
+    the standard tolerance rule for singular value truncation.
+    Parameters
+    ----------
+    X : np.ndarray
+        Input matrix to pseudo-invert.
+    Returns
+    -------
+    np.ndarray
+        Pseudoinverse of X.
+    See Also
+    --------
+    r_ginv_like : Direct SVD-based implementation with custom tolerance.
+    r_ginv_rcond : Computes the rcond value used here.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> X_pinv = pinv_match_r(X)
+    >>> assert np.allclose(X @ X_pinv @ X, X, atol=1e-10)
+    """
+    return np.linalg.pinv(np.asarray(X), rcond=r_ginv_rcond(X))
+def pinv_symmetric_psd(X: np.ndarray, tol: Optional[float] = None) -> np.ndarray:
+    """
+    Compute pseudoinverse for symmetric positive semi-definite matrices.
+    Uses eigenvalue decomposition instead of SVD, exploiting symmetry for
+    improved numerical stability and efficiency. Small or negative eigenvalues
+    (arising from numerical noise) are clipped to zero.
+    Parameters
+    ----------
+    X : np.ndarray
+        Symmetric input matrix to pseudo-invert, shape (n, n).
+    tol : float, optional
+        Absolute tolerance for eigenvalue truncation.
+        If None, uses: n * max(eigenvalues) * machine_epsilon.
+    Returns
+    -------
+    np.ndarray
+        Symmetric pseudoinverse of X, shape (n, n).
+    Notes
+    -----
+    For a symmetric matrix X = Q Λ Q^T, the pseudoinverse is:
+        X^+ = Q Λ^+ Q^T
+    where Λ^+ has diagonal entries 1/λ_i for λ_i > tol, and 0 otherwise.
+    The input matrix is symmetrized as 0.5*(X + X^T) before decomposition
+    to handle minor floating-point asymmetries.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> # Create a symmetric positive definite matrix
+    >>> A = np.array([[4, 2], [2, 3]])
+    >>> A_pinv = pinv_symmetric_psd(A)
+    >>> assert np.allclose(A @ A_pinv @ A, A, atol=1e-10)
+    >>> assert np.allclose(A_pinv, A_pinv.T, atol=1e-14)  # Result is symmetric
+    """
+    X = np.asarray(X)
+    # Symmetrize defensively to counter FP drift
+    X = 0.5 * (X + X.T)
+    # Eigen-decomposition for symmetric matrices
+    w, Q = la.eigh(X)
+    if tol is None:
+        eps = np.finfo(X.dtype if np.issubdtype(X.dtype, np.floating) else np.float64).eps
+        tol = max(X.shape) * float(np.max(w, initial=0.0)) * eps
+    # Invert with clipping
+    w_inv = np.where(w > tol, 1.0 / w, 0.0)
+    return (Q * w_inv) @ Q.T
+def numeric_rank(X: np.ndarray, tol: Optional[float] = None) -> int:
+    """
+    Compute effective numerical rank via singular value decomposition.
+    The numerical rank counts singular values exceeding the tolerance threshold,
+    providing a robust measure of matrix rank that accounts for floating-point
+    precision limitations.
+    Parameters
+    ----------
+    X : np.ndarray
+        Input matrix, shape (m, n).
+    tol : float, optional
+        Absolute tolerance for singular value truncation.
+        If None, uses: max(m, n) * max(singular_values) * machine_epsilon.
+    Returns
+    -------
+    int
+        Number of singular values exceeding the tolerance.
+    Notes
+    -----
+    Unlike numpy.linalg.matrix_rank, this function uses a tolerance rule
+    that scales with the matrix dimensions, providing more consistent
+    behavior across different problem sizes.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> # Full rank matrix
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> assert numeric_rank(X) == 2
+    >>>
+    >>> # Rank deficient matrix
+    >>> Y = np.array([[1, 2], [2, 4]])  # Second row is 2x first row
+    >>> assert numeric_rank(Y) == 1
+    """
+    X = np.asarray(X)
+    s = la.svd(X, compute_uv=False, lapack_driver='gesdd')
+    if tol is None:
+        eps = np.finfo(X.dtype if np.issubdtype(X.dtype, np.floating) else np.float64).eps
+        tol = max(X.shape) * s.max(initial=0.0) * eps
+    return int(np.sum(s > tol))
+def symmetrize(A: np.ndarray) -> np.ndarray:
+    """
+    Force matrix symmetry by averaging with its transpose.
+    Computes 0.5 * (A + A^T), which projects any square matrix onto
+    the space of symmetric matrices.
+    Parameters
+    ----------
+    A : np.ndarray
+        Square matrix, shape (n, n).
+    Returns
+    -------
+    np.ndarray
+        Symmetric matrix, shape (n, n).
+    Examples
+    --------
+    >>> import numpy as np
+    >>> A = np.array([[1, 2], [3, 4]])
+    >>> A_sym = symmetrize(A)
+    >>> assert np.allclose(A_sym, A_sym.T)
+    >>> assert np.allclose(A_sym, [[1, 2.5], [2.5, 4]])
+    """
+    A = np.asarray(A)
+    return 0.5 * (A + A.T)
+def max_asymmetry(A: np.ndarray) -> float:
+    """
+    Compute the maximum asymmetry of a matrix.
+    Returns the infinity norm of (A - A^T), measuring how far
+    the matrix deviates from perfect symmetry.
+    Parameters
+    ----------
+    A : np.ndarray
+        Square matrix, shape (n, n).
+    Returns
+    -------
+    float
+        Maximum absolute difference: max|A_ij - A_ji|.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> A = np.array([[1, 2], [2.001, 4]])
+    >>> asym = max_asymmetry(A)
+    >>> assert np.isclose(asym, 0.001)
+    """
+    A = np.asarray(A)
+    return float(np.max(np.abs(A - A.T)))
+def is_symmetric(A: np.ndarray, atol: float = 1e-12) -> bool:
+    """
+    Check if a matrix is symmetric within tolerance.
+    Parameters
+    ----------
+    A : np.ndarray
+        Square matrix to check, shape (n, n).
+    atol : float, default=1e-12
+        Absolute tolerance for asymmetry.
+    Returns
+    -------
+    bool
+        True if max|A_ij - A_ji| <= atol.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> A = np.array([[1, 2], [2, 4]])
+    >>> assert is_symmetric(A)
+    >>>
+    >>> B = np.array([[1, 2], [3, 4]])
+    >>> assert not is_symmetric(B)
+    """
+    return max_asymmetry(A) <= atol

cbps/utils/r_compat.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""
+rpy2 and pandas 2.x Compatibility Utilities
+This module provides compatibility patches for using rpy2 with pandas 2.x,
+where the deprecated ``DataFrame.iteritems()`` and ``Series.iteritems()``
+methods were removed. The rpy2 pandas2ri converter relies on these methods,
+causing AttributeError in pandas 2.x environments.
+This module is primarily used for cross-validation testing and is not
+required for normal CBPS functionality.
+Usage
+-----
+Call ``ensure_rpy2_compatibility()`` before importing rpy2::
+    from cbps.utils.r_compat import ensure_rpy2_compatibility
+    ensure_rpy2_compatibility()
+    import rpy2.robjects as ro
+    from rpy2.robjects import pandas2ri
+    pandas2ri.activate()
+Notes
+-----
+The compatibility patch maps ``iteritems()`` to ``items()``, which is the
+pandas 2.x replacement. This patch is idempotent and safe to call multiple
+times.
+"""
+import pandas as pd
+def ensure_rpy2_compatibility():
+    """
+    Apply compatibility patches for rpy2 with pandas 2.x.
+    Maps the removed ``iteritems()`` methods to ``items()`` on both
+    DataFrame and Series classes. This is required because rpy2's
+    pandas2ri converter uses these deprecated methods.
+    Notes
+    -----
+    - Idempotent: safe to call multiple times
+    - No effect on pandas 1.x (where iteritems() exists)
+    - Applied at the class level to DataFrame and Series
+    Examples
+    --------
+    >>> from cbps.utils.r_compat import ensure_rpy2_compatibility
+    >>> ensure_rpy2_compatibility()
+    >>> # rpy2 can now be safely imported
+    """
+    # Check if patch is needed (pandas 2.x lacks iteritems)
+    if not hasattr(pd.DataFrame, 'iteritems'):
+        # Add DataFrame.iteritems as an alias for items
+        pd.DataFrame.iteritems = pd.DataFrame.items
+    if not hasattr(pd.Series, 'iteritems'):
+        # Add Series.iteritems as an alias for items
+        pd.Series.iteritems = pd.Series.items
+def check_rpy2_available():
+    """
+    Check rpy2 availability and apply compatibility patches.
+    Attempts to import rpy2 and the CBPS package from the R environment.
+    Automatically applies pandas 2.x compatibility patches before import.
+    This function is primarily used for internal testing and validation.
+    Returns
+    -------
+    available : bool
+        True if rpy2 and required packages are available.
+    components : tuple of (robjects, pandas2ri, cbps_package) or None
+        If available, returns the imported rpy2 components.
+        If not available, returns None.
+    Examples
+    --------
+    >>> from cbps.utils.r_compat import check_rpy2_available
+    >>> available, components = check_rpy2_available()
+    >>> if available:
+    ...     ro, pandas2ri, cbps_pkg = components
+    """
+    try:
+        # Apply compatibility patches first
+        ensure_rpy2_compatibility()
+        # Attempt to import rpy2
+        import rpy2.robjects as ro
+        from rpy2.robjects import pandas2ri
+        from rpy2.robjects.packages import importr
+        # Activate pandas conversion
+        pandas2ri.activate()
+        # Attempt to import CBPS package
+        cbps_r = importr('CBPS')
+        return True, (ro, pandas2ri, cbps_r)
+    except ImportError as e:
+        # rpy2 or required packages not installed
+        return False, None
+    except Exception as e:
+        # Other initialization errors
+        return False, None