PyPI - skxperiments - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

skxperiments 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

skxperiments/__init__.py +5 -0
skxperiments/core/__init__.py +42 -0
skxperiments/core/assignment.py +589 -0
skxperiments/core/base.py +512 -0
skxperiments/core/exceptions.py +145 -0
skxperiments/core/potential_outcomes.py +168 -0
skxperiments/core/results.py +624 -0
skxperiments/design/__init__.py +22 -0
skxperiments/design/balance.py +182 -0
skxperiments/design/blocked_crd.py +157 -0
skxperiments/design/crd.py +162 -0
skxperiments/design/factorial.py +174 -0
skxperiments/design/power.py +233 -0
skxperiments/design/rerandomized_crd.py +319 -0
skxperiments/diagnostics/__init__.py +21 -0
skxperiments/diagnostics/aa_test.py +277 -0
skxperiments/diagnostics/balance_report.py +224 -0
skxperiments/diagnostics/srm.py +327 -0
skxperiments/estimators/__init__.py +23 -0
skxperiments/estimators/blocked_difference_in_means.py +197 -0
skxperiments/estimators/cuped.py +280 -0
skxperiments/estimators/difference_in_means.py +161 -0
skxperiments/estimators/factorial_estimator.py +213 -0
skxperiments/estimators/lin_estimator.py +298 -0
skxperiments/inference/__init__.py +17 -0
skxperiments/inference/bootstrap.py +450 -0
skxperiments/inference/multiple.py +365 -0
skxperiments/inference/neyman.py +386 -0
skxperiments/inference/randomization_test.py +319 -0
skxperiments/pipeline.py +366 -0
skxperiments/reporting/__init__.py +30 -0
skxperiments/reporting/plots.py +411 -0
skxperiments/reporting/summary.py +185 -0
skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0

skxperiments/diagnostics/srm.py ADDED Viewed

@@ -0,0 +1,327 @@
+"""Sample Ratio Mismatch (SRM) diagnostic.
+A Sample Ratio Mismatch occurs when the observed allocation of units to
+treatment arms differs from the intended allocation by more than chance
+would explain. It is a high-priority alarm for an *implementation* bug
+(asymmetric logging, bot filtering, a broken assignment service), not a
+scientific hypothesis test — hence the conventional decision threshold of
+0.001 rather than 0.05 (Kohavi et al.).
+``SRMTest`` compares the observed arm (or cell) counts to the counts
+expected under the design's intended allocation using Pearson's
+chi-squared goodness-of-fit test, and flags the experiment when the
+p-value falls below the threshold.
+References
+----------
+Kohavi, R., Tang, D., & Xu, Y. (2020). Trustworthy Online Controlled
+    Experiments. Cambridge University Press (Sample Ratio Mismatch).
+"""
+from dataclasses import asdict, dataclass
+from scipy.stats import chisquare
+from skxperiments.core.assignment import (
+    BlockedAssignment,
+    CRDAssignment,
+    FactorialAssignment,
+)
+from skxperiments.core.base import DiagnosticsReport
+from skxperiments.core.exceptions import InvalidDesignError
+@dataclass(frozen=True)
+class SRMResult:
+    """Result of a Sample Ratio Mismatch test.
+    Attributes
+    ----------
+    statistic : float
+        Pearson chi-squared statistic.
+    p_value : float
+        Chi-squared goodness-of-fit p-value.
+    dof : int
+        Degrees of freedom (number of groups minus one).
+    observed : dict
+        Mapping from group label to observed count. Groups are
+        ``"control"``/``"treated"`` for two-arm designs and the integer
+        cell index for factorial designs.
+    expected : dict
+        Mapping from group label to expected count under the intended
+        allocation.
+    threshold : float
+        Decision threshold the p-value was compared against.
+    flagged : bool
+        True if ``p_value < threshold`` — an SRM is suspected.
+    """
+    statistic: float
+    p_value: float
+    dof: int
+    observed: dict
+    expected: dict
+    threshold: float
+    flagged: bool
+    def summary(self) -> "SRMResult":
+        """Print a formatted summary table and return self.
+        Returns
+        -------
+        SRMResult
+            Returns self for method chaining (mirrors ``Results.summary``).
+        """
+        status = "FLAGGED — possible SRM" if self.flagged else "OK"
+        lines = ["SRM Test", "--------"]
+        lines.append(f"chi-square     {self.statistic:.4f}")
+        lines.append(f"dof            {self.dof}")
+        lines.append(f"p-value        {self.p_value:.6f}")
+        lines.append(f"threshold      {self.threshold}")
+        lines.append(f"status         {status}")
+        lines.append("group          observed / expected")
+        for group in self.observed:
+            lines.append(
+                f"  {group}: {self.observed[group]} / "
+                f"{self.expected[group]:.1f}"
+            )
+        print("\n".join(lines))
+        return self
+    def to_dict(self) -> dict:
+        """Return the result as a plain dictionary."""
+        return asdict(self)
+    def to_diagnostics_report(self) -> DiagnosticsReport:
+        """Convert to a ``DiagnosticsReport`` for pipeline aggregation.
+        Returns
+        -------
+        DiagnosticsReport
+            A report carrying a single flag when an SRM is suspected, and
+            no flags otherwise.
+        """
+        report = DiagnosticsReport()
+        if self.flagged:
+            expected_rounded = {
+                group: round(count, 1)
+                for group, count in self.expected.items()
+            }
+            report.flags.append(
+                f"Sample Ratio Mismatch (p={self.p_value:.2e} < "
+                f"{self.threshold}): observed {self.observed} vs expected "
+                f"{expected_rounded}."
+            )
+        return report
+class SRMTest:
+    """Sample Ratio Mismatch test via Pearson's chi-squared.
+    Compares observed arm/cell counts to the counts expected under the
+    design's intended allocation. Supports two-arm designs
+    (``CRDAssignment``, including rerandomized, and ``BlockedAssignment``)
+    and factorial designs (``FactorialAssignment``).
+    Parameters
+    ----------
+    threshold : float, optional
+        Decision threshold for the p-value, by default 0.001. An
+        experiment is flagged when the chi-squared p-value is below it.
+        Must be in (0, 1).
+    expected : float, dict, or None, optional
+        Intended allocation. By default None, in which case it is inferred
+        from the design:
+        - two-arm designs: the design's ``p`` (treatment proportion);
+        - factorial designs: a uniform allocation across the ``2**K``
+          cells.
+        When the design has no intended proportion (e.g., ``CRD`` built
+        with ``n_treated`` rather than ``p``, or ``design_`` is None),
+        ``expected`` must be provided explicitly. For two-arm designs it
+        may be a float (the treated proportion in (0, 1)); for any design
+        it may be a dict mapping each group label to a positive expected
+        proportion (normalized internally).
+    Notes
+    -----
+    SRM is a check on the *observed* data, which in a pipeline may have
+    been filtered or joined after randomization. Run directly on a fresh
+    ``Assignment`` from ``randomize()`` it will not flag, because the
+    library's designs fix the per-arm counts exactly.
+    """
+    def __init__(
+        self,
+        threshold: float = 0.001,
+        expected: float | dict | None = None,
+    ) -> None:
+        if not isinstance(threshold, (int, float)) or isinstance(
+            threshold, bool
+        ):
+            raise InvalidDesignError(
+                f"threshold must be a float in (0, 1), got "
+                f"{type(threshold).__name__}."
+            )
+        if not (0.0 < threshold < 1.0):
+            raise InvalidDesignError(
+                f"threshold must be in (0, 1), got {threshold}."
+            )
+        if expected is not None and not isinstance(
+            expected, (int, float, dict)
+        ):
+            raise InvalidDesignError(
+                f"expected must be None, a float, or a dict, got "
+                f"{type(expected).__name__}."
+            )
+        if isinstance(expected, bool):
+            raise InvalidDesignError("expected must not be a bool.")
+        self.threshold = threshold
+        self.expected = expected
+    def run(
+        self,
+        assignment: CRDAssignment | BlockedAssignment | FactorialAssignment,
+    ) -> SRMResult:
+        """Run the SRM test on an assignment.
+        Parameters
+        ----------
+        assignment : CRDAssignment, BlockedAssignment, or FactorialAssignment
+            The assignment whose realized allocation is being checked.
+        Returns
+        -------
+        SRMResult
+        Raises
+        ------
+        InvalidDesignError
+            If the assignment type is unsupported, if the expected
+            allocation cannot be inferred and was not provided, or if
+            ``expected`` is malformed.
+        """
+        observed, proportions = self._observed_and_proportions(assignment)
+        total = sum(observed.values())
+        if total <= 0:
+            raise InvalidDesignError(
+                "SRMTest requires at least one observed unit."
+            )
+        labels = list(observed.keys())
+        f_obs = [observed[label] for label in labels]
+        expected_counts = {
+            label: proportions[label] * total for label in labels
+        }
+        f_exp = [expected_counts[label] for label in labels]
+        statistic, p_value = chisquare(f_obs, f_exp)
+        return SRMResult(
+            statistic=float(statistic),
+            p_value=float(p_value),
+            dof=len(labels) - 1,
+            observed=observed,
+            expected=expected_counts,
+            threshold=self.threshold,
+            flagged=bool(p_value < self.threshold),
+        )
+    def _observed_and_proportions(
+        self,
+        assignment: CRDAssignment | BlockedAssignment | FactorialAssignment,
+    ) -> tuple[dict, dict]:
+        """Return observed counts and expected proportions per group."""
+        if isinstance(assignment, FactorialAssignment):
+            observed = {
+                int(cell): int(count)
+                for cell, count in sorted(assignment.cell_sizes_.items())
+            }
+            if self.expected is None:
+                k = len(observed)
+                proportions = {cell: 1.0 / k for cell in observed}
+            else:
+                proportions = self._proportions_from_dict(
+                    self.expected, list(observed)
+                )
+            return observed, proportions
+        if isinstance(assignment, (CRDAssignment, BlockedAssignment)):
+            observed = {
+                "control": int(assignment.n_control_),
+                "treated": int(assignment.n_treated_),
+            }
+            if isinstance(self.expected, dict):
+                proportions = self._proportions_from_dict(
+                    self.expected, ["control", "treated"]
+                )
+            elif self.expected is not None:
+                p_treated = self._validate_proportion(self.expected)
+                proportions = {
+                    "control": 1.0 - p_treated,
+                    "treated": p_treated,
+                }
+            else:
+                design = assignment.design_
+                p = getattr(design, "p", None) if design is not None else None
+                if p is None:
+                    raise InvalidDesignError(
+                        "SRMTest cannot infer the expected allocation: the "
+                        "design has no intended proportion `p` (e.g., CRD "
+                        "built with n_treated, or design_ is None). Pass "
+                        "expected=<treated proportion> or a dict of expected "
+                        "proportions."
+                    )
+                p_treated = float(p)
+                proportions = {
+                    "control": 1.0 - p_treated,
+                    "treated": p_treated,
+                }
+            return observed, proportions
+        raise InvalidDesignError(
+            f"SRMTest supports CRDAssignment, BlockedAssignment, and "
+            f"FactorialAssignment; received {type(assignment).__name__}."
+        )
+    @staticmethod
+    def _validate_proportion(value: float) -> float:
+        """Validate a scalar treated proportion in (0, 1)."""
+        if isinstance(value, bool) or not isinstance(value, (int, float)):
+            raise InvalidDesignError(
+                f"expected proportion must be a float in (0, 1), got "
+                f"{type(value).__name__}."
+            )
+        if not (0.0 < value < 1.0):
+            raise InvalidDesignError(
+                f"expected proportion must be in (0, 1), got {value}."
+            )
+        return float(value)
+    @staticmethod
+    def _proportions_from_dict(expected: dict, labels: list) -> dict:
+        """Normalize a dict of expected proportions over ``labels``."""
+        if not isinstance(expected, dict):
+            raise InvalidDesignError(
+                "expected must be a dict mapping each group to a positive "
+                "proportion."
+            )
+        if set(expected.keys()) != set(labels):
+            raise InvalidDesignError(
+                f"expected keys {sorted(map(str, expected.keys()))} must "
+                f"match the assignment groups {sorted(map(str, labels))}."
+            )
+        values = list(expected.values())
+        if any(
+            isinstance(v, bool) or not isinstance(v, (int, float)) or v <= 0
+            for v in values
+        ):
+            raise InvalidDesignError(
+                "expected proportions must be positive numbers."
+            )
+        total = float(sum(values))
+        return {label: float(expected[label]) / total for label in labels}

skxperiments/estimators/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Causal estimators.
+Estimators consume Assignment objects produced by designs and return
+Results objects. Inference (SE, CI, p-value) is the responsibility of
+inference classes (Phase 4); estimators here compute point estimates
+only.
+"""
+from skxperiments.estimators.blocked_difference_in_means import (
+    BlockedDifferenceInMeans,
+)
+from skxperiments.estimators.cuped import CUPED
+from skxperiments.estimators.difference_in_means import DifferenceInMeans
+from skxperiments.estimators.factorial_estimator import FactorialEstimator
+from skxperiments.estimators.lin_estimator import LinEstimator
+__all__ = [
+    "BlockedDifferenceInMeans",
+    "CUPED",
+    "DifferenceInMeans",
+    "FactorialEstimator",
+    "LinEstimator",
+]

skxperiments/estimators/blocked_difference_in_means.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Blocked difference-in-means estimator for blocked randomized designs.
+Computes the SATE estimate as a size-weighted average of within-block
+difference-in-means estimates, the canonical estimator under blocked
+CRD (Imbens & Rubin 2015, Chapter 9).
+"""
+import pandas as pd
+from skxperiments.core.assignment import BlockedAssignment
+from skxperiments.core.base import BaseEstimator
+from skxperiments.core.exceptions import InvalidDesignError
+from skxperiments.core.results import Results
+class BlockedDifferenceInMeans(BaseEstimator):
+    """Size-weighted ATE estimator for BlockedAssignment.
+    Estimates the SATE as a weighted average of within-block
+    difference-in-means estimates, weighted by block size:
+        ATE_hat = sum_b (n_b / N) * (mean(Y_treated_b) - mean(Y_control_b))
+    This is the canonical estimator under blocked CRD (Imbens & Rubin
+    2015, Chapter 9). It is unbiased for SATE without any assumption
+    on within-block variance, and remains numerically stable even
+    with very small blocks (n_b = 2 each).
+    This estimator computes the point estimate only. Standard errors,
+    confidence intervals, and p-values are produced by inference
+    classes (Phase 4) such as ``RandomizationTest`` or ``NeymanCI``.
+    The ``Results`` object returned by ``estimate()`` therefore has
+    ``se``, ``ci``, and ``p_value`` set to ``None``.
+    # TODO v2: adicionar parâmetro weighting: Literal["size", "precision"] = "size"
+    # quando houver demanda concreta. Precision-weighting reduz variância
+    # assintótica sob homocedasticidade dentro de bloco, mas é instável
+    # com blocos pequenos e exige reformulação paralela do NeymanCI.
+    Parameters
+    ----------
+    outcome_col : str
+        Name of the outcome column in ``assignment.data_``.
+    Attributes
+    ----------
+    assignment_ : BlockedAssignment
+        The fitted assignment, stored for downstream use.
+    ate_ : float
+        Size-weighted point estimate of the ATE.
+    block_ates_ : dict
+        Mapping from block label to within-block ATE estimate.
+    Notes
+    -----
+    Accepts only ``BlockedAssignment``. ``CRDAssignment`` and
+    ``FactorialAssignment`` are rejected via ``DesignEstimatorMismatch``:
+    use ``DifferenceInMeans`` or ``FactorialEstimator`` respectively.
+    Every block must have at least one treated unit and one control
+    unit; otherwise the within-block ATE is undefined and ``fit``
+    raises ``InvalidDesignError`` identifying the offending block.
+    Examples
+    --------
+    >>> from skxperiments.design.blocked_crd import BlockedCRD
+    >>> from skxperiments.estimators.blocked_difference_in_means import (
+    ...     BlockedDifferenceInMeans,
+    ... )
+    >>> # df has a "block" column, an outcome "y", and other covariates
+    >>> design = BlockedCRD(block_col="block", p=0.5, seed=42)
+    >>> assignment = design.randomize(df)  # doctest: +SKIP
+    >>> estimator = BlockedDifferenceInMeans(outcome_col="y")
+    >>> results = estimator.fit(assignment).estimate()  # doctest: +SKIP
+    >>> results.ate  # doctest: +SKIP
+    """
+    def __init__(self, outcome_col: str) -> None:
+        self.outcome_col = outcome_col
+    def fit(
+        self, assignment: BlockedAssignment
+    ) -> "BlockedDifferenceInMeans":
+        """Fit the estimator on a BlockedAssignment.
+        Parameters
+        ----------
+        assignment : BlockedAssignment
+            Assignment produced by ``BlockedCRD``.
+        Returns
+        -------
+        BlockedDifferenceInMeans
+            Returns self.
+        Raises
+        ------
+        DesignEstimatorMismatch
+            If ``assignment`` is not a ``BlockedAssignment``.
+        InvalidDesignError
+            If ``outcome_col`` is missing, non-numeric, or contains
+            NaN; or if any block has zero treated or zero control
+            units.
+        """
+        self._validate_assignment_type(assignment, BlockedAssignment)
+        data = assignment.data_
+        if self.outcome_col not in data.columns:
+            raise InvalidDesignError(
+                f"Outcome column '{self.outcome_col}' not found in "
+                f"assignment.data_. Available columns: "
+                f"{list(data.columns)}."
+            )
+        if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
+            raise InvalidDesignError(
+                f"Outcome column '{self.outcome_col}' must be numeric. "
+                f"dtype found: {data[self.outcome_col].dtype}."
+            )
+        if data[self.outcome_col].isna().any():
+            raise InvalidDesignError(
+                f"Outcome column '{self.outcome_col}' contains NaN "
+                f"values. Impute or drop NaN before fitting."
+            )
+        # Validate every block has at least 1 treated and 1 control,
+        # before any computation, to fail fast with a clear message.
+        for block_val in assignment.block_sizes_:
+            mask = data[assignment.block_col_] == block_val
+            block_treatment = data.loc[mask, assignment.treatment_col_]
+            n_t = int((block_treatment == 1).sum())
+            n_c = int((block_treatment == 0).sum())
+            if n_t == 0 or n_c == 0:
+                raise InvalidDesignError(
+                    f"Block '{block_val}' has {n_t} treated and {n_c} "
+                    f"control units; BlockedDifferenceInMeans requires "
+                    f"at least 1 of each."
+                )
+        # Compute within-block ATEs and the size-weighted total in a
+        # single pass.
+        N = assignment.n_units_
+        block_ates: dict = {}
+        ate_total = 0.0
+        for block_val, n_b in assignment.block_sizes_.items():
+            mask = data[assignment.block_col_] == block_val
+            block_outcome = data.loc[mask, self.outcome_col]
+            block_treatment = data.loc[mask, assignment.treatment_col_]
+            treated_mean = float(block_outcome[block_treatment == 1].mean())
+            control_mean = float(block_outcome[block_treatment == 0].mean())
+            ate_b = treated_mean - control_mean
+            block_ates[block_val] = ate_b
+            ate_total += (n_b / N) * ate_b
+        self.assignment_: BlockedAssignment = assignment
+        self.block_ates_: dict = block_ates
+        self.ate_: float = float(ate_total)
+        return self
+    def estimate(self) -> Results:
+        """Return a Results object with the point estimate and metadata.
+        Returns
+        -------
+        Results
+            Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
+            ``estimator_name``, and ``design_name`` populated.
+            ``se``, ``ci``, ``p_value`` are ``None`` — inference is
+            Phase 4.
+        Raises
+        ------
+        NotFittedError
+            If ``fit`` has not been called.
+        """
+        self._check_is_fitted()
+        design_name: str | None
+        if self.assignment_.design_ is not None:
+            design_name = type(self.assignment_.design_).__name__
+        else:
+            design_name = None
+        return Results(
+            ate=self.ate_,
+            n_obs=self.assignment_.n_units_,
+            n_treated=self.assignment_.n_treated_,
+            n_control=self.assignment_.n_control_,
+            estimator_name=type(self).__name__,
+            design_name=design_name,
+        )