skxperiments 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. skxperiments/__init__.py +5 -0
  2. skxperiments/core/__init__.py +42 -0
  3. skxperiments/core/assignment.py +589 -0
  4. skxperiments/core/base.py +512 -0
  5. skxperiments/core/exceptions.py +145 -0
  6. skxperiments/core/potential_outcomes.py +168 -0
  7. skxperiments/core/results.py +624 -0
  8. skxperiments/design/__init__.py +22 -0
  9. skxperiments/design/balance.py +182 -0
  10. skxperiments/design/blocked_crd.py +157 -0
  11. skxperiments/design/crd.py +162 -0
  12. skxperiments/design/factorial.py +174 -0
  13. skxperiments/design/power.py +233 -0
  14. skxperiments/design/rerandomized_crd.py +319 -0
  15. skxperiments/diagnostics/__init__.py +21 -0
  16. skxperiments/diagnostics/aa_test.py +277 -0
  17. skxperiments/diagnostics/balance_report.py +224 -0
  18. skxperiments/diagnostics/srm.py +327 -0
  19. skxperiments/estimators/__init__.py +23 -0
  20. skxperiments/estimators/blocked_difference_in_means.py +197 -0
  21. skxperiments/estimators/cuped.py +280 -0
  22. skxperiments/estimators/difference_in_means.py +161 -0
  23. skxperiments/estimators/factorial_estimator.py +213 -0
  24. skxperiments/estimators/lin_estimator.py +298 -0
  25. skxperiments/inference/__init__.py +17 -0
  26. skxperiments/inference/bootstrap.py +450 -0
  27. skxperiments/inference/multiple.py +365 -0
  28. skxperiments/inference/neyman.py +386 -0
  29. skxperiments/inference/randomization_test.py +319 -0
  30. skxperiments/pipeline.py +366 -0
  31. skxperiments/reporting/__init__.py +30 -0
  32. skxperiments/reporting/plots.py +411 -0
  33. skxperiments/reporting/summary.py +185 -0
  34. skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
  35. skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
  36. skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
@@ -0,0 +1,327 @@
1
+ """Sample Ratio Mismatch (SRM) diagnostic.
2
+
3
+ A Sample Ratio Mismatch occurs when the observed allocation of units to
4
+ treatment arms differs from the intended allocation by more than chance
5
+ would explain. It is a high-priority alarm for an *implementation* bug
6
+ (asymmetric logging, bot filtering, a broken assignment service), not a
7
+ scientific hypothesis test — hence the conventional decision threshold of
8
+ 0.001 rather than 0.05 (Kohavi et al.).
9
+
10
+ ``SRMTest`` compares the observed arm (or cell) counts to the counts
11
+ expected under the design's intended allocation using Pearson's
12
+ chi-squared goodness-of-fit test, and flags the experiment when the
13
+ p-value falls below the threshold.
14
+
15
+ References
16
+ ----------
17
+ Kohavi, R., Tang, D., & Xu, Y. (2020). Trustworthy Online Controlled
18
+ Experiments. Cambridge University Press (Sample Ratio Mismatch).
19
+ """
20
+
21
+ from dataclasses import asdict, dataclass
22
+
23
+ from scipy.stats import chisquare
24
+
25
+ from skxperiments.core.assignment import (
26
+ BlockedAssignment,
27
+ CRDAssignment,
28
+ FactorialAssignment,
29
+ )
30
+ from skxperiments.core.base import DiagnosticsReport
31
+ from skxperiments.core.exceptions import InvalidDesignError
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class SRMResult:
36
+ """Result of a Sample Ratio Mismatch test.
37
+
38
+ Attributes
39
+ ----------
40
+ statistic : float
41
+ Pearson chi-squared statistic.
42
+ p_value : float
43
+ Chi-squared goodness-of-fit p-value.
44
+ dof : int
45
+ Degrees of freedom (number of groups minus one).
46
+ observed : dict
47
+ Mapping from group label to observed count. Groups are
48
+ ``"control"``/``"treated"`` for two-arm designs and the integer
49
+ cell index for factorial designs.
50
+ expected : dict
51
+ Mapping from group label to expected count under the intended
52
+ allocation.
53
+ threshold : float
54
+ Decision threshold the p-value was compared against.
55
+ flagged : bool
56
+ True if ``p_value < threshold`` — an SRM is suspected.
57
+ """
58
+
59
+ statistic: float
60
+ p_value: float
61
+ dof: int
62
+ observed: dict
63
+ expected: dict
64
+ threshold: float
65
+ flagged: bool
66
+
67
+ def summary(self) -> "SRMResult":
68
+ """Print a formatted summary table and return self.
69
+
70
+ Returns
71
+ -------
72
+ SRMResult
73
+ Returns self for method chaining (mirrors ``Results.summary``).
74
+ """
75
+ status = "FLAGGED — possible SRM" if self.flagged else "OK"
76
+ lines = ["SRM Test", "--------"]
77
+ lines.append(f"chi-square {self.statistic:.4f}")
78
+ lines.append(f"dof {self.dof}")
79
+ lines.append(f"p-value {self.p_value:.6f}")
80
+ lines.append(f"threshold {self.threshold}")
81
+ lines.append(f"status {status}")
82
+ lines.append("group observed / expected")
83
+ for group in self.observed:
84
+ lines.append(
85
+ f" {group}: {self.observed[group]} / "
86
+ f"{self.expected[group]:.1f}"
87
+ )
88
+ print("\n".join(lines))
89
+ return self
90
+
91
+ def to_dict(self) -> dict:
92
+ """Return the result as a plain dictionary."""
93
+ return asdict(self)
94
+
95
+ def to_diagnostics_report(self) -> DiagnosticsReport:
96
+ """Convert to a ``DiagnosticsReport`` for pipeline aggregation.
97
+
98
+ Returns
99
+ -------
100
+ DiagnosticsReport
101
+ A report carrying a single flag when an SRM is suspected, and
102
+ no flags otherwise.
103
+ """
104
+ report = DiagnosticsReport()
105
+ if self.flagged:
106
+ expected_rounded = {
107
+ group: round(count, 1)
108
+ for group, count in self.expected.items()
109
+ }
110
+ report.flags.append(
111
+ f"Sample Ratio Mismatch (p={self.p_value:.2e} < "
112
+ f"{self.threshold}): observed {self.observed} vs expected "
113
+ f"{expected_rounded}."
114
+ )
115
+ return report
116
+
117
+
118
+ class SRMTest:
119
+ """Sample Ratio Mismatch test via Pearson's chi-squared.
120
+
121
+ Compares observed arm/cell counts to the counts expected under the
122
+ design's intended allocation. Supports two-arm designs
123
+ (``CRDAssignment``, including rerandomized, and ``BlockedAssignment``)
124
+ and factorial designs (``FactorialAssignment``).
125
+
126
+ Parameters
127
+ ----------
128
+ threshold : float, optional
129
+ Decision threshold for the p-value, by default 0.001. An
130
+ experiment is flagged when the chi-squared p-value is below it.
131
+ Must be in (0, 1).
132
+ expected : float, dict, or None, optional
133
+ Intended allocation. By default None, in which case it is inferred
134
+ from the design:
135
+
136
+ - two-arm designs: the design's ``p`` (treatment proportion);
137
+ - factorial designs: a uniform allocation across the ``2**K``
138
+ cells.
139
+
140
+ When the design has no intended proportion (e.g., ``CRD`` built
141
+ with ``n_treated`` rather than ``p``, or ``design_`` is None),
142
+ ``expected`` must be provided explicitly. For two-arm designs it
143
+ may be a float (the treated proportion in (0, 1)); for any design
144
+ it may be a dict mapping each group label to a positive expected
145
+ proportion (normalized internally).
146
+
147
+ Notes
148
+ -----
149
+ SRM is a check on the *observed* data, which in a pipeline may have
150
+ been filtered or joined after randomization. Run directly on a fresh
151
+ ``Assignment`` from ``randomize()`` it will not flag, because the
152
+ library's designs fix the per-arm counts exactly.
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ threshold: float = 0.001,
158
+ expected: float | dict | None = None,
159
+ ) -> None:
160
+ if not isinstance(threshold, (int, float)) or isinstance(
161
+ threshold, bool
162
+ ):
163
+ raise InvalidDesignError(
164
+ f"threshold must be a float in (0, 1), got "
165
+ f"{type(threshold).__name__}."
166
+ )
167
+ if not (0.0 < threshold < 1.0):
168
+ raise InvalidDesignError(
169
+ f"threshold must be in (0, 1), got {threshold}."
170
+ )
171
+
172
+ if expected is not None and not isinstance(
173
+ expected, (int, float, dict)
174
+ ):
175
+ raise InvalidDesignError(
176
+ f"expected must be None, a float, or a dict, got "
177
+ f"{type(expected).__name__}."
178
+ )
179
+ if isinstance(expected, bool):
180
+ raise InvalidDesignError("expected must not be a bool.")
181
+
182
+ self.threshold = threshold
183
+ self.expected = expected
184
+
185
+ def run(
186
+ self,
187
+ assignment: CRDAssignment | BlockedAssignment | FactorialAssignment,
188
+ ) -> SRMResult:
189
+ """Run the SRM test on an assignment.
190
+
191
+ Parameters
192
+ ----------
193
+ assignment : CRDAssignment, BlockedAssignment, or FactorialAssignment
194
+ The assignment whose realized allocation is being checked.
195
+
196
+ Returns
197
+ -------
198
+ SRMResult
199
+
200
+ Raises
201
+ ------
202
+ InvalidDesignError
203
+ If the assignment type is unsupported, if the expected
204
+ allocation cannot be inferred and was not provided, or if
205
+ ``expected`` is malformed.
206
+ """
207
+ observed, proportions = self._observed_and_proportions(assignment)
208
+
209
+ total = sum(observed.values())
210
+ if total <= 0:
211
+ raise InvalidDesignError(
212
+ "SRMTest requires at least one observed unit."
213
+ )
214
+
215
+ labels = list(observed.keys())
216
+ f_obs = [observed[label] for label in labels]
217
+ expected_counts = {
218
+ label: proportions[label] * total for label in labels
219
+ }
220
+ f_exp = [expected_counts[label] for label in labels]
221
+
222
+ statistic, p_value = chisquare(f_obs, f_exp)
223
+
224
+ return SRMResult(
225
+ statistic=float(statistic),
226
+ p_value=float(p_value),
227
+ dof=len(labels) - 1,
228
+ observed=observed,
229
+ expected=expected_counts,
230
+ threshold=self.threshold,
231
+ flagged=bool(p_value < self.threshold),
232
+ )
233
+
234
+ def _observed_and_proportions(
235
+ self,
236
+ assignment: CRDAssignment | BlockedAssignment | FactorialAssignment,
237
+ ) -> tuple[dict, dict]:
238
+ """Return observed counts and expected proportions per group."""
239
+ if isinstance(assignment, FactorialAssignment):
240
+ observed = {
241
+ int(cell): int(count)
242
+ for cell, count in sorted(assignment.cell_sizes_.items())
243
+ }
244
+ if self.expected is None:
245
+ k = len(observed)
246
+ proportions = {cell: 1.0 / k for cell in observed}
247
+ else:
248
+ proportions = self._proportions_from_dict(
249
+ self.expected, list(observed)
250
+ )
251
+ return observed, proportions
252
+
253
+ if isinstance(assignment, (CRDAssignment, BlockedAssignment)):
254
+ observed = {
255
+ "control": int(assignment.n_control_),
256
+ "treated": int(assignment.n_treated_),
257
+ }
258
+ if isinstance(self.expected, dict):
259
+ proportions = self._proportions_from_dict(
260
+ self.expected, ["control", "treated"]
261
+ )
262
+ elif self.expected is not None:
263
+ p_treated = self._validate_proportion(self.expected)
264
+ proportions = {
265
+ "control": 1.0 - p_treated,
266
+ "treated": p_treated,
267
+ }
268
+ else:
269
+ design = assignment.design_
270
+ p = getattr(design, "p", None) if design is not None else None
271
+ if p is None:
272
+ raise InvalidDesignError(
273
+ "SRMTest cannot infer the expected allocation: the "
274
+ "design has no intended proportion `p` (e.g., CRD "
275
+ "built with n_treated, or design_ is None). Pass "
276
+ "expected=<treated proportion> or a dict of expected "
277
+ "proportions."
278
+ )
279
+ p_treated = float(p)
280
+ proportions = {
281
+ "control": 1.0 - p_treated,
282
+ "treated": p_treated,
283
+ }
284
+ return observed, proportions
285
+
286
+ raise InvalidDesignError(
287
+ f"SRMTest supports CRDAssignment, BlockedAssignment, and "
288
+ f"FactorialAssignment; received {type(assignment).__name__}."
289
+ )
290
+
291
+ @staticmethod
292
+ def _validate_proportion(value: float) -> float:
293
+ """Validate a scalar treated proportion in (0, 1)."""
294
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
295
+ raise InvalidDesignError(
296
+ f"expected proportion must be a float in (0, 1), got "
297
+ f"{type(value).__name__}."
298
+ )
299
+ if not (0.0 < value < 1.0):
300
+ raise InvalidDesignError(
301
+ f"expected proportion must be in (0, 1), got {value}."
302
+ )
303
+ return float(value)
304
+
305
+ @staticmethod
306
+ def _proportions_from_dict(expected: dict, labels: list) -> dict:
307
+ """Normalize a dict of expected proportions over ``labels``."""
308
+ if not isinstance(expected, dict):
309
+ raise InvalidDesignError(
310
+ "expected must be a dict mapping each group to a positive "
311
+ "proportion."
312
+ )
313
+ if set(expected.keys()) != set(labels):
314
+ raise InvalidDesignError(
315
+ f"expected keys {sorted(map(str, expected.keys()))} must "
316
+ f"match the assignment groups {sorted(map(str, labels))}."
317
+ )
318
+ values = list(expected.values())
319
+ if any(
320
+ isinstance(v, bool) or not isinstance(v, (int, float)) or v <= 0
321
+ for v in values
322
+ ):
323
+ raise InvalidDesignError(
324
+ "expected proportions must be positive numbers."
325
+ )
326
+ total = float(sum(values))
327
+ return {label: float(expected[label]) / total for label in labels}
@@ -0,0 +1,23 @@
1
+ """Causal estimators.
2
+
3
+ Estimators consume Assignment objects produced by designs and return
4
+ Results objects. Inference (SE, CI, p-value) is the responsibility of
5
+ inference classes (Phase 4); estimators here compute point estimates
6
+ only.
7
+ """
8
+
9
+ from skxperiments.estimators.blocked_difference_in_means import (
10
+ BlockedDifferenceInMeans,
11
+ )
12
+ from skxperiments.estimators.cuped import CUPED
13
+ from skxperiments.estimators.difference_in_means import DifferenceInMeans
14
+ from skxperiments.estimators.factorial_estimator import FactorialEstimator
15
+ from skxperiments.estimators.lin_estimator import LinEstimator
16
+
17
+ __all__ = [
18
+ "BlockedDifferenceInMeans",
19
+ "CUPED",
20
+ "DifferenceInMeans",
21
+ "FactorialEstimator",
22
+ "LinEstimator",
23
+ ]
@@ -0,0 +1,197 @@
1
+ """Blocked difference-in-means estimator for blocked randomized designs.
2
+
3
+ Computes the SATE estimate as a size-weighted average of within-block
4
+ difference-in-means estimates, the canonical estimator under blocked
5
+ CRD (Imbens & Rubin 2015, Chapter 9).
6
+ """
7
+
8
+ import pandas as pd
9
+
10
+ from skxperiments.core.assignment import BlockedAssignment
11
+ from skxperiments.core.base import BaseEstimator
12
+ from skxperiments.core.exceptions import InvalidDesignError
13
+ from skxperiments.core.results import Results
14
+
15
+
16
+ class BlockedDifferenceInMeans(BaseEstimator):
17
+ """Size-weighted ATE estimator for BlockedAssignment.
18
+
19
+ Estimates the SATE as a weighted average of within-block
20
+ difference-in-means estimates, weighted by block size:
21
+
22
+ ATE_hat = sum_b (n_b / N) * (mean(Y_treated_b) - mean(Y_control_b))
23
+
24
+ This is the canonical estimator under blocked CRD (Imbens & Rubin
25
+ 2015, Chapter 9). It is unbiased for SATE without any assumption
26
+ on within-block variance, and remains numerically stable even
27
+ with very small blocks (n_b = 2 each).
28
+
29
+ This estimator computes the point estimate only. Standard errors,
30
+ confidence intervals, and p-values are produced by inference
31
+ classes (Phase 4) such as ``RandomizationTest`` or ``NeymanCI``.
32
+ The ``Results`` object returned by ``estimate()`` therefore has
33
+ ``se``, ``ci``, and ``p_value`` set to ``None``.
34
+
35
+ # TODO v2: adicionar parâmetro weighting: Literal["size", "precision"] = "size"
36
+ # quando houver demanda concreta. Precision-weighting reduz variância
37
+ # assintótica sob homocedasticidade dentro de bloco, mas é instável
38
+ # com blocos pequenos e exige reformulação paralela do NeymanCI.
39
+
40
+ Parameters
41
+ ----------
42
+ outcome_col : str
43
+ Name of the outcome column in ``assignment.data_``.
44
+
45
+ Attributes
46
+ ----------
47
+ assignment_ : BlockedAssignment
48
+ The fitted assignment, stored for downstream use.
49
+ ate_ : float
50
+ Size-weighted point estimate of the ATE.
51
+ block_ates_ : dict
52
+ Mapping from block label to within-block ATE estimate.
53
+
54
+ Notes
55
+ -----
56
+ Accepts only ``BlockedAssignment``. ``CRDAssignment`` and
57
+ ``FactorialAssignment`` are rejected via ``DesignEstimatorMismatch``:
58
+ use ``DifferenceInMeans`` or ``FactorialEstimator`` respectively.
59
+
60
+ Every block must have at least one treated unit and one control
61
+ unit; otherwise the within-block ATE is undefined and ``fit``
62
+ raises ``InvalidDesignError`` identifying the offending block.
63
+
64
+ Examples
65
+ --------
66
+ >>> from skxperiments.design.blocked_crd import BlockedCRD
67
+ >>> from skxperiments.estimators.blocked_difference_in_means import (
68
+ ... BlockedDifferenceInMeans,
69
+ ... )
70
+ >>> # df has a "block" column, an outcome "y", and other covariates
71
+ >>> design = BlockedCRD(block_col="block", p=0.5, seed=42)
72
+ >>> assignment = design.randomize(df) # doctest: +SKIP
73
+ >>> estimator = BlockedDifferenceInMeans(outcome_col="y")
74
+ >>> results = estimator.fit(assignment).estimate() # doctest: +SKIP
75
+ >>> results.ate # doctest: +SKIP
76
+ """
77
+
78
+ def __init__(self, outcome_col: str) -> None:
79
+ self.outcome_col = outcome_col
80
+
81
+ def fit(
82
+ self, assignment: BlockedAssignment
83
+ ) -> "BlockedDifferenceInMeans":
84
+ """Fit the estimator on a BlockedAssignment.
85
+
86
+ Parameters
87
+ ----------
88
+ assignment : BlockedAssignment
89
+ Assignment produced by ``BlockedCRD``.
90
+
91
+ Returns
92
+ -------
93
+ BlockedDifferenceInMeans
94
+ Returns self.
95
+
96
+ Raises
97
+ ------
98
+ DesignEstimatorMismatch
99
+ If ``assignment`` is not a ``BlockedAssignment``.
100
+ InvalidDesignError
101
+ If ``outcome_col`` is missing, non-numeric, or contains
102
+ NaN; or if any block has zero treated or zero control
103
+ units.
104
+ """
105
+ self._validate_assignment_type(assignment, BlockedAssignment)
106
+
107
+ data = assignment.data_
108
+
109
+ if self.outcome_col not in data.columns:
110
+ raise InvalidDesignError(
111
+ f"Outcome column '{self.outcome_col}' not found in "
112
+ f"assignment.data_. Available columns: "
113
+ f"{list(data.columns)}."
114
+ )
115
+
116
+ if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
117
+ raise InvalidDesignError(
118
+ f"Outcome column '{self.outcome_col}' must be numeric. "
119
+ f"dtype found: {data[self.outcome_col].dtype}."
120
+ )
121
+
122
+ if data[self.outcome_col].isna().any():
123
+ raise InvalidDesignError(
124
+ f"Outcome column '{self.outcome_col}' contains NaN "
125
+ f"values. Impute or drop NaN before fitting."
126
+ )
127
+
128
+ # Validate every block has at least 1 treated and 1 control,
129
+ # before any computation, to fail fast with a clear message.
130
+ for block_val in assignment.block_sizes_:
131
+ mask = data[assignment.block_col_] == block_val
132
+ block_treatment = data.loc[mask, assignment.treatment_col_]
133
+ n_t = int((block_treatment == 1).sum())
134
+ n_c = int((block_treatment == 0).sum())
135
+ if n_t == 0 or n_c == 0:
136
+ raise InvalidDesignError(
137
+ f"Block '{block_val}' has {n_t} treated and {n_c} "
138
+ f"control units; BlockedDifferenceInMeans requires "
139
+ f"at least 1 of each."
140
+ )
141
+
142
+ # Compute within-block ATEs and the size-weighted total in a
143
+ # single pass.
144
+ N = assignment.n_units_
145
+ block_ates: dict = {}
146
+ ate_total = 0.0
147
+
148
+ for block_val, n_b in assignment.block_sizes_.items():
149
+ mask = data[assignment.block_col_] == block_val
150
+ block_outcome = data.loc[mask, self.outcome_col]
151
+ block_treatment = data.loc[mask, assignment.treatment_col_]
152
+
153
+ treated_mean = float(block_outcome[block_treatment == 1].mean())
154
+ control_mean = float(block_outcome[block_treatment == 0].mean())
155
+
156
+ ate_b = treated_mean - control_mean
157
+ block_ates[block_val] = ate_b
158
+ ate_total += (n_b / N) * ate_b
159
+
160
+ self.assignment_: BlockedAssignment = assignment
161
+ self.block_ates_: dict = block_ates
162
+ self.ate_: float = float(ate_total)
163
+
164
+ return self
165
+
166
+ def estimate(self) -> Results:
167
+ """Return a Results object with the point estimate and metadata.
168
+
169
+ Returns
170
+ -------
171
+ Results
172
+ Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
173
+ ``estimator_name``, and ``design_name`` populated.
174
+ ``se``, ``ci``, ``p_value`` are ``None`` — inference is
175
+ Phase 4.
176
+
177
+ Raises
178
+ ------
179
+ NotFittedError
180
+ If ``fit`` has not been called.
181
+ """
182
+ self._check_is_fitted()
183
+
184
+ design_name: str | None
185
+ if self.assignment_.design_ is not None:
186
+ design_name = type(self.assignment_.design_).__name__
187
+ else:
188
+ design_name = None
189
+
190
+ return Results(
191
+ ate=self.ate_,
192
+ n_obs=self.assignment_.n_units_,
193
+ n_treated=self.assignment_.n_treated_,
194
+ n_control=self.assignment_.n_control_,
195
+ estimator_name=type(self).__name__,
196
+ design_name=design_name,
197
+ )