skxperiments 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. skxperiments/__init__.py +5 -0
  2. skxperiments/core/__init__.py +42 -0
  3. skxperiments/core/assignment.py +589 -0
  4. skxperiments/core/base.py +512 -0
  5. skxperiments/core/exceptions.py +145 -0
  6. skxperiments/core/potential_outcomes.py +168 -0
  7. skxperiments/core/results.py +624 -0
  8. skxperiments/design/__init__.py +22 -0
  9. skxperiments/design/balance.py +182 -0
  10. skxperiments/design/blocked_crd.py +157 -0
  11. skxperiments/design/crd.py +162 -0
  12. skxperiments/design/factorial.py +174 -0
  13. skxperiments/design/power.py +233 -0
  14. skxperiments/design/rerandomized_crd.py +319 -0
  15. skxperiments/diagnostics/__init__.py +21 -0
  16. skxperiments/diagnostics/aa_test.py +277 -0
  17. skxperiments/diagnostics/balance_report.py +224 -0
  18. skxperiments/diagnostics/srm.py +327 -0
  19. skxperiments/estimators/__init__.py +23 -0
  20. skxperiments/estimators/blocked_difference_in_means.py +197 -0
  21. skxperiments/estimators/cuped.py +280 -0
  22. skxperiments/estimators/difference_in_means.py +161 -0
  23. skxperiments/estimators/factorial_estimator.py +213 -0
  24. skxperiments/estimators/lin_estimator.py +298 -0
  25. skxperiments/inference/__init__.py +17 -0
  26. skxperiments/inference/bootstrap.py +450 -0
  27. skxperiments/inference/multiple.py +365 -0
  28. skxperiments/inference/neyman.py +386 -0
  29. skxperiments/inference/randomization_test.py +319 -0
  30. skxperiments/pipeline.py +366 -0
  31. skxperiments/reporting/__init__.py +30 -0
  32. skxperiments/reporting/plots.py +411 -0
  33. skxperiments/reporting/summary.py +185 -0
  34. skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
  35. skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
  36. skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
@@ -0,0 +1,280 @@
1
+ """CUPED (Controlled-experiment Using Pre-Experiment Data) estimator.
2
+
3
+ Reduces variance of the ATE estimate by adjusting the outcome with a
4
+ pre-experiment covariate, typically the same metric measured in a
5
+ period before the experiment.
6
+
7
+ Reference: Deng, A., Xu, Y., Kohavi, R., & Walker, T. (2013).
8
+ Improving the sensitivity of online controlled experiments by
9
+ utilizing pre-experiment data. WSDM 2013.
10
+ """
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from skxperiments.core.assignment import CRDAssignment
16
+ from skxperiments.core.base import BaseEstimator
17
+ from skxperiments.core.exceptions import InvalidDesignError
18
+ from skxperiments.core.results import Results
19
+
20
+
21
+ class CUPED(BaseEstimator):
22
+ """CUPED estimator: variance reduction via pre-experiment covariate.
23
+
24
+ Estimates the ATE by adjusting the outcome with a pre-experiment
25
+ covariate, typically the same metric measured before the
26
+ experiment started:
27
+
28
+ theta = Cov(Y, X_pre) / Var(X_pre)
29
+ ATE = (mean(Y_T) - mean(Y_C)) - theta * (mean(X_pre_T) - mean(X_pre_C))
30
+
31
+ Under randomization, the second term has expectation zero, so the
32
+ estimator remains unbiased for the ATE. Variance is reduced by a
33
+ factor of approximately ``1 - rho**2``, where ``rho`` is the
34
+ Pearson correlation between Y and X_pre.
35
+
36
+ With a single pre-experiment covariate and **no interaction
37
+ term**, CUPED is asymptotically equivalent to OLS of Y on
38
+ [1, T, X_pre]. This differs from ``LinEstimator``, which adds
39
+ the interaction T * X_centered and is asymptotically optimal
40
+ among linear adjustments.
41
+
42
+ Decisão arquitetural fixada: CUPED v1 accepts only
43
+ ``CRDAssignment``. ``BlockedAssignment`` is rejected with a
44
+ suggestion to use ``BlockedDifferenceInMeans`` (CUPED with
45
+ blocking is planned for v2). ``FactorialAssignment`` is rejected
46
+ with ``DesignEstimatorMismatch``.
47
+
48
+ Decisão arquitetural fixada: ``pre_experiment_col`` is a parameter
49
+ of ``__init__``, not ``fit``, consistent with sklearn and the
50
+ other estimators in the library.
51
+
52
+ Decisão arquitetural fixada: CUPED has no ``inference_mode``
53
+ parameter. Phase 4 will use a CUPED-specific SE formula
54
+ without a mode flag.
55
+
56
+ Parameters
57
+ ----------
58
+ outcome_col : str
59
+ Name of the outcome column in ``assignment.data_``.
60
+ pre_experiment_col : str
61
+ Name of the pre-experiment covariate column in
62
+ ``assignment.data_``. Must differ from ``outcome_col`` and
63
+ have non-zero variance.
64
+
65
+ Attributes
66
+ ----------
67
+ assignment_ : CRDAssignment
68
+ The fitted assignment.
69
+ ate_ : float
70
+ CUPED-adjusted point estimate of the ATE.
71
+ theta_ : float
72
+ Adjustment coefficient ``Cov(Y, X_pre) / Var(X_pre)``.
73
+ Interpretable as the slope of OLS regression of Y on X_pre.
74
+ correlation_ : float
75
+ Pearson correlation between Y and X_pre. The expected
76
+ variance reduction relative to ``DifferenceInMeans`` is
77
+ ``1 - correlation_**2``.
78
+
79
+ Notes
80
+ -----
81
+ Covariance and variance are computed with ``ddof=1`` (sample
82
+ convention), consistent with ``pandas.Series.cov`` and ``.var``
83
+ defaults. ``np.cov(y, x, ddof=1)[0, 1]`` returns the off-diagonal
84
+ sample covariance; ``np.corrcoef(y, x)[0, 1]`` returns the
85
+ Pearson correlation.
86
+
87
+ Standard errors, confidence intervals, and p-values are not
88
+ computed here. The ``Results`` object returned by ``estimate()``
89
+ has ``se``, ``ci``, ``p_value`` set to ``None``. Phase 4
90
+ inference classes will compute them.
91
+
92
+ Examples
93
+ --------
94
+ >>> from skxperiments.design.crd import CRD
95
+ >>> from skxperiments.estimators.cuped import CUPED
96
+ >>> design = CRD(p=0.5, seed=42)
97
+ >>> assignment = design.randomize(df) # doctest: +SKIP
98
+ >>> estimator = CUPED(
99
+ ... outcome_col="y", pre_experiment_col="y_pre"
100
+ ... )
101
+ >>> result = estimator.fit(assignment).estimate() # doctest: +SKIP
102
+ >>> result.ate # doctest: +SKIP
103
+ >>> result.extra["theta"] # doctest: +SKIP
104
+ """
105
+
106
+ def __init__(
107
+ self,
108
+ outcome_col: str,
109
+ pre_experiment_col: str,
110
+ ) -> None:
111
+ # Validate types and non-emptiness.
112
+ if not isinstance(outcome_col, str) or len(outcome_col) == 0:
113
+ raise InvalidDesignError(
114
+ f"outcome_col must be a non-empty string, but received "
115
+ f"{outcome_col!r}."
116
+ )
117
+ if (
118
+ not isinstance(pre_experiment_col, str)
119
+ or len(pre_experiment_col) == 0
120
+ ):
121
+ raise InvalidDesignError(
122
+ f"pre_experiment_col must be a non-empty string, but "
123
+ f"received {pre_experiment_col!r}."
124
+ )
125
+
126
+ # Validate distinctness.
127
+ if outcome_col == pre_experiment_col:
128
+ raise InvalidDesignError(
129
+ f"outcome_col and pre_experiment_col must differ. Using "
130
+ f"the same column ({outcome_col!r}) for both does not "
131
+ f"make sense: theta would be 1, and the adjusted "
132
+ f"outcome would be identically zero."
133
+ )
134
+
135
+ self.outcome_col = outcome_col
136
+ self.pre_experiment_col = pre_experiment_col
137
+
138
+ def fit(self, assignment: CRDAssignment) -> "CUPED":
139
+ """Fit the CUPED estimator on a CRDAssignment.
140
+
141
+ Parameters
142
+ ----------
143
+ assignment : CRDAssignment
144
+ The assignment to fit on.
145
+
146
+ Returns
147
+ -------
148
+ CUPED
149
+ Returns self.
150
+
151
+ Raises
152
+ ------
153
+ DesignEstimatorMismatch
154
+ If ``assignment`` is not a ``CRDAssignment``. For
155
+ ``BlockedAssignment``, suggests
156
+ ``BlockedDifferenceInMeans``.
157
+ InvalidDesignError
158
+ If ``outcome_col`` or ``pre_experiment_col`` is missing,
159
+ non-numeric, contains NaN; or if ``pre_experiment_col``
160
+ has zero variance.
161
+ """
162
+ self._validate_assignment_type(assignment, CRDAssignment)
163
+
164
+ data = assignment.data_
165
+
166
+ # Validate outcome.
167
+ if self.outcome_col not in data.columns:
168
+ raise InvalidDesignError(
169
+ f"Outcome column '{self.outcome_col}' not found in "
170
+ f"assignment.data_. Available columns: "
171
+ f"{list(data.columns)}."
172
+ )
173
+
174
+ if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
175
+ raise InvalidDesignError(
176
+ f"Outcome column '{self.outcome_col}' must be numeric. "
177
+ f"dtype found: {data[self.outcome_col].dtype}."
178
+ )
179
+
180
+ if data[self.outcome_col].isna().any():
181
+ raise InvalidDesignError(
182
+ f"Outcome column '{self.outcome_col}' contains NaN "
183
+ f"values. Impute or drop NaN before fitting."
184
+ )
185
+
186
+ # Validate pre-experiment covariate.
187
+ if self.pre_experiment_col not in data.columns:
188
+ raise InvalidDesignError(
189
+ f"Pre-experiment column '{self.pre_experiment_col}' "
190
+ f"not found in assignment.data_. Available columns: "
191
+ f"{list(data.columns)}."
192
+ )
193
+
194
+ if not pd.api.types.is_numeric_dtype(data[self.pre_experiment_col]):
195
+ raise InvalidDesignError(
196
+ f"Pre-experiment column '{self.pre_experiment_col}' "
197
+ f"must be numeric. dtype found: "
198
+ f"{data[self.pre_experiment_col].dtype}."
199
+ )
200
+
201
+ if data[self.pre_experiment_col].isna().any():
202
+ raise InvalidDesignError(
203
+ f"Pre-experiment column '{self.pre_experiment_col}' "
204
+ f"contains NaN values. Impute or drop NaN before "
205
+ f"fitting."
206
+ )
207
+
208
+ # Validate non-zero variance.
209
+ var_x = float(np.var(data[self.pre_experiment_col].values, ddof=1))
210
+ if var_x == 0:
211
+ raise InvalidDesignError(
212
+ f"pre_experiment_col '{self.pre_experiment_col}' has "
213
+ f"zero variance; theta is undefined. CUPED requires a "
214
+ f"non-constant pre-experiment covariate."
215
+ )
216
+
217
+ # Compute CUPED estimator.
218
+ y = data[self.outcome_col].values.astype(float)
219
+ x_pre = data[self.pre_experiment_col].values.astype(float)
220
+
221
+ # theta = Cov(Y, X_pre) / Var(X_pre), over all units, ddof=1.
222
+ cov_yx = float(np.cov(y, x_pre, ddof=1)[0, 1])
223
+ theta = cov_yx / var_x
224
+
225
+ # Pearson correlation, for diagnostic reporting of expected
226
+ # variance reduction (= 1 - correlation**2).
227
+ correlation = float(np.corrcoef(y, x_pre)[0, 1])
228
+
229
+ # ATE on adjusted outcome:
230
+ # tau = (mean(Y_T) - mean(Y_C)) - theta * (mean(X_T) - mean(X_C))
231
+ treated_idx = assignment.treated_ids()
232
+ control_idx = assignment.control_ids()
233
+
234
+ dim_y = float(y[treated_idx].mean() - y[control_idx].mean())
235
+ dim_x = float(x_pre[treated_idx].mean() - x_pre[control_idx].mean())
236
+
237
+ self.assignment_: CRDAssignment = assignment
238
+ self.theta_: float = theta
239
+ self.correlation_: float = correlation
240
+ self.ate_: float = dim_y - theta * dim_x
241
+
242
+ return self
243
+
244
+ def estimate(self) -> Results:
245
+ """Return a Results object with the point estimate and metadata.
246
+
247
+ Returns
248
+ -------
249
+ Results
250
+ Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
251
+ ``estimator_name``, ``design_name`` populated, and
252
+ ``extra={"theta": ..., "correlation": ...}`` propagated.
253
+ ``se``, ``ci``, ``p_value`` are ``None`` — inference is
254
+ Phase 4.
255
+
256
+ Raises
257
+ ------
258
+ NotFittedError
259
+ If ``fit`` has not been called.
260
+ """
261
+ self._check_is_fitted()
262
+
263
+ design_name: str | None
264
+ if self.assignment_.design_ is not None:
265
+ design_name = type(self.assignment_.design_).__name__
266
+ else:
267
+ design_name = None
268
+
269
+ return Results(
270
+ ate=self.ate_,
271
+ n_obs=self.assignment_.n_units_,
272
+ n_treated=self.assignment_.n_treated_,
273
+ n_control=self.assignment_.n_control_,
274
+ estimator_name=type(self).__name__,
275
+ design_name=design_name,
276
+ extra={
277
+ "theta": self.theta_,
278
+ "correlation": self.correlation_,
279
+ },
280
+ )
@@ -0,0 +1,161 @@
1
+ """Difference-in-means estimator for completely randomized designs.
2
+
3
+ Computes the simple ATE estimate for a CRDAssignment as the difference
4
+ between the treated-group mean and the control-group mean of the
5
+ outcome variable.
6
+ """
7
+
8
+ import pandas as pd
9
+
10
+ from skxperiments.core.assignment import CRDAssignment
11
+ from skxperiments.core.base import BaseEstimator
12
+ from skxperiments.core.exceptions import InvalidDesignError
13
+ from skxperiments.core.results import Results
14
+
15
+
16
+ class DifferenceInMeans(BaseEstimator):
17
+ """Difference-in-means estimator for the ATE under CRD.
18
+
19
+ Estimates the average treatment effect as the difference between
20
+ the sample mean of the outcome among treated units and the sample
21
+ mean among control units:
22
+
23
+ ATE_hat = mean(Y[treated]) - mean(Y[control])
24
+
25
+ This estimator computes the point estimate only. Standard errors,
26
+ confidence intervals, and p-values are produced by inference
27
+ classes (Phase 4) such as ``RandomizationTest`` or ``NeymanCI``.
28
+ The ``Results`` object returned by ``estimate()`` therefore has
29
+ ``se``, ``ci``, and ``p_value`` set to ``None``.
30
+
31
+ Parameters
32
+ ----------
33
+ outcome_col : str
34
+ Name of the outcome column in ``assignment.data_``.
35
+
36
+ Attributes
37
+ ----------
38
+ assignment_ : CRDAssignment
39
+ The fitted assignment, stored for downstream use.
40
+ ate_ : float
41
+ Point estimate of the ATE.
42
+
43
+ Notes
44
+ -----
45
+ Accepts ``CRDAssignment`` produced by either ``CRD`` or
46
+ ``ReRandomizedCRD`` — the point estimator is the same regardless
47
+ of the rerandomization criterion. Correct inference under
48
+ rerandomization requires the corresponding inference class to
49
+ consume ``rerandomization_metadata``, which is the responsibility
50
+ of Phase 4. The estimator itself ignores the metadata.
51
+
52
+ ``BlockedAssignment`` and ``FactorialAssignment`` are rejected
53
+ via ``DesignEstimatorMismatch``: use ``BlockedDifferenceInMeans``
54
+ or ``FactorialEstimator`` respectively.
55
+
56
+ Examples
57
+ --------
58
+ >>> from skxperiments.design.crd import CRD
59
+ >>> from skxperiments.estimators.difference_in_means import (
60
+ ... DifferenceInMeans,
61
+ ... )
62
+ >>> # df has columns "x" and "y" (the outcome)
63
+ >>> design = CRD(p=0.5, seed=42)
64
+ >>> assignment = design.randomize(df) # doctest: +SKIP
65
+ >>> estimator = DifferenceInMeans(outcome_col="y")
66
+ >>> results = estimator.fit(assignment).estimate() # doctest: +SKIP
67
+ >>> results.ate # doctest: +SKIP
68
+ """
69
+
70
+ def __init__(self, outcome_col: str) -> None:
71
+ self.outcome_col = outcome_col
72
+
73
+ def fit(self, assignment: CRDAssignment) -> "DifferenceInMeans":
74
+ """Fit the estimator on a CRDAssignment.
75
+
76
+ Parameters
77
+ ----------
78
+ assignment : CRDAssignment
79
+ Assignment produced by ``CRD`` or ``ReRandomizedCRD``.
80
+
81
+ Returns
82
+ -------
83
+ DifferenceInMeans
84
+ Returns self.
85
+
86
+ Raises
87
+ ------
88
+ DesignEstimatorMismatch
89
+ If ``assignment`` is not a ``CRDAssignment`` (in particular,
90
+ ``BlockedAssignment`` and ``FactorialAssignment`` are
91
+ rejected).
92
+ InvalidDesignError
93
+ If ``outcome_col`` is missing from ``assignment.data_``,
94
+ is not numeric, or contains NaN values.
95
+ """
96
+ self._validate_assignment_type(assignment, CRDAssignment)
97
+
98
+ data = assignment.data_
99
+
100
+ if self.outcome_col not in data.columns:
101
+ raise InvalidDesignError(
102
+ f"Outcome column '{self.outcome_col}' not found in "
103
+ f"assignment.data_. Available columns: "
104
+ f"{list(data.columns)}."
105
+ )
106
+
107
+ if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
108
+ raise InvalidDesignError(
109
+ f"Outcome column '{self.outcome_col}' must be numeric. "
110
+ f"dtype found: {data[self.outcome_col].dtype}."
111
+ )
112
+
113
+ if data[self.outcome_col].isna().any():
114
+ raise InvalidDesignError(
115
+ f"Outcome column '{self.outcome_col}' contains NaN "
116
+ f"values. Impute or drop NaN before fitting."
117
+ )
118
+
119
+ y = data[self.outcome_col].values
120
+ treated_idx = assignment.treated_ids()
121
+ control_idx = assignment.control_ids()
122
+
123
+ self.assignment_: CRDAssignment = assignment
124
+ self.ate_: float = float(
125
+ y[treated_idx].mean() - y[control_idx].mean()
126
+ )
127
+
128
+ return self
129
+
130
+ def estimate(self) -> Results:
131
+ """Return a Results object with the point estimate and metadata.
132
+
133
+ Returns
134
+ -------
135
+ Results
136
+ Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
137
+ ``estimator_name``, and ``design_name`` populated.
138
+ ``se``, ``ci``, ``p_value`` are ``None`` — inference is
139
+ Phase 4.
140
+
141
+ Raises
142
+ ------
143
+ NotFittedError
144
+ If ``fit`` has not been called.
145
+ """
146
+ self._check_is_fitted()
147
+
148
+ design_name: str | None
149
+ if self.assignment_.design_ is not None:
150
+ design_name = type(self.assignment_.design_).__name__
151
+ else:
152
+ design_name = None
153
+
154
+ return Results(
155
+ ate=self.ate_,
156
+ n_obs=self.assignment_.n_units_,
157
+ n_treated=self.assignment_.n_treated_,
158
+ n_control=self.assignment_.n_control_,
159
+ estimator_name=type(self).__name__,
160
+ design_name=design_name,
161
+ )
@@ -0,0 +1,213 @@
1
+ """Factorial estimator for 2^K factorial designs.
2
+
3
+ Computes all 2^K - 1 orthogonal contrasts (main effects and
4
+ interactions of all orders) from a FactorialAssignment. Each effect
5
+ is the standard factorial contrast: for a non-empty subset S of
6
+ factors, the effect is the average over cells of Y weighted by
7
+ prod_{j in S} (2 * x_j - 1), divided by 2^(K-1).
8
+
9
+ Reference: Box, Hunter & Hunter (2005), Statistics for Experimenters.
10
+ """
11
+
12
+ import itertools
13
+
14
+ import pandas as pd
15
+
16
+ from skxperiments.core.assignment import FactorialAssignment
17
+ from skxperiments.core.base import BaseEstimator
18
+ from skxperiments.core.exceptions import InvalidDesignError
19
+ from skxperiments.core.results import Results
20
+
21
+
22
+ class FactorialEstimator(BaseEstimator):
23
+ """Estimates main effects and all interactions in a 2^K design.
24
+
25
+ For a FactorialAssignment with K factors, returns 2^K - 1 effects:
26
+ K main effects, C(K,2) two-way interactions, ..., and one K-way
27
+ interaction. Each effect is computed via the standard factorial
28
+ contrast.
29
+
30
+ Effect keys are tuples of factor names in **alphabetical order**,
31
+ independent of the order in which factors were passed to
32
+ ``FactorialDesign``. This guarantees reproducibility: the same
33
+ DataFrame and design parameters produce the same key set, in the
34
+ same order, regardless of factor ordering.
35
+
36
+ Subset enumeration uses ``itertools.combinations`` over the sorted
37
+ factor names, iterating r from 1 to K. For K=3 with factors
38
+ ``["A", "B", "C"]``, the order is::
39
+
40
+ ("A",), ("B",), ("C",),
41
+ ("A", "B"), ("A", "C"), ("B", "C"),
42
+ ("A", "B", "C")
43
+
44
+ This estimator computes the point estimate only. Standard errors,
45
+ confidence intervals, and p-values are produced by inference
46
+ classes (Phase 4). The ``Results`` returned by ``estimate()`` has
47
+ ``se``, ``ci``, ``p_value`` set to ``None``.
48
+
49
+ Parameters
50
+ ----------
51
+ outcome_col : str
52
+ Name of the outcome column in ``assignment.data_``.
53
+
54
+ Attributes
55
+ ----------
56
+ assignment_ : FactorialAssignment
57
+ The fitted assignment.
58
+ effects_ : dict[tuple[str, ...], float]
59
+ Mapping from effect-name tuple to point estimate. Has
60
+ ``2 ** K - 1`` entries.
61
+
62
+ Notes
63
+ -----
64
+ Accepts only ``FactorialAssignment``. ``CRDAssignment`` and
65
+ ``BlockedAssignment`` are rejected via ``DesignEstimatorMismatch``;
66
+ use ``DifferenceInMeans`` or ``BlockedDifferenceInMeans``
67
+ respectively.
68
+
69
+ Cell-level outcome means are computed via ``groupby`` on the
70
+ synthetic ``"_cell"`` column of ``assignment.data_``, in a single
71
+ pass over the data.
72
+
73
+ Examples
74
+ --------
75
+ >>> from skxperiments.design.factorial import FactorialDesign
76
+ >>> from skxperiments.estimators.factorial_estimator import (
77
+ ... FactorialEstimator,
78
+ ... )
79
+ >>> design = FactorialDesign(
80
+ ... factors=["A", "B"], n_per_cell=100, seed=42
81
+ ... )
82
+ >>> assignment = design.randomize(df) # doctest: +SKIP
83
+ >>> result = (
84
+ ... FactorialEstimator(outcome_col="y")
85
+ ... .fit(assignment)
86
+ ... .estimate()
87
+ ... )
88
+ >>> result.effects[("A",)] # doctest: +SKIP
89
+ """
90
+
91
+ def __init__(self, outcome_col: str) -> None:
92
+ self.outcome_col = outcome_col
93
+
94
+ def fit(
95
+ self, assignment: FactorialAssignment
96
+ ) -> "FactorialEstimator":
97
+ """Fit the estimator on a FactorialAssignment.
98
+
99
+ Parameters
100
+ ----------
101
+ assignment : FactorialAssignment
102
+ Assignment produced by ``FactorialDesign``.
103
+
104
+ Returns
105
+ -------
106
+ FactorialEstimator
107
+ Returns self.
108
+
109
+ Raises
110
+ ------
111
+ DesignEstimatorMismatch
112
+ If ``assignment`` is not a ``FactorialAssignment``.
113
+ InvalidDesignError
114
+ If ``outcome_col`` is missing, non-numeric, or contains
115
+ NaN; or if any cell has zero units.
116
+ """
117
+ self._validate_assignment_type(assignment, FactorialAssignment)
118
+
119
+ data = assignment.data_
120
+
121
+ if self.outcome_col not in data.columns:
122
+ raise InvalidDesignError(
123
+ f"Outcome column '{self.outcome_col}' not found in "
124
+ f"assignment.data_. Available columns: "
125
+ f"{list(data.columns)}."
126
+ )
127
+
128
+ if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
129
+ raise InvalidDesignError(
130
+ f"Outcome column '{self.outcome_col}' must be numeric. "
131
+ f"dtype found: {data[self.outcome_col].dtype}."
132
+ )
133
+
134
+ if data[self.outcome_col].isna().any():
135
+ raise InvalidDesignError(
136
+ f"Outcome column '{self.outcome_col}' contains NaN "
137
+ f"values. Impute or drop NaN before fitting."
138
+ )
139
+
140
+ # Defensive: every cell must have at least 1 unit.
141
+ for cell_idx, size in assignment.cell_sizes_.items():
142
+ if size == 0:
143
+ raise InvalidDesignError(
144
+ f"Cell {cell_idx} has 0 units; FactorialEstimator "
145
+ f"requires at least 1 unit per cell."
146
+ )
147
+
148
+ # Cell-level outcome means in a single pass over the data.
149
+ cell_means_series = data.groupby("_cell")[self.outcome_col].mean()
150
+ cell_means: dict[int, float] = {
151
+ int(idx): float(val) for idx, val in cell_means_series.items()
152
+ }
153
+
154
+ # Compute all 2^K - 1 effects via factorial contrasts.
155
+ factor_cols = assignment.factor_cols # original design order
156
+ factor_cols_sorted = sorted(factor_cols) # alphabetical for keys
157
+ K = len(factor_cols)
158
+
159
+ effects: dict[tuple[str, ...], float] = {}
160
+
161
+ for r in range(1, K + 1):
162
+ for subset in itertools.combinations(factor_cols_sorted, r):
163
+ # Indices in the original factor_cols determine which
164
+ # bit of the cell index encodes each factor.
165
+ subset_indices = [factor_cols.index(f) for f in subset]
166
+ effect = 0.0
167
+ for cell_idx, cell_mean in cell_means.items():
168
+ sign = 1
169
+ for j in subset_indices:
170
+ x_j = (cell_idx >> j) & 1
171
+ sign *= 2 * x_j - 1
172
+ effect += sign * cell_mean
173
+ effects[subset] = effect / (2 ** (K - 1))
174
+
175
+ self.assignment_: FactorialAssignment = assignment
176
+ self.effects_: dict[tuple[str, ...], float] = effects
177
+
178
+ return self
179
+
180
+ def estimate(self) -> Results:
181
+ """Return a Results object with all 2^K - 1 effects.
182
+
183
+ Returns
184
+ -------
185
+ Results
186
+ Multi-effect ``Results`` with ``effects`` populated,
187
+ ``ate=None``, ``n_treated=None``, ``n_control=None``,
188
+ ``estimator_name`` and ``design_name`` auto-populated.
189
+ ``se``, ``ci``, ``p_value`` are ``None`` — inference is
190
+ Phase 4.
191
+
192
+ Raises
193
+ ------
194
+ NotFittedError
195
+ If ``fit`` has not been called.
196
+ """
197
+ self._check_is_fitted()
198
+
199
+ design_name: str | None
200
+ if self.assignment_.design_ is not None:
201
+ design_name = type(self.assignment_.design_).__name__
202
+ else:
203
+ design_name = None
204
+
205
+ return Results(
206
+ ate=None,
207
+ effects=self.effects_,
208
+ n_obs=self.assignment_.n_units_,
209
+ n_treated=None,
210
+ n_control=None,
211
+ estimator_name=type(self).__name__,
212
+ design_name=design_name,
213
+ )