skxperiments 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. skxperiments/__init__.py +5 -0
  2. skxperiments/core/__init__.py +42 -0
  3. skxperiments/core/assignment.py +589 -0
  4. skxperiments/core/base.py +512 -0
  5. skxperiments/core/exceptions.py +145 -0
  6. skxperiments/core/potential_outcomes.py +168 -0
  7. skxperiments/core/results.py +624 -0
  8. skxperiments/design/__init__.py +22 -0
  9. skxperiments/design/balance.py +182 -0
  10. skxperiments/design/blocked_crd.py +157 -0
  11. skxperiments/design/crd.py +162 -0
  12. skxperiments/design/factorial.py +174 -0
  13. skxperiments/design/power.py +233 -0
  14. skxperiments/design/rerandomized_crd.py +319 -0
  15. skxperiments/diagnostics/__init__.py +21 -0
  16. skxperiments/diagnostics/aa_test.py +277 -0
  17. skxperiments/diagnostics/balance_report.py +224 -0
  18. skxperiments/diagnostics/srm.py +327 -0
  19. skxperiments/estimators/__init__.py +23 -0
  20. skxperiments/estimators/blocked_difference_in_means.py +197 -0
  21. skxperiments/estimators/cuped.py +280 -0
  22. skxperiments/estimators/difference_in_means.py +161 -0
  23. skxperiments/estimators/factorial_estimator.py +213 -0
  24. skxperiments/estimators/lin_estimator.py +298 -0
  25. skxperiments/inference/__init__.py +17 -0
  26. skxperiments/inference/bootstrap.py +450 -0
  27. skxperiments/inference/multiple.py +365 -0
  28. skxperiments/inference/neyman.py +386 -0
  29. skxperiments/inference/randomization_test.py +319 -0
  30. skxperiments/pipeline.py +366 -0
  31. skxperiments/reporting/__init__.py +30 -0
  32. skxperiments/reporting/plots.py +411 -0
  33. skxperiments/reporting/summary.py +185 -0
  34. skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
  35. skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
  36. skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
@@ -0,0 +1,298 @@
1
+ """Lin (2013) covariate-adjusted ATE estimator.
2
+
3
+ Computes the ATE via OLS of Y on (1, T, X_centered, T * X_centered),
4
+ where X_centered = X - mean(X). The coefficient of T is the ATE
5
+ estimate. Lin's adjustment reduces variance compared to plain
6
+ difference-in-means when covariates predict the outcome, while
7
+ remaining consistent for the ATE under any covariate distribution.
8
+
9
+ Reference: Lin, W. (2013). Agnostic notes on regression adjustments
10
+ to experimental data: Reexamining Freedman's critique. Annals of
11
+ Applied Statistics, 7(1), 295-318.
12
+ """
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from skxperiments.core.assignment import (
18
+ BlockedAssignment,
19
+ CRDAssignment,
20
+ )
21
+ from skxperiments.core.base import BaseEstimator
22
+ from skxperiments.core.exceptions import InvalidDesignError
23
+ from skxperiments.core.results import Results
24
+
25
+
26
+ class LinEstimator(BaseEstimator):
27
+ """Lin (2013) covariate-adjusted ATE estimator.
28
+
29
+ Estimates the ATE via OLS of Y on a design matrix that includes
30
+ the treatment indicator, mean-centered covariates, and their
31
+ treatment-covariate interactions:
32
+
33
+ ATE_hat = coef_T in OLS of Y on [1, T, X_centered, T * X_centered]
34
+
35
+ where X_centered = X - mean(X). The coefficient of T is the ATE.
36
+
37
+ Lin's adjustment reduces the variance of the ATE estimate compared
38
+ to plain difference-in-means whenever covariates predict the
39
+ outcome, and remains consistent for the ATE without distributional
40
+ assumptions on covariates.
41
+
42
+ Decisão arquitetural fixada (item 18): ``inference_mode`` is
43
+ documentational metadata only at this stage. ``LinEstimator``
44
+ computes the point estimate only — it does not compute HC2,
45
+ Neyman, or any standard error. Inference is the responsibility
46
+ of Phase 4 classes, which read ``Results.extra["inference_mode"]``
47
+ to decide which variance formula to apply.
48
+
49
+ Parameters
50
+ ----------
51
+ outcome_col : str
52
+ Name of the outcome column in ``assignment.data_``.
53
+ covariates : list of str
54
+ Names of covariate columns. Must be a non-empty list of
55
+ strings. For unadjusted ATE, use ``DifferenceInMeans``.
56
+ inference_mode : {"finite_population", "superpopulation"}, optional
57
+ Documentational flag propagated to ``Results.extra``. The
58
+ Phase 4 inference class reads this to pick a variance
59
+ formula. Default ``"finite_population"``.
60
+
61
+ Attributes
62
+ ----------
63
+ assignment_ : CRDAssignment or BlockedAssignment
64
+ The fitted assignment.
65
+ ate_ : float
66
+ Lin-adjusted point estimate of the ATE.
67
+ coefficients_ : np.ndarray
68
+ Full vector of OLS coefficients with shape ``(2 + 2*K,)``,
69
+ where K is the number of covariates. Layout:
70
+
71
+ - ``coefficients_[0]`` — intercept
72
+ - ``coefficients_[1]`` — ATE (coefficient of T)
73
+ - ``coefficients_[2 : 2 + K]`` — centered-covariate coefficients
74
+ - ``coefficients_[2 + K : 2 + 2*K]`` — interaction coefficients
75
+ for T times each centered covariate
76
+
77
+ inference_mode_ : str
78
+ Copy of ``inference_mode`` set during ``fit``. Follows the
79
+ sklearn convention of trailing-underscore attributes for
80
+ learned state, even when the value is merely mirrored from
81
+ ``__init__`` without transformation.
82
+
83
+ Notes
84
+ -----
85
+ Accepts ``CRDAssignment`` or ``BlockedAssignment``.
86
+ ``FactorialAssignment`` is rejected via ``DesignEstimatorMismatch``.
87
+
88
+ With ``BlockedAssignment``, ``LinEstimator`` treats the data as a
89
+ single sample and does **not** use block structure. Users who
90
+ want to exploit block structure should use
91
+ ``BlockedDifferenceInMeans``.
92
+
93
+ Constant covariates (zero variance) make the design matrix
94
+ singular and are rejected at ``fit``.
95
+
96
+ Examples
97
+ --------
98
+ >>> from skxperiments.design.crd import CRD
99
+ >>> from skxperiments.estimators.lin_estimator import LinEstimator
100
+ >>> design = CRD(p=0.5, seed=42)
101
+ >>> assignment = design.randomize(df) # doctest: +SKIP
102
+ >>> estimator = LinEstimator(
103
+ ... outcome_col="y", covariates=["x1", "x2"]
104
+ ... )
105
+ >>> result = estimator.fit(assignment).estimate() # doctest: +SKIP
106
+ >>> result.ate # doctest: +SKIP
107
+ """
108
+
109
+ _VALID_INFERENCE_MODES = ("finite_population", "superpopulation")
110
+
111
+ def __init__(
112
+ self,
113
+ outcome_col: str,
114
+ covariates: list[str],
115
+ inference_mode: str = "finite_population",
116
+ ) -> None:
117
+ # Validate covariates
118
+ if not isinstance(covariates, list):
119
+ raise InvalidDesignError(
120
+ f"covariates must be a list of strings, but received "
121
+ f"{type(covariates).__name__}."
122
+ )
123
+ if len(covariates) == 0:
124
+ raise InvalidDesignError(
125
+ "LinEstimator requires at least one covariate; use "
126
+ "DifferenceInMeans for unadjusted ATE."
127
+ )
128
+ for c in covariates:
129
+ if not isinstance(c, str):
130
+ raise InvalidDesignError(
131
+ f"covariates must be a list of strings; found "
132
+ f"element of type {type(c).__name__}: {c!r}."
133
+ )
134
+
135
+ # Validate inference_mode
136
+ if inference_mode not in self._VALID_INFERENCE_MODES:
137
+ raise InvalidDesignError(
138
+ f"inference_mode must be one of "
139
+ f"{self._VALID_INFERENCE_MODES}, but received "
140
+ f"{inference_mode!r}."
141
+ )
142
+
143
+ self.outcome_col = outcome_col
144
+ self.covariates = covariates
145
+ self.inference_mode = inference_mode
146
+
147
+ def fit(
148
+ self, assignment: CRDAssignment | BlockedAssignment
149
+ ) -> "LinEstimator":
150
+ """Fit the Lin estimator on a CRDAssignment or BlockedAssignment.
151
+
152
+ Parameters
153
+ ----------
154
+ assignment : CRDAssignment or BlockedAssignment
155
+ The assignment to fit on.
156
+
157
+ Returns
158
+ -------
159
+ LinEstimator
160
+ Returns self.
161
+
162
+ Raises
163
+ ------
164
+ DesignEstimatorMismatch
165
+ If ``assignment`` is not a ``CRDAssignment`` or
166
+ ``BlockedAssignment``.
167
+ InvalidDesignError
168
+ If ``outcome_col`` is missing, non-numeric, or has NaN;
169
+ if any covariate is missing, non-numeric, has NaN, or is
170
+ constant (zero variance).
171
+ """
172
+ self._validate_assignment_type(
173
+ assignment, (CRDAssignment, BlockedAssignment)
174
+ )
175
+
176
+ data = assignment.data_
177
+
178
+ # Validate outcome
179
+ if self.outcome_col not in data.columns:
180
+ raise InvalidDesignError(
181
+ f"Outcome column '{self.outcome_col}' not found in "
182
+ f"assignment.data_. Available columns: "
183
+ f"{list(data.columns)}."
184
+ )
185
+
186
+ if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
187
+ raise InvalidDesignError(
188
+ f"Outcome column '{self.outcome_col}' must be numeric. "
189
+ f"dtype found: {data[self.outcome_col].dtype}."
190
+ )
191
+
192
+ if data[self.outcome_col].isna().any():
193
+ raise InvalidDesignError(
194
+ f"Outcome column '{self.outcome_col}' contains NaN "
195
+ f"values. Impute or drop NaN before fitting."
196
+ )
197
+
198
+ # Validate covariates
199
+ missing = [c for c in self.covariates if c not in data.columns]
200
+ if missing:
201
+ raise InvalidDesignError(
202
+ f"Covariates not found in assignment.data_: {missing}. "
203
+ f"Available columns: {list(data.columns)}."
204
+ )
205
+
206
+ non_numeric = [
207
+ c for c in self.covariates
208
+ if not pd.api.types.is_numeric_dtype(data[c])
209
+ ]
210
+ if non_numeric:
211
+ raise InvalidDesignError(
212
+ f"Covariates must be numeric: {non_numeric} are not."
213
+ )
214
+
215
+ with_nan = [c for c in self.covariates if data[c].isna().any()]
216
+ if with_nan:
217
+ raise InvalidDesignError(
218
+ f"Covariates contain NaN values: {with_nan}. "
219
+ f"Impute or drop NaN before fitting."
220
+ )
221
+
222
+ constants = [
223
+ c for c in self.covariates
224
+ if data[c].var(ddof=0) == 0
225
+ ]
226
+ if constants:
227
+ raise InvalidDesignError(
228
+ f"Covariates are constant (zero variance): "
229
+ f"{constants}. A constant covariate makes the design "
230
+ f"matrix singular; remove or replace them."
231
+ )
232
+
233
+ # Build the Lin design matrix.
234
+ X = data[self.covariates].values.astype(float)
235
+ X_centered = X - X.mean(axis=0)
236
+
237
+ T = (
238
+ data[assignment.treatment_col_]
239
+ .values.astype(float)
240
+ .reshape(-1, 1)
241
+ )
242
+ y = data[self.outcome_col].values.astype(float)
243
+
244
+ n = len(y)
245
+ intercept = np.ones((n, 1))
246
+ interaction = T * X_centered # broadcasting: (n,1)*(n,K) -> (n,K)
247
+ design_matrix = np.hstack(
248
+ [intercept, T, X_centered, interaction]
249
+ )
250
+
251
+ # OLS via lstsq — more stable than inverting X.T @ X.
252
+ coefficients, *_ = np.linalg.lstsq(
253
+ design_matrix, y, rcond=None
254
+ )
255
+
256
+ # Layout: [intercept, T, X_centered (K), T*X_centered (K)].
257
+ # coefficients[1] is the ATE.
258
+ self.assignment_: CRDAssignment | BlockedAssignment = assignment
259
+ self.ate_: float = float(coefficients[1])
260
+ self.coefficients_: np.ndarray = coefficients
261
+ self.inference_mode_: str = self.inference_mode
262
+
263
+ return self
264
+
265
+ def estimate(self) -> Results:
266
+ """Return a Results object with the point estimate and metadata.
267
+
268
+ Returns
269
+ -------
270
+ Results
271
+ Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
272
+ ``estimator_name``, ``design_name`` populated, and
273
+ ``extra={"inference_mode": <value>}`` propagated.
274
+ ``se``, ``ci``, ``p_value`` are ``None`` — inference is
275
+ Phase 4.
276
+
277
+ Raises
278
+ ------
279
+ NotFittedError
280
+ If ``fit`` has not been called.
281
+ """
282
+ self._check_is_fitted()
283
+
284
+ design_name: str | None
285
+ if self.assignment_.design_ is not None:
286
+ design_name = type(self.assignment_.design_).__name__
287
+ else:
288
+ design_name = None
289
+
290
+ return Results(
291
+ ate=self.ate_,
292
+ n_obs=self.assignment_.n_units_,
293
+ n_treated=self.assignment_.n_treated_,
294
+ n_control=self.assignment_.n_control_,
295
+ estimator_name=type(self).__name__,
296
+ design_name=design_name,
297
+ extra={"inference_mode": self.inference_mode_},
298
+ )
@@ -0,0 +1,17 @@
1
+ """Inference module for hypothesis testing and confidence intervals.
2
+
3
+ Phase 4: randomization-based inference, multiple testing correction,
4
+ Neyman variance, bootstrap.
5
+ """
6
+
7
+ from skxperiments.inference.bootstrap import BootstrapCI
8
+ from skxperiments.inference.multiple import MultipleTestingCorrection
9
+ from skxperiments.inference.neyman import NeymanCI
10
+ from skxperiments.inference.randomization_test import RandomizationTest
11
+
12
+ __all__ = [
13
+ "BootstrapCI",
14
+ "MultipleTestingCorrection",
15
+ "NeymanCI",
16
+ "RandomizationTest",
17
+ ]