skxperiments 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skxperiments/__init__.py +5 -0
- skxperiments/core/__init__.py +42 -0
- skxperiments/core/assignment.py +589 -0
- skxperiments/core/base.py +512 -0
- skxperiments/core/exceptions.py +145 -0
- skxperiments/core/potential_outcomes.py +168 -0
- skxperiments/core/results.py +624 -0
- skxperiments/design/__init__.py +22 -0
- skxperiments/design/balance.py +182 -0
- skxperiments/design/blocked_crd.py +157 -0
- skxperiments/design/crd.py +162 -0
- skxperiments/design/factorial.py +174 -0
- skxperiments/design/power.py +233 -0
- skxperiments/design/rerandomized_crd.py +319 -0
- skxperiments/diagnostics/__init__.py +21 -0
- skxperiments/diagnostics/aa_test.py +277 -0
- skxperiments/diagnostics/balance_report.py +224 -0
- skxperiments/diagnostics/srm.py +327 -0
- skxperiments/estimators/__init__.py +23 -0
- skxperiments/estimators/blocked_difference_in_means.py +197 -0
- skxperiments/estimators/cuped.py +280 -0
- skxperiments/estimators/difference_in_means.py +161 -0
- skxperiments/estimators/factorial_estimator.py +213 -0
- skxperiments/estimators/lin_estimator.py +298 -0
- skxperiments/inference/__init__.py +17 -0
- skxperiments/inference/bootstrap.py +450 -0
- skxperiments/inference/multiple.py +365 -0
- skxperiments/inference/neyman.py +386 -0
- skxperiments/inference/randomization_test.py +319 -0
- skxperiments/pipeline.py +366 -0
- skxperiments/reporting/__init__.py +30 -0
- skxperiments/reporting/plots.py +411 -0
- skxperiments/reporting/summary.py +185 -0
- skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
- skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
- skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""Lin (2013) covariate-adjusted ATE estimator.
|
|
2
|
+
|
|
3
|
+
Computes the ATE via OLS of Y on (1, T, X_centered, T * X_centered),
|
|
4
|
+
where X_centered = X - mean(X). The coefficient of T is the ATE
|
|
5
|
+
estimate. Lin's adjustment reduces variance compared to plain
|
|
6
|
+
difference-in-means when covariates predict the outcome, while
|
|
7
|
+
remaining consistent for the ATE under any covariate distribution.
|
|
8
|
+
|
|
9
|
+
Reference: Lin, W. (2013). Agnostic notes on regression adjustments
|
|
10
|
+
to experimental data: Reexamining Freedman's critique. Annals of
|
|
11
|
+
Applied Statistics, 7(1), 295-318.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from skxperiments.core.assignment import (
|
|
18
|
+
BlockedAssignment,
|
|
19
|
+
CRDAssignment,
|
|
20
|
+
)
|
|
21
|
+
from skxperiments.core.base import BaseEstimator
|
|
22
|
+
from skxperiments.core.exceptions import InvalidDesignError
|
|
23
|
+
from skxperiments.core.results import Results
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LinEstimator(BaseEstimator):
|
|
27
|
+
"""Lin (2013) covariate-adjusted ATE estimator.
|
|
28
|
+
|
|
29
|
+
Estimates the ATE via OLS of Y on a design matrix that includes
|
|
30
|
+
the treatment indicator, mean-centered covariates, and their
|
|
31
|
+
treatment-covariate interactions:
|
|
32
|
+
|
|
33
|
+
ATE_hat = coef_T in OLS of Y on [1, T, X_centered, T * X_centered]
|
|
34
|
+
|
|
35
|
+
where X_centered = X - mean(X). The coefficient of T is the ATE.
|
|
36
|
+
|
|
37
|
+
Lin's adjustment reduces the variance of the ATE estimate compared
|
|
38
|
+
to plain difference-in-means whenever covariates predict the
|
|
39
|
+
outcome, and remains consistent for the ATE without distributional
|
|
40
|
+
assumptions on covariates.
|
|
41
|
+
|
|
42
|
+
Decisão arquitetural fixada (item 18): ``inference_mode`` is
|
|
43
|
+
documentational metadata only at this stage. ``LinEstimator``
|
|
44
|
+
computes the point estimate only — it does not compute HC2,
|
|
45
|
+
Neyman, or any standard error. Inference is the responsibility
|
|
46
|
+
of Phase 4 classes, which read ``Results.extra["inference_mode"]``
|
|
47
|
+
to decide which variance formula to apply.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
outcome_col : str
|
|
52
|
+
Name of the outcome column in ``assignment.data_``.
|
|
53
|
+
covariates : list of str
|
|
54
|
+
Names of covariate columns. Must be a non-empty list of
|
|
55
|
+
strings. For unadjusted ATE, use ``DifferenceInMeans``.
|
|
56
|
+
inference_mode : {"finite_population", "superpopulation"}, optional
|
|
57
|
+
Documentational flag propagated to ``Results.extra``. The
|
|
58
|
+
Phase 4 inference class reads this to pick a variance
|
|
59
|
+
formula. Default ``"finite_population"``.
|
|
60
|
+
|
|
61
|
+
Attributes
|
|
62
|
+
----------
|
|
63
|
+
assignment_ : CRDAssignment or BlockedAssignment
|
|
64
|
+
The fitted assignment.
|
|
65
|
+
ate_ : float
|
|
66
|
+
Lin-adjusted point estimate of the ATE.
|
|
67
|
+
coefficients_ : np.ndarray
|
|
68
|
+
Full vector of OLS coefficients with shape ``(2 + 2*K,)``,
|
|
69
|
+
where K is the number of covariates. Layout:
|
|
70
|
+
|
|
71
|
+
- ``coefficients_[0]`` — intercept
|
|
72
|
+
- ``coefficients_[1]`` — ATE (coefficient of T)
|
|
73
|
+
- ``coefficients_[2 : 2 + K]`` — centered-covariate coefficients
|
|
74
|
+
- ``coefficients_[2 + K : 2 + 2*K]`` — interaction coefficients
|
|
75
|
+
for T times each centered covariate
|
|
76
|
+
|
|
77
|
+
inference_mode_ : str
|
|
78
|
+
Copy of ``inference_mode`` set during ``fit``. Follows the
|
|
79
|
+
sklearn convention of trailing-underscore attributes for
|
|
80
|
+
learned state, even when the value is merely mirrored from
|
|
81
|
+
``__init__`` without transformation.
|
|
82
|
+
|
|
83
|
+
Notes
|
|
84
|
+
-----
|
|
85
|
+
Accepts ``CRDAssignment`` or ``BlockedAssignment``.
|
|
86
|
+
``FactorialAssignment`` is rejected via ``DesignEstimatorMismatch``.
|
|
87
|
+
|
|
88
|
+
With ``BlockedAssignment``, ``LinEstimator`` treats the data as a
|
|
89
|
+
single sample and does **not** use block structure. Users who
|
|
90
|
+
want to exploit block structure should use
|
|
91
|
+
``BlockedDifferenceInMeans``.
|
|
92
|
+
|
|
93
|
+
Constant covariates (zero variance) make the design matrix
|
|
94
|
+
singular and are rejected at ``fit``.
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
>>> from skxperiments.design.crd import CRD
|
|
99
|
+
>>> from skxperiments.estimators.lin_estimator import LinEstimator
|
|
100
|
+
>>> design = CRD(p=0.5, seed=42)
|
|
101
|
+
>>> assignment = design.randomize(df) # doctest: +SKIP
|
|
102
|
+
>>> estimator = LinEstimator(
|
|
103
|
+
... outcome_col="y", covariates=["x1", "x2"]
|
|
104
|
+
... )
|
|
105
|
+
>>> result = estimator.fit(assignment).estimate() # doctest: +SKIP
|
|
106
|
+
>>> result.ate # doctest: +SKIP
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
_VALID_INFERENCE_MODES = ("finite_population", "superpopulation")
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
outcome_col: str,
|
|
114
|
+
covariates: list[str],
|
|
115
|
+
inference_mode: str = "finite_population",
|
|
116
|
+
) -> None:
|
|
117
|
+
# Validate covariates
|
|
118
|
+
if not isinstance(covariates, list):
|
|
119
|
+
raise InvalidDesignError(
|
|
120
|
+
f"covariates must be a list of strings, but received "
|
|
121
|
+
f"{type(covariates).__name__}."
|
|
122
|
+
)
|
|
123
|
+
if len(covariates) == 0:
|
|
124
|
+
raise InvalidDesignError(
|
|
125
|
+
"LinEstimator requires at least one covariate; use "
|
|
126
|
+
"DifferenceInMeans for unadjusted ATE."
|
|
127
|
+
)
|
|
128
|
+
for c in covariates:
|
|
129
|
+
if not isinstance(c, str):
|
|
130
|
+
raise InvalidDesignError(
|
|
131
|
+
f"covariates must be a list of strings; found "
|
|
132
|
+
f"element of type {type(c).__name__}: {c!r}."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Validate inference_mode
|
|
136
|
+
if inference_mode not in self._VALID_INFERENCE_MODES:
|
|
137
|
+
raise InvalidDesignError(
|
|
138
|
+
f"inference_mode must be one of "
|
|
139
|
+
f"{self._VALID_INFERENCE_MODES}, but received "
|
|
140
|
+
f"{inference_mode!r}."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self.outcome_col = outcome_col
|
|
144
|
+
self.covariates = covariates
|
|
145
|
+
self.inference_mode = inference_mode
|
|
146
|
+
|
|
147
|
+
def fit(
|
|
148
|
+
self, assignment: CRDAssignment | BlockedAssignment
|
|
149
|
+
) -> "LinEstimator":
|
|
150
|
+
"""Fit the Lin estimator on a CRDAssignment or BlockedAssignment.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
assignment : CRDAssignment or BlockedAssignment
|
|
155
|
+
The assignment to fit on.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
LinEstimator
|
|
160
|
+
Returns self.
|
|
161
|
+
|
|
162
|
+
Raises
|
|
163
|
+
------
|
|
164
|
+
DesignEstimatorMismatch
|
|
165
|
+
If ``assignment`` is not a ``CRDAssignment`` or
|
|
166
|
+
``BlockedAssignment``.
|
|
167
|
+
InvalidDesignError
|
|
168
|
+
If ``outcome_col`` is missing, non-numeric, or has NaN;
|
|
169
|
+
if any covariate is missing, non-numeric, has NaN, or is
|
|
170
|
+
constant (zero variance).
|
|
171
|
+
"""
|
|
172
|
+
self._validate_assignment_type(
|
|
173
|
+
assignment, (CRDAssignment, BlockedAssignment)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
data = assignment.data_
|
|
177
|
+
|
|
178
|
+
# Validate outcome
|
|
179
|
+
if self.outcome_col not in data.columns:
|
|
180
|
+
raise InvalidDesignError(
|
|
181
|
+
f"Outcome column '{self.outcome_col}' not found in "
|
|
182
|
+
f"assignment.data_. Available columns: "
|
|
183
|
+
f"{list(data.columns)}."
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if not pd.api.types.is_numeric_dtype(data[self.outcome_col]):
|
|
187
|
+
raise InvalidDesignError(
|
|
188
|
+
f"Outcome column '{self.outcome_col}' must be numeric. "
|
|
189
|
+
f"dtype found: {data[self.outcome_col].dtype}."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if data[self.outcome_col].isna().any():
|
|
193
|
+
raise InvalidDesignError(
|
|
194
|
+
f"Outcome column '{self.outcome_col}' contains NaN "
|
|
195
|
+
f"values. Impute or drop NaN before fitting."
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Validate covariates
|
|
199
|
+
missing = [c for c in self.covariates if c not in data.columns]
|
|
200
|
+
if missing:
|
|
201
|
+
raise InvalidDesignError(
|
|
202
|
+
f"Covariates not found in assignment.data_: {missing}. "
|
|
203
|
+
f"Available columns: {list(data.columns)}."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
non_numeric = [
|
|
207
|
+
c for c in self.covariates
|
|
208
|
+
if not pd.api.types.is_numeric_dtype(data[c])
|
|
209
|
+
]
|
|
210
|
+
if non_numeric:
|
|
211
|
+
raise InvalidDesignError(
|
|
212
|
+
f"Covariates must be numeric: {non_numeric} are not."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
with_nan = [c for c in self.covariates if data[c].isna().any()]
|
|
216
|
+
if with_nan:
|
|
217
|
+
raise InvalidDesignError(
|
|
218
|
+
f"Covariates contain NaN values: {with_nan}. "
|
|
219
|
+
f"Impute or drop NaN before fitting."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
constants = [
|
|
223
|
+
c for c in self.covariates
|
|
224
|
+
if data[c].var(ddof=0) == 0
|
|
225
|
+
]
|
|
226
|
+
if constants:
|
|
227
|
+
raise InvalidDesignError(
|
|
228
|
+
f"Covariates are constant (zero variance): "
|
|
229
|
+
f"{constants}. A constant covariate makes the design "
|
|
230
|
+
f"matrix singular; remove or replace them."
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Build the Lin design matrix.
|
|
234
|
+
X = data[self.covariates].values.astype(float)
|
|
235
|
+
X_centered = X - X.mean(axis=0)
|
|
236
|
+
|
|
237
|
+
T = (
|
|
238
|
+
data[assignment.treatment_col_]
|
|
239
|
+
.values.astype(float)
|
|
240
|
+
.reshape(-1, 1)
|
|
241
|
+
)
|
|
242
|
+
y = data[self.outcome_col].values.astype(float)
|
|
243
|
+
|
|
244
|
+
n = len(y)
|
|
245
|
+
intercept = np.ones((n, 1))
|
|
246
|
+
interaction = T * X_centered # broadcasting: (n,1)*(n,K) -> (n,K)
|
|
247
|
+
design_matrix = np.hstack(
|
|
248
|
+
[intercept, T, X_centered, interaction]
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# OLS via lstsq — more stable than inverting X.T @ X.
|
|
252
|
+
coefficients, *_ = np.linalg.lstsq(
|
|
253
|
+
design_matrix, y, rcond=None
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Layout: [intercept, T, X_centered (K), T*X_centered (K)].
|
|
257
|
+
# coefficients[1] is the ATE.
|
|
258
|
+
self.assignment_: CRDAssignment | BlockedAssignment = assignment
|
|
259
|
+
self.ate_: float = float(coefficients[1])
|
|
260
|
+
self.coefficients_: np.ndarray = coefficients
|
|
261
|
+
self.inference_mode_: str = self.inference_mode
|
|
262
|
+
|
|
263
|
+
return self
|
|
264
|
+
|
|
265
|
+
def estimate(self) -> Results:
|
|
266
|
+
"""Return a Results object with the point estimate and metadata.
|
|
267
|
+
|
|
268
|
+
Returns
|
|
269
|
+
-------
|
|
270
|
+
Results
|
|
271
|
+
Results with ``ate``, ``n_obs``, ``n_treated``, ``n_control``,
|
|
272
|
+
``estimator_name``, ``design_name`` populated, and
|
|
273
|
+
``extra={"inference_mode": <value>}`` propagated.
|
|
274
|
+
``se``, ``ci``, ``p_value`` are ``None`` — inference is
|
|
275
|
+
Phase 4.
|
|
276
|
+
|
|
277
|
+
Raises
|
|
278
|
+
------
|
|
279
|
+
NotFittedError
|
|
280
|
+
If ``fit`` has not been called.
|
|
281
|
+
"""
|
|
282
|
+
self._check_is_fitted()
|
|
283
|
+
|
|
284
|
+
design_name: str | None
|
|
285
|
+
if self.assignment_.design_ is not None:
|
|
286
|
+
design_name = type(self.assignment_.design_).__name__
|
|
287
|
+
else:
|
|
288
|
+
design_name = None
|
|
289
|
+
|
|
290
|
+
return Results(
|
|
291
|
+
ate=self.ate_,
|
|
292
|
+
n_obs=self.assignment_.n_units_,
|
|
293
|
+
n_treated=self.assignment_.n_treated_,
|
|
294
|
+
n_control=self.assignment_.n_control_,
|
|
295
|
+
estimator_name=type(self).__name__,
|
|
296
|
+
design_name=design_name,
|
|
297
|
+
extra={"inference_mode": self.inference_mode_},
|
|
298
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Inference module for hypothesis testing and confidence intervals.
|
|
2
|
+
|
|
3
|
+
Phase 4: randomization-based inference, multiple testing correction,
|
|
4
|
+
Neyman variance, bootstrap.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from skxperiments.inference.bootstrap import BootstrapCI
|
|
8
|
+
from skxperiments.inference.multiple import MultipleTestingCorrection
|
|
9
|
+
from skxperiments.inference.neyman import NeymanCI
|
|
10
|
+
from skxperiments.inference.randomization_test import RandomizationTest
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BootstrapCI",
|
|
14
|
+
"MultipleTestingCorrection",
|
|
15
|
+
"NeymanCI",
|
|
16
|
+
"RandomizationTest",
|
|
17
|
+
]
|