skxperiments 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skxperiments/__init__.py +5 -0
- skxperiments/core/__init__.py +42 -0
- skxperiments/core/assignment.py +589 -0
- skxperiments/core/base.py +512 -0
- skxperiments/core/exceptions.py +145 -0
- skxperiments/core/potential_outcomes.py +168 -0
- skxperiments/core/results.py +624 -0
- skxperiments/design/__init__.py +22 -0
- skxperiments/design/balance.py +182 -0
- skxperiments/design/blocked_crd.py +157 -0
- skxperiments/design/crd.py +162 -0
- skxperiments/design/factorial.py +174 -0
- skxperiments/design/power.py +233 -0
- skxperiments/design/rerandomized_crd.py +319 -0
- skxperiments/diagnostics/__init__.py +21 -0
- skxperiments/diagnostics/aa_test.py +277 -0
- skxperiments/diagnostics/balance_report.py +224 -0
- skxperiments/diagnostics/srm.py +327 -0
- skxperiments/estimators/__init__.py +23 -0
- skxperiments/estimators/blocked_difference_in_means.py +197 -0
- skxperiments/estimators/cuped.py +280 -0
- skxperiments/estimators/difference_in_means.py +161 -0
- skxperiments/estimators/factorial_estimator.py +213 -0
- skxperiments/estimators/lin_estimator.py +298 -0
- skxperiments/inference/__init__.py +17 -0
- skxperiments/inference/bootstrap.py +450 -0
- skxperiments/inference/multiple.py +365 -0
- skxperiments/inference/neyman.py +386 -0
- skxperiments/inference/randomization_test.py +319 -0
- skxperiments/pipeline.py +366 -0
- skxperiments/reporting/__init__.py +30 -0
- skxperiments/reporting/plots.py +411 -0
- skxperiments/reporting/summary.py +185 -0
- skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
- skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
- skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Covariate balance diagnostics for experimental designs.
|
|
2
|
+
|
|
3
|
+
Provides check_balance, a standalone function that computes the
|
|
4
|
+
standardized mean difference (SMD) between treatment and control
|
|
5
|
+
groups for each covariate in an Assignment.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from skxperiments.core.assignment import BaseAssignment
|
|
12
|
+
from skxperiments.core.exceptions import (
|
|
13
|
+
InsufficientDataError,
|
|
14
|
+
InvalidDesignError,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def check_balance(
|
|
19
|
+
assignment: BaseAssignment,
|
|
20
|
+
covariates: list[str] | None = None,
|
|
21
|
+
) -> pd.DataFrame:
|
|
22
|
+
"""Compute covariate balance between treatment and control groups.
|
|
23
|
+
|
|
24
|
+
For each covariate, returns the mean in each group, the pooled
|
|
25
|
+
standard deviation, and the standardized mean difference (SMD).
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
assignment : BaseAssignment
|
|
30
|
+
Assignment object produced by a design. Must expose ``data_``
|
|
31
|
+
(DataFrame with treatment column attached) and ``treatment_col_``.
|
|
32
|
+
covariates : list of str or None, optional
|
|
33
|
+
Names of covariates to check. If None, all numeric columns in
|
|
34
|
+
``assignment.data_`` except the treatment column are used, in
|
|
35
|
+
the order they appear in the DataFrame. Boolean columns count
|
|
36
|
+
as numeric. By default None.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
pd.DataFrame
|
|
41
|
+
DataFrame with one row per covariate and columns:
|
|
42
|
+
``covariate``, ``mean_treated``, ``mean_control``,
|
|
43
|
+
``std_pooled``, ``smd``. The index is a default RangeIndex.
|
|
44
|
+
|
|
45
|
+
Raises
|
|
46
|
+
------
|
|
47
|
+
InvalidDesignError
|
|
48
|
+
If a name in ``covariates`` is not a column of ``assignment.data_``,
|
|
49
|
+
or if any selected covariate contains NaN values.
|
|
50
|
+
InsufficientDataError
|
|
51
|
+
If ``covariates`` is None and no numeric columns are available
|
|
52
|
+
after excluding the treatment column.
|
|
53
|
+
|
|
54
|
+
Notes
|
|
55
|
+
-----
|
|
56
|
+
The pooled standard deviation follows the convention common in
|
|
57
|
+
the SMD literature for randomized experiments (Austin 2009;
|
|
58
|
+
Stuart 2010):
|
|
59
|
+
|
|
60
|
+
std_pooled = sqrt((var_treated + var_control) / 2)
|
|
61
|
+
|
|
62
|
+
where each variance is computed with ``ddof=1``. When
|
|
63
|
+
``std_pooled == 0`` (no within-group variation), the SMD is NaN
|
|
64
|
+
rather than raising an exception.
|
|
65
|
+
|
|
66
|
+
The function does not modify ``assignment.data_``.
|
|
67
|
+
|
|
68
|
+
References
|
|
69
|
+
----------
|
|
70
|
+
Austin, P. C. (2009). Balance diagnostics for comparing the
|
|
71
|
+
distribution of baseline covariates between treatment groups in
|
|
72
|
+
propensity-score matched samples. Statistics in Medicine.
|
|
73
|
+
|
|
74
|
+
Stuart, E. A. (2010). Matching methods for causal inference:
|
|
75
|
+
A review and a look forward. Statistical Science.
|
|
76
|
+
|
|
77
|
+
Examples
|
|
78
|
+
--------
|
|
79
|
+
>>> import numpy as np
|
|
80
|
+
>>> import pandas as pd
|
|
81
|
+
>>> from skxperiments.core.assignment import CRDAssignment
|
|
82
|
+
>>> rng = np.random.default_rng(42)
|
|
83
|
+
>>> df = pd.DataFrame({
|
|
84
|
+
... "x1": rng.normal(size=100),
|
|
85
|
+
... "x2": rng.normal(size=100),
|
|
86
|
+
... "treatment": rng.integers(0, 2, size=100),
|
|
87
|
+
... })
|
|
88
|
+
>>> assignment = CRDAssignment(
|
|
89
|
+
... data=df, treatment_col="treatment", design=None, seed=42
|
|
90
|
+
... )
|
|
91
|
+
>>> result = check_balance(assignment)
|
|
92
|
+
>>> set(result.columns) == {
|
|
93
|
+
... "covariate", "mean_treated", "mean_control",
|
|
94
|
+
... "std_pooled", "smd",
|
|
95
|
+
... }
|
|
96
|
+
True
|
|
97
|
+
"""
|
|
98
|
+
data = assignment.data_
|
|
99
|
+
treatment_col = assignment.treatment_col_
|
|
100
|
+
|
|
101
|
+
# Resolve covariate list
|
|
102
|
+
if covariates is None:
|
|
103
|
+
numeric_cols = [
|
|
104
|
+
col
|
|
105
|
+
for col in data.columns
|
|
106
|
+
if col != treatment_col and pd.api.types.is_numeric_dtype(data[col])
|
|
107
|
+
]
|
|
108
|
+
if len(numeric_cols) == 0:
|
|
109
|
+
raise InsufficientDataError(
|
|
110
|
+
context=(
|
|
111
|
+
"check_balance with covariates=None "
|
|
112
|
+
"(no numeric columns available after excluding "
|
|
113
|
+
f"treatment column '{treatment_col}')"
|
|
114
|
+
),
|
|
115
|
+
minimum=1,
|
|
116
|
+
received=0,
|
|
117
|
+
)
|
|
118
|
+
selected = numeric_cols
|
|
119
|
+
else:
|
|
120
|
+
missing = [c for c in covariates if c not in data.columns]
|
|
121
|
+
if missing:
|
|
122
|
+
raise InvalidDesignError(
|
|
123
|
+
f"Covariates not found in assignment.data_: {missing}. "
|
|
124
|
+
f"Available columns: {list(data.columns)}."
|
|
125
|
+
)
|
|
126
|
+
selected = list(covariates)
|
|
127
|
+
|
|
128
|
+
# Validate no NaN in any selected covariate
|
|
129
|
+
cols_with_nan = [c for c in selected if data[c].isna().any()]
|
|
130
|
+
if cols_with_nan:
|
|
131
|
+
raise InvalidDesignError(
|
|
132
|
+
f"Covariates contain NaN values: {cols_with_nan}. "
|
|
133
|
+
f"check_balance requires complete data; impute or drop NaN "
|
|
134
|
+
f"before calling."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Compute group masks
|
|
138
|
+
treatment_values = data[treatment_col].values
|
|
139
|
+
treated_mask = treatment_values == 1
|
|
140
|
+
control_mask = treatment_values == 0
|
|
141
|
+
|
|
142
|
+
# Compute statistics per covariate
|
|
143
|
+
rows: list[dict[str, float | str]] = []
|
|
144
|
+
for cov in selected:
|
|
145
|
+
values = data[cov].astype(float).values
|
|
146
|
+
treated_vals = values[treated_mask]
|
|
147
|
+
control_vals = values[control_mask]
|
|
148
|
+
|
|
149
|
+
mean_t = float(np.mean(treated_vals))
|
|
150
|
+
mean_c = float(np.mean(control_vals))
|
|
151
|
+
|
|
152
|
+
var_t = float(np.var(treated_vals, ddof=1))
|
|
153
|
+
var_c = float(np.var(control_vals, ddof=1))
|
|
154
|
+
|
|
155
|
+
std_pooled = float(np.sqrt((var_t + var_c) / 2.0))
|
|
156
|
+
|
|
157
|
+
if std_pooled == 0.0:
|
|
158
|
+
smd: float = float("nan")
|
|
159
|
+
else:
|
|
160
|
+
smd = (mean_t - mean_c) / std_pooled
|
|
161
|
+
|
|
162
|
+
rows.append(
|
|
163
|
+
{
|
|
164
|
+
"covariate": cov,
|
|
165
|
+
"mean_treated": mean_t,
|
|
166
|
+
"mean_control": mean_c,
|
|
167
|
+
"std_pooled": std_pooled,
|
|
168
|
+
"smd": smd,
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
result = pd.DataFrame(
|
|
173
|
+
rows,
|
|
174
|
+
columns=[
|
|
175
|
+
"covariate",
|
|
176
|
+
"mean_treated",
|
|
177
|
+
"mean_control",
|
|
178
|
+
"std_pooled",
|
|
179
|
+
"smd",
|
|
180
|
+
],
|
|
181
|
+
)
|
|
182
|
+
return result.reset_index(drop=True)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Blocked Completely Randomized Design.
|
|
2
|
+
|
|
3
|
+
Randomizes treatment independently within each block, preserving the
|
|
4
|
+
treatment proportion within every block. Useful when there are
|
|
5
|
+
pre-experiment covariates that define meaningful subgroups (e.g.,
|
|
6
|
+
geography, device type) where balance must be guaranteed.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from skxperiments.core.assignment import BlockedAssignment
|
|
13
|
+
from skxperiments.core.base import BaseDesign
|
|
14
|
+
from skxperiments.core.exceptions import (
|
|
15
|
+
InsufficientDataError,
|
|
16
|
+
InvalidDesignError,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BlockedCRD(BaseDesign):
|
|
21
|
+
"""Blocked Completely Randomized Design.
|
|
22
|
+
|
|
23
|
+
Treatment is randomized independently within each block defined by
|
|
24
|
+
``block_col``. The treatment proportion ``p`` is applied uniformly
|
|
25
|
+
to all blocks. Within each block, the number of treated units is
|
|
26
|
+
``round(p * n_block)``.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
block_col : str
|
|
31
|
+
Name of the column in the DataFrame that defines blocks.
|
|
32
|
+
p : float
|
|
33
|
+
Treatment proportion in (0, 1), applied uniformly across blocks.
|
|
34
|
+
seed : int or None, optional
|
|
35
|
+
Random seed for reproducibility, by default None.
|
|
36
|
+
treatment_col : str, optional
|
|
37
|
+
Name of the treatment column to be added to the output, by
|
|
38
|
+
default ``"treatment"``.
|
|
39
|
+
|
|
40
|
+
Examples
|
|
41
|
+
--------
|
|
42
|
+
>>> import pandas as pd
|
|
43
|
+
>>> df = pd.DataFrame({
|
|
44
|
+
... "x": range(8),
|
|
45
|
+
... "region": ["A", "A", "A", "A", "B", "B", "B", "B"],
|
|
46
|
+
... })
|
|
47
|
+
>>> design = BlockedCRD(block_col="region", p=0.5, seed=42)
|
|
48
|
+
>>> assignment = design.randomize(df)
|
|
49
|
+
>>> assignment.block_sizes_
|
|
50
|
+
{'A': 4, 'B': 4}
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
block_col: str,
|
|
56
|
+
p: float | None = None,
|
|
57
|
+
seed: int | None = None,
|
|
58
|
+
treatment_col: str = "treatment",
|
|
59
|
+
) -> None:
|
|
60
|
+
if p is None:
|
|
61
|
+
raise InvalidDesignError(
|
|
62
|
+
"BlockedCRD requires a treatment proportion p; "
|
|
63
|
+
"received p=None."
|
|
64
|
+
)
|
|
65
|
+
if not (0.0 < p < 1.0):
|
|
66
|
+
raise InvalidDesignError(
|
|
67
|
+
f"Treatment proportion p must be in (0, 1), but received {p}."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.block_col = block_col
|
|
71
|
+
self.p = p
|
|
72
|
+
self.seed = seed
|
|
73
|
+
self.treatment_col = treatment_col
|
|
74
|
+
|
|
75
|
+
def randomize(self, df: pd.DataFrame) -> BlockedAssignment:
|
|
76
|
+
"""Perform blocked randomization and return a BlockedAssignment.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
df : pd.DataFrame
|
|
81
|
+
DataFrame containing the experimental units. Must contain
|
|
82
|
+
``block_col`` and must not contain ``treatment_col``.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
BlockedAssignment
|
|
87
|
+
Assignment with treatment column added. Original DataFrame
|
|
88
|
+
is not modified.
|
|
89
|
+
|
|
90
|
+
Raises
|
|
91
|
+
------
|
|
92
|
+
InvalidDesignError
|
|
93
|
+
If ``block_col`` is missing from ``df``, if ``treatment_col``
|
|
94
|
+
already exists in ``df``, or if rounding results in 0 or n
|
|
95
|
+
treated units in any block.
|
|
96
|
+
InsufficientDataError
|
|
97
|
+
If any block has fewer than 2 units.
|
|
98
|
+
"""
|
|
99
|
+
if self.block_col not in df.columns:
|
|
100
|
+
raise InvalidDesignError(
|
|
101
|
+
f"Block column '{self.block_col}' not found in DataFrame. "
|
|
102
|
+
f"Available columns: {list(df.columns)}."
|
|
103
|
+
)
|
|
104
|
+
if self.treatment_col in df.columns:
|
|
105
|
+
raise InvalidDesignError(
|
|
106
|
+
f"Treatment column '{self.treatment_col}' already exists "
|
|
107
|
+
f"in DataFrame. Drop or rename it before calling randomize()."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
df_out = df.copy()
|
|
111
|
+
treatment = np.zeros(len(df_out), dtype=int)
|
|
112
|
+
|
|
113
|
+
rng = np.random.default_rng(self.seed)
|
|
114
|
+
|
|
115
|
+
# Iterate blocks in stable order (sorted by label) for
|
|
116
|
+
# reproducibility independent of pandas grouping internals.
|
|
117
|
+
block_labels = sorted(df_out[self.block_col].unique(), key=lambda x: str(x))
|
|
118
|
+
|
|
119
|
+
block_sizes: dict = {}
|
|
120
|
+
|
|
121
|
+
for label in block_labels:
|
|
122
|
+
block_iloc = np.where(df_out[self.block_col].values == label)[0]
|
|
123
|
+
n_block = len(block_iloc)
|
|
124
|
+
block_sizes[label] = n_block
|
|
125
|
+
|
|
126
|
+
if n_block < 2:
|
|
127
|
+
raise InsufficientDataError(
|
|
128
|
+
context=(
|
|
129
|
+
f"BlockedCRD randomization for block '{label}'"
|
|
130
|
+
),
|
|
131
|
+
minimum=2,
|
|
132
|
+
received=n_block,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
n_treated_block = int(round(self.p * n_block))
|
|
136
|
+
|
|
137
|
+
if n_treated_block == 0 or n_treated_block == n_block:
|
|
138
|
+
raise InvalidDesignError(
|
|
139
|
+
f"Block '{label}' has size {n_block}; with p={self.p}, "
|
|
140
|
+
f"rounding yields {n_treated_block} treated units. "
|
|
141
|
+
f"Each block must have at least 1 treated and 1 control "
|
|
142
|
+
f"unit. Increase block size or adjust p."
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
chosen = rng.choice(block_iloc, size=n_treated_block, replace=False)
|
|
146
|
+
treatment[chosen] = 1
|
|
147
|
+
|
|
148
|
+
df_out[self.treatment_col] = treatment
|
|
149
|
+
|
|
150
|
+
return BlockedAssignment(
|
|
151
|
+
data=df_out,
|
|
152
|
+
treatment_col=self.treatment_col,
|
|
153
|
+
design=self,
|
|
154
|
+
block_col=self.block_col,
|
|
155
|
+
block_sizes=block_sizes,
|
|
156
|
+
seed=self.seed,
|
|
157
|
+
)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Completely Randomized Design (CRD).
|
|
2
|
+
|
|
3
|
+
Assigns units to treatment uniformly at random, with either a fixed
|
|
4
|
+
absolute count of treated units or a fixed treatment proportion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from skxperiments.core.assignment import CRDAssignment
|
|
11
|
+
from skxperiments.core.base import BaseDesign
|
|
12
|
+
from skxperiments.core.exceptions import (
|
|
13
|
+
InsufficientDataError,
|
|
14
|
+
InvalidDesignError,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CRD(BaseDesign):
|
|
19
|
+
"""Completely Randomized Design.
|
|
20
|
+
|
|
21
|
+
Treatment is assigned uniformly at random to a fixed number of
|
|
22
|
+
units. The user provides exactly one of ``n_treated`` (absolute
|
|
23
|
+
count) or ``p`` (proportion); rounding for ``p`` uses
|
|
24
|
+
``round(p * n)``.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
n_treated : int or None, optional
|
|
29
|
+
Absolute number of units to assign to treatment. Mutually
|
|
30
|
+
exclusive with ``p``. By default None.
|
|
31
|
+
p : float or None, optional
|
|
32
|
+
Treatment proportion in (0, 1). Mutually exclusive with
|
|
33
|
+
``n_treated``. By default None.
|
|
34
|
+
seed : int or None, optional
|
|
35
|
+
Random seed for reproducibility, by default None.
|
|
36
|
+
treatment_col : str, optional
|
|
37
|
+
Name of the treatment column to be added to the output, by
|
|
38
|
+
default ``"treatment"``.
|
|
39
|
+
|
|
40
|
+
Examples
|
|
41
|
+
--------
|
|
42
|
+
>>> import pandas as pd
|
|
43
|
+
>>> df = pd.DataFrame({"x": range(100)})
|
|
44
|
+
>>> design = CRD(p=0.5, seed=42)
|
|
45
|
+
>>> assignment = design.randomize(df)
|
|
46
|
+
>>> assignment.n_treated_
|
|
47
|
+
50
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
n_treated: int | None = None,
|
|
53
|
+
p: float | None = None,
|
|
54
|
+
seed: int | None = None,
|
|
55
|
+
treatment_col: str = "treatment",
|
|
56
|
+
) -> None:
|
|
57
|
+
# Mutual exclusivity: exactly one of n_treated or p.
|
|
58
|
+
if n_treated is None and p is None:
|
|
59
|
+
raise InvalidDesignError(
|
|
60
|
+
"CRD requires exactly one of n_treated or p; both "
|
|
61
|
+
"are None."
|
|
62
|
+
)
|
|
63
|
+
if n_treated is not None and p is not None:
|
|
64
|
+
raise InvalidDesignError(
|
|
65
|
+
"CRD requires exactly one of n_treated or p; both "
|
|
66
|
+
"were provided."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if n_treated is not None:
|
|
70
|
+
if not isinstance(n_treated, (int, np.integer)) or n_treated <= 0:
|
|
71
|
+
raise InvalidDesignError(
|
|
72
|
+
f"n_treated must be a positive integer, but received "
|
|
73
|
+
f"{n_treated!r}."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if p is not None:
|
|
77
|
+
if not isinstance(p, (int, float)) or not (0.0 < p < 1.0):
|
|
78
|
+
raise InvalidDesignError(
|
|
79
|
+
f"p must be in (0, 1), but received {p!r}."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.n_treated = n_treated
|
|
83
|
+
self.p = p
|
|
84
|
+
self.seed = seed
|
|
85
|
+
self.treatment_col = treatment_col
|
|
86
|
+
|
|
87
|
+
def randomize(self, df: pd.DataFrame) -> CRDAssignment:
|
|
88
|
+
"""Perform complete randomization and return a CRDAssignment.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
df : pd.DataFrame
|
|
93
|
+
DataFrame with experimental units. Must not contain
|
|
94
|
+
``treatment_col``.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
CRDAssignment
|
|
99
|
+
Assignment with the treatment column added. The original
|
|
100
|
+
DataFrame is not modified.
|
|
101
|
+
|
|
102
|
+
Raises
|
|
103
|
+
------
|
|
104
|
+
InvalidDesignError
|
|
105
|
+
If ``treatment_col`` already exists in ``df``, or if the
|
|
106
|
+
resolved number of treated is 0 or N (no treatment
|
|
107
|
+
contrast possible).
|
|
108
|
+
InsufficientDataError
|
|
109
|
+
If ``len(df) < n_treated`` (when ``n_treated`` was given),
|
|
110
|
+
or if ``len(df) < 2`` (no contrast possible).
|
|
111
|
+
"""
|
|
112
|
+
n_total = len(df)
|
|
113
|
+
|
|
114
|
+
if n_total < 2:
|
|
115
|
+
raise InsufficientDataError(
|
|
116
|
+
context="CRD randomization",
|
|
117
|
+
minimum=2,
|
|
118
|
+
received=n_total,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if self.treatment_col in df.columns:
|
|
122
|
+
raise InvalidDesignError(
|
|
123
|
+
f"Treatment column '{self.treatment_col}' already "
|
|
124
|
+
f"exists in DataFrame. Drop or rename it before "
|
|
125
|
+
f"calling randomize()."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Resolve n_treated.
|
|
129
|
+
if self.n_treated is not None:
|
|
130
|
+
if n_total < self.n_treated:
|
|
131
|
+
raise InsufficientDataError(
|
|
132
|
+
context="CRD randomization",
|
|
133
|
+
minimum=self.n_treated,
|
|
134
|
+
received=n_total,
|
|
135
|
+
)
|
|
136
|
+
n_treated_resolved = self.n_treated
|
|
137
|
+
else:
|
|
138
|
+
n_treated_resolved = int(round(self.p * n_total))
|
|
139
|
+
|
|
140
|
+
if n_treated_resolved <= 0 or n_treated_resolved >= n_total:
|
|
141
|
+
raise InvalidDesignError(
|
|
142
|
+
f"Resolved n_treated={n_treated_resolved} for N={n_total}; "
|
|
143
|
+
f"must be strictly between 0 and N. Adjust n_treated "
|
|
144
|
+
f"or p."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Defensive copy. Build treatment vector.
|
|
148
|
+
df_out = df.copy()
|
|
149
|
+
rng = np.random.default_rng(self.seed)
|
|
150
|
+
|
|
151
|
+
treatment = np.zeros(n_total, dtype=int)
|
|
152
|
+
chosen = rng.choice(n_total, size=n_treated_resolved, replace=False)
|
|
153
|
+
treatment[chosen] = 1
|
|
154
|
+
|
|
155
|
+
df_out[self.treatment_col] = treatment
|
|
156
|
+
|
|
157
|
+
return CRDAssignment(
|
|
158
|
+
data=df_out,
|
|
159
|
+
treatment_col=self.treatment_col,
|
|
160
|
+
design=self,
|
|
161
|
+
seed=self.seed,
|
|
162
|
+
)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""2^K Factorial Design.
|
|
2
|
+
|
|
3
|
+
Randomly assigns units to one of 2^K cells defined by the values of
|
|
4
|
+
K binary factors, with all cells of equal size.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from skxperiments.core.assignment import FactorialAssignment
|
|
11
|
+
from skxperiments.core.base import BaseDesign
|
|
12
|
+
from skxperiments.core.exceptions import (
|
|
13
|
+
InsufficientDataError,
|
|
14
|
+
InvalidDesignError,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FactorialDesign(BaseDesign):
|
|
19
|
+
"""2^K Factorial Design with equal cell sizes.
|
|
20
|
+
|
|
21
|
+
Randomly assigns units to one of 2^K cells defined by K binary
|
|
22
|
+
factors. Each cell contains exactly ``n_per_cell`` units, so the
|
|
23
|
+
DataFrame must have ``n_per_cell * 2^K`` rows.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
factors : list of str
|
|
28
|
+
Names of the K factors. These will be added as columns to the
|
|
29
|
+
Assignment's data, alongside the synthetic ``"_cell"`` column.
|
|
30
|
+
Must be non-empty and contain no duplicates.
|
|
31
|
+
n_per_cell : int
|
|
32
|
+
Number of units per cell. All cells have equal size in this
|
|
33
|
+
version. Must be >= 1.
|
|
34
|
+
seed : int or None, optional
|
|
35
|
+
Random seed for reproducibility, by default None.
|
|
36
|
+
|
|
37
|
+
Notes
|
|
38
|
+
-----
|
|
39
|
+
Cell encoding convention (little-endian):
|
|
40
|
+
|
|
41
|
+
cell_index = sum(factor_value * 2**i
|
|
42
|
+
for i, factor_value in enumerate(factors))
|
|
43
|
+
|
|
44
|
+
For K=2 with factors ``["A", "B"]``:
|
|
45
|
+
A=0, B=0 -> cell 0
|
|
46
|
+
A=1, B=0 -> cell 1
|
|
47
|
+
A=0, B=1 -> cell 2
|
|
48
|
+
A=1, B=1 -> cell 3
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
>>> import pandas as pd
|
|
53
|
+
>>> df = pd.DataFrame({"x": range(8)})
|
|
54
|
+
>>> design = FactorialDesign(factors=["A", "B"], n_per_cell=2, seed=42)
|
|
55
|
+
>>> assignment = design.randomize(df)
|
|
56
|
+
>>> assignment.n_cells_
|
|
57
|
+
4
|
|
58
|
+
>>> assignment.cell_sizes_
|
|
59
|
+
{0: 2, 1: 2, 2: 2, 3: 2}
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
factors: list[str],
|
|
65
|
+
n_per_cell: int,
|
|
66
|
+
seed: int | None = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
if not isinstance(factors, list) or len(factors) == 0:
|
|
69
|
+
raise InvalidDesignError(
|
|
70
|
+
"FactorialDesign requires a non-empty list of factors."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if len(set(factors)) != len(factors):
|
|
74
|
+
duplicates = [f for f in factors if factors.count(f) > 1]
|
|
75
|
+
raise InvalidDesignError(
|
|
76
|
+
f"FactorialDesign factors must be unique. "
|
|
77
|
+
f"Duplicates found: {sorted(set(duplicates))}."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if not isinstance(n_per_cell, int) or n_per_cell < 1:
|
|
81
|
+
raise InvalidDesignError(
|
|
82
|
+
f"n_per_cell must be a positive integer, "
|
|
83
|
+
f"received {n_per_cell!r}."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.factors = factors
|
|
87
|
+
self.n_per_cell = n_per_cell
|
|
88
|
+
self.seed = seed
|
|
89
|
+
|
|
90
|
+
def randomize(self, df: pd.DataFrame) -> FactorialAssignment:
|
|
91
|
+
"""Perform factorial randomization.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
df : pd.DataFrame
|
|
96
|
+
DataFrame with experimental units. Must have exactly
|
|
97
|
+
``n_per_cell * 2^K`` rows and must not contain any column
|
|
98
|
+
named in ``factors`` or named ``"_cell"``.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
FactorialAssignment
|
|
103
|
+
Assignment with factor columns and ``"_cell"`` added.
|
|
104
|
+
|
|
105
|
+
Raises
|
|
106
|
+
------
|
|
107
|
+
InvalidDesignError
|
|
108
|
+
If column-name collisions exist or if any cell has size 0.
|
|
109
|
+
InsufficientDataError
|
|
110
|
+
If ``len(df) != n_per_cell * 2^K``.
|
|
111
|
+
"""
|
|
112
|
+
k = len(self.factors)
|
|
113
|
+
n_cells = 2**k
|
|
114
|
+
n_required = self.n_per_cell * n_cells
|
|
115
|
+
|
|
116
|
+
if len(df) != n_required:
|
|
117
|
+
raise InsufficientDataError(
|
|
118
|
+
context=(
|
|
119
|
+
f"FactorialDesign with K={k} factors and "
|
|
120
|
+
f"n_per_cell={self.n_per_cell} requires exactly "
|
|
121
|
+
f"{n_required} units"
|
|
122
|
+
),
|
|
123
|
+
minimum=n_required,
|
|
124
|
+
received=len(df),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Detect column-name collisions
|
|
128
|
+
forbidden = set(self.factors) | {"_cell"}
|
|
129
|
+
collisions = sorted(forbidden & set(df.columns))
|
|
130
|
+
if collisions:
|
|
131
|
+
raise InvalidDesignError(
|
|
132
|
+
f"DataFrame already contains columns reserved for "
|
|
133
|
+
f"FactorialDesign output: {collisions}. Drop or rename "
|
|
134
|
+
f"them before calling randomize()."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
df_out = df.copy()
|
|
138
|
+
rng = np.random.default_rng(self.seed)
|
|
139
|
+
|
|
140
|
+
# Shuffle iloc positions and assign sequentially to cells.
|
|
141
|
+
shuffled = rng.permutation(n_required)
|
|
142
|
+
cell_assignment = np.empty(n_required, dtype=int)
|
|
143
|
+
for cell_idx in range(n_cells):
|
|
144
|
+
start = cell_idx * self.n_per_cell
|
|
145
|
+
end = start + self.n_per_cell
|
|
146
|
+
cell_assignment[shuffled[start:end]] = cell_idx
|
|
147
|
+
|
|
148
|
+
df_out["_cell"] = cell_assignment
|
|
149
|
+
|
|
150
|
+
# Decode cell index back into binary factor values
|
|
151
|
+
# using little-endian convention.
|
|
152
|
+
for i, factor_name in enumerate(self.factors):
|
|
153
|
+
df_out[factor_name] = (cell_assignment >> i) & 1
|
|
154
|
+
|
|
155
|
+
# Build cell_sizes and validate non-empty cells (defensive;
|
|
156
|
+
# by construction every cell has n_per_cell >= 1).
|
|
157
|
+
cell_sizes: dict = {}
|
|
158
|
+
for cell_idx in range(n_cells):
|
|
159
|
+
size = int((cell_assignment == cell_idx).sum())
|
|
160
|
+
if size == 0:
|
|
161
|
+
raise InvalidDesignError(
|
|
162
|
+
f"Cell {cell_idx} has zero units after randomization. "
|
|
163
|
+
f"This should not happen with n_per_cell={self.n_per_cell}; "
|
|
164
|
+
f"please report as a bug."
|
|
165
|
+
)
|
|
166
|
+
cell_sizes[cell_idx] = size
|
|
167
|
+
|
|
168
|
+
return FactorialAssignment(
|
|
169
|
+
data=df_out,
|
|
170
|
+
design=self,
|
|
171
|
+
factor_cols=self.factors,
|
|
172
|
+
cell_sizes=cell_sizes,
|
|
173
|
+
seed=self.seed,
|
|
174
|
+
)
|