skxperiments 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. skxperiments/__init__.py +5 -0
  2. skxperiments/core/__init__.py +42 -0
  3. skxperiments/core/assignment.py +589 -0
  4. skxperiments/core/base.py +512 -0
  5. skxperiments/core/exceptions.py +145 -0
  6. skxperiments/core/potential_outcomes.py +168 -0
  7. skxperiments/core/results.py +624 -0
  8. skxperiments/design/__init__.py +22 -0
  9. skxperiments/design/balance.py +182 -0
  10. skxperiments/design/blocked_crd.py +157 -0
  11. skxperiments/design/crd.py +162 -0
  12. skxperiments/design/factorial.py +174 -0
  13. skxperiments/design/power.py +233 -0
  14. skxperiments/design/rerandomized_crd.py +319 -0
  15. skxperiments/diagnostics/__init__.py +21 -0
  16. skxperiments/diagnostics/aa_test.py +277 -0
  17. skxperiments/diagnostics/balance_report.py +224 -0
  18. skxperiments/diagnostics/srm.py +327 -0
  19. skxperiments/estimators/__init__.py +23 -0
  20. skxperiments/estimators/blocked_difference_in_means.py +197 -0
  21. skxperiments/estimators/cuped.py +280 -0
  22. skxperiments/estimators/difference_in_means.py +161 -0
  23. skxperiments/estimators/factorial_estimator.py +213 -0
  24. skxperiments/estimators/lin_estimator.py +298 -0
  25. skxperiments/inference/__init__.py +17 -0
  26. skxperiments/inference/bootstrap.py +450 -0
  27. skxperiments/inference/multiple.py +365 -0
  28. skxperiments/inference/neyman.py +386 -0
  29. skxperiments/inference/randomization_test.py +319 -0
  30. skxperiments/pipeline.py +366 -0
  31. skxperiments/reporting/__init__.py +30 -0
  32. skxperiments/reporting/plots.py +411 -0
  33. skxperiments/reporting/summary.py +185 -0
  34. skxperiments-0.1.0.dev0.dist-info/METADATA +272 -0
  35. skxperiments-0.1.0.dev0.dist-info/RECORD +36 -0
  36. skxperiments-0.1.0.dev0.dist-info/WHEEL +4 -0
@@ -0,0 +1,182 @@
1
+ """Covariate balance diagnostics for experimental designs.
2
+
3
+ Provides check_balance, a standalone function that computes the
4
+ standardized mean difference (SMD) between treatment and control
5
+ groups for each covariate in an Assignment.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from skxperiments.core.assignment import BaseAssignment
12
+ from skxperiments.core.exceptions import (
13
+ InsufficientDataError,
14
+ InvalidDesignError,
15
+ )
16
+
17
+
18
+ def check_balance(
19
+ assignment: BaseAssignment,
20
+ covariates: list[str] | None = None,
21
+ ) -> pd.DataFrame:
22
+ """Compute covariate balance between treatment and control groups.
23
+
24
+ For each covariate, returns the mean in each group, the pooled
25
+ standard deviation, and the standardized mean difference (SMD).
26
+
27
+ Parameters
28
+ ----------
29
+ assignment : BaseAssignment
30
+ Assignment object produced by a design. Must expose ``data_``
31
+ (DataFrame with treatment column attached) and ``treatment_col_``.
32
+ covariates : list of str or None, optional
33
+ Names of covariates to check. If None, all numeric columns in
34
+ ``assignment.data_`` except the treatment column are used, in
35
+ the order they appear in the DataFrame. Boolean columns count
36
+ as numeric. By default None.
37
+
38
+ Returns
39
+ -------
40
+ pd.DataFrame
41
+ DataFrame with one row per covariate and columns:
42
+ ``covariate``, ``mean_treated``, ``mean_control``,
43
+ ``std_pooled``, ``smd``. The index is a default RangeIndex.
44
+
45
+ Raises
46
+ ------
47
+ InvalidDesignError
48
+ If a name in ``covariates`` is not a column of ``assignment.data_``,
49
+ or if any selected covariate contains NaN values.
50
+ InsufficientDataError
51
+ If ``covariates`` is None and no numeric columns are available
52
+ after excluding the treatment column.
53
+
54
+ Notes
55
+ -----
56
+ The pooled standard deviation follows the convention common in
57
+ the SMD literature for randomized experiments (Austin 2009;
58
+ Stuart 2010):
59
+
60
+ std_pooled = sqrt((var_treated + var_control) / 2)
61
+
62
+ where each variance is computed with ``ddof=1``. When
63
+ ``std_pooled == 0`` (no within-group variation), the SMD is NaN
64
+ rather than raising an exception.
65
+
66
+ The function does not modify ``assignment.data_``.
67
+
68
+ References
69
+ ----------
70
+ Austin, P. C. (2009). Balance diagnostics for comparing the
71
+ distribution of baseline covariates between treatment groups in
72
+ propensity-score matched samples. Statistics in Medicine.
73
+
74
+ Stuart, E. A. (2010). Matching methods for causal inference:
75
+ A review and a look forward. Statistical Science.
76
+
77
+ Examples
78
+ --------
79
+ >>> import numpy as np
80
+ >>> import pandas as pd
81
+ >>> from skxperiments.core.assignment import CRDAssignment
82
+ >>> rng = np.random.default_rng(42)
83
+ >>> df = pd.DataFrame({
84
+ ... "x1": rng.normal(size=100),
85
+ ... "x2": rng.normal(size=100),
86
+ ... "treatment": rng.integers(0, 2, size=100),
87
+ ... })
88
+ >>> assignment = CRDAssignment(
89
+ ... data=df, treatment_col="treatment", design=None, seed=42
90
+ ... )
91
+ >>> result = check_balance(assignment)
92
+ >>> set(result.columns) == {
93
+ ... "covariate", "mean_treated", "mean_control",
94
+ ... "std_pooled", "smd",
95
+ ... }
96
+ True
97
+ """
98
+ data = assignment.data_
99
+ treatment_col = assignment.treatment_col_
100
+
101
+ # Resolve covariate list
102
+ if covariates is None:
103
+ numeric_cols = [
104
+ col
105
+ for col in data.columns
106
+ if col != treatment_col and pd.api.types.is_numeric_dtype(data[col])
107
+ ]
108
+ if len(numeric_cols) == 0:
109
+ raise InsufficientDataError(
110
+ context=(
111
+ "check_balance with covariates=None "
112
+ "(no numeric columns available after excluding "
113
+ f"treatment column '{treatment_col}')"
114
+ ),
115
+ minimum=1,
116
+ received=0,
117
+ )
118
+ selected = numeric_cols
119
+ else:
120
+ missing = [c for c in covariates if c not in data.columns]
121
+ if missing:
122
+ raise InvalidDesignError(
123
+ f"Covariates not found in assignment.data_: {missing}. "
124
+ f"Available columns: {list(data.columns)}."
125
+ )
126
+ selected = list(covariates)
127
+
128
+ # Validate no NaN in any selected covariate
129
+ cols_with_nan = [c for c in selected if data[c].isna().any()]
130
+ if cols_with_nan:
131
+ raise InvalidDesignError(
132
+ f"Covariates contain NaN values: {cols_with_nan}. "
133
+ f"check_balance requires complete data; impute or drop NaN "
134
+ f"before calling."
135
+ )
136
+
137
+ # Compute group masks
138
+ treatment_values = data[treatment_col].values
139
+ treated_mask = treatment_values == 1
140
+ control_mask = treatment_values == 0
141
+
142
+ # Compute statistics per covariate
143
+ rows: list[dict[str, float | str]] = []
144
+ for cov in selected:
145
+ values = data[cov].astype(float).values
146
+ treated_vals = values[treated_mask]
147
+ control_vals = values[control_mask]
148
+
149
+ mean_t = float(np.mean(treated_vals))
150
+ mean_c = float(np.mean(control_vals))
151
+
152
+ var_t = float(np.var(treated_vals, ddof=1))
153
+ var_c = float(np.var(control_vals, ddof=1))
154
+
155
+ std_pooled = float(np.sqrt((var_t + var_c) / 2.0))
156
+
157
+ if std_pooled == 0.0:
158
+ smd: float = float("nan")
159
+ else:
160
+ smd = (mean_t - mean_c) / std_pooled
161
+
162
+ rows.append(
163
+ {
164
+ "covariate": cov,
165
+ "mean_treated": mean_t,
166
+ "mean_control": mean_c,
167
+ "std_pooled": std_pooled,
168
+ "smd": smd,
169
+ }
170
+ )
171
+
172
+ result = pd.DataFrame(
173
+ rows,
174
+ columns=[
175
+ "covariate",
176
+ "mean_treated",
177
+ "mean_control",
178
+ "std_pooled",
179
+ "smd",
180
+ ],
181
+ )
182
+ return result.reset_index(drop=True)
@@ -0,0 +1,157 @@
1
+ """Blocked Completely Randomized Design.
2
+
3
+ Randomizes treatment independently within each block, preserving the
4
+ treatment proportion within every block. Useful when there are
5
+ pre-experiment covariates that define meaningful subgroups (e.g.,
6
+ geography, device type) where balance must be guaranteed.
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from skxperiments.core.assignment import BlockedAssignment
13
+ from skxperiments.core.base import BaseDesign
14
+ from skxperiments.core.exceptions import (
15
+ InsufficientDataError,
16
+ InvalidDesignError,
17
+ )
18
+
19
+
20
+ class BlockedCRD(BaseDesign):
21
+ """Blocked Completely Randomized Design.
22
+
23
+ Treatment is randomized independently within each block defined by
24
+ ``block_col``. The treatment proportion ``p`` is applied uniformly
25
+ to all blocks. Within each block, the number of treated units is
26
+ ``round(p * n_block)``.
27
+
28
+ Parameters
29
+ ----------
30
+ block_col : str
31
+ Name of the column in the DataFrame that defines blocks.
32
+ p : float
33
+ Treatment proportion in (0, 1), applied uniformly across blocks.
34
+ seed : int or None, optional
35
+ Random seed for reproducibility, by default None.
36
+ treatment_col : str, optional
37
+ Name of the treatment column to be added to the output, by
38
+ default ``"treatment"``.
39
+
40
+ Examples
41
+ --------
42
+ >>> import pandas as pd
43
+ >>> df = pd.DataFrame({
44
+ ... "x": range(8),
45
+ ... "region": ["A", "A", "A", "A", "B", "B", "B", "B"],
46
+ ... })
47
+ >>> design = BlockedCRD(block_col="region", p=0.5, seed=42)
48
+ >>> assignment = design.randomize(df)
49
+ >>> assignment.block_sizes_
50
+ {'A': 4, 'B': 4}
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ block_col: str,
56
+ p: float | None = None,
57
+ seed: int | None = None,
58
+ treatment_col: str = "treatment",
59
+ ) -> None:
60
+ if p is None:
61
+ raise InvalidDesignError(
62
+ "BlockedCRD requires a treatment proportion p; "
63
+ "received p=None."
64
+ )
65
+ if not (0.0 < p < 1.0):
66
+ raise InvalidDesignError(
67
+ f"Treatment proportion p must be in (0, 1), but received {p}."
68
+ )
69
+
70
+ self.block_col = block_col
71
+ self.p = p
72
+ self.seed = seed
73
+ self.treatment_col = treatment_col
74
+
75
+ def randomize(self, df: pd.DataFrame) -> BlockedAssignment:
76
+ """Perform blocked randomization and return a BlockedAssignment.
77
+
78
+ Parameters
79
+ ----------
80
+ df : pd.DataFrame
81
+ DataFrame containing the experimental units. Must contain
82
+ ``block_col`` and must not contain ``treatment_col``.
83
+
84
+ Returns
85
+ -------
86
+ BlockedAssignment
87
+ Assignment with treatment column added. Original DataFrame
88
+ is not modified.
89
+
90
+ Raises
91
+ ------
92
+ InvalidDesignError
93
+ If ``block_col`` is missing from ``df``, if ``treatment_col``
94
+ already exists in ``df``, or if rounding results in 0 or n
95
+ treated units in any block.
96
+ InsufficientDataError
97
+ If any block has fewer than 2 units.
98
+ """
99
+ if self.block_col not in df.columns:
100
+ raise InvalidDesignError(
101
+ f"Block column '{self.block_col}' not found in DataFrame. "
102
+ f"Available columns: {list(df.columns)}."
103
+ )
104
+ if self.treatment_col in df.columns:
105
+ raise InvalidDesignError(
106
+ f"Treatment column '{self.treatment_col}' already exists "
107
+ f"in DataFrame. Drop or rename it before calling randomize()."
108
+ )
109
+
110
+ df_out = df.copy()
111
+ treatment = np.zeros(len(df_out), dtype=int)
112
+
113
+ rng = np.random.default_rng(self.seed)
114
+
115
+ # Iterate blocks in stable order (sorted by label) for
116
+ # reproducibility independent of pandas grouping internals.
117
+ block_labels = sorted(df_out[self.block_col].unique(), key=lambda x: str(x))
118
+
119
+ block_sizes: dict = {}
120
+
121
+ for label in block_labels:
122
+ block_iloc = np.where(df_out[self.block_col].values == label)[0]
123
+ n_block = len(block_iloc)
124
+ block_sizes[label] = n_block
125
+
126
+ if n_block < 2:
127
+ raise InsufficientDataError(
128
+ context=(
129
+ f"BlockedCRD randomization for block '{label}'"
130
+ ),
131
+ minimum=2,
132
+ received=n_block,
133
+ )
134
+
135
+ n_treated_block = int(round(self.p * n_block))
136
+
137
+ if n_treated_block == 0 or n_treated_block == n_block:
138
+ raise InvalidDesignError(
139
+ f"Block '{label}' has size {n_block}; with p={self.p}, "
140
+ f"rounding yields {n_treated_block} treated units. "
141
+ f"Each block must have at least 1 treated and 1 control "
142
+ f"unit. Increase block size or adjust p."
143
+ )
144
+
145
+ chosen = rng.choice(block_iloc, size=n_treated_block, replace=False)
146
+ treatment[chosen] = 1
147
+
148
+ df_out[self.treatment_col] = treatment
149
+
150
+ return BlockedAssignment(
151
+ data=df_out,
152
+ treatment_col=self.treatment_col,
153
+ design=self,
154
+ block_col=self.block_col,
155
+ block_sizes=block_sizes,
156
+ seed=self.seed,
157
+ )
@@ -0,0 +1,162 @@
1
+ """Completely Randomized Design (CRD).
2
+
3
+ Assigns units to treatment uniformly at random, with either a fixed
4
+ absolute count of treated units or a fixed treatment proportion.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from skxperiments.core.assignment import CRDAssignment
11
+ from skxperiments.core.base import BaseDesign
12
+ from skxperiments.core.exceptions import (
13
+ InsufficientDataError,
14
+ InvalidDesignError,
15
+ )
16
+
17
+
18
+ class CRD(BaseDesign):
19
+ """Completely Randomized Design.
20
+
21
+ Treatment is assigned uniformly at random to a fixed number of
22
+ units. The user provides exactly one of ``n_treated`` (absolute
23
+ count) or ``p`` (proportion); rounding for ``p`` uses
24
+ ``round(p * n)``.
25
+
26
+ Parameters
27
+ ----------
28
+ n_treated : int or None, optional
29
+ Absolute number of units to assign to treatment. Mutually
30
+ exclusive with ``p``. By default None.
31
+ p : float or None, optional
32
+ Treatment proportion in (0, 1). Mutually exclusive with
33
+ ``n_treated``. By default None.
34
+ seed : int or None, optional
35
+ Random seed for reproducibility, by default None.
36
+ treatment_col : str, optional
37
+ Name of the treatment column to be added to the output, by
38
+ default ``"treatment"``.
39
+
40
+ Examples
41
+ --------
42
+ >>> import pandas as pd
43
+ >>> df = pd.DataFrame({"x": range(100)})
44
+ >>> design = CRD(p=0.5, seed=42)
45
+ >>> assignment = design.randomize(df)
46
+ >>> assignment.n_treated_
47
+ 50
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ n_treated: int | None = None,
53
+ p: float | None = None,
54
+ seed: int | None = None,
55
+ treatment_col: str = "treatment",
56
+ ) -> None:
57
+ # Mutual exclusivity: exactly one of n_treated or p.
58
+ if n_treated is None and p is None:
59
+ raise InvalidDesignError(
60
+ "CRD requires exactly one of n_treated or p; both "
61
+ "are None."
62
+ )
63
+ if n_treated is not None and p is not None:
64
+ raise InvalidDesignError(
65
+ "CRD requires exactly one of n_treated or p; both "
66
+ "were provided."
67
+ )
68
+
69
+ if n_treated is not None:
70
+ if not isinstance(n_treated, (int, np.integer)) or n_treated <= 0:
71
+ raise InvalidDesignError(
72
+ f"n_treated must be a positive integer, but received "
73
+ f"{n_treated!r}."
74
+ )
75
+
76
+ if p is not None:
77
+ if not isinstance(p, (int, float)) or not (0.0 < p < 1.0):
78
+ raise InvalidDesignError(
79
+ f"p must be in (0, 1), but received {p!r}."
80
+ )
81
+
82
+ self.n_treated = n_treated
83
+ self.p = p
84
+ self.seed = seed
85
+ self.treatment_col = treatment_col
86
+
87
+ def randomize(self, df: pd.DataFrame) -> CRDAssignment:
88
+ """Perform complete randomization and return a CRDAssignment.
89
+
90
+ Parameters
91
+ ----------
92
+ df : pd.DataFrame
93
+ DataFrame with experimental units. Must not contain
94
+ ``treatment_col``.
95
+
96
+ Returns
97
+ -------
98
+ CRDAssignment
99
+ Assignment with the treatment column added. The original
100
+ DataFrame is not modified.
101
+
102
+ Raises
103
+ ------
104
+ InvalidDesignError
105
+ If ``treatment_col`` already exists in ``df``, or if the
106
+ resolved number of treated is 0 or N (no treatment
107
+ contrast possible).
108
+ InsufficientDataError
109
+ If ``len(df) < n_treated`` (when ``n_treated`` was given),
110
+ or if ``len(df) < 2`` (no contrast possible).
111
+ """
112
+ n_total = len(df)
113
+
114
+ if n_total < 2:
115
+ raise InsufficientDataError(
116
+ context="CRD randomization",
117
+ minimum=2,
118
+ received=n_total,
119
+ )
120
+
121
+ if self.treatment_col in df.columns:
122
+ raise InvalidDesignError(
123
+ f"Treatment column '{self.treatment_col}' already "
124
+ f"exists in DataFrame. Drop or rename it before "
125
+ f"calling randomize()."
126
+ )
127
+
128
+ # Resolve n_treated.
129
+ if self.n_treated is not None:
130
+ if n_total < self.n_treated:
131
+ raise InsufficientDataError(
132
+ context="CRD randomization",
133
+ minimum=self.n_treated,
134
+ received=n_total,
135
+ )
136
+ n_treated_resolved = self.n_treated
137
+ else:
138
+ n_treated_resolved = int(round(self.p * n_total))
139
+
140
+ if n_treated_resolved <= 0 or n_treated_resolved >= n_total:
141
+ raise InvalidDesignError(
142
+ f"Resolved n_treated={n_treated_resolved} for N={n_total}; "
143
+ f"must be strictly between 0 and N. Adjust n_treated "
144
+ f"or p."
145
+ )
146
+
147
+ # Defensive copy. Build treatment vector.
148
+ df_out = df.copy()
149
+ rng = np.random.default_rng(self.seed)
150
+
151
+ treatment = np.zeros(n_total, dtype=int)
152
+ chosen = rng.choice(n_total, size=n_treated_resolved, replace=False)
153
+ treatment[chosen] = 1
154
+
155
+ df_out[self.treatment_col] = treatment
156
+
157
+ return CRDAssignment(
158
+ data=df_out,
159
+ treatment_col=self.treatment_col,
160
+ design=self,
161
+ seed=self.seed,
162
+ )
@@ -0,0 +1,174 @@
1
+ """2^K Factorial Design.
2
+
3
+ Randomly assigns units to one of 2^K cells defined by the values of
4
+ K binary factors, with all cells of equal size.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from skxperiments.core.assignment import FactorialAssignment
11
+ from skxperiments.core.base import BaseDesign
12
+ from skxperiments.core.exceptions import (
13
+ InsufficientDataError,
14
+ InvalidDesignError,
15
+ )
16
+
17
+
18
+ class FactorialDesign(BaseDesign):
19
+ """2^K Factorial Design with equal cell sizes.
20
+
21
+ Randomly assigns units to one of 2^K cells defined by K binary
22
+ factors. Each cell contains exactly ``n_per_cell`` units, so the
23
+ DataFrame must have ``n_per_cell * 2^K`` rows.
24
+
25
+ Parameters
26
+ ----------
27
+ factors : list of str
28
+ Names of the K factors. These will be added as columns to the
29
+ Assignment's data, alongside the synthetic ``"_cell"`` column.
30
+ Must be non-empty and contain no duplicates.
31
+ n_per_cell : int
32
+ Number of units per cell. All cells have equal size in this
33
+ version. Must be >= 1.
34
+ seed : int or None, optional
35
+ Random seed for reproducibility, by default None.
36
+
37
+ Notes
38
+ -----
39
+ Cell encoding convention (little-endian):
40
+
41
+ cell_index = sum(factor_value * 2**i
42
+ for i, factor_value in enumerate(factors))
43
+
44
+ For K=2 with factors ``["A", "B"]``:
45
+ A=0, B=0 -> cell 0
46
+ A=1, B=0 -> cell 1
47
+ A=0, B=1 -> cell 2
48
+ A=1, B=1 -> cell 3
49
+
50
+ Examples
51
+ --------
52
+ >>> import pandas as pd
53
+ >>> df = pd.DataFrame({"x": range(8)})
54
+ >>> design = FactorialDesign(factors=["A", "B"], n_per_cell=2, seed=42)
55
+ >>> assignment = design.randomize(df)
56
+ >>> assignment.n_cells_
57
+ 4
58
+ >>> assignment.cell_sizes_
59
+ {0: 2, 1: 2, 2: 2, 3: 2}
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ factors: list[str],
65
+ n_per_cell: int,
66
+ seed: int | None = None,
67
+ ) -> None:
68
+ if not isinstance(factors, list) or len(factors) == 0:
69
+ raise InvalidDesignError(
70
+ "FactorialDesign requires a non-empty list of factors."
71
+ )
72
+
73
+ if len(set(factors)) != len(factors):
74
+ duplicates = [f for f in factors if factors.count(f) > 1]
75
+ raise InvalidDesignError(
76
+ f"FactorialDesign factors must be unique. "
77
+ f"Duplicates found: {sorted(set(duplicates))}."
78
+ )
79
+
80
+ if not isinstance(n_per_cell, int) or n_per_cell < 1:
81
+ raise InvalidDesignError(
82
+ f"n_per_cell must be a positive integer, "
83
+ f"received {n_per_cell!r}."
84
+ )
85
+
86
+ self.factors = factors
87
+ self.n_per_cell = n_per_cell
88
+ self.seed = seed
89
+
90
+ def randomize(self, df: pd.DataFrame) -> FactorialAssignment:
91
+ """Perform factorial randomization.
92
+
93
+ Parameters
94
+ ----------
95
+ df : pd.DataFrame
96
+ DataFrame with experimental units. Must have exactly
97
+ ``n_per_cell * 2^K`` rows and must not contain any column
98
+ named in ``factors`` or named ``"_cell"``.
99
+
100
+ Returns
101
+ -------
102
+ FactorialAssignment
103
+ Assignment with factor columns and ``"_cell"`` added.
104
+
105
+ Raises
106
+ ------
107
+ InvalidDesignError
108
+ If column-name collisions exist or if any cell has size 0.
109
+ InsufficientDataError
110
+ If ``len(df) != n_per_cell * 2^K``.
111
+ """
112
+ k = len(self.factors)
113
+ n_cells = 2**k
114
+ n_required = self.n_per_cell * n_cells
115
+
116
+ if len(df) != n_required:
117
+ raise InsufficientDataError(
118
+ context=(
119
+ f"FactorialDesign with K={k} factors and "
120
+ f"n_per_cell={self.n_per_cell} requires exactly "
121
+ f"{n_required} units"
122
+ ),
123
+ minimum=n_required,
124
+ received=len(df),
125
+ )
126
+
127
+ # Detect column-name collisions
128
+ forbidden = set(self.factors) | {"_cell"}
129
+ collisions = sorted(forbidden & set(df.columns))
130
+ if collisions:
131
+ raise InvalidDesignError(
132
+ f"DataFrame already contains columns reserved for "
133
+ f"FactorialDesign output: {collisions}. Drop or rename "
134
+ f"them before calling randomize()."
135
+ )
136
+
137
+ df_out = df.copy()
138
+ rng = np.random.default_rng(self.seed)
139
+
140
+ # Shuffle iloc positions and assign sequentially to cells.
141
+ shuffled = rng.permutation(n_required)
142
+ cell_assignment = np.empty(n_required, dtype=int)
143
+ for cell_idx in range(n_cells):
144
+ start = cell_idx * self.n_per_cell
145
+ end = start + self.n_per_cell
146
+ cell_assignment[shuffled[start:end]] = cell_idx
147
+
148
+ df_out["_cell"] = cell_assignment
149
+
150
+ # Decode cell index back into binary factor values
151
+ # using little-endian convention.
152
+ for i, factor_name in enumerate(self.factors):
153
+ df_out[factor_name] = (cell_assignment >> i) & 1
154
+
155
+ # Build cell_sizes and validate non-empty cells (defensive;
156
+ # by construction every cell has n_per_cell >= 1).
157
+ cell_sizes: dict = {}
158
+ for cell_idx in range(n_cells):
159
+ size = int((cell_assignment == cell_idx).sum())
160
+ if size == 0:
161
+ raise InvalidDesignError(
162
+ f"Cell {cell_idx} has zero units after randomization. "
163
+ f"This should not happen with n_per_cell={self.n_per_cell}; "
164
+ f"please report as a bug."
165
+ )
166
+ cell_sizes[cell_idx] = size
167
+
168
+ return FactorialAssignment(
169
+ data=df_out,
170
+ design=self,
171
+ factor_cols=self.factors,
172
+ cell_sizes=cell_sizes,
173
+ seed=self.seed,
174
+ )