pysofra 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pysofra/__init__.py +82 -0
  2. pysofra/core/__init__.py +14 -0
  3. pysofra/core/compose.py +167 -0
  4. pysofra/core/format.py +155 -0
  5. pysofra/core/frames.py +69 -0
  6. pysofra/core/schema.py +128 -0
  7. pysofra/core/table.py +924 -0
  8. pysofra/io/__init__.py +1 -0
  9. pysofra/models/__init__.py +6 -0
  10. pysofra/models/extract.py +249 -0
  11. pysofra/models/pool.py +119 -0
  12. pysofra/models/regression.py +507 -0
  13. pysofra/models/survival.py +395 -0
  14. pysofra/models/uvregression.py +438 -0
  15. pysofra/notebook/__init__.py +6 -0
  16. pysofra/plot/__init__.py +23 -0
  17. pysofra/plot/_backend.py +32 -0
  18. pysofra/plot/forest.py +159 -0
  19. pysofra/plot/inline.py +171 -0
  20. pysofra/plot/km.py +249 -0
  21. pysofra/render/__init__.py +28 -0
  22. pysofra/render/_zip_determinism.py +57 -0
  23. pysofra/render/base.py +22 -0
  24. pysofra/render/docx.py +286 -0
  25. pysofra/render/html.py +442 -0
  26. pysofra/render/image.py +130 -0
  27. pysofra/render/latex.py +253 -0
  28. pysofra/render/markdown.py +128 -0
  29. pysofra/render/pptx.py +340 -0
  30. pysofra/render/xlsx.py +226 -0
  31. pysofra/summary/__init__.py +6 -0
  32. pysofra/summary/calibrate.py +214 -0
  33. pysofra/summary/design.py +246 -0
  34. pysofra/summary/effect_size.py +187 -0
  35. pysofra/summary/extras.py +745 -0
  36. pysofra/summary/smd.py +133 -0
  37. pysofra/summary/stats.py +135 -0
  38. pysofra/summary/tbl_cross.py +339 -0
  39. pysofra/summary/tbl_one.py +1220 -0
  40. pysofra/summary/tbl_summary.py +51 -0
  41. pysofra/summary/tests.py +370 -0
  42. pysofra/summary/typing.py +129 -0
  43. pysofra/summary/weights.py +161 -0
  44. pysofra/themes/__init__.py +5 -0
  45. pysofra/themes/registry.py +272 -0
  46. pysofra-0.1.0a1.dist-info/METADATA +301 -0
  47. pysofra-0.1.0a1.dist-info/RECORD +50 -0
  48. pysofra-0.1.0a1.dist-info/WHEEL +4 -0
  49. pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
  50. pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,214 @@
1
+ """Survey-weight calibration — post-stratification and raking.
2
+
3
+ Both algorithms scale the design weights so that the weighted marginal
4
+ totals over selected variables match supplied population targets.
5
+
6
+ * :func:`post_stratify` solves the case where the calibration variables
7
+ partition the sample into a single cross-classification (e.g. age × sex
8
+ cells). The new weight for each row is the design weight multiplied
9
+ by the cell-level ratio of population total to weighted sample total.
10
+
11
+ * :func:`rake` (a.k.a. iterative proportional fitting) handles the more
12
+ common case where targets are given for several variables marginally
13
+ but not for their joint cross-classification. Weights are scaled one
14
+ variable at a time, repeatedly, until convergence.
15
+
16
+ Both functions return a new pandas Series of calibrated weights with the
17
+ same index as the input. Use the result as the ``weights`` column of a
18
+ :class:`SurveyDesign`.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from collections.abc import Mapping
24
+
25
+ import numpy as np
26
+ import pandas as pd
27
+
28
+
29
+ def post_stratify(
30
+ data: pd.DataFrame,
31
+ base_weights: pd.Series | str,
32
+ *,
33
+ strata_cols: list[str] | tuple[str, ...],
34
+ targets: Mapping[tuple[object, ...], float] | pd.Series,
35
+ ) -> pd.Series:
36
+ """Post-stratification calibration over a complete cross-classification.
37
+
38
+ Parameters
39
+ ----------
40
+ data
41
+ Source dataframe.
42
+ base_weights
43
+ Either the column name of design weights in ``data`` or a Series
44
+ aligned to ``data.index``.
45
+ strata_cols
46
+ One or more columns whose Cartesian product defines the
47
+ post-strata.
48
+ targets
49
+ Population totals for each stratum. Accepts either:
50
+
51
+ * a ``dict``-like keyed by tuples whose length equals
52
+ ``len(strata_cols)`` (e.g. ``{('M', '<50'): 1200, ...}``), or
53
+ * a ``pandas.Series`` indexed by those tuples
54
+ (a ``MultiIndex.Series``).
55
+
56
+ Returns
57
+ -------
58
+ pandas.Series
59
+ Calibrated weights, aligned to ``data.index``.
60
+
61
+ Raises
62
+ ------
63
+ KeyError
64
+ When a stratum present in the data is missing from ``targets``.
65
+ """
66
+ if isinstance(base_weights, str):
67
+ bw = pd.to_numeric(data[base_weights], errors="coerce").astype(float)
68
+ else:
69
+ bw = pd.to_numeric(base_weights, errors="coerce").astype(float)
70
+
71
+ strata_cols = list(strata_cols)
72
+ if not strata_cols:
73
+ raise ValueError("post_stratify requires at least one strata column.")
74
+ key = data[strata_cols].apply(
75
+ lambda row: tuple(row.tolist()) if len(strata_cols) > 1 else row.iloc[0],
76
+ axis=1,
77
+ )
78
+ weighted_totals = bw.groupby(key).sum()
79
+
80
+ targets_dict = (
81
+ targets.to_dict()
82
+ if isinstance(targets, pd.Series)
83
+ else dict(targets)
84
+ )
85
+ # Allow scalar keys when there's only one strata column.
86
+ if len(strata_cols) == 1:
87
+ targets_dict = {(k if isinstance(k, tuple) else (k,)): v
88
+ for k, v in targets_dict.items()}
89
+ key = key.map(lambda x: x if isinstance(x, tuple) else (x,))
90
+ weighted_totals.index = [
91
+ (i if isinstance(i, tuple) else (i,))
92
+ for i in weighted_totals.index
93
+ ]
94
+
95
+ missing = [
96
+ k for k in weighted_totals.index
97
+ if (k if isinstance(k, tuple) else (k,)) not in targets_dict
98
+ ]
99
+ if missing:
100
+ raise KeyError(f"post_stratify: targets missing strata {missing[:5]}...")
101
+
102
+ scale_map = {
103
+ stratum: float(targets_dict[stratum if isinstance(stratum, tuple) else (stratum,)])
104
+ / float(total)
105
+ if total > 0 else 0.0
106
+ for stratum, total in weighted_totals.items()
107
+ }
108
+ return bw * key.map(scale_map).astype(float)
109
+
110
+
111
+ def rake(
112
+ data: pd.DataFrame,
113
+ base_weights: pd.Series | str,
114
+ *,
115
+ margins: Mapping[str, Mapping[object, float]],
116
+ max_iter: int = 50,
117
+ tol: float = 1e-6,
118
+ ) -> pd.Series:
119
+ """Raking (iterative proportional fitting) over marginal targets.
120
+
121
+ Parameters
122
+ ----------
123
+ data
124
+ Source dataframe.
125
+ base_weights
126
+ Either the column name in ``data`` or an aligned Series.
127
+ margins
128
+ Mapping of *variable → {level: target_total}*. Each variable's
129
+ targets are summed during one iteration; the algorithm cycles
130
+ through the variables until the weights stabilise.
131
+ max_iter
132
+ Maximum number of full sweeps over ``margins``.
133
+ tol
134
+ Convergence threshold on the largest relative change in any
135
+ weight between iterations.
136
+
137
+ Returns
138
+ -------
139
+ pandas.Series
140
+ Calibrated weights aligned to ``data.index``.
141
+ """
142
+ if isinstance(base_weights, str):
143
+ w = pd.to_numeric(data[base_weights], errors="coerce").astype(float)
144
+ else:
145
+ w = pd.to_numeric(base_weights, errors="coerce").astype(float)
146
+ w = w.copy()
147
+
148
+ if not margins:
149
+ return w
150
+
151
+ # Validate columns / targets up front.
152
+ for col, target_levels in margins.items():
153
+ if col not in data.columns:
154
+ raise KeyError(f"rake: column {col!r} not in data")
155
+ present = set(data[col].dropna().unique())
156
+ missing_targets = present - set(target_levels)
157
+ if missing_targets:
158
+ raise KeyError(
159
+ f"rake: column {col!r} has levels with no target: {sorted(missing_targets)[:5]}"
160
+ )
161
+
162
+ for _ in range(max_iter):
163
+ max_rel = 0.0
164
+ for col, target_levels in margins.items():
165
+ for lvl, target in target_levels.items():
166
+ mask = data[col] == lvl
167
+ total = float(w[mask].sum())
168
+ if total <= 0:
169
+ continue
170
+ factor = float(target) / total
171
+ old = w[mask].copy()
172
+ w.loc[mask] = old * factor
173
+ # Track the worst relative change for convergence.
174
+ if old.sum() > 0:
175
+ rel = abs((w[mask].sum() - old.sum()) / old.sum())
176
+ max_rel = max(max_rel, rel)
177
+ if max_rel < tol:
178
+ break
179
+
180
+ return w
181
+
182
+
183
+ def design_effect(weights: pd.Series) -> float:
184
+ """Kish's design-effect estimate: ``DEFF ≈ n · Σw² / (Σw)²``.
185
+
186
+ A quick QC check after calibration — large DEFF (≫ 1) means the
187
+ weights are highly variable and effective sample size is low.
188
+
189
+ Negative weights are not meaningful in a design context (they would
190
+ flip the contribution of a row), so they are excluded from the
191
+ computation. If any are present, a ``UserWarning`` flags how many
192
+ rows were dropped — matching the same behaviour as ``tbl_one(...,
193
+ weights=...)``. Returns ``nan`` when no positive weights remain.
194
+ """
195
+ w_raw = pd.to_numeric(weights, errors="coerce").dropna()
196
+ n_negative = int((w_raw < 0).sum())
197
+ if n_negative:
198
+ import warnings
199
+ warnings.warn(
200
+ f"design_effect: weights column contains {n_negative} negative "
201
+ "value(s); rows with negative weight are excluded from the "
202
+ "design-effect estimate.",
203
+ UserWarning,
204
+ stacklevel=2,
205
+ )
206
+ w = w_raw[w_raw > 0]
207
+ if w.empty:
208
+ return float("nan")
209
+ n = len(w)
210
+ return float(n * (w ** 2).sum() / (w.sum() ** 2))
211
+
212
+
213
+ # silence unused-import lint
214
+ _ = np
@@ -0,0 +1,246 @@
1
+ """Survey design object for variance estimation under complex sampling.
2
+
3
+ The :class:`SurveyDesign` dataclass mirrors the headline fields of R's
4
+ ``survey::svydesign``:
5
+
6
+ * ``weights`` — column carrying the sampling weight for each row.
7
+ * ``strata`` — optional stratification variable. Within each stratum,
8
+ PSUs are assumed independent; variance is summed across strata.
9
+ * ``cluster`` — optional primary-sampling-unit (PSU) variable. When
10
+ given, the variance of any estimator is computed across cluster
11
+ totals rather than individual observations (Taylor linearization
12
+ for the mean).
13
+ * ``fpc`` — optional finite-population-correction column (population
14
+ size in each stratum, or per-cluster if no strata). Used to scale
15
+ the variance by ``(1 - n/N)`` per stratum.
16
+
17
+ This is a *first-order* implementation: it covers what the vast
18
+ majority of survey-weighted clinical / epidemiology pipelines need
19
+ (stratified single-stage and clustered single-stage designs with FPC).
20
+ Multi-stage designs and post-stratification calibration remain on the
21
+ roadmap.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass
27
+
28
+ import numpy as np
29
+ import pandas as pd
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class SurveyDesign:
34
+ """Column-name bundle describing a survey-design structure.
35
+
36
+ ``cluster`` accepts either a single column name (single-stage
37
+ cluster sampling) or a tuple of names (multi-stage). For multi-stage
38
+ designs PySofra currently uses the outermost PSU for variance
39
+ estimation and a footnote will name the second-stage column as
40
+ "nested within" — full multi-stage Taylor linearisation is planned.
41
+
42
+ ``replicate_weights`` and ``replicate_scale`` enable the jackknife
43
+ family of variance estimators: every replicate column carries
44
+ weights with one PSU dropped, and the variance is computed as
45
+ ``replicate_scale * Σ (θ̂_r − θ̂)²``. The ``"jk1"`` default sets
46
+ ``replicate_scale`` to ``(n − 1)/n`` automatically.
47
+ """
48
+
49
+ weights: str
50
+ strata: str | None = None
51
+ cluster: str | tuple[str, ...] | None = None
52
+ fpc: str | None = None
53
+ replicate_weights: tuple[str, ...] | None = None
54
+ replicate_type: str = "jk1" # 'jk1' | 'jkn' | 'bootstrap'
55
+
56
+ @property
57
+ def primary_cluster(self) -> str | None:
58
+ if self.cluster is None:
59
+ return None
60
+ if isinstance(self.cluster, tuple):
61
+ return self.cluster[0] if self.cluster else None
62
+ return self.cluster
63
+
64
+ def validate(self, data: pd.DataFrame) -> None:
65
+ for name, col in (("weights", self.weights),
66
+ ("strata", self.strata),
67
+ ("fpc", self.fpc)):
68
+ if col is not None and col not in data.columns:
69
+ raise KeyError(f"{name} column {col!r} not in data")
70
+ if self.cluster is not None:
71
+ cluster_cols = (
72
+ self.cluster if isinstance(self.cluster, tuple)
73
+ else (self.cluster,)
74
+ )
75
+ for c in cluster_cols:
76
+ if c not in data.columns:
77
+ raise KeyError(f"cluster column {c!r} not in data")
78
+ if self.replicate_weights is not None:
79
+ missing = [c for c in self.replicate_weights if c not in data.columns]
80
+ if missing:
81
+ raise KeyError(f"replicate_weights columns not in data: {missing}")
82
+ if self.replicate_type not in ("jk1", "jkn", "bootstrap"):
83
+ raise ValueError(
84
+ f"replicate_type must be 'jk1', 'jkn', or 'bootstrap'; "
85
+ f"got {self.replicate_type!r}."
86
+ )
87
+
88
+
89
+ # ----------------------------------------------------------------------
90
+ # Variance estimators
91
+ # ----------------------------------------------------------------------
92
+
93
+
94
+ def design_mean_var(
95
+ values: pd.Series,
96
+ weights: pd.Series,
97
+ *,
98
+ strata: pd.Series | None = None,
99
+ cluster: pd.Series | None = None,
100
+ fpc: pd.Series | None = None,
101
+ ) -> tuple[float, float, float]:
102
+ """Estimate the survey-weighted mean and its design-based variance.
103
+
104
+ Returns ``(mean, variance, n_eff)``.
105
+
106
+ For a simple stratified design, this implements the Taylor-series
107
+ linearisation:
108
+
109
+ Var(ŷ) = Σ_h (1 - f_h) · (n_h / (n_h - 1)) · Σ_{i in h} (w_i (y_i - ŷ))²
110
+ / (Σ w_i)²
111
+
112
+ For a clustered design (no strata, single-stage), the variance is
113
+ computed across PSU totals.
114
+
115
+ When both strata and clusters are given, the formula nests cluster
116
+ variance within strata.
117
+ """
118
+ v = pd.to_numeric(values, errors="coerce").astype(float)
119
+ w = pd.to_numeric(weights, errors="coerce").astype(float)
120
+ mask = ~(v.isna() | w.isna()) & (w > 0)
121
+ v = v[mask]
122
+ w = w[mask]
123
+ if strata is not None:
124
+ strata = strata[mask].reset_index(drop=True)
125
+ if cluster is not None:
126
+ cluster = cluster[mask].reset_index(drop=True)
127
+ if fpc is not None:
128
+ fpc = pd.to_numeric(fpc, errors="coerce")[mask].reset_index(drop=True)
129
+ v = v.reset_index(drop=True)
130
+ w = w.reset_index(drop=True)
131
+
132
+ total_w = float(w.sum())
133
+ if total_w <= 0 or v.size == 0:
134
+ return float("nan"), float("nan"), 0.0
135
+
136
+ mean = float((w * v).sum() / total_w)
137
+ n = int(v.size)
138
+
139
+ # Residuals for the mean estimator.
140
+ e = w.to_numpy() * (v.to_numpy() - mean)
141
+
142
+ if strata is None and cluster is None:
143
+ var_num = float(np.sum(e ** 2)) * (n / max(n - 1, 1))
144
+ elif strata is None:
145
+ assert cluster is not None
146
+ # Single-stage clusters; sum residuals within each cluster, take
147
+ # variance across cluster totals.
148
+ s_per_cluster = pd.Series(e).groupby(cluster).sum().to_numpy()
149
+ n_clust = int(s_per_cluster.size)
150
+ if n_clust > 1:
151
+ mean_cluster = float(s_per_cluster.mean())
152
+ var_num = float(np.sum((s_per_cluster - mean_cluster) ** 2)) \
153
+ * (n_clust / (n_clust - 1))
154
+ else:
155
+ var_num = 0.0
156
+ else:
157
+ # Stratified, possibly with clusters within strata.
158
+ var_num = 0.0
159
+ s = pd.Series(e)
160
+ strata_series = pd.Series(strata)
161
+ for _stratum, idx in strata_series.groupby(strata_series).indices.items():
162
+ idx_arr = np.asarray(idx)
163
+ e_h = s.iloc[idx_arr].to_numpy()
164
+ if cluster is not None:
165
+ c_h = cluster.iloc[idx_arr].to_numpy()
166
+ psu_totals = pd.Series(e_h).groupby(c_h).sum().to_numpy()
167
+ n_h = int(psu_totals.size)
168
+ if n_h > 1:
169
+ mean_h = float(psu_totals.mean())
170
+ contrib = float(np.sum((psu_totals - mean_h) ** 2)) \
171
+ * (n_h / (n_h - 1))
172
+ else:
173
+ contrib = 0.0
174
+ else:
175
+ n_h = int(e_h.size)
176
+ contrib = (
177
+ float(np.sum(e_h ** 2)) * (n_h / (n_h - 1))
178
+ if n_h > 1
179
+ else 0.0
180
+ )
181
+
182
+ if fpc is not None and idx_arr.size > 0:
183
+ fpc_h = float(fpc.iloc[idx_arr].iloc[0])
184
+ # FPC = 1 - n/N
185
+ f_h = min(1.0, idx_arr.size / max(fpc_h, 1.0))
186
+ contrib *= 1.0 - f_h
187
+
188
+ var_num += contrib
189
+
190
+ variance = var_num / (total_w ** 2)
191
+ return mean, variance, total_w
192
+
193
+
194
+ # ----------------------------------------------------------------------
195
+ # Replicate-weight variance
196
+ # ----------------------------------------------------------------------
197
+
198
+
199
+ def replicate_mean_var(
200
+ values: pd.Series,
201
+ base_weights: pd.Series,
202
+ replicate_weights: list[pd.Series] | tuple[pd.Series, ...],
203
+ *,
204
+ replicate_type: str = "jk1",
205
+ ) -> tuple[float, float, float]:
206
+ """Variance of a weighted mean from replicate weights.
207
+
208
+ The full-sample estimator uses ``base_weights``; each replicate gives
209
+ a perturbed estimate, and the variance is
210
+
211
+ Var(θ̂) = c · Σ_r (θ̂_r − θ̂)²
212
+
213
+ where ``c`` is the replicate-type scaling: ``(R-1)/R`` for ``jk1``,
214
+ ``1/R`` for ``bootstrap``. ``jkn`` (BRR/stratified jackknife) uses
215
+ ``(R-1)/R`` too; users who need a different scale should pass the
216
+ appropriate replicate weights and use ``bootstrap`` for the
217
+ unscaled form.
218
+ """
219
+ v = pd.to_numeric(values, errors="coerce").astype(float)
220
+ bw = pd.to_numeric(base_weights, errors="coerce").astype(float)
221
+ mask = ~(v.isna() | bw.isna()) & (bw > 0)
222
+ v = v[mask].reset_index(drop=True)
223
+ bw = bw[mask].reset_index(drop=True)
224
+
225
+ total_w = float(bw.sum())
226
+ if total_w <= 0 or v.empty:
227
+ return float("nan"), float("nan"), 0.0
228
+ theta_hat = float((v * bw).sum() / total_w)
229
+
230
+ R = len(replicate_weights)
231
+ if R == 0:
232
+ return theta_hat, 0.0, total_w
233
+
234
+ sq_dev = 0.0
235
+ for rw in replicate_weights:
236
+ rw_arr = pd.to_numeric(rw, errors="coerce").astype(float)
237
+ rw_arr = rw_arr[mask].reset_index(drop=True)
238
+ w_pos = rw_arr.where(rw_arr > 0, 0.0)
239
+ denom = float(w_pos.sum())
240
+ if denom <= 0:
241
+ continue
242
+ theta_r = float((v * w_pos).sum() / denom)
243
+ sq_dev += (theta_r - theta_hat) ** 2
244
+
245
+ scale = 1.0 / R if replicate_type == "bootstrap" else (R - 1.0) / R
246
+ return theta_hat, scale * sq_dev, total_w
@@ -0,0 +1,187 @@
1
+ """Effect-size helpers — Cohen's d, Hedges' g, Cramér's V, eta-squared.
2
+
3
+ Companion functions to the inferential tests in
4
+ :mod:`pysofra.summary.tests`. Effect sizes describe the *magnitude* of
5
+ a difference / association independently of sample size, and are
6
+ frequently requested alongside the p-values in clinical reports.
7
+
8
+ All functions accept aligned :class:`pandas.Series` for ``values`` /
9
+ ``groups``; missing values are dropped pairwise. They return floats
10
+ (or ``None`` for degenerate input).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import math
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ # ----------------------------------------------------------------------
21
+ # Continuous
22
+ # ----------------------------------------------------------------------
23
+
24
+ def cohen_d(a: pd.Series | np.ndarray, b: pd.Series | np.ndarray) -> float | None:
25
+ """Cohen's d using the pooled standard deviation.
26
+
27
+ ``d = (μ₁ − μ₂) / s_pool``, where the pooled SD weights the two
28
+ samples by their degrees of freedom.
29
+ """
30
+ a_arr = pd.to_numeric(pd.Series(a), errors="coerce").dropna().to_numpy(dtype=float)
31
+ b_arr = pd.to_numeric(pd.Series(b), errors="coerce").dropna().to_numpy(dtype=float)
32
+ n_a, n_b = a_arr.size, b_arr.size
33
+ if n_a < 2 or n_b < 2:
34
+ return None
35
+ v_a = float(np.var(a_arr, ddof=1))
36
+ v_b = float(np.var(b_arr, ddof=1))
37
+ s_pool = math.sqrt(((n_a - 1) * v_a + (n_b - 1) * v_b) / (n_a + n_b - 2))
38
+ if s_pool == 0:
39
+ return 0.0 if a_arr.mean() == b_arr.mean() else float("inf")
40
+ return (float(a_arr.mean()) - float(b_arr.mean())) / s_pool
41
+
42
+
43
+ def hedges_g(a: pd.Series | np.ndarray, b: pd.Series | np.ndarray) -> float | None:
44
+ """Hedges' g — Cohen's d with the small-sample bias correction.
45
+
46
+ ``g = d · J``, where ``J ≈ 1 − 3/(4(n_a+n_b) − 9)`` (Hedges 1981).
47
+ """
48
+ d = cohen_d(a, b)
49
+ if d is None or math.isinf(d):
50
+ return d
51
+ n_a = int(pd.to_numeric(pd.Series(a), errors="coerce").dropna().size)
52
+ n_b = int(pd.to_numeric(pd.Series(b), errors="coerce").dropna().size)
53
+ denom = 4 * (n_a + n_b) - 9
54
+ if denom <= 0: # pragma: no cover — unreachable given cohen_d's n>=2 guard
55
+ return d
56
+ j = 1.0 - 3.0 / denom
57
+ return d * j
58
+
59
+
60
+ def eta_squared(values: pd.Series, groups: pd.Series) -> float | None:
61
+ """One-way ANOVA effect size: between-group / total sum-of-squares.
62
+
63
+ Ranges ``[0, 1]``. Small ≈ 0.01, medium ≈ 0.06, large ≈ 0.14
64
+ (Cohen 1988).
65
+ """
66
+ df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"),
67
+ "g": groups}).dropna()
68
+ if df.empty:
69
+ return None
70
+ grand = float(df["v"].mean())
71
+ ss_between = float((df.groupby("g")["v"]
72
+ .apply(lambda x: x.size * (x.mean() - grand) ** 2))
73
+ .sum())
74
+ ss_total = float(((df["v"] - grand) ** 2).sum())
75
+ if ss_total <= 0:
76
+ return 0.0
77
+ return ss_between / ss_total
78
+
79
+
80
+ def omega_squared(values: pd.Series, groups: pd.Series) -> float | None:
81
+ """Less-biased counterpart to ``eta_squared`` (Hays 1973)."""
82
+ df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"),
83
+ "g": groups}).dropna()
84
+ if df.empty:
85
+ return None
86
+ k = int(df["g"].nunique())
87
+ n = int(df.shape[0])
88
+ if n - k <= 0 or k <= 1:
89
+ return None
90
+ grand = float(df["v"].mean())
91
+ ss_between = float((df.groupby("g")["v"]
92
+ .apply(lambda x: x.size * (x.mean() - grand) ** 2))
93
+ .sum())
94
+ ss_total = float(((df["v"] - grand) ** 2).sum())
95
+ if ss_total <= 0:
96
+ return 0.0
97
+ ms_within = (ss_total - ss_between) / (n - k)
98
+ omega = (ss_between - (k - 1) * ms_within) / (ss_total + ms_within)
99
+ return float(max(0.0, omega))
100
+
101
+
102
+ # ----------------------------------------------------------------------
103
+ # Categorical
104
+ # ----------------------------------------------------------------------
105
+
106
+ def cramers_v(values: pd.Series, groups: pd.Series) -> float | None:
107
+ """Cramér's V — chi-square effect size normalised to ``[0, 1]``.
108
+
109
+ ``V = √(χ² / (N · (min(R, C) − 1)))``.
110
+ """
111
+ import warnings as _w
112
+
113
+ from scipy import stats as sp_stats
114
+ df = pd.DataFrame({"v": values, "g": groups}).dropna()
115
+ if df.empty:
116
+ return None
117
+ ctab = pd.crosstab(df["v"], df["g"])
118
+ if ctab.shape[0] < 2 or ctab.shape[1] < 2:
119
+ return None
120
+ with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
121
+ _w.catch_warnings():
122
+ _w.simplefilter("ignore", RuntimeWarning)
123
+ chi2, _, _, _ = sp_stats.chi2_contingency(ctab.to_numpy(), correction=False)
124
+ n = float(ctab.values.sum())
125
+ if n <= 0: # pragma: no cover — guarded by the shape >= 2x2 check above
126
+ return None
127
+ min_dim = min(ctab.shape) - 1
128
+ if min_dim <= 0: # pragma: no cover — shape >= 2x2 guarantees min_dim >= 1
129
+ return None
130
+ return float(math.sqrt(chi2 / (n * min_dim)))
131
+
132
+
133
+ def phi_coefficient(values: pd.Series, groups: pd.Series) -> float | None:
134
+ """Phi — Cramér's V special case for 2×2 tables; ``φ = √(χ²/N)``."""
135
+ import warnings as _w
136
+
137
+ from scipy import stats as sp_stats
138
+ df = pd.DataFrame({"v": values, "g": groups}).dropna()
139
+ if df.empty:
140
+ return None
141
+ ctab = pd.crosstab(df["v"], df["g"])
142
+ if ctab.shape != (2, 2):
143
+ return None
144
+ with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
145
+ _w.catch_warnings():
146
+ _w.simplefilter("ignore", RuntimeWarning)
147
+ chi2, _, _, _ = sp_stats.chi2_contingency(ctab.to_numpy(), correction=False)
148
+ n = float(ctab.values.sum())
149
+ if n <= 0: # pragma: no cover — guarded by the (2,2) shape check above
150
+ return None
151
+ return float(math.sqrt(chi2 / n))
152
+
153
+
154
+ # ----------------------------------------------------------------------
155
+ # Auto dispatch (mirrors auto-test selection)
156
+ # ----------------------------------------------------------------------
157
+
158
+
159
+ def auto_effect_size(values: pd.Series, groups: pd.Series) -> tuple[str, float | None]:
160
+ """Pick a sensible effect size for the variable kind / number of groups.
161
+
162
+ Returns ``(name, value)`` so callers can both display the metric and
163
+ label it in a footnote.
164
+ """
165
+ g_unique = pd.Series(groups).dropna().unique()
166
+ n_groups = len(g_unique)
167
+
168
+ # Continuous-looking?
169
+ try:
170
+ pd.to_numeric(values, errors="raise")
171
+ continuous = True
172
+ except (ValueError, TypeError):
173
+ continuous = False
174
+
175
+ if continuous and n_groups == 2:
176
+ return "Cohen's d", cohen_d(
177
+ values[groups == g_unique[0]],
178
+ values[groups == g_unique[1]],
179
+ )
180
+ if continuous and n_groups >= 3:
181
+ return "η²", eta_squared(values, groups)
182
+ if not continuous and n_groups >= 2:
183
+ ctab = pd.crosstab(values, groups)
184
+ if ctab.shape == (2, 2):
185
+ return "φ", phi_coefficient(values, groups)
186
+ return "Cramér's V", cramers_v(values, groups)
187
+ return "—", None