pysofra 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysofra/__init__.py +82 -0
- pysofra/core/__init__.py +14 -0
- pysofra/core/compose.py +167 -0
- pysofra/core/format.py +155 -0
- pysofra/core/frames.py +69 -0
- pysofra/core/schema.py +128 -0
- pysofra/core/table.py +924 -0
- pysofra/io/__init__.py +1 -0
- pysofra/models/__init__.py +6 -0
- pysofra/models/extract.py +249 -0
- pysofra/models/pool.py +119 -0
- pysofra/models/regression.py +507 -0
- pysofra/models/survival.py +395 -0
- pysofra/models/uvregression.py +438 -0
- pysofra/notebook/__init__.py +6 -0
- pysofra/plot/__init__.py +23 -0
- pysofra/plot/_backend.py +32 -0
- pysofra/plot/forest.py +159 -0
- pysofra/plot/inline.py +171 -0
- pysofra/plot/km.py +249 -0
- pysofra/render/__init__.py +28 -0
- pysofra/render/_zip_determinism.py +57 -0
- pysofra/render/base.py +22 -0
- pysofra/render/docx.py +286 -0
- pysofra/render/html.py +442 -0
- pysofra/render/image.py +130 -0
- pysofra/render/latex.py +253 -0
- pysofra/render/markdown.py +128 -0
- pysofra/render/pptx.py +340 -0
- pysofra/render/xlsx.py +226 -0
- pysofra/summary/__init__.py +6 -0
- pysofra/summary/calibrate.py +214 -0
- pysofra/summary/design.py +246 -0
- pysofra/summary/effect_size.py +187 -0
- pysofra/summary/extras.py +745 -0
- pysofra/summary/smd.py +133 -0
- pysofra/summary/stats.py +135 -0
- pysofra/summary/tbl_cross.py +339 -0
- pysofra/summary/tbl_one.py +1220 -0
- pysofra/summary/tbl_summary.py +51 -0
- pysofra/summary/tests.py +370 -0
- pysofra/summary/typing.py +129 -0
- pysofra/summary/weights.py +161 -0
- pysofra/themes/__init__.py +5 -0
- pysofra/themes/registry.py +272 -0
- pysofra-0.1.0a1.dist-info/METADATA +301 -0
- pysofra-0.1.0a1.dist-info/RECORD +50 -0
- pysofra-0.1.0a1.dist-info/WHEEL +4 -0
- pysofra-0.1.0a1.dist-info/licenses/LICENSE +674 -0
- pysofra-0.1.0a1.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Survey-weight calibration — post-stratification and raking.
|
|
2
|
+
|
|
3
|
+
Both algorithms scale the design weights so that the weighted marginal
|
|
4
|
+
totals over selected variables match supplied population targets.
|
|
5
|
+
|
|
6
|
+
* :func:`post_stratify` solves the case where the calibration variables
|
|
7
|
+
partition the sample into a single cross-classification (e.g. age × sex
|
|
8
|
+
cells). The new weight for each row is the design weight multiplied
|
|
9
|
+
by the cell-level ratio of population total to weighted sample total.
|
|
10
|
+
|
|
11
|
+
* :func:`rake` (a.k.a. iterative proportional fitting) handles the more
|
|
12
|
+
common case where targets are given for several variables marginally
|
|
13
|
+
but not for their joint cross-classification. Weights are scaled one
|
|
14
|
+
variable at a time, repeatedly, until convergence.
|
|
15
|
+
|
|
16
|
+
Both functions return a new pandas Series of calibrated weights with the
|
|
17
|
+
same index as the input. Use the result as the ``weights`` column of a
|
|
18
|
+
:class:`SurveyDesign`.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Mapping
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def post_stratify(
|
|
30
|
+
data: pd.DataFrame,
|
|
31
|
+
base_weights: pd.Series | str,
|
|
32
|
+
*,
|
|
33
|
+
strata_cols: list[str] | tuple[str, ...],
|
|
34
|
+
targets: Mapping[tuple[object, ...], float] | pd.Series,
|
|
35
|
+
) -> pd.Series:
|
|
36
|
+
"""Post-stratification calibration over a complete cross-classification.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
data
|
|
41
|
+
Source dataframe.
|
|
42
|
+
base_weights
|
|
43
|
+
Either the column name of design weights in ``data`` or a Series
|
|
44
|
+
aligned to ``data.index``.
|
|
45
|
+
strata_cols
|
|
46
|
+
One or more columns whose Cartesian product defines the
|
|
47
|
+
post-strata.
|
|
48
|
+
targets
|
|
49
|
+
Population totals for each stratum. Accepts either:
|
|
50
|
+
|
|
51
|
+
* a ``dict``-like keyed by tuples whose length equals
|
|
52
|
+
``len(strata_cols)`` (e.g. ``{('M', '<50'): 1200, ...}``), or
|
|
53
|
+
* a ``pandas.Series`` indexed by those tuples
|
|
54
|
+
(a ``MultiIndex.Series``).
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
pandas.Series
|
|
59
|
+
Calibrated weights, aligned to ``data.index``.
|
|
60
|
+
|
|
61
|
+
Raises
|
|
62
|
+
------
|
|
63
|
+
KeyError
|
|
64
|
+
When a stratum present in the data is missing from ``targets``.
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(base_weights, str):
|
|
67
|
+
bw = pd.to_numeric(data[base_weights], errors="coerce").astype(float)
|
|
68
|
+
else:
|
|
69
|
+
bw = pd.to_numeric(base_weights, errors="coerce").astype(float)
|
|
70
|
+
|
|
71
|
+
strata_cols = list(strata_cols)
|
|
72
|
+
if not strata_cols:
|
|
73
|
+
raise ValueError("post_stratify requires at least one strata column.")
|
|
74
|
+
key = data[strata_cols].apply(
|
|
75
|
+
lambda row: tuple(row.tolist()) if len(strata_cols) > 1 else row.iloc[0],
|
|
76
|
+
axis=1,
|
|
77
|
+
)
|
|
78
|
+
weighted_totals = bw.groupby(key).sum()
|
|
79
|
+
|
|
80
|
+
targets_dict = (
|
|
81
|
+
targets.to_dict()
|
|
82
|
+
if isinstance(targets, pd.Series)
|
|
83
|
+
else dict(targets)
|
|
84
|
+
)
|
|
85
|
+
# Allow scalar keys when there's only one strata column.
|
|
86
|
+
if len(strata_cols) == 1:
|
|
87
|
+
targets_dict = {(k if isinstance(k, tuple) else (k,)): v
|
|
88
|
+
for k, v in targets_dict.items()}
|
|
89
|
+
key = key.map(lambda x: x if isinstance(x, tuple) else (x,))
|
|
90
|
+
weighted_totals.index = [
|
|
91
|
+
(i if isinstance(i, tuple) else (i,))
|
|
92
|
+
for i in weighted_totals.index
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
missing = [
|
|
96
|
+
k for k in weighted_totals.index
|
|
97
|
+
if (k if isinstance(k, tuple) else (k,)) not in targets_dict
|
|
98
|
+
]
|
|
99
|
+
if missing:
|
|
100
|
+
raise KeyError(f"post_stratify: targets missing strata {missing[:5]}...")
|
|
101
|
+
|
|
102
|
+
scale_map = {
|
|
103
|
+
stratum: float(targets_dict[stratum if isinstance(stratum, tuple) else (stratum,)])
|
|
104
|
+
/ float(total)
|
|
105
|
+
if total > 0 else 0.0
|
|
106
|
+
for stratum, total in weighted_totals.items()
|
|
107
|
+
}
|
|
108
|
+
return bw * key.map(scale_map).astype(float)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def rake(
|
|
112
|
+
data: pd.DataFrame,
|
|
113
|
+
base_weights: pd.Series | str,
|
|
114
|
+
*,
|
|
115
|
+
margins: Mapping[str, Mapping[object, float]],
|
|
116
|
+
max_iter: int = 50,
|
|
117
|
+
tol: float = 1e-6,
|
|
118
|
+
) -> pd.Series:
|
|
119
|
+
"""Raking (iterative proportional fitting) over marginal targets.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
data
|
|
124
|
+
Source dataframe.
|
|
125
|
+
base_weights
|
|
126
|
+
Either the column name in ``data`` or an aligned Series.
|
|
127
|
+
margins
|
|
128
|
+
Mapping of *variable → {level: target_total}*. Each variable's
|
|
129
|
+
targets are summed during one iteration; the algorithm cycles
|
|
130
|
+
through the variables until the weights stabilise.
|
|
131
|
+
max_iter
|
|
132
|
+
Maximum number of full sweeps over ``margins``.
|
|
133
|
+
tol
|
|
134
|
+
Convergence threshold on the largest relative change in any
|
|
135
|
+
weight between iterations.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
pandas.Series
|
|
140
|
+
Calibrated weights aligned to ``data.index``.
|
|
141
|
+
"""
|
|
142
|
+
if isinstance(base_weights, str):
|
|
143
|
+
w = pd.to_numeric(data[base_weights], errors="coerce").astype(float)
|
|
144
|
+
else:
|
|
145
|
+
w = pd.to_numeric(base_weights, errors="coerce").astype(float)
|
|
146
|
+
w = w.copy()
|
|
147
|
+
|
|
148
|
+
if not margins:
|
|
149
|
+
return w
|
|
150
|
+
|
|
151
|
+
# Validate columns / targets up front.
|
|
152
|
+
for col, target_levels in margins.items():
|
|
153
|
+
if col not in data.columns:
|
|
154
|
+
raise KeyError(f"rake: column {col!r} not in data")
|
|
155
|
+
present = set(data[col].dropna().unique())
|
|
156
|
+
missing_targets = present - set(target_levels)
|
|
157
|
+
if missing_targets:
|
|
158
|
+
raise KeyError(
|
|
159
|
+
f"rake: column {col!r} has levels with no target: {sorted(missing_targets)[:5]}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
for _ in range(max_iter):
|
|
163
|
+
max_rel = 0.0
|
|
164
|
+
for col, target_levels in margins.items():
|
|
165
|
+
for lvl, target in target_levels.items():
|
|
166
|
+
mask = data[col] == lvl
|
|
167
|
+
total = float(w[mask].sum())
|
|
168
|
+
if total <= 0:
|
|
169
|
+
continue
|
|
170
|
+
factor = float(target) / total
|
|
171
|
+
old = w[mask].copy()
|
|
172
|
+
w.loc[mask] = old * factor
|
|
173
|
+
# Track the worst relative change for convergence.
|
|
174
|
+
if old.sum() > 0:
|
|
175
|
+
rel = abs((w[mask].sum() - old.sum()) / old.sum())
|
|
176
|
+
max_rel = max(max_rel, rel)
|
|
177
|
+
if max_rel < tol:
|
|
178
|
+
break
|
|
179
|
+
|
|
180
|
+
return w
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def design_effect(weights: pd.Series) -> float:
|
|
184
|
+
"""Kish's design-effect estimate: ``DEFF ≈ n · Σw² / (Σw)²``.
|
|
185
|
+
|
|
186
|
+
A quick QC check after calibration — large DEFF (≫ 1) means the
|
|
187
|
+
weights are highly variable and effective sample size is low.
|
|
188
|
+
|
|
189
|
+
Negative weights are not meaningful in a design context (they would
|
|
190
|
+
flip the contribution of a row), so they are excluded from the
|
|
191
|
+
computation. If any are present, a ``UserWarning`` flags how many
|
|
192
|
+
rows were dropped — matching the same behaviour as ``tbl_one(...,
|
|
193
|
+
weights=...)``. Returns ``nan`` when no positive weights remain.
|
|
194
|
+
"""
|
|
195
|
+
w_raw = pd.to_numeric(weights, errors="coerce").dropna()
|
|
196
|
+
n_negative = int((w_raw < 0).sum())
|
|
197
|
+
if n_negative:
|
|
198
|
+
import warnings
|
|
199
|
+
warnings.warn(
|
|
200
|
+
f"design_effect: weights column contains {n_negative} negative "
|
|
201
|
+
"value(s); rows with negative weight are excluded from the "
|
|
202
|
+
"design-effect estimate.",
|
|
203
|
+
UserWarning,
|
|
204
|
+
stacklevel=2,
|
|
205
|
+
)
|
|
206
|
+
w = w_raw[w_raw > 0]
|
|
207
|
+
if w.empty:
|
|
208
|
+
return float("nan")
|
|
209
|
+
n = len(w)
|
|
210
|
+
return float(n * (w ** 2).sum() / (w.sum() ** 2))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# silence unused-import lint
|
|
214
|
+
_ = np
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""Survey design object for variance estimation under complex sampling.
|
|
2
|
+
|
|
3
|
+
The :class:`SurveyDesign` dataclass mirrors the headline fields of R's
|
|
4
|
+
``survey::svydesign``:
|
|
5
|
+
|
|
6
|
+
* ``weights`` — column carrying the sampling weight for each row.
|
|
7
|
+
* ``strata`` — optional stratification variable. Within each stratum,
|
|
8
|
+
PSUs are assumed independent; variance is summed across strata.
|
|
9
|
+
* ``cluster`` — optional primary-sampling-unit (PSU) variable. When
|
|
10
|
+
given, the variance of any estimator is computed across cluster
|
|
11
|
+
totals rather than individual observations (Taylor linearization
|
|
12
|
+
for the mean).
|
|
13
|
+
* ``fpc`` — optional finite-population-correction column (population
|
|
14
|
+
size in each stratum, or per-cluster if no strata). Used to scale
|
|
15
|
+
the variance by ``(1 - n/N)`` per stratum.
|
|
16
|
+
|
|
17
|
+
This is a *first-order* implementation: it covers what the vast
|
|
18
|
+
majority of survey-weighted clinical / epidemiology pipelines need
|
|
19
|
+
(stratified single-stage and clustered single-stage designs with FPC).
|
|
20
|
+
Multi-stage designs and post-stratification calibration remain on the
|
|
21
|
+
roadmap.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class SurveyDesign:
|
|
34
|
+
"""Column-name bundle describing a survey-design structure.
|
|
35
|
+
|
|
36
|
+
``cluster`` accepts either a single column name (single-stage
|
|
37
|
+
cluster sampling) or a tuple of names (multi-stage). For multi-stage
|
|
38
|
+
designs PySofra currently uses the outermost PSU for variance
|
|
39
|
+
estimation and a footnote will name the second-stage column as
|
|
40
|
+
"nested within" — full multi-stage Taylor linearisation is planned.
|
|
41
|
+
|
|
42
|
+
``replicate_weights`` and ``replicate_scale`` enable the jackknife
|
|
43
|
+
family of variance estimators: every replicate column carries
|
|
44
|
+
weights with one PSU dropped, and the variance is computed as
|
|
45
|
+
``replicate_scale * Σ (θ̂_r − θ̂)²``. The ``"jk1"`` default sets
|
|
46
|
+
``replicate_scale`` to ``(n − 1)/n`` automatically.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
weights: str
|
|
50
|
+
strata: str | None = None
|
|
51
|
+
cluster: str | tuple[str, ...] | None = None
|
|
52
|
+
fpc: str | None = None
|
|
53
|
+
replicate_weights: tuple[str, ...] | None = None
|
|
54
|
+
replicate_type: str = "jk1" # 'jk1' | 'jkn' | 'bootstrap'
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def primary_cluster(self) -> str | None:
|
|
58
|
+
if self.cluster is None:
|
|
59
|
+
return None
|
|
60
|
+
if isinstance(self.cluster, tuple):
|
|
61
|
+
return self.cluster[0] if self.cluster else None
|
|
62
|
+
return self.cluster
|
|
63
|
+
|
|
64
|
+
def validate(self, data: pd.DataFrame) -> None:
|
|
65
|
+
for name, col in (("weights", self.weights),
|
|
66
|
+
("strata", self.strata),
|
|
67
|
+
("fpc", self.fpc)):
|
|
68
|
+
if col is not None and col not in data.columns:
|
|
69
|
+
raise KeyError(f"{name} column {col!r} not in data")
|
|
70
|
+
if self.cluster is not None:
|
|
71
|
+
cluster_cols = (
|
|
72
|
+
self.cluster if isinstance(self.cluster, tuple)
|
|
73
|
+
else (self.cluster,)
|
|
74
|
+
)
|
|
75
|
+
for c in cluster_cols:
|
|
76
|
+
if c not in data.columns:
|
|
77
|
+
raise KeyError(f"cluster column {c!r} not in data")
|
|
78
|
+
if self.replicate_weights is not None:
|
|
79
|
+
missing = [c for c in self.replicate_weights if c not in data.columns]
|
|
80
|
+
if missing:
|
|
81
|
+
raise KeyError(f"replicate_weights columns not in data: {missing}")
|
|
82
|
+
if self.replicate_type not in ("jk1", "jkn", "bootstrap"):
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"replicate_type must be 'jk1', 'jkn', or 'bootstrap'; "
|
|
85
|
+
f"got {self.replicate_type!r}."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ----------------------------------------------------------------------
|
|
90
|
+
# Variance estimators
|
|
91
|
+
# ----------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def design_mean_var(
|
|
95
|
+
values: pd.Series,
|
|
96
|
+
weights: pd.Series,
|
|
97
|
+
*,
|
|
98
|
+
strata: pd.Series | None = None,
|
|
99
|
+
cluster: pd.Series | None = None,
|
|
100
|
+
fpc: pd.Series | None = None,
|
|
101
|
+
) -> tuple[float, float, float]:
|
|
102
|
+
"""Estimate the survey-weighted mean and its design-based variance.
|
|
103
|
+
|
|
104
|
+
Returns ``(mean, variance, n_eff)``.
|
|
105
|
+
|
|
106
|
+
For a simple stratified design, this implements the Taylor-series
|
|
107
|
+
linearisation:
|
|
108
|
+
|
|
109
|
+
Var(ŷ) = Σ_h (1 - f_h) · (n_h / (n_h - 1)) · Σ_{i in h} (w_i (y_i - ŷ))²
|
|
110
|
+
/ (Σ w_i)²
|
|
111
|
+
|
|
112
|
+
For a clustered design (no strata, single-stage), the variance is
|
|
113
|
+
computed across PSU totals.
|
|
114
|
+
|
|
115
|
+
When both strata and clusters are given, the formula nests cluster
|
|
116
|
+
variance within strata.
|
|
117
|
+
"""
|
|
118
|
+
v = pd.to_numeric(values, errors="coerce").astype(float)
|
|
119
|
+
w = pd.to_numeric(weights, errors="coerce").astype(float)
|
|
120
|
+
mask = ~(v.isna() | w.isna()) & (w > 0)
|
|
121
|
+
v = v[mask]
|
|
122
|
+
w = w[mask]
|
|
123
|
+
if strata is not None:
|
|
124
|
+
strata = strata[mask].reset_index(drop=True)
|
|
125
|
+
if cluster is not None:
|
|
126
|
+
cluster = cluster[mask].reset_index(drop=True)
|
|
127
|
+
if fpc is not None:
|
|
128
|
+
fpc = pd.to_numeric(fpc, errors="coerce")[mask].reset_index(drop=True)
|
|
129
|
+
v = v.reset_index(drop=True)
|
|
130
|
+
w = w.reset_index(drop=True)
|
|
131
|
+
|
|
132
|
+
total_w = float(w.sum())
|
|
133
|
+
if total_w <= 0 or v.size == 0:
|
|
134
|
+
return float("nan"), float("nan"), 0.0
|
|
135
|
+
|
|
136
|
+
mean = float((w * v).sum() / total_w)
|
|
137
|
+
n = int(v.size)
|
|
138
|
+
|
|
139
|
+
# Residuals for the mean estimator.
|
|
140
|
+
e = w.to_numpy() * (v.to_numpy() - mean)
|
|
141
|
+
|
|
142
|
+
if strata is None and cluster is None:
|
|
143
|
+
var_num = float(np.sum(e ** 2)) * (n / max(n - 1, 1))
|
|
144
|
+
elif strata is None:
|
|
145
|
+
assert cluster is not None
|
|
146
|
+
# Single-stage clusters; sum residuals within each cluster, take
|
|
147
|
+
# variance across cluster totals.
|
|
148
|
+
s_per_cluster = pd.Series(e).groupby(cluster).sum().to_numpy()
|
|
149
|
+
n_clust = int(s_per_cluster.size)
|
|
150
|
+
if n_clust > 1:
|
|
151
|
+
mean_cluster = float(s_per_cluster.mean())
|
|
152
|
+
var_num = float(np.sum((s_per_cluster - mean_cluster) ** 2)) \
|
|
153
|
+
* (n_clust / (n_clust - 1))
|
|
154
|
+
else:
|
|
155
|
+
var_num = 0.0
|
|
156
|
+
else:
|
|
157
|
+
# Stratified, possibly with clusters within strata.
|
|
158
|
+
var_num = 0.0
|
|
159
|
+
s = pd.Series(e)
|
|
160
|
+
strata_series = pd.Series(strata)
|
|
161
|
+
for _stratum, idx in strata_series.groupby(strata_series).indices.items():
|
|
162
|
+
idx_arr = np.asarray(idx)
|
|
163
|
+
e_h = s.iloc[idx_arr].to_numpy()
|
|
164
|
+
if cluster is not None:
|
|
165
|
+
c_h = cluster.iloc[idx_arr].to_numpy()
|
|
166
|
+
psu_totals = pd.Series(e_h).groupby(c_h).sum().to_numpy()
|
|
167
|
+
n_h = int(psu_totals.size)
|
|
168
|
+
if n_h > 1:
|
|
169
|
+
mean_h = float(psu_totals.mean())
|
|
170
|
+
contrib = float(np.sum((psu_totals - mean_h) ** 2)) \
|
|
171
|
+
* (n_h / (n_h - 1))
|
|
172
|
+
else:
|
|
173
|
+
contrib = 0.0
|
|
174
|
+
else:
|
|
175
|
+
n_h = int(e_h.size)
|
|
176
|
+
contrib = (
|
|
177
|
+
float(np.sum(e_h ** 2)) * (n_h / (n_h - 1))
|
|
178
|
+
if n_h > 1
|
|
179
|
+
else 0.0
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if fpc is not None and idx_arr.size > 0:
|
|
183
|
+
fpc_h = float(fpc.iloc[idx_arr].iloc[0])
|
|
184
|
+
# FPC = 1 - n/N
|
|
185
|
+
f_h = min(1.0, idx_arr.size / max(fpc_h, 1.0))
|
|
186
|
+
contrib *= 1.0 - f_h
|
|
187
|
+
|
|
188
|
+
var_num += contrib
|
|
189
|
+
|
|
190
|
+
variance = var_num / (total_w ** 2)
|
|
191
|
+
return mean, variance, total_w
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ----------------------------------------------------------------------
|
|
195
|
+
# Replicate-weight variance
|
|
196
|
+
# ----------------------------------------------------------------------
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def replicate_mean_var(
|
|
200
|
+
values: pd.Series,
|
|
201
|
+
base_weights: pd.Series,
|
|
202
|
+
replicate_weights: list[pd.Series] | tuple[pd.Series, ...],
|
|
203
|
+
*,
|
|
204
|
+
replicate_type: str = "jk1",
|
|
205
|
+
) -> tuple[float, float, float]:
|
|
206
|
+
"""Variance of a weighted mean from replicate weights.
|
|
207
|
+
|
|
208
|
+
The full-sample estimator uses ``base_weights``; each replicate gives
|
|
209
|
+
a perturbed estimate, and the variance is
|
|
210
|
+
|
|
211
|
+
Var(θ̂) = c · Σ_r (θ̂_r − θ̂)²
|
|
212
|
+
|
|
213
|
+
where ``c`` is the replicate-type scaling: ``(R-1)/R`` for ``jk1``,
|
|
214
|
+
``1/R`` for ``bootstrap``. ``jkn`` (BRR/stratified jackknife) uses
|
|
215
|
+
``(R-1)/R`` too; users who need a different scale should pass the
|
|
216
|
+
appropriate replicate weights and use ``bootstrap`` for the
|
|
217
|
+
unscaled form.
|
|
218
|
+
"""
|
|
219
|
+
v = pd.to_numeric(values, errors="coerce").astype(float)
|
|
220
|
+
bw = pd.to_numeric(base_weights, errors="coerce").astype(float)
|
|
221
|
+
mask = ~(v.isna() | bw.isna()) & (bw > 0)
|
|
222
|
+
v = v[mask].reset_index(drop=True)
|
|
223
|
+
bw = bw[mask].reset_index(drop=True)
|
|
224
|
+
|
|
225
|
+
total_w = float(bw.sum())
|
|
226
|
+
if total_w <= 0 or v.empty:
|
|
227
|
+
return float("nan"), float("nan"), 0.0
|
|
228
|
+
theta_hat = float((v * bw).sum() / total_w)
|
|
229
|
+
|
|
230
|
+
R = len(replicate_weights)
|
|
231
|
+
if R == 0:
|
|
232
|
+
return theta_hat, 0.0, total_w
|
|
233
|
+
|
|
234
|
+
sq_dev = 0.0
|
|
235
|
+
for rw in replicate_weights:
|
|
236
|
+
rw_arr = pd.to_numeric(rw, errors="coerce").astype(float)
|
|
237
|
+
rw_arr = rw_arr[mask].reset_index(drop=True)
|
|
238
|
+
w_pos = rw_arr.where(rw_arr > 0, 0.0)
|
|
239
|
+
denom = float(w_pos.sum())
|
|
240
|
+
if denom <= 0:
|
|
241
|
+
continue
|
|
242
|
+
theta_r = float((v * w_pos).sum() / denom)
|
|
243
|
+
sq_dev += (theta_r - theta_hat) ** 2
|
|
244
|
+
|
|
245
|
+
scale = 1.0 / R if replicate_type == "bootstrap" else (R - 1.0) / R
|
|
246
|
+
return theta_hat, scale * sq_dev, total_w
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Effect-size helpers — Cohen's d, Hedges' g, Cramér's V, eta-squared.
|
|
2
|
+
|
|
3
|
+
Companion functions to the inferential tests in
|
|
4
|
+
:mod:`pysofra.summary.tests`. Effect sizes describe the *magnitude* of
|
|
5
|
+
a difference / association independently of sample size, and are
|
|
6
|
+
frequently requested alongside the p-values in clinical reports.
|
|
7
|
+
|
|
8
|
+
All functions accept aligned :class:`pandas.Series` for ``values`` /
|
|
9
|
+
``groups``; missing values are dropped pairwise. They return floats
|
|
10
|
+
(or ``None`` for degenerate input).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
# ----------------------------------------------------------------------
|
|
21
|
+
# Continuous
|
|
22
|
+
# ----------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
def cohen_d(a: pd.Series | np.ndarray, b: pd.Series | np.ndarray) -> float | None:
|
|
25
|
+
"""Cohen's d using the pooled standard deviation.
|
|
26
|
+
|
|
27
|
+
``d = (μ₁ − μ₂) / s_pool``, where the pooled SD weights the two
|
|
28
|
+
samples by their degrees of freedom.
|
|
29
|
+
"""
|
|
30
|
+
a_arr = pd.to_numeric(pd.Series(a), errors="coerce").dropna().to_numpy(dtype=float)
|
|
31
|
+
b_arr = pd.to_numeric(pd.Series(b), errors="coerce").dropna().to_numpy(dtype=float)
|
|
32
|
+
n_a, n_b = a_arr.size, b_arr.size
|
|
33
|
+
if n_a < 2 or n_b < 2:
|
|
34
|
+
return None
|
|
35
|
+
v_a = float(np.var(a_arr, ddof=1))
|
|
36
|
+
v_b = float(np.var(b_arr, ddof=1))
|
|
37
|
+
s_pool = math.sqrt(((n_a - 1) * v_a + (n_b - 1) * v_b) / (n_a + n_b - 2))
|
|
38
|
+
if s_pool == 0:
|
|
39
|
+
return 0.0 if a_arr.mean() == b_arr.mean() else float("inf")
|
|
40
|
+
return (float(a_arr.mean()) - float(b_arr.mean())) / s_pool
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def hedges_g(a: pd.Series | np.ndarray, b: pd.Series | np.ndarray) -> float | None:
|
|
44
|
+
"""Hedges' g — Cohen's d with the small-sample bias correction.
|
|
45
|
+
|
|
46
|
+
``g = d · J``, where ``J ≈ 1 − 3/(4(n_a+n_b) − 9)`` (Hedges 1981).
|
|
47
|
+
"""
|
|
48
|
+
d = cohen_d(a, b)
|
|
49
|
+
if d is None or math.isinf(d):
|
|
50
|
+
return d
|
|
51
|
+
n_a = int(pd.to_numeric(pd.Series(a), errors="coerce").dropna().size)
|
|
52
|
+
n_b = int(pd.to_numeric(pd.Series(b), errors="coerce").dropna().size)
|
|
53
|
+
denom = 4 * (n_a + n_b) - 9
|
|
54
|
+
if denom <= 0: # pragma: no cover — unreachable given cohen_d's n>=2 guard
|
|
55
|
+
return d
|
|
56
|
+
j = 1.0 - 3.0 / denom
|
|
57
|
+
return d * j
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def eta_squared(values: pd.Series, groups: pd.Series) -> float | None:
|
|
61
|
+
"""One-way ANOVA effect size: between-group / total sum-of-squares.
|
|
62
|
+
|
|
63
|
+
Ranges ``[0, 1]``. Small ≈ 0.01, medium ≈ 0.06, large ≈ 0.14
|
|
64
|
+
(Cohen 1988).
|
|
65
|
+
"""
|
|
66
|
+
df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"),
|
|
67
|
+
"g": groups}).dropna()
|
|
68
|
+
if df.empty:
|
|
69
|
+
return None
|
|
70
|
+
grand = float(df["v"].mean())
|
|
71
|
+
ss_between = float((df.groupby("g")["v"]
|
|
72
|
+
.apply(lambda x: x.size * (x.mean() - grand) ** 2))
|
|
73
|
+
.sum())
|
|
74
|
+
ss_total = float(((df["v"] - grand) ** 2).sum())
|
|
75
|
+
if ss_total <= 0:
|
|
76
|
+
return 0.0
|
|
77
|
+
return ss_between / ss_total
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def omega_squared(values: pd.Series, groups: pd.Series) -> float | None:
|
|
81
|
+
"""Less-biased counterpart to ``eta_squared`` (Hays 1973)."""
|
|
82
|
+
df = pd.DataFrame({"v": pd.to_numeric(values, errors="coerce"),
|
|
83
|
+
"g": groups}).dropna()
|
|
84
|
+
if df.empty:
|
|
85
|
+
return None
|
|
86
|
+
k = int(df["g"].nunique())
|
|
87
|
+
n = int(df.shape[0])
|
|
88
|
+
if n - k <= 0 or k <= 1:
|
|
89
|
+
return None
|
|
90
|
+
grand = float(df["v"].mean())
|
|
91
|
+
ss_between = float((df.groupby("g")["v"]
|
|
92
|
+
.apply(lambda x: x.size * (x.mean() - grand) ** 2))
|
|
93
|
+
.sum())
|
|
94
|
+
ss_total = float(((df["v"] - grand) ** 2).sum())
|
|
95
|
+
if ss_total <= 0:
|
|
96
|
+
return 0.0
|
|
97
|
+
ms_within = (ss_total - ss_between) / (n - k)
|
|
98
|
+
omega = (ss_between - (k - 1) * ms_within) / (ss_total + ms_within)
|
|
99
|
+
return float(max(0.0, omega))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ----------------------------------------------------------------------
|
|
103
|
+
# Categorical
|
|
104
|
+
# ----------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def cramers_v(values: pd.Series, groups: pd.Series) -> float | None:
|
|
107
|
+
"""Cramér's V — chi-square effect size normalised to ``[0, 1]``.
|
|
108
|
+
|
|
109
|
+
``V = √(χ² / (N · (min(R, C) − 1)))``.
|
|
110
|
+
"""
|
|
111
|
+
import warnings as _w
|
|
112
|
+
|
|
113
|
+
from scipy import stats as sp_stats
|
|
114
|
+
df = pd.DataFrame({"v": values, "g": groups}).dropna()
|
|
115
|
+
if df.empty:
|
|
116
|
+
return None
|
|
117
|
+
ctab = pd.crosstab(df["v"], df["g"])
|
|
118
|
+
if ctab.shape[0] < 2 or ctab.shape[1] < 2:
|
|
119
|
+
return None
|
|
120
|
+
with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
|
|
121
|
+
_w.catch_warnings():
|
|
122
|
+
_w.simplefilter("ignore", RuntimeWarning)
|
|
123
|
+
chi2, _, _, _ = sp_stats.chi2_contingency(ctab.to_numpy(), correction=False)
|
|
124
|
+
n = float(ctab.values.sum())
|
|
125
|
+
if n <= 0: # pragma: no cover — guarded by the shape >= 2x2 check above
|
|
126
|
+
return None
|
|
127
|
+
min_dim = min(ctab.shape) - 1
|
|
128
|
+
if min_dim <= 0: # pragma: no cover — shape >= 2x2 guarantees min_dim >= 1
|
|
129
|
+
return None
|
|
130
|
+
return float(math.sqrt(chi2 / (n * min_dim)))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def phi_coefficient(values: pd.Series, groups: pd.Series) -> float | None:
|
|
134
|
+
"""Phi — Cramér's V special case for 2×2 tables; ``φ = √(χ²/N)``."""
|
|
135
|
+
import warnings as _w
|
|
136
|
+
|
|
137
|
+
from scipy import stats as sp_stats
|
|
138
|
+
df = pd.DataFrame({"v": values, "g": groups}).dropna()
|
|
139
|
+
if df.empty:
|
|
140
|
+
return None
|
|
141
|
+
ctab = pd.crosstab(df["v"], df["g"])
|
|
142
|
+
if ctab.shape != (2, 2):
|
|
143
|
+
return None
|
|
144
|
+
with np.errstate(invalid="ignore", over="ignore", divide="ignore"), \
|
|
145
|
+
_w.catch_warnings():
|
|
146
|
+
_w.simplefilter("ignore", RuntimeWarning)
|
|
147
|
+
chi2, _, _, _ = sp_stats.chi2_contingency(ctab.to_numpy(), correction=False)
|
|
148
|
+
n = float(ctab.values.sum())
|
|
149
|
+
if n <= 0: # pragma: no cover — guarded by the (2,2) shape check above
|
|
150
|
+
return None
|
|
151
|
+
return float(math.sqrt(chi2 / n))
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ----------------------------------------------------------------------
|
|
155
|
+
# Auto dispatch (mirrors auto-test selection)
|
|
156
|
+
# ----------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def auto_effect_size(values: pd.Series, groups: pd.Series) -> tuple[str, float | None]:
|
|
160
|
+
"""Pick a sensible effect size for the variable kind / number of groups.
|
|
161
|
+
|
|
162
|
+
Returns ``(name, value)`` so callers can both display the metric and
|
|
163
|
+
label it in a footnote.
|
|
164
|
+
"""
|
|
165
|
+
g_unique = pd.Series(groups).dropna().unique()
|
|
166
|
+
n_groups = len(g_unique)
|
|
167
|
+
|
|
168
|
+
# Continuous-looking?
|
|
169
|
+
try:
|
|
170
|
+
pd.to_numeric(values, errors="raise")
|
|
171
|
+
continuous = True
|
|
172
|
+
except (ValueError, TypeError):
|
|
173
|
+
continuous = False
|
|
174
|
+
|
|
175
|
+
if continuous and n_groups == 2:
|
|
176
|
+
return "Cohen's d", cohen_d(
|
|
177
|
+
values[groups == g_unique[0]],
|
|
178
|
+
values[groups == g_unique[1]],
|
|
179
|
+
)
|
|
180
|
+
if continuous and n_groups >= 3:
|
|
181
|
+
return "η²", eta_squared(values, groups)
|
|
182
|
+
if not continuous and n_groups >= 2:
|
|
183
|
+
ctab = pd.crosstab(values, groups)
|
|
184
|
+
if ctab.shape == (2, 2):
|
|
185
|
+
return "φ", phi_coefficient(values, groups)
|
|
186
|
+
return "Cramér's V", cramers_v(values, groups)
|
|
187
|
+
return "—", None
|