pysofra 0.1.0a6__tar.gz → 0.1.0a7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/CHANGELOG.md +36 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/PKG-INFO +3 -3
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/README.md +2 -2
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/pyproject.toml +1 -1
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/__init__.py +1 -1
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/table.py +17 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/pool.py +42 -8
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/survival.py +23 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/effect_size.py +23 -2
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/extras.py +7 -1
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/tbl_one.py +13 -5
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/tests.py +54 -7
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_regressions.py +46 -3
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/.gitignore +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/LICENSE +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/NOTICE +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/compose.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/format.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/frames.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/core/schema.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/extract.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/regression.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/models/uvregression.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/plot/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/plot/_backend.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/plot/forest.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/plot/inline.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/plot/km.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/_zip_determinism.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/base.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/docx.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/html.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/image.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/latex.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/markdown.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/pptx.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/render/xlsx.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/calibrate.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/design.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/smd.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/stats.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/tbl_cross.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/tbl_summary.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/typing.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/summary/weights.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/themes/__init__.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/src/pysofra/themes/registry.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/conftest.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/README.md +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/anova_oneway.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/chi_square.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/fisher_2x2.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/kruskal_wallis.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/student_t.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/svyttest.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/weighted_mean.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/welch_t_test.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/fixtures/scipy_validation/wilcoxon_rank_sum.json +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_api_stability.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_compose.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_compose_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_conditional_formatting.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_design_regression.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_extract_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_extras_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_extras_edges_2.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_format.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_latex_pptx.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_misc_fixes.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_modifier_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_multi_model.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_partial_modifiers.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_partials.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_plot_determinism.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_plot_embedding.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_plots.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_polars.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_pptx_overflow.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_property_invariants.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_rao_scott.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_regression.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_render_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_render_edges_2.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_renderer_consistency.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_rendering.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_scipy_validation.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_snapshot.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_statistical_correctness.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_stats.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_summary_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_summary_edges_2.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_survey_design.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_survey_extensions.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_survival.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_table_edges.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_tbl_one.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_test_overrides.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_uvregression_factors.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_validation_fixes.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_weights.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_wishlist.py +0 -0
- {pysofra-0.1.0a6 → pysofra-0.1.0a7}/tests/test_xlsx.py +0 -0
|
@@ -5,6 +5,42 @@ All notable changes to PySofra will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.1.0a7] — 2026-05-26
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- **`tbl_survival` validates `time` and `event` content**: negative
|
|
12
|
+
survival times raise `ValueError`; non-`0/1` event codes raise
|
|
13
|
+
`ValueError`. Previously these were passed silently to lifelines,
|
|
14
|
+
which would either clamp negative times to zero or treat any
|
|
15
|
+
nonzero event value as a death — producing a misleading curve
|
|
16
|
+
without complaint.
|
|
17
|
+
- **`add_global_p()` on weighted `tbl_one`** now uses
|
|
18
|
+
``statsmodels.GLM(..., var_weights=w)`` instead of
|
|
19
|
+
``freq_weights=w``. For non-integer sampling weights ``freq_weights``
|
|
20
|
+
scales ``df_resid`` by ``Σw`` (treating the weight as an integer
|
|
21
|
+
count of repeats), which inflates the effective sample size and
|
|
22
|
+
produces anti-conservative p-values. ``var_weights`` keeps
|
|
23
|
+
``df_resid = n − k`` — the appropriate SRS-weighted Wald-F
|
|
24
|
+
convention. For full design-based inference (with strata or
|
|
25
|
+
clusters) use ``ps.SurveyDesign`` end-to-end.
|
|
26
|
+
|
|
27
|
+
### Changed
|
|
28
|
+
- **`rao_scott_chisq` docstring** now honestly states a 10–15%
|
|
29
|
+
typical disagreement with R ``survey::svychisq`` on non-trivial
|
|
30
|
+
weighted designs (was: an overoptimistic "~5%"). The first-order
|
|
31
|
+
Kish-DEFF approximation is unchanged; for design-grade chi-square
|
|
32
|
+
inference call R directly.
|
|
33
|
+
- **Added published-reference citations** to public statistical
|
|
34
|
+
functions: Welch / Satterthwaite, Wilcoxon (Mann-Whitney 1947),
|
|
35
|
+
Kruskal-Wallis (1952), Fisher (1922), Pearson chi-square (1900),
|
|
36
|
+
Wilson score (1927), Rao-Scott (1981/1984), Kish (1965),
|
|
37
|
+
Benjamini-Hochberg (1995), Benjamini-Yekutieli (2001), Holm
|
|
38
|
+
(1979), Hommel (1988), Šidák (1967), Binder (1983) Taylor
|
|
39
|
+
linearisation.
|
|
40
|
+
- **`pool` and `cohen_d` docstrings** now have NumPy-style
|
|
41
|
+
``Parameters`` / ``Returns`` / ``References`` sections matching
|
|
42
|
+
the other public functions.
|
|
43
|
+
|
|
8
44
|
## [0.1.0a6] — 2026-05-26
|
|
9
45
|
|
|
10
46
|
### Fixed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pysofra
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a7
|
|
4
4
|
Summary: Statistical reporting and table preparation framework for Python — the missing reporting layer.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pysofra
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pysofra
|
|
@@ -75,7 +75,7 @@ Description-Content-Type: text/markdown
|
|
|
75
75
|
[](https://github.com/jturner-uofl/pysofra/blob/main/LICENSE)
|
|
76
76
|
[](https://github.com/astral-sh/ruff)
|
|
77
77
|
[](http://mypy-lang.org/)
|
|
78
|
-
[](#status)
|
|
79
79
|
|
|
80
80
|
</div>
|
|
81
81
|
|
|
@@ -255,7 +255,7 @@ pip install "pysofra[dev]" # testing + linting (pytest, ruff, mypy, hypot
|
|
|
255
255
|
|
|
256
256
|
## Status
|
|
257
257
|
|
|
258
|
-
PySofra is in **alpha** (`0.1.
|
|
258
|
+
PySofra is in **alpha** (`0.1.0a7`). The public API surface is pinned
|
|
259
259
|
by an explicit
|
|
260
260
|
[API-stability test](https://github.com/jturner-uofl/pysofra/blob/main/tests/test_api_stability.py)
|
|
261
261
|
so that any unintended rename, removal, or signature change surfaces as
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
[](https://github.com/jturner-uofl/pysofra/blob/main/LICENSE)
|
|
10
10
|
[](https://github.com/astral-sh/ruff)
|
|
11
11
|
[](http://mypy-lang.org/)
|
|
12
|
-
[](#status)
|
|
13
13
|
|
|
14
14
|
</div>
|
|
15
15
|
|
|
@@ -189,7 +189,7 @@ pip install "pysofra[dev]" # testing + linting (pytest, ruff, mypy, hypot
|
|
|
189
189
|
|
|
190
190
|
## Status
|
|
191
191
|
|
|
192
|
-
PySofra is in **alpha** (`0.1.
|
|
192
|
+
PySofra is in **alpha** (`0.1.0a7`). The public API surface is pinned
|
|
193
193
|
by an explicit
|
|
194
194
|
[API-stability test](https://github.com/jturner-uofl/pysofra/blob/main/tests/test_api_stability.py)
|
|
195
195
|
so that any unintended rename, removal, or signature change surfaces as
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pysofra"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a7"
|
|
8
8
|
description = "Statistical reporting and table preparation framework for Python — the missing reporting layer."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "GPL-3.0-or-later" }
|
|
@@ -262,6 +262,23 @@ class SofraTable:
|
|
|
262
262
|
``fdr_bh`` (Benjamini–Hochberg, default), ``fdr_by``,
|
|
263
263
|
``bonferroni``, ``holm``, ``hommel``, ``sidak``. Implicitly
|
|
264
264
|
enables p-values when not already on.
|
|
265
|
+
|
|
266
|
+
References
|
|
267
|
+
----------
|
|
268
|
+
Benjamini, Y., & Hochberg, Y. (1995). Controlling the false
|
|
269
|
+
discovery rate: a practical and powerful approach to multiple
|
|
270
|
+
testing. *J. R. Stat. Soc. B*, 57(1), 289–300. (``fdr_bh``)
|
|
271
|
+
Benjamini, Y., & Yekutieli, D. (2001). The control of the
|
|
272
|
+
false discovery rate in multiple testing under dependency.
|
|
273
|
+
*Ann. Stat.*, 29(4), 1165–1188. (``fdr_by``)
|
|
274
|
+
Holm, S. (1979). A simple sequentially rejective multiple test
|
|
275
|
+
procedure. *Scand. J. Stat.*, 6(2), 65–70. (``holm``)
|
|
276
|
+
Hommel, G. (1988). A stagewise rejective multiple test
|
|
277
|
+
procedure based on a modified Bonferroni test. *Biometrika*,
|
|
278
|
+
75(2), 383–386. (``hommel``)
|
|
279
|
+
Šidák, Z. (1967). Rectangular confidence regions for the
|
|
280
|
+
means of multivariate normal distributions. *J. Am. Stat.
|
|
281
|
+
Assoc.*, 62(318), 626–633. (``sidak``)
|
|
265
282
|
"""
|
|
266
283
|
return self._with_option(p_value=True, q_value=True, q_method=method)
|
|
267
284
|
|
|
@@ -45,14 +45,48 @@ from .extract import ModelSummary, extract
|
|
|
45
45
|
def pool(models: list[Any], *, conf_level: float = 0.95) -> ModelSummary:
|
|
46
46
|
"""Pool a list of fitted models via Rubin's rules.
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
models
|
|
51
|
+
A list of two or more fitted models, each fit on a separate
|
|
52
|
+
imputed dataset. Every model must be one of the families
|
|
53
|
+
recognised by :func:`pysofra.models.extract.extract` —
|
|
54
|
+
statsmodels (Logit, OLS, GLM, Poisson), lifelines
|
|
55
|
+
(CoxPHFitter, AFT family), or scikit-learn linear models.
|
|
56
|
+
All models in the list must share the same coefficient names.
|
|
57
|
+
conf_level
|
|
58
|
+
Confidence level for the pooled CIs, in the open interval
|
|
59
|
+
``(0, 1)``. Default 0.95.
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
ModelSummary
|
|
64
|
+
A summary whose ``estimates``, ``ci_lo``, ``ci_hi`` and
|
|
65
|
+
``pvalues`` reflect Rubin's-rule pooling across the
|
|
66
|
+
imputed-dataset fits. Pass this directly into
|
|
67
|
+
:func:`pysofra.tbl_regression` to render a pooled regression
|
|
68
|
+
table.
|
|
69
|
+
|
|
70
|
+
Notes
|
|
71
|
+
-----
|
|
72
|
+
The pooled point estimate is the across-imputation mean of the
|
|
73
|
+
per-imputation estimates. The total variance ``T = Ū + (1 + 1/m)·B``
|
|
74
|
+
combines the average within-imputation variance ``Ū`` and the
|
|
75
|
+
between-imputation variance ``B`` (with the small-sample
|
|
76
|
+
correction ``1 + 1/m``). Confidence intervals use a *t*
|
|
77
|
+
distribution with Rubin's original degrees-of-freedom
|
|
78
|
+
``df = (m − 1)·(1 + Ū / ((1 + 1/m)·B))²``. The newer
|
|
79
|
+
Barnard–Rubin (1999) df refinement is not yet implemented; for
|
|
80
|
+
very small per-imputation df it slightly narrows the CI relative
|
|
81
|
+
to ``mice::pool``.
|
|
82
|
+
|
|
83
|
+
References
|
|
84
|
+
----------
|
|
85
|
+
Rubin, D. B. (1987). *Multiple Imputation for Nonresponse in
|
|
86
|
+
Surveys.* Wiley.
|
|
87
|
+
Barnard, J., & Rubin, D. B. (1999). Small-sample degrees of
|
|
88
|
+
freedom with multiple imputation. *Biometrika*, 86(4),
|
|
89
|
+
948–955.
|
|
56
90
|
"""
|
|
57
91
|
if not (0.0 < conf_level < 1.0):
|
|
58
92
|
raise ValueError(
|
|
@@ -86,6 +86,29 @@ def tbl_survival(
|
|
|
86
86
|
for col in (time, event):
|
|
87
87
|
if col not in data.columns:
|
|
88
88
|
raise KeyError(f"column {col!r} not in data")
|
|
89
|
+
|
|
90
|
+
# Validate time + event content. ``lifelines`` will silently treat
|
|
91
|
+
# negative survival times as zero and any nonzero event value as a
|
|
92
|
+
# death, so input mistakes (e.g. a "censor at last follow-up" column
|
|
93
|
+
# encoded as 0/1/9, or a follow-up time accidentally negated) can
|
|
94
|
+
# produce a misleading survival curve without complaint. Fail loud
|
|
95
|
+
# at the boundary instead.
|
|
96
|
+
time_num = pd.to_numeric(data[time], errors="coerce")
|
|
97
|
+
if (time_num < 0).any():
|
|
98
|
+
n_bad = int((time_num < 0).sum())
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"column {time!r} contains {n_bad} negative value(s); "
|
|
101
|
+
"survival times must be non-negative."
|
|
102
|
+
)
|
|
103
|
+
event_num = pd.to_numeric(data[event], errors="coerce").dropna()
|
|
104
|
+
bad_events = ~event_num.isin([0, 1])
|
|
105
|
+
if bool(bad_events.any()):
|
|
106
|
+
bad_vals = sorted(event_num[bad_events].unique().tolist())
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"column {event!r} must contain only 0/1 (or boolean) "
|
|
109
|
+
f"values; got unexpected values: {bad_vals!r}."
|
|
110
|
+
)
|
|
111
|
+
|
|
89
112
|
if by is not None and by not in data.columns:
|
|
90
113
|
raise KeyError(f"by column {by!r} not in data")
|
|
91
114
|
|
|
@@ -24,8 +24,29 @@ import pandas as pd
|
|
|
24
24
|
def cohen_d(a: pd.Series | np.ndarray, b: pd.Series | np.ndarray) -> float | None:
|
|
25
25
|
"""Cohen's d using the pooled standard deviation.
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
a, b
|
|
30
|
+
Two independent samples (``pandas.Series`` or 1-D ``numpy``
|
|
31
|
+
array). Non-numeric entries are coerced; ``NaN`` rows are
|
|
32
|
+
dropped per array. Each sample must contain at least two
|
|
33
|
+
finite values.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
float or None
|
|
38
|
+
``d = (μ_a − μ_b) / s_pool``, where the pooled SD weights the
|
|
39
|
+
two samples by their degrees of freedom:
|
|
40
|
+
``s_pool = sqrt(((n_a − 1)·s_a² + (n_b − 1)·s_b²) / (n_a + n_b − 2))``.
|
|
41
|
+
Returns ``None`` if either sample has fewer than 2 finite
|
|
42
|
+
observations. Returns ``0.0`` if the pooled SD is zero and
|
|
43
|
+
the two means are identical; ``inf`` if the pooled SD is zero
|
|
44
|
+
but the means differ (degenerate constant-sample case).
|
|
45
|
+
|
|
46
|
+
References
|
|
47
|
+
----------
|
|
48
|
+
Cohen, J. (1988). *Statistical Power Analysis for the Behavioral
|
|
49
|
+
Sciences* (2nd ed.). Lawrence Erlbaum.
|
|
29
50
|
"""
|
|
30
51
|
a_arr = pd.to_numeric(pd.Series(a), errors="coerce").dropna().to_numpy(dtype=float)
|
|
31
52
|
b_arr = pd.to_numeric(pd.Series(b), errors="coerce").dropna().to_numpy(dtype=float)
|
|
@@ -731,7 +731,13 @@ def add_ci(
|
|
|
731
731
|
|
|
732
732
|
|
|
733
733
|
def _wilson_ci(x: int, n: int, *, z: float) -> tuple[float, float]:
|
|
734
|
-
"""Wilson score CI for a proportion.
|
|
734
|
+
"""Wilson score CI for a proportion.
|
|
735
|
+
|
|
736
|
+
References
|
|
737
|
+
----------
|
|
738
|
+
Wilson, E. B. (1927). Probable inference, the law of succession,
|
|
739
|
+
and statistical inference. *J. Am. Stat. Assoc.*, 22(158), 209–212.
|
|
740
|
+
"""
|
|
735
741
|
if n == 0:
|
|
736
742
|
return float("nan"), float("nan")
|
|
737
743
|
p = x / n
|
|
@@ -1207,14 +1207,22 @@ def _fit_global_p(
|
|
|
1207
1207
|
try:
|
|
1208
1208
|
with _w.catch_warnings():
|
|
1209
1209
|
_w.simplefilter("ignore") # statsmodels convergence chatter
|
|
1210
|
-
# Honour weights by routing through GLM(Binomial)
|
|
1211
|
-
#
|
|
1212
|
-
#
|
|
1213
|
-
#
|
|
1210
|
+
# Honour weights by routing through GLM(Binomial). We use
|
|
1211
|
+
# ``var_weights`` rather than ``freq_weights``: ``freq_weights``
|
|
1212
|
+
# treats the weight as an integer *count of repeats* and so
|
|
1213
|
+
# scales ``df_resid`` by ``Σw`` — which dramatically inflates
|
|
1214
|
+
# the effective sample size for non-integer sampling weights
|
|
1215
|
+
# (a survey weight calibrated to a 200k population would push
|
|
1216
|
+
# df_resid to 200k instead of n). ``var_weights`` keeps
|
|
1217
|
+
# ``df_resid = n − k``, which is the appropriate convention
|
|
1218
|
+
# for sampling / IPW weights where the weight does not
|
|
1219
|
+
# represent a count. For full design-based inference (with
|
|
1220
|
+
# strata or clusters) use ``ps.SurveyDesign`` end-to-end;
|
|
1221
|
+
# the joint p test here is an SRS-weighted Wald-F.
|
|
1214
1222
|
if weights_col is not None:
|
|
1215
1223
|
w_arr = sub[weights_col].to_numpy(dtype=float)
|
|
1216
1224
|
fam = sm.families.Binomial()
|
|
1217
|
-
res = sm.GLM(y, X, family=fam,
|
|
1225
|
+
res = sm.GLM(y, X, family=fam, var_weights=w_arr).fit(disp=False)
|
|
1218
1226
|
else:
|
|
1219
1227
|
res = sm.Logit(y, X).fit(disp=False, method="newton",
|
|
1220
1228
|
maxiter=100)
|
|
@@ -17,6 +17,29 @@ Two layers:
|
|
|
17
17
|
|
|
18
18
|
Returns a small :class:`TestResult` so callers can render both the p-value
|
|
19
19
|
and the test name for the footnote.
|
|
20
|
+
|
|
21
|
+
References
|
|
22
|
+
----------
|
|
23
|
+
Welch, B. L. (1947). The generalization of "Student's" problem when
|
|
24
|
+
several different population variances are involved. *Biometrika*,
|
|
25
|
+
34(1/2), 28–35. (Welch's t with Satterthwaite df.)
|
|
26
|
+
Satterthwaite, F. E. (1946). An approximate distribution of estimates
|
|
27
|
+
of variance components. *Biometrics Bulletin*, 2(6), 110–114.
|
|
28
|
+
Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of
|
|
29
|
+
two random variables is stochastically larger than the other.
|
|
30
|
+
*Ann. Math. Statist.*, 18(1), 50–60. (Wilcoxon rank-sum.)
|
|
31
|
+
Kruskal, W. H., & Wallis, W. A. (1952). Use of ranks in one-criterion
|
|
32
|
+
variance analysis. *J. Am. Stat. Assoc.*, 47(260), 583–621.
|
|
33
|
+
Fisher, R. A. (1922). On the interpretation of χ² from contingency
|
|
34
|
+
tables, and the calculation of P. *J. Royal Statist. Soc.*,
|
|
35
|
+
85(1), 87–94.
|
|
36
|
+
Pearson, K. (1900). On the criterion that a given system of
|
|
37
|
+
deviations from the probable… *Phil. Mag.*, 50(302), 157–175.
|
|
38
|
+
(Pearson chi-square.)
|
|
39
|
+
Rao, J. N. K., & Scott, A. J. (1981). The analysis of categorical
|
|
40
|
+
data from complex sample surveys. *J. Am. Stat. Assoc.*, 76,
|
|
41
|
+
221–230. (Rao–Scott chi-square; see also Rao & Scott 1984.)
|
|
42
|
+
Kish, L. (1965). *Survey Sampling.* Wiley. (Kish design effect.)
|
|
20
43
|
"""
|
|
21
44
|
|
|
22
45
|
from __future__ import annotations
|
|
@@ -167,6 +190,15 @@ def svyttest(
|
|
|
167
190
|
full design and gave inflated t-statistics whenever clusters
|
|
168
191
|
straddled groups; the current formulation matches R to first
|
|
169
192
|
order.
|
|
193
|
+
|
|
194
|
+
References
|
|
195
|
+
----------
|
|
196
|
+
Lumley, T. (2010). *Complex Surveys: A Guide to Analysis Using R.*
|
|
197
|
+
Wiley. Chapter on two-sample tests for survey data.
|
|
198
|
+
Binder, D. A. (1983). On the variances of asymptotically normal
|
|
199
|
+
estimators from complex surveys. *Int. Statist. Rev.*, 51(3),
|
|
200
|
+
279–292. (Taylor linearisation of regression coefficients
|
|
201
|
+
under complex sampling.)
|
|
170
202
|
"""
|
|
171
203
|
df_ = pd.DataFrame({
|
|
172
204
|
"v": pd.to_numeric(values, errors="coerce"),
|
|
@@ -319,13 +351,28 @@ def rao_scott_chisq(
|
|
|
319
351
|
Notes
|
|
320
352
|
-----
|
|
321
353
|
This is a *first-order* Rao–Scott correction using the Kish design
|
|
322
|
-
effect (a single scalar derived from the weights).
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
354
|
+
effect (a single scalar derived from the weights). The fully-correct
|
|
355
|
+
Rao–Scott statistic uses the *generalised* design effect derived from
|
|
356
|
+
the eigenvalues of the full design covariance matrix
|
|
357
|
+
(Rao & Scott, 1981, 1984); R ``survey::svychisq`` implements that
|
|
358
|
+
version. On non-trivial weighted designs (stratified, clustered, or
|
|
359
|
+
even simple weighted with non-uniform weights), the Kish
|
|
360
|
+
approximation here typically disagrees with R ``svychisq`` by
|
|
361
|
+
**10–15%** in the statistic and a similar amount in the p-value.
|
|
362
|
+
The approximation is adequate for descriptive Table 1 contexts
|
|
363
|
+
where the χ² is a guide rather than a publication-grade test
|
|
364
|
+
statistic; for design-grade chi-square inference, call
|
|
365
|
+
``survey::svychisq`` in R directly.
|
|
366
|
+
|
|
367
|
+
References
|
|
368
|
+
----------
|
|
369
|
+
Rao, J. N. K., & Scott, A. J. (1981). The analysis of categorical
|
|
370
|
+
data from complex sample surveys. *J. Am. Stat. Assoc.*,
|
|
371
|
+
76(374), 221–230.
|
|
372
|
+
Rao, J. N. K., & Scott, A. J. (1984). On chi-squared tests for
|
|
373
|
+
multiway contingency tables with cell proportions estimated
|
|
374
|
+
from survey data. *Ann. Stat.*, 12(1), 46–60.
|
|
375
|
+
Kish, L. (1965). *Survey Sampling.* Wiley.
|
|
329
376
|
"""
|
|
330
377
|
df = pd.DataFrame({
|
|
331
378
|
"v": values,
|
|
@@ -1857,7 +1857,13 @@ class TestWeightedModifiers:
|
|
|
1857
1857
|
assert abs(diff_unw[1] - diff_wt[1]) > 1e-6 or \
|
|
1858
1858
|
abs(diff_unw[2] - diff_wt[2]) > 1e-6
|
|
1859
1859
|
|
|
1860
|
-
def
|
|
1860
|
+
def test_add_global_p_weighted_matches_glm_var_weights(self):
|
|
1861
|
+
# Reference uses ``var_weights=`` rather than ``freq_weights=``:
|
|
1862
|
+
# for non-integer sampling weights, ``freq_weights`` artificially
|
|
1863
|
+
# inflates df_resid by ``Σw`` (treating the weight as an integer
|
|
1864
|
+
# count of repeats), making the F-test anti-conservative. The
|
|
1865
|
+
# ``var_weights`` convention keeps df_resid = n - k, which is
|
|
1866
|
+
# the appropriate SRS-weighted Wald-F for sampling/IPW weights.
|
|
1861
1867
|
sm = pytest.importorskip("statsmodels.api")
|
|
1862
1868
|
df = self._df()
|
|
1863
1869
|
t = (
|
|
@@ -1865,12 +1871,12 @@ class TestWeightedModifiers:
|
|
|
1865
1871
|
weights="w", missing="never", types={"smoker": "dichotomous"})
|
|
1866
1872
|
.add_global_p()
|
|
1867
1873
|
)
|
|
1868
|
-
# Manual reference: fit GLM(Binomial) with
|
|
1874
|
+
# Manual reference: fit GLM(Binomial) with var_weights and
|
|
1869
1875
|
# f_test on the single age coefficient.
|
|
1870
1876
|
y = (df["arm"] == "B").astype(int).to_numpy()
|
|
1871
1877
|
X = sm.add_constant(df[["age"]])
|
|
1872
1878
|
ref = sm.GLM(y, X, family=sm.families.Binomial(),
|
|
1873
|
-
|
|
1879
|
+
var_weights=df["w"].to_numpy(dtype=float)).fit(disp=False)
|
|
1874
1880
|
expected_p = float(ref.f_test("age = 0").pvalue)
|
|
1875
1881
|
# Get the table's global p for "age"
|
|
1876
1882
|
row = next(r for r in t.rows if r.cells[0].text == "age")
|
|
@@ -1883,3 +1889,40 @@ class TestWeightedModifiers:
|
|
|
1883
1889
|
del gp_cell
|
|
1884
1890
|
assert last_p is not None
|
|
1885
1891
|
assert abs(float(last_p) - expected_p) < 1e-6, (last_p, expected_p)
|
|
1892
|
+
|
|
1893
|
+
|
|
1894
|
+
# ----------------------------------------------------------------------
|
|
1895
|
+
# tbl_survival validates time + event content. Previously negative
|
|
1896
|
+
# follow-up times and non-0/1 event codes were silently passed
|
|
1897
|
+
# through to lifelines (which treats nonzero as a death), producing
|
|
1898
|
+
# misleading survival curves without complaint.
|
|
1899
|
+
# ----------------------------------------------------------------------
|
|
1900
|
+
class TestSurvivalInputValidation:
|
|
1901
|
+
def test_negative_time_raises(self):
|
|
1902
|
+
pytest.importorskip("lifelines")
|
|
1903
|
+
df = pd.DataFrame({
|
|
1904
|
+
"t": [1.0, -2.0, 3.0, 4.0],
|
|
1905
|
+
"e": [0, 1, 1, 0],
|
|
1906
|
+
})
|
|
1907
|
+
with pytest.raises(ValueError, match=r"negative value"):
|
|
1908
|
+
ps.tbl_survival(df, time="t", event="e")
|
|
1909
|
+
|
|
1910
|
+
def test_non_binary_event_raises(self):
|
|
1911
|
+
pytest.importorskip("lifelines")
|
|
1912
|
+
df = pd.DataFrame({
|
|
1913
|
+
"t": [1.0, 2.0, 3.0, 4.0],
|
|
1914
|
+
"e": [0, 1, 9, 1], # 9 is not 0/1
|
|
1915
|
+
})
|
|
1916
|
+
with pytest.raises(ValueError, match=r"must contain only 0/1"):
|
|
1917
|
+
ps.tbl_survival(df, time="t", event="e")
|
|
1918
|
+
|
|
1919
|
+
def test_valid_inputs_pass(self):
|
|
1920
|
+
pytest.importorskip("lifelines")
|
|
1921
|
+
rng = np.random.default_rng(0)
|
|
1922
|
+
df = pd.DataFrame({
|
|
1923
|
+
"t": rng.exponential(10, 50),
|
|
1924
|
+
"e": rng.integers(0, 2, 50),
|
|
1925
|
+
})
|
|
1926
|
+
# Should not raise.
|
|
1927
|
+
t = ps.tbl_survival(df, time="t", event="e")
|
|
1928
|
+
assert len(t.rows) >= 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|