hea 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hea/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ from .compare import AIC, BIC, anova
2
+ from .family import (
3
+ Binomial,
4
+ Family,
5
+ Gamma,
6
+ Gaussian,
7
+ InverseGaussian,
8
+ Poisson,
9
+ binomial,
10
+ gaussian,
11
+ inverse_gaussian,
12
+ poisson,
13
+ )
14
+ from .gam import gam
15
+ from .glm import glm
16
+ from .lm import lm
17
+ from .lme import lme
18
+ from .stats import (
19
+ aov,
20
+ chisq_test,
21
+ cor_test,
22
+ kruskal_test,
23
+ rank,
24
+ signed_rank,
25
+ t_test,
26
+ wilcox_test,
27
+ )
28
+ from .data import data, factor
29
+ from . import plot
hea/compare.py ADDED
@@ -0,0 +1,440 @@
1
+ """Model-comparison helpers: ``anova``, ``AIC``, ``BIC``.
2
+
3
+ Lives above ``lm`` and ``lme`` in the import graph so both can be compared
4
+ here without creating a cycle. ``lm.py`` and ``lme.py`` stay unaware of
5
+ each other.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import inspect
11
+
12
+ import polars as pl
13
+ from scipy.stats import chi2, f
14
+
15
+ from .glm import glm
16
+ from .lm import lm
17
+ from .lme import lme
18
+ from .utils import format_df, significance_code
19
+
20
+ __all__ = ["anova", "AIC", "BIC"]
21
+
22
+
23
+ def _caller_names(models, frame, fallback: str = "model") -> list[str]:
24
+ """Recover caller-bound variable names for ``models``, like R's
25
+ ``match.call``. Walks ``frame``'s locals + globals; falls back to
26
+ ``f"{fallback} {i}"`` when a model has no unique binding (e.g.
27
+ passed as an expression or aliased to multiple names).
28
+ """
29
+ if frame is None:
30
+ return [f"{fallback} {i}" for i in range(len(models))]
31
+ scope = {**frame.f_globals, **frame.f_locals}
32
+ by_id: dict[int, list[str]] = {}
33
+ for name, val in scope.items():
34
+ if name.startswith("_"):
35
+ continue
36
+ by_id.setdefault(id(val), []).append(name)
37
+ out = []
38
+ for i, m in enumerate(models):
39
+ names = by_id.get(id(m), [])
40
+ out.append(names[0] if len(names) == 1 else f"{fallback} {i}")
41
+ return out
42
+
43
+
44
+ def AIC(*models) -> None:
45
+ """Print an AIC comparison table for one or more fitted models.
46
+
47
+ Each model must expose ``.AIC`` and ``.npar``. Row labels are
48
+ recovered from the caller's variable names (R-style); falls back
49
+ to ``model i`` for unbound or aliased arguments.
50
+ """
51
+ names = _caller_names(models, inspect.currentframe().f_back)
52
+ rows = pl.DataFrame({
53
+ "": names,
54
+ "df": [m.npar for m in models],
55
+ "AIC": [round(m.AIC, 2) for m in models],
56
+ })
57
+ print(format_df(rows))
58
+
59
+
60
+ def BIC(*models) -> None:
61
+ """Print a BIC comparison table for one or more fitted models.
62
+
63
+ Each model must expose ``.BIC`` and ``.npar``. Row labels are
64
+ recovered from the caller's variable names (R-style); falls back
65
+ to ``model i`` for unbound or aliased arguments.
66
+ """
67
+ names = _caller_names(models, inspect.currentframe().f_back)
68
+ rows = pl.DataFrame({
69
+ "": names,
70
+ "df": [m.npar for m in models],
71
+ "BIC": [round(m.BIC, 2) for m in models],
72
+ })
73
+ print(format_df(rows))
74
+
75
+
76
+ def anova(*models, test: str | None = None):
77
+ """Compare nested fits, or decompose a single fit by Type-I SS.
78
+
79
+ - One ``lm`` → sequential (Type I) ANOVA table, splitting the model's
80
+ total SS into incremental contributions per RHS term in formula
81
+ order. Mirrors R's ``anova(m)`` for a single ``lm``.
82
+ - Multiple ``lm`` fits → F-test ANOVA table (incremental for 3+).
83
+ - Multiple ``glm`` fits → analysis-of-deviance table (incremental for
84
+ 3+); ``test=`` selects the test statistic (see below).
85
+ - Multiple ``lme`` fits → likelihood-ratio test (lme4-style, incremental
86
+ for 3+). REML fits are internally refit by ML before the LRT.
87
+
88
+ Parameters
89
+ ----------
90
+ test : {"Chisq", "LRT", "F", "Rao", None}, optional
91
+ Only meaningful for ``glm`` comparisons. ``None`` (default) auto-
92
+ picks ``"Chisq"`` for scale-known families (Poisson, Binomial) and
93
+ ``"F"`` for unknown-scale (Gaussian, Gamma, IG), matching R's
94
+ ``anova.glm`` recommendation. ``"LRT"`` is an alias for ``"Chisq"``.
95
+ ``"Rao"`` (score test) is not implemented yet. For ``lm`` and ``lme``
96
+ the test is fixed (always F / Chisq LRT respectively); passing
97
+ ``test=`` for those raises.
98
+
99
+ For multi-model calls rows are sorted by parameter count (smaller
100
+ model first), matching R's ``anova``. Row labels are recovered from
101
+ the caller's variable names (R-style); falls back to ``model i`` for
102
+ unbound or aliased arguments, preserving *input* order.
103
+ """
104
+ if len(models) == 0:
105
+ raise TypeError("anova(): need at least one model")
106
+ if len(models) == 1:
107
+ m = models[0]
108
+ if not isinstance(m, lm) or isinstance(m, glm):
109
+ raise TypeError(
110
+ "anova(m): single-model form supports lm only "
111
+ f"(got {type(m).__name__})"
112
+ )
113
+ if test is not None:
114
+ raise TypeError("anova(lm): test= is not accepted (always F)")
115
+ return _anova_lm_single(m)
116
+ labels = _caller_names(models, inspect.currentframe().f_back)
117
+ if all(isinstance(m, lme) for m in models):
118
+ if test is not None and test.upper() not in ("CHISQ", "LRT"):
119
+ raise ValueError(
120
+ f"anova(lme): only test='Chisq'/'LRT' (the default LRT) "
121
+ f"is supported, got {test!r}"
122
+ )
123
+ return _anova_lme(*models, labels=labels)
124
+ # glm before lm: glm is not an lm subclass, but the isinstance order
125
+ # would still matter if it ever became one. Keep the explicit branch.
126
+ if all(isinstance(m, glm) for m in models):
127
+ return _anova_glm(*models, labels=labels, test=test)
128
+ if all(isinstance(m, lm) for m in models):
129
+ if test is not None:
130
+ raise TypeError("anova(lm): test= is not accepted (always F)")
131
+ return _anova_lm(*models, labels=labels)
132
+ raise TypeError("anova(): all models must be the same type (lm, glm, or lme)")
133
+
134
+
135
+ def _anova_lm(*models, labels: list[str]):
136
+ """F-test ANOVA table comparing nested ``lm`` fits."""
137
+ # Sort ascending by npar (= descending by df_residuals, matching R).
138
+ order = sorted(range(len(models)), key=lambda i: models[i].df_residuals,
139
+ reverse=True)
140
+
141
+ dfs = [models[i].df_residuals for i in order]
142
+ rss = [models[i].rss for i in order]
143
+ # R uses the largest (least-constrained) model's MSE as the F denom.
144
+ mse_full = rss[-1] / dfs[-1]
145
+
146
+ df_col: list[int | None] = [None]
147
+ sos_col: list[float | None] = [None]
148
+ f_col: list[float | None] = [None]
149
+ p_col: list[float | None] = [None]
150
+ sig_col: list[str] = [""]
151
+ for k in range(1, len(order)):
152
+ d_df = dfs[k - 1] - dfs[k]
153
+ d_rss = rss[k - 1] - rss[k]
154
+ if d_df <= 0:
155
+ df_col.append(d_df); sos_col.append(round(d_rss, 3))
156
+ f_col.append(None); p_col.append(None); sig_col.append("")
157
+ continue
158
+ fstat = (d_rss / d_df) / mse_full
159
+ p = float(f.sf(fstat, d_df, dfs[-1]))
160
+ df_col.append(d_df)
161
+ sos_col.append(round(d_rss, 3))
162
+ f_col.append(round(fstat, 3))
163
+ p_col.append(float(f"{p:.4g}"))
164
+ sig_col.append(significance_code([p])[0])
165
+
166
+ docstring = "Analysis of Variance Table\n\n"
167
+ for i, m in enumerate(models):
168
+ docstring += f"{labels[i]}: {m.formula}\n"
169
+
170
+ df_ = pl.DataFrame({
171
+ "": [labels[i] for i in order],
172
+ "Res.Df": dfs,
173
+ "RSS": [round(r, 3) for r in rss],
174
+ "Df": df_col,
175
+ "Sum of Sq": sos_col,
176
+ "F": f_col,
177
+ "Pr(>F)": p_col,
178
+ " ": sig_col,
179
+ })
180
+
181
+ print(docstring)
182
+ print(format_df(df_))
183
+ print("---")
184
+ print("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
185
+
186
+
187
+ def _anova_lm_single(m: lm):
188
+ """Sequential (Type I) ANOVA — R's ``anova.lm(m)`` for a single fit.
189
+
190
+ Refits the model with terms added one at a time in formula order,
191
+ attributing each step's drop in RSS to that term. F = MS_term /
192
+ MS_residual_full, p = upper-tail F. R uses QR-incremental SS, which
193
+ is bit-equivalent for full-rank designs; refitting is conceptually
194
+ simpler and reuses hea's existing rank-deficiency handling.
195
+ """
196
+ terms = m._expanded.terms
197
+ if not terms:
198
+ raise TypeError(
199
+ "anova(m): single-model form needs at least one RHS term "
200
+ "(got an intercept-only model)"
201
+ )
202
+
203
+ lhs = m.formula.split("~", 1)[0].strip()
204
+ intercept_str = "1" if m._expanded.intercept else "0"
205
+
206
+ def cumulative_formula(k: int) -> str:
207
+ if k == 0:
208
+ return f"{lhs} ~ {intercept_str}"
209
+ rhs = " + ".join(t.label for t in terms[:k])
210
+ return f"{lhs} ~ {intercept_str} + {rhs}"
211
+
212
+ rss_chain: list[float] = []
213
+ df_chain: list[int] = []
214
+ for k in range(len(terms)):
215
+ m_k = lm(cumulative_formula(k), m.data,
216
+ weights=m.weights, method=m.method)
217
+ rss_chain.append(m_k.rss)
218
+ df_chain.append(m_k.df_residuals)
219
+ # Last entry = the original full model — reuse its values directly to
220
+ # avoid a redundant refit and any floating-point drift from re-solving.
221
+ rss_chain.append(m.rss)
222
+ df_chain.append(m.df_residuals)
223
+
224
+ mse_full = m.rss / m.df_residuals
225
+
226
+ df_col: list[int] = []
227
+ sos_col: list[float] = []
228
+ ms_col: list[float] = []
229
+ f_col: list[float | None] = []
230
+ p_col: list[float | None] = []
231
+ sig_col: list[str] = []
232
+ for i, t in enumerate(terms):
233
+ d_df = df_chain[i] - df_chain[i + 1]
234
+ d_rss = rss_chain[i] - rss_chain[i + 1]
235
+ if d_df <= 0:
236
+ df_col.append(d_df); sos_col.append(round(d_rss, 4))
237
+ ms_col.append(float("nan"))
238
+ f_col.append(None); p_col.append(None); sig_col.append("")
239
+ continue
240
+ ms = d_rss / d_df
241
+ fstat = ms / mse_full
242
+ p = float(f.sf(fstat, d_df, m.df_residuals))
243
+ df_col.append(d_df); sos_col.append(round(d_rss, 4))
244
+ ms_col.append(round(ms, 4))
245
+ f_col.append(round(fstat, 4))
246
+ p_col.append(float(f"{p:.4g}"))
247
+ sig_col.append(significance_code([p])[0])
248
+ # Residuals row
249
+ df_col.append(m.df_residuals); sos_col.append(round(m.rss, 4))
250
+ ms_col.append(round(mse_full, 4))
251
+ f_col.append(None); p_col.append(None); sig_col.append("")
252
+
253
+ docstring = "Analysis of Variance Table\n\n"
254
+ docstring += f"Response: {lhs}\n"
255
+
256
+ df_ = pl.DataFrame({
257
+ "": [t.label for t in terms] + ["Residuals"],
258
+ "Df": df_col,
259
+ "Sum Sq": sos_col,
260
+ "Mean Sq": ms_col,
261
+ "F value": f_col,
262
+ "Pr(>F)": p_col,
263
+ " ": sig_col,
264
+ })
265
+
266
+ print(docstring)
267
+ print(format_df(df_))
268
+ print("---")
269
+ print("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
270
+
271
+
272
+ def _anova_glm(*models, labels: list[str], test: str | None = None):
273
+ """``anova.glm``-style deviance table for nested ``glm`` fits.
274
+
275
+ With ``test=None`` we auto-pick (matches R's recommendation):
276
+ - scale-known families (Poisson, Binomial) → ``Chisq`` LRT on Δdev.
277
+ - unknown-scale families (Gaussian, Gamma, IG) → ``F``.
278
+
279
+ Override via ``test=``:
280
+ - ``"Chisq"`` / ``"LRT"`` (alias) → ``Δdev / dispersion_full ~ χ²(Δdf)``.
281
+ For scale-known families ``dispersion_full = 1`` so this is just Δdev,
282
+ matching the auto-pick. For unknown-scale, the division is the
283
+ asymptotic chi-square test (R's ``anova.glm`` does the same).
284
+ - ``"F"`` → ``F = (Δdev / Δdf) / dispersion_full`` against ``F(Δdf,
285
+ df_residual_full)``. Allowed for scale-known families too (R does)
286
+ though the chi-square version is preferred.
287
+ - ``"Rao"`` → score test, not implemented yet.
288
+
289
+ Three-or-more models are walked incrementally (row k vs row k-1 after
290
+ sorting by ``df_residuals`` descending, matching ``_anova_lm``).
291
+ """
292
+ fam0 = models[0].family
293
+ if not all(type(m.family) is type(fam0) and
294
+ m.family.link.name == fam0.link.name for m in models):
295
+ raise ValueError("anova(): all glm fits must share family and link")
296
+
297
+ if test is None:
298
+ test = "Chisq" if fam0.scale_known else "F"
299
+ else:
300
+ t_norm = test.upper()
301
+ if t_norm == "LRT":
302
+ test = "Chisq"
303
+ elif t_norm == "RAO":
304
+ raise NotImplementedError(
305
+ "anova(glm, test='Rao'): score test not implemented yet"
306
+ )
307
+ elif t_norm == "CHISQ":
308
+ test = "Chisq"
309
+ elif t_norm == "F":
310
+ test = "F"
311
+ else:
312
+ raise ValueError(
313
+ f"anova(glm): test must be 'Chisq', 'LRT', 'F', 'Rao', or None; "
314
+ f"got {test!r}"
315
+ )
316
+
317
+ # Sort ascending by npar (= descending by df_residuals), matching R.
318
+ order = sorted(range(len(models)), key=lambda i: models[i].df_residuals,
319
+ reverse=True)
320
+ dfs = [models[i].df_residual for i in order]
321
+ devs = [models[i].deviance for i in order]
322
+ full = models[order[-1]]
323
+ disp_full = float(full.dispersion)
324
+ df_full = int(full.df_residual)
325
+
326
+ df_col: list[int | None] = [None]
327
+ dev_col: list[float | None] = [None]
328
+ stat_col: list[float | None] = [None]
329
+ p_col: list[float | None] = [None]
330
+ sig_col: list[str] = [""]
331
+ for k in range(1, len(order)):
332
+ d_df = dfs[k - 1] - dfs[k]
333
+ d_dev = devs[k - 1] - devs[k]
334
+ if d_df <= 0:
335
+ df_col.append(d_df); dev_col.append(round(d_dev, 4))
336
+ stat_col.append(None); p_col.append(None); sig_col.append("")
337
+ continue
338
+ if test == "Chisq":
339
+ # disp_full == 1 for scale-known families (Poisson/Binomial),
340
+ # so this matches the canonical LRT there. For unknown-scale
341
+ # it's the asymptotic χ² test on the rescaled deviance — same
342
+ # formula R uses when `test="Chisq"` is passed for Gaussian/
343
+ # Gamma/IG fits.
344
+ stat = d_dev / disp_full
345
+ p = float(chi2.sf(stat, d_df))
346
+ else:
347
+ stat = (d_dev / d_df) / disp_full
348
+ p = float(f.sf(stat, d_df, df_full))
349
+ df_col.append(d_df)
350
+ dev_col.append(round(d_dev, 4))
351
+ stat_col.append(round(stat, 4))
352
+ p_col.append(float(f"{p:.4g}"))
353
+ sig_col.append(significance_code([p])[0])
354
+
355
+ docstring = "Analysis of Deviance Table\n\n"
356
+ for i, m in enumerate(models):
357
+ docstring += f"{labels[i]}: {m.formula}\n"
358
+
359
+ stat_lbl = "F" if test == "F" else "Deviance"
360
+ p_lbl = "Pr(>F)" if test == "F" else "Pr(>Chi)"
361
+
362
+ df_ = pl.DataFrame({
363
+ "": [labels[i] for i in order],
364
+ "Resid. Df": dfs,
365
+ "Resid. Dev": [round(d, 4) for d in devs],
366
+ "Df": df_col,
367
+ "Deviance": dev_col,
368
+ stat_lbl: stat_col,
369
+ p_lbl: p_col,
370
+ " ": sig_col,
371
+ })
372
+
373
+ print(docstring)
374
+ print(format_df(df_))
375
+ print("---")
376
+ print("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
377
+ return df_
378
+
379
+
380
+ def _anova_lme(*models, labels: list[str]):
381
+ """Likelihood-ratio test for nested ``lme`` fits (lme4-style)."""
382
+ # LRT requires ML; silently refit any REML inputs.
383
+ refit = any(m.REML for m in models)
384
+ models = tuple(
385
+ (lme(m.formula, m.data, REML=False) if m.REML else m) for m in models
386
+ )
387
+ if refit:
388
+ print("refitting model(s) with ML (instead of REML)")
389
+ # Sort ascending by npar, preserving original indices for row labels.
390
+ order = sorted(range(len(models)), key=lambda i: models[i].npar)
391
+
392
+ npar_col: list[int] = []
393
+ aic_col: list[float] = []
394
+ bic_col: list[float] = []
395
+ ll_col: list[float] = []
396
+ dev_col: list[float] = []
397
+ chi_col: list[float | None] = []
398
+ dfc_col: list[int | None] = []
399
+ p_col: list[float | None] = []
400
+ sig_col: list[str] = []
401
+ for k, idx in enumerate(order):
402
+ m = models[idx]
403
+ npar_col.append(m.npar)
404
+ aic_col.append(round(m.AIC, 4))
405
+ bic_col.append(round(m.BIC, 4))
406
+ ll_col.append(round(m.loglike, 4))
407
+ dev_col.append(round(m.deviance, 4))
408
+ if k == 0:
409
+ chi_col.append(None); dfc_col.append(None); p_col.append(None); sig_col.append("")
410
+ continue
411
+ prev = models[order[k - 1]]
412
+ chisq = prev.deviance - m.deviance
413
+ d_df = m.npar - prev.npar
414
+ p = float(chi2.sf(chisq, d_df)) if d_df > 0 else float("nan")
415
+ chi_col.append(round(chisq, 4))
416
+ dfc_col.append(d_df)
417
+ p_col.append(float(f"{p:.4g}"))
418
+ sig_col.append(significance_code([p])[0])
419
+
420
+ docstring = "Analysis of Variance Table (likelihood ratio test)\n\n"
421
+ for i, m in enumerate(models):
422
+ docstring += f"{labels[i]}: {m.formula}\n"
423
+
424
+ df_ = pl.DataFrame({
425
+ "": [labels[i] for i in order],
426
+ "npar": npar_col,
427
+ "AIC": aic_col,
428
+ "BIC": bic_col,
429
+ "logLik": ll_col,
430
+ "deviance": dev_col,
431
+ "Chisq": chi_col,
432
+ "Df": dfc_col,
433
+ "Pr(>Chisq)": p_col,
434
+ " ": sig_col,
435
+ })
436
+
437
+ print(docstring)
438
+ print(format_df(df_))
439
+ print("---")
440
+ print("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")