mergeron 2024.739097.4__py3-none-any.whl → 2024.739099.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mergeron might be problematic. Click here for more details.

@@ -1,518 +0,0 @@
1
- """
2
- Functions to estimate confidence intervals for
3
- (a.) a proportion or muliple proportions, and (b.) contrast between
4
- two independent proportions or two series of independent propotions.
5
-
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- from collections.abc import Sequence
11
- from dataclasses import dataclass
12
- from typing import Literal, TypeVar
13
-
14
- import numpy as np
15
- from numpy.typing import NBitBase, NDArray
16
- from scipy.optimize import OptimizeResult, root # type: ignore
17
- from scipy.stats import beta, chi2, norm # type: ignore
18
-
19
- from .. import VERSION # noqa: TID252
20
-
21
- __version__ = VERSION
22
-
23
- TI = TypeVar("TI", bound=NBitBase)
24
-
25
-
26
- def propn_ci(
27
- _npos: NDArray[np.integer[TI]] | int = 4,
28
- _nobs: NDArray[np.integer[TI]] | int = 10,
29
- /,
30
- *,
31
- alpha: float = 0.05,
32
- method: Literal[
33
- "Agresti-Coull", "Clopper-Pearson", "Exact", "Wilson", "Score"
34
- ] = "Wilson",
35
- ) -> tuple[
36
- NDArray[np.float64] | float,
37
- NDArray[np.float64] | float,
38
- NDArray[np.float64] | float,
39
- NDArray[np.float64] | float,
40
- ]:
41
- """Returns point estimates and confidence interval for a proportion
42
-
43
- Methods "Clopper-Pearson" and "Exact" are synoymous [3]_. Similarly,
44
- "Wilson" and "Score" are synonyms here.
45
-
46
- Parameters
47
- ----------
48
- _npos
49
- Number of positives
50
-
51
- _nobs
52
- Number of observed values
53
-
54
- alpha
55
- Significance level
56
-
57
- method
58
- Method to use for estimating confidence interval
59
-
60
- Returns
61
- -------
62
- Raw and estimated proportions, and bounds of the confidence interval
63
-
64
-
65
- References
66
- ----------
67
-
68
- .. [3] Alan Agresti & Brent A. Coull (1998) Approximate is Better
69
- than “Exact” for Interval Estimation of Binomial Proportions,
70
- The American Statistician, 52:2, 119-126,
71
- https://doi.org/10.1080/00031305.1998.10480550
72
-
73
- """
74
-
75
- for _f in _npos, _nobs:
76
- if not isinstance(_f, int | np.integer):
77
- raise ValueError(
78
- f"Count, {_f!r} must have type that is a subtype of np.integer."
79
- )
80
-
81
- if not _nobs:
82
- return (np.nan, np.nan, np.nan, np.nan)
83
-
84
- _raw_phat: NDArray[np.float64] | float = _npos / _nobs
85
- _est_phat: NDArray[np.float64] | float
86
- _est_ci_l: NDArray[np.float64] | float
87
- _est_ci_u: NDArray[np.float64] | float
88
-
89
- match method:
90
- case "Clopper-Pearson" | "Exact":
91
- _est_ci_l, _est_ci_u = (
92
- beta.ppf(*_f)
93
- for _f in (
94
- (alpha / 2, _npos, _nobs - _npos + 1),
95
- (1 - alpha / 2, _npos + 1, _nobs - _npos),
96
- )
97
- )
98
- _est_phat = 1 / 2 * (_est_ci_l + _est_ci_u)
99
-
100
- case "Agresti-Coull":
101
- _zsc = norm.ppf(1 - alpha / 2)
102
- _zscsq = _zsc * _zsc
103
- _adjmt = 4 if alpha == 0.05 else _zscsq
104
- _est_phat = (_npos + _adjmt / 2) / (_nobs + _adjmt)
105
- _est_ci_l, _est_ci_u = (
106
- _est_phat + _g
107
- for _g in [
108
- _f * _zsc * np.sqrt(_est_phat * (1 - _est_phat) / (_nobs + _adjmt))
109
- for _f in (-1, 1)
110
- ]
111
- )
112
-
113
- case "Wilson" | "Score":
114
- _zsc = norm.ppf(1 - alpha / 2)
115
- _zscsq = _zsc * _zsc
116
- _est_phat = (_npos + _zscsq / 2) / (_nobs + _zscsq)
117
- _est_ci_l, _est_ci_u = (
118
- _est_phat
119
- + _f
120
- * _zsc
121
- * np.sqrt(_nobs * _raw_phat * (1 - _raw_phat) + _zscsq / 4)
122
- / (_nobs + _zscsq)
123
- for _f in (-1, 1)
124
- )
125
-
126
- case _:
127
- raise ValueError(f"Method, {f'"{method}"'} not yet implemented.")
128
-
129
- return _raw_phat, _est_phat, _est_ci_l, _est_ci_u
130
-
131
-
132
- def propn_ci_multinomial(
133
- _counts: NDArray[np.integer[TI]],
134
- /,
135
- *,
136
- alpha: float = 0.05,
137
- method: Literal["goodman", "quesenberry-hurst"] = "goodman",
138
- alternative: Literal["default", "simplified"] = "default",
139
- ) -> NDArray[np.float64]:
140
- """Confidence intervals for multiple proportions.
141
-
142
- Parameters
143
- ----------
144
- _counts
145
- `n x 2` np.array of multinomial counts
146
- alpha
147
- Significance level
148
- method
149
- Method used to computer confidence intervals
150
- alternative
151
- Method used to estimate standard errors, whether "default"
152
- or "simplified"
153
-
154
- Returns
155
- -------
156
- Array of confidence intervals
157
-
158
- """
159
- if method not in (_mli := ("goodman", "quesenberry-hurst")):
160
- raise ValueError(
161
- f'Invalid value {f'"{method}"'} for "method". Must be one of {_mli}.'
162
- )
163
-
164
- _n = np.einsum("j->", _counts).astype(np.int64)
165
- _prob = _counts / _n
166
- _chi2_cr = (
167
- chi2(len(_counts) - 1).ppf(1 - alpha)
168
- if method == "quesenberry-hurst"
169
- else chi2(1).ppf(1 - (alpha / len(_counts)))
170
- )
171
-
172
- if alternative == "default":
173
- _ci_len_half = np.sqrt(_chi2_cr * (_chi2_cr + 4 * _n * _prob * (1 - _prob)))
174
- return np.column_stack([
175
- (_chi2_cr + 2 * _counts + _f * _ci_len_half) / (2 * (_n + _chi2_cr))
176
- for _f in (-1, 1)
177
- ])
178
-
179
- elif alternative == "simplified":
180
- _ci_len_half = np.sqrt(_chi2_cr * _prob * (1 - _prob) / _n)
181
- return np.column_stack([_prob + _f * _ci_len_half for _f in (-1, 1)])
182
-
183
- else:
184
- raise ValueError(
185
- f"Invalid value, {f'"{alternative}"'} for, \"alternative\". "
186
- f"Must be one of '{'("default", "simplified")'}'."
187
- )
188
-
189
-
190
- def propn_diff_ci(
191
- _npos1: int = 4,
192
- _nobs1: int = 10,
193
- _npos2: int = 4,
194
- _nobs2: int = 10,
195
- /,
196
- *,
197
- alpha: float = 0.05,
198
- method: Literal["Agresti-Caffo", "Mee", "M-N", "Newcombe", "Score"] = "M-N",
199
- ) -> tuple[float, float, float, float]:
200
- R"""Confidence intervals for differences in binomial proportions.
201
-
202
- Methods available are Agresti-Caffo [4]_, Mee [5]_, Meitinen-Nurminen [5]_ [6]_
203
- and Newcombe (aka, Score method) [5]_. See also, source code for the
204
- R-language function BinomDiffCI, in the module StatsAndCIs [7]_.
205
-
206
- Parameters
207
- ----------
208
- _npos1, _npos2
209
- Counts of positive outcomes in the respective binomial distributions
210
- _nobs1, _nobs2
211
- Counts of all outcomes in the respective binomial distributions
212
- alpha
213
- Significance level
214
- method
215
- Method used to compute confidence intervals
216
-
217
- Returns
218
- -------
219
- Raw and expected values of estimated difference, with bounds of c.i.
220
-
221
- References
222
- ----------
223
-
224
- .. [4] Agresti, A., & Caffo, T. (2000). Simple and Effective
225
- Confidence Intervals for Proportions and Differences of Proportions
226
- Result from Adding Two Successes and Two Failures.
227
- The American Statistician, 54(4), 280--288. https://doi.org/10.2307/2685779
228
-
229
- .. [5] Newcombe, R.G. (1998). Two-sided confidence intervals for
230
- the single proportion: comparison of seven methods. Statist. Med., 17: 857-872.
231
- https://doi.org/10.1002/(SICI)1097-0258(19980430)17:8%3C857::AID-SIM777%3E3.0.CO;2-E
232
-
233
- .. [6] Miettinen, O. and Nurminen, M. (1985). Comparative analysis of two rates.
234
- Statist. Med., 4: 213-226. https://doi.org/10.1002/sim.4780040211; Appendix I
235
-
236
- .. [7] StatsAndCIs.r, function BinomDiffCI, method, "mn"
237
- https://github.com/cran/DescTools/blob/master/R/StatsAndCIs.r
238
- (R source code is distributed under the CC-BY license.)
239
-
240
- """
241
- for _f in _npos1, _nobs1, _npos1, _nobs2:
242
- if not isinstance(_f, int | np.integer):
243
- raise ValueError(
244
- f"Count, {_f!r} must be of int type or be a subtype of np.integer."
245
- )
246
-
247
- if not min(_nobs1, _nobs2):
248
- return (np.nan, np.nan, np.nan, np.nan)
249
-
250
- match method:
251
- case "Agresti-Caffo":
252
- _res = _propn_diff_ci_agresti_caffo(
253
- _npos1, _nobs1, _npos2, _nobs2, alpha=alpha
254
- )
255
-
256
- case "Newcombe" | "Score":
257
- _res = _propn_diff_ci_newcombe_score(
258
- _npos1, _nobs1, _npos2, _nobs2, alpha=alpha
259
- )
260
-
261
- case "M-N" | "Mee":
262
- _res = _propn_diff_ci_mn(
263
- _npos1, _nobs1, _npos2, _nobs2, alpha=alpha, method=method
264
- )
265
-
266
- case _:
267
- raise ValueError(f"Method, {f'"{method}"'} not implemented.")
268
-
269
- return _res
270
-
271
-
272
- def _propn_diff_ci_agresti_caffo(
273
- _npos1: int = 4,
274
- _nobs1: int = 10,
275
- _npos2: int = 4,
276
- _nobs2: int = 10,
277
- /,
278
- *,
279
- alpha: float = 0.05,
280
- ) -> tuple[float, float, float, float]:
281
- """
282
- Estimate Agresti-Caffo confidence intervals for differences of
283
- multiple proportions.
284
- """
285
-
286
- _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
287
-
288
- _zsc = norm.ppf(1 - alpha / 2)
289
- _zscsq = _zsc * _zsc
290
-
291
- _adjmt_t = 2 if alpha == 0.05 else _zscsq / 2
292
- _npos1_ac, _npos2_ac = (_f + _adjmt_t / 2 for _f in (_npos1, _npos2))
293
- _nobs1_ac, _nobs2_ac = (_f + _adjmt_t for _f in (_nobs1, _nobs2))
294
-
295
- _p1_est = _npos1_ac / _nobs1_ac
296
- _p2_est = _npos2_ac / _nobs2_ac
297
- _diff_est = _p1_est - _p2_est
298
- _se_est = np.sqrt(
299
- _p1_est * (1 - _p1_est) / _nobs1_ac + _p2_est * (1 - _p2_est) / _nobs2_ac
300
- )
301
-
302
- _diff_cl_l, _diff_cl_u = (_diff_est + _s * _zsc * _se_est for _s in (-1, 1))
303
-
304
- return _diff_hat, _diff_est, max(-1.0, _diff_cl_l), min(1.0, _diff_cl_u)
305
-
306
-
307
- def _propn_diff_ci_newcombe_score(
308
- _npos1: int = 4,
309
- _nobs1: int = 10,
310
- _npos2: int = 4,
311
- _nobs2: int = 10,
312
- /,
313
- *,
314
- alpha: float = 0.05,
315
- ) -> tuple[float, float, float, float]:
316
- """
317
- See Neccombe(1998), Agrest-Caffo (2002).
318
- """
319
- _l1, _u1 = propn_ci(_npos1, _nobs1, alpha=alpha, method="Wilson")[-2:]
320
- _l2, _u2 = propn_ci(_npos2, _nobs2, alpha=alpha, method="Wilson")[-2:]
321
-
322
- _zsc = norm.ppf(1 - alpha / 2)
323
- _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
324
-
325
- _diff_cl_l = _diff_hat - _zsc * np.sqrt(
326
- _l1 * (1 - _l1) / _nobs1 + _u2 * (1 - _u2) / _nobs2
327
- )
328
- _diff_cl_u = _diff_hat + _zsc * np.sqrt(
329
- _u1 * (1 - _u1) / _nobs1 + _l2 * (1 - _l2) / _nobs2
330
- )
331
-
332
- return _diff_hat, (_diff_cl_l + _diff_cl_u) / 2, _diff_cl_l, _diff_cl_u
333
-
334
-
335
- def _propn_diff_ci_mn(
336
- _npos1: int = 4,
337
- _nobs1: int = 10,
338
- _npos2: int = 4,
339
- _nobs2: int = 10,
340
- /,
341
- *,
342
- alpha: float = 0.05,
343
- method: Literal["M-N", "Mee"] = "M-N",
344
- ) -> tuple[float, float, float, float]:
345
- """
346
- See Miettinen and Nurminen (1985; Newcombe (1998);
347
- and StasAndCIs.r -> BinomDiffCi -> "mn".
348
-
349
- """
350
- for _f in _npos1, _nobs1, _npos1, _nobs2:
351
- if not isinstance(_f, int | np.integer):
352
- raise ValueError(
353
- f"Count, {_f!r} must have type that is a subtype of np.integer."
354
- )
355
-
356
- _chi_sq_cr = chi2.ppf(1 - alpha, 1)
357
- _counts = (_npos1, _nobs1, _npos2, _nobs2)
358
-
359
- _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
360
-
361
- _ci_est_start = np.array([(_diff_hat + _s) / 2 for _s in (-1, 1)])
362
- # Avoid potential corner cases
363
- _ci_est_offset = (1 - 1.055e-2, 1)
364
- if _diff_hat == 1.0:
365
- _ci_est_start += _ci_est_offset
366
- elif _diff_hat == -1.0:
367
- _ci_est_start -= _ci_est_offset[::-1]
368
-
369
- def _obj_fn(
370
- _dh: float, _counts: Sequence[int], _cr: float, _method: Literal["M-N", "Mee"]
371
- ) -> float:
372
- return _cr - _propn_diff_chisq_mn(_counts, _dh, method=_method)
373
-
374
- def _get_sol(_sol: OptimizeResult, /) -> float:
375
- return float(_sol.x[0] if _sol.x.shape else _sol.x)
376
-
377
- _diff_cl_l, _diff_cl_u = (
378
- _get_sol(root(_obj_fn, _dh0, args=(_counts, _chi_sq_cr, method)))
379
- for _dh0 in _ci_est_start
380
- )
381
-
382
- _ci_lo, _ci_hi = max(-1.0, _diff_cl_l), min(1.0, _diff_cl_u)
383
- return _diff_hat, (_ci_lo + _ci_hi) / 2, _ci_lo, _ci_hi
384
-
385
-
386
- def _propn_diff_chisq_mn(
387
- _counts: Sequence[int],
388
- _rd: float = 0.0,
389
- /,
390
- *,
391
- method: Literal["M-N", "Mee"] = "M-N",
392
- ) -> float:
393
- R"""Estimate the :math:`\chi^2` statistic for the Meittinen-Nurminen (1985),
394
- and Newcombe (1998) confidence intervals for a difference in binomial proportions.
395
-
396
- Parameters
397
- ----------
398
- _counts
399
- Numbers of positives and observations for (two) samples to be tested
400
-
401
- _rd
402
- Starting value
403
-
404
- method
405
- Specify Meitinen-Nurminen or Mee
406
-
407
- Returns
408
- -------
409
- Chi-square estimate
410
-
411
- """
412
- if _counts is None:
413
- _counts = [1] * 4
414
-
415
- _np1, _no1, _np2, _no2 = _counts
416
- _p1h, _p2h = _np1 / _no1, _np2 / _no2
417
- _diff = _p1h - _p2h - _rd
418
-
419
- if not _diff:
420
- return 0.0
421
-
422
- _np, _no = _np1 + _np2, _no1 + _no2
423
-
424
- _l3 = _no
425
- _l2 = (_no1 + 2 * _no2) * _rd - _no - _np
426
- _l1 = (_no2 * _rd - _no - 2 * _np2) * _rd + _np
427
- _l0 = _np2 * _rd * (1 - _rd)
428
- _l2_to_3l3 = _l2 / (3 * _l3)
429
-
430
- _q = _l2_to_3l3**3 - (_l1 * _l2_to_3l3 - _l0) / (2 * _l3)
431
- _p = np.sign(_q) * np.sqrt(_l2**2 - 3 * _l3 * _l1) / (3 * _l3)
432
- _a = (np.pi + np.arccos(_q / _p**3)) / 3
433
-
434
- _p2t: float = 2 * _p * np.cos(_a) - _l2_to_3l3
435
- _p1t: float = _p2t + _rd
436
-
437
- return _diff**2 / (
438
- (_p1t * (1 - _p1t) / _no1 + _p2t * (1 - _p2t) / _no2)
439
- * (_no / (_no - 1) if method == "M-N" else 1.0)
440
- )
441
-
442
-
443
- def propn_diff_ci_multinomial(
444
- _counts: NDArray[np.integer[TI]], /, *, alpha: float = 0.05
445
- ) -> NDArray[np.float64]:
446
- """Estimate confidence intervals of pair-wise differences in multinomial proportions
447
-
448
- Differences in multinomial proportions sum to zero.
449
-
450
- Parameters
451
- ----------
452
- _counts
453
- Two dimensional np.array of observed values of multinomial distributions
454
- (in columns).
455
- alpha
456
- Significance level
457
-
458
- Returns
459
- -------
460
- Array of confidence intervals
461
-
462
- """
463
-
464
- if len(_counts.shape) > 2:
465
- raise ValueError(
466
- "This implementation is only valid for estimating confidence intervals "
467
- "for differences in two (2) sets of multinomial proportions."
468
- )
469
-
470
- _prob = _counts / np.einsum("jk->k", _counts).astype(np.int64)
471
- _var = np.einsum("jk->j", _prob * (1 - _prob) / _counts)[:, None]
472
-
473
- _d, _d_cr = np.diff(_prob, axis=1), norm.ppf(1 - (alpha / len(_counts)))
474
- return np.column_stack([_d + _f * _d_cr * np.sqrt(_var) for _f in (-1, 1)])
475
-
476
-
477
- @dataclass(slots=True, frozen=True)
478
- class MultinomialPropnsTest:
479
- estimate: np.float64
480
- dof: int
481
- critical_value: np.float64
482
- p_value: np.float64
483
-
484
-
485
- def propn_test_multinomial(
486
- _counts: NDArray[np.integer[TI]], /, *, alpha: float = 0.05
487
- ) -> MultinomialPropnsTest:
488
- """Chi-square test for homogeneity of differences in multinomial proportions.
489
-
490
- Differences in multinomial proportions sum to zero.
491
-
492
- Parameters
493
- ----------
494
- _counts
495
- Two dimensional array of observed values of multinomial distributions
496
- (in columns).
497
- alpha
498
- Significance level
499
-
500
- Returns
501
- -------
502
- Estimated statistic, degrees of freedom, critical value, p-value
503
-
504
- """
505
-
506
- _n = np.einsum("jk->", _counts).astype(np.int64)
507
- _n_k = np.einsum("jk->k", _counts).astype(np.int64)
508
- _prob = _counts / _n_k
509
-
510
- _p_bar = _n / np.einsum("jk->j", _n_k / _prob)
511
-
512
- _y_sq = _n * ((1 / np.einsum("j->", _p_bar)) - 1)
513
- _dof = np.array([_s - 1 for _s in _counts.shape]).prod()
514
- _chi_rv = chi2(_dof)
515
-
516
- return MultinomialPropnsTest(
517
- _y_sq, _dof, _chi_rv.ppf(1 - alpha), 1 - _chi_rv.cdf(_y_sq)
518
- )