mergeron_extra 2024.739099.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+
7
+ _PKG_NAME: str = Path(__file__).parent.stem
8
+
9
+ VERSION = "2024.739097.4"
10
+
11
+ __version__ = VERSION
12
+
13
+ DATA_DIR: Path = Path.home() / _PKG_NAME
14
+ """
15
+ Defines a subdirectory named for this package in the user's home path.
16
+
17
+ If the subdirectory doesn't exist, it is created on package invocation.
18
+ """
19
+ if not DATA_DIR.is_dir():
20
+ DATA_DIR.mkdir(parents=False)
21
+
22
+
23
+ np.set_printoptions(precision=18)
@@ -0,0 +1,515 @@
1
+ """
2
+ Functions to estimate confidence intervals for
3
+ (a.) a proportion or multiple proportions, and (b.) contrast between
4
+ two independent proportions or two series of independent proportions.
5
+
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Sequence
11
+ from dataclasses import dataclass
12
+ from typing import Literal, TypeVar
13
+
14
+ import numpy as np
15
+ from numpy.typing import NBitBase, NDArray
16
+ from scipy.optimize import OptimizeResult, root # type: ignore
17
+ from scipy.stats import beta, chi2, norm # type: ignore
18
+
19
+ from . import TI, VERSION, ArrayDouble, ArrayINT
20
+
21
+ __version__ = VERSION
22
+
23
+ TI = TypeVar("TI", bound=NBitBase)
24
+
25
+
26
+ def propn_ci(
27
+ _npos: ArrayINT[TI] | int = 4,
28
+ _nobs: ArrayINT[TI] | int = 10,
29
+ /,
30
+ *,
31
+ alpha: float = 0.05,
32
+ method: Literal[
33
+ "Agresti-Coull", "Clopper-Pearson", "Exact", "Wilson", "Score"
34
+ ] = "Wilson",
35
+ ) -> tuple[
36
+ ArrayDouble | float, ArrayDouble | float, ArrayDouble | float, ArrayDouble | float
37
+ ]:
38
+ """Returns point estimates and confidence interval for a proportion
39
+
40
+ Methods "Clopper-Pearson" and "Exact" are synoymous [3]_. Similarly,
41
+ "Wilson" and "Score" are synonyms here.
42
+
43
+ Parameters
44
+ ----------
45
+ _npos
46
+ Number of positives
47
+
48
+ _nobs
49
+ Number of observed values
50
+
51
+ alpha
52
+ Significance level
53
+
54
+ method
55
+ Method to use for estimating confidence interval
56
+
57
+ Returns
58
+ -------
59
+ Raw and estimated proportions, and bounds of the confidence interval
60
+
61
+
62
+ References
63
+ ----------
64
+
65
+ .. [3] Alan Agresti & Brent A. Coull (1998) Approximate is Better
66
+ than “Exact” for Interval Estimation of Binomial Proportions,
67
+ The American Statistician, 52:2, 119-126,
68
+ https://doi.org/10.1080/00031305.1998.10480550
69
+
70
+ """
71
+
72
+ for _f in _npos, _nobs:
73
+ if not isinstance(_f, int | np.integer):
74
+ raise ValueError(
75
+ f"Count, {_f!r} must have type that is a subtype of np.integer."
76
+ )
77
+
78
+ if not _nobs:
79
+ return (np.nan, np.nan, np.nan, np.nan)
80
+
81
+ _raw_phat: ArrayDouble | float = _npos / _nobs
82
+ _est_phat: ArrayDouble | float
83
+ _est_ci_l: ArrayDouble | float
84
+ _est_ci_u: ArrayDouble | float
85
+
86
+ match method:
87
+ case "Clopper-Pearson" | "Exact":
88
+ _est_ci_l, _est_ci_u = (
89
+ beta.ppf(*_f)
90
+ for _f in (
91
+ (alpha / 2, _npos, _nobs - _npos + 1),
92
+ (1 - alpha / 2, _npos + 1, _nobs - _npos),
93
+ )
94
+ )
95
+ _est_phat = 1 / 2 * (_est_ci_l + _est_ci_u)
96
+
97
+ case "Agresti-Coull":
98
+ _zsc = norm.ppf(1 - alpha / 2)
99
+ _zscsq = _zsc * _zsc
100
+ _adjmt = 4 if alpha == 0.05 else _zscsq
101
+ _est_phat = (_npos + _adjmt / 2) / (_nobs + _adjmt)
102
+ _est_ci_l, _est_ci_u = (
103
+ _est_phat + _g
104
+ for _g in [
105
+ _f * _zsc * np.sqrt(_est_phat * (1 - _est_phat) / (_nobs + _adjmt))
106
+ for _f in (-1, 1)
107
+ ]
108
+ )
109
+
110
+ case "Wilson" | "Score":
111
+ _zsc = norm.ppf(1 - alpha / 2)
112
+ _zscsq = _zsc * _zsc
113
+ _est_phat = (_npos + _zscsq / 2) / (_nobs + _zscsq)
114
+ _est_ci_l, _est_ci_u = (
115
+ _est_phat
116
+ + _f
117
+ * _zsc
118
+ * np.sqrt(_nobs * _raw_phat * (1 - _raw_phat) + _zscsq / 4)
119
+ / (_nobs + _zscsq)
120
+ for _f in (-1, 1)
121
+ )
122
+
123
+ case _:
124
+ raise ValueError(f"Method, {f'"{method}"'} not yet implemented.")
125
+
126
+ return _raw_phat, _est_phat, _est_ci_l, _est_ci_u
127
+
128
+
129
+ def propn_ci_multinomial(
130
+ _counts: NDArray[np.integer[TI]],
131
+ /,
132
+ *,
133
+ alpha: float = 0.05,
134
+ method: Literal["goodman", "quesenberry-hurst"] = "goodman",
135
+ alternative: Literal["default", "simplified"] = "default",
136
+ ) -> ArrayDouble:
137
+ """Confidence intervals for multiple proportions.
138
+
139
+ Parameters
140
+ ----------
141
+ _counts
142
+ `n x 2` np.array of multinomial counts
143
+ alpha
144
+ Significance level
145
+ method
146
+ Method used to computer confidence intervals
147
+ alternative
148
+ Method used to estimate standard errors, whether "default"
149
+ or "simplified"
150
+
151
+ Returns
152
+ -------
153
+ Array of confidence intervals
154
+
155
+ """
156
+ if method not in (_mli := ("goodman", "quesenberry-hurst")):
157
+ raise ValueError(
158
+ f'Invalid value {f'"{method}"'} for "method". Must be one of {_mli}.'
159
+ )
160
+
161
+ _n = np.einsum("j->", _counts).astype(np.int64)
162
+ _prob = _counts / _n
163
+ _chi2_cr = (
164
+ chi2(len(_counts) - 1).ppf(1 - alpha)
165
+ if method == "quesenberry-hurst"
166
+ else chi2(1).ppf(1 - (alpha / len(_counts)))
167
+ )
168
+
169
+ if alternative == "default":
170
+ _ci_len_half = np.sqrt(_chi2_cr * (_chi2_cr + 4 * _n * _prob * (1 - _prob)))
171
+ return np.column_stack([
172
+ (_chi2_cr + 2 * _counts + _f * _ci_len_half) / (2 * (_n + _chi2_cr))
173
+ for _f in (-1, 1)
174
+ ])
175
+
176
+ elif alternative == "simplified":
177
+ _ci_len_half = np.sqrt(_chi2_cr * _prob * (1 - _prob) / _n)
178
+ return np.column_stack([_prob + _f * _ci_len_half for _f in (-1, 1)])
179
+
180
+ else:
181
+ raise ValueError(
182
+ f"Invalid value, {f'"{alternative}"'} for, \"alternative\". "
183
+ f"Must be one of '{'("default", "simplified")'}'."
184
+ )
185
+
186
+
187
+ def propn_diff_ci(
188
+ _npos1: int = 4,
189
+ _nobs1: int = 10,
190
+ _npos2: int = 4,
191
+ _nobs2: int = 10,
192
+ /,
193
+ *,
194
+ alpha: float = 0.05,
195
+ method: Literal["Agresti-Caffo", "Mee", "M-N", "Newcombe", "Score"] = "M-N",
196
+ ) -> tuple[float, float, float, float]:
197
+ R"""Confidence intervals for differences in binomial proportions.
198
+
199
+ Methods available are Agresti-Caffo [4]_, Mee [5]_, Meitinen-Nurminen [5]_ [6]_
200
+ and Newcombe (aka, Score method) [5]_. See also, source code for the
201
+ R-language function BinomDiffCI, in the module StatsAndCIs [7]_.
202
+
203
+ Parameters
204
+ ----------
205
+ _npos1, _npos2
206
+ Counts of positive outcomes in the respective binomial distributions
207
+ _nobs1, _nobs2
208
+ Counts of all outcomes in the respective binomial distributions
209
+ alpha
210
+ Significance level
211
+ method
212
+ Method used to compute confidence intervals
213
+
214
+ Returns
215
+ -------
216
+ Raw and expected values of estimated difference, with bounds of c.i.
217
+
218
+ References
219
+ ----------
220
+
221
+ .. [4] Agresti, A., & Caffo, T. (2000). Simple and Effective
222
+ Confidence Intervals for Proportions and Differences of Proportions
223
+ Result from Adding Two Successes and Two Failures.
224
+ The American Statistician, 54(4), 280--288. https://doi.org/10.2307/2685779
225
+
226
+ .. [5] Newcombe, R.G. (1998). Two-sided confidence intervals for
227
+ the single proportion: comparison of seven methods. Statist. Med., 17: 857-872.
228
+ https://doi.org/10.1002/(SICI)1097-0258(19980430)17:8%3C857::AID-SIM777%3E3.0.CO;2-E
229
+
230
+ .. [6] Miettinen, O. and Nurminen, M. (1985). Comparative analysis of two rates.
231
+ Statist. Med., 4: 213-226. https://doi.org/10.1002/sim.4780040211; Appendix I
232
+
233
+ .. [7] StatsAndCIs.r, function BinomDiffCI, method, "mn"
234
+ https://github.com/cran/DescTools/blob/master/R/StatsAndCIs.r
235
+ (R source code is distributed under the CC-BY license.)
236
+
237
+ """
238
+ for _f in _npos1, _nobs1, _npos1, _nobs2:
239
+ if not isinstance(_f, int | np.integer):
240
+ raise ValueError(
241
+ f"Count, {_f!r} must be of int type or be a subtype of np.integer."
242
+ )
243
+
244
+ if not min(_nobs1, _nobs2):
245
+ return (np.nan, np.nan, np.nan, np.nan)
246
+
247
+ match method:
248
+ case "Agresti-Caffo":
249
+ _res = _propn_diff_ci_agresti_caffo(
250
+ _npos1, _nobs1, _npos2, _nobs2, alpha=alpha
251
+ )
252
+
253
+ case "Newcombe" | "Score":
254
+ _res = _propn_diff_ci_newcombe_score(
255
+ _npos1, _nobs1, _npos2, _nobs2, alpha=alpha
256
+ )
257
+
258
+ case "M-N" | "Mee":
259
+ _res = _propn_diff_ci_mn(
260
+ _npos1, _nobs1, _npos2, _nobs2, alpha=alpha, method=method
261
+ )
262
+
263
+ case _:
264
+ raise ValueError(f"Method, {f'"{method}"'} not implemented.")
265
+
266
+ return _res
267
+
268
+
269
+ def _propn_diff_ci_agresti_caffo(
270
+ _npos1: int = 4,
271
+ _nobs1: int = 10,
272
+ _npos2: int = 4,
273
+ _nobs2: int = 10,
274
+ /,
275
+ *,
276
+ alpha: float = 0.05,
277
+ ) -> tuple[float, float, float, float]:
278
+ """
279
+ Estimate Agresti-Caffo confidence intervals for differences of
280
+ multiple proportions.
281
+ """
282
+
283
+ _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
284
+
285
+ _zsc = norm.ppf(1 - alpha / 2)
286
+ _zscsq = _zsc * _zsc
287
+
288
+ _adjmt_t = 2 if alpha == 0.05 else _zscsq / 2
289
+ _npos1_ac, _npos2_ac = (_f + _adjmt_t / 2 for _f in (_npos1, _npos2))
290
+ _nobs1_ac, _nobs2_ac = (_f + _adjmt_t for _f in (_nobs1, _nobs2))
291
+
292
+ _p1_est = _npos1_ac / _nobs1_ac
293
+ _p2_est = _npos2_ac / _nobs2_ac
294
+ _diff_est = _p1_est - _p2_est
295
+ _se_est = np.sqrt(
296
+ _p1_est * (1 - _p1_est) / _nobs1_ac + _p2_est * (1 - _p2_est) / _nobs2_ac
297
+ )
298
+
299
+ _diff_cl_l, _diff_cl_u = (_diff_est + _s * _zsc * _se_est for _s in (-1, 1))
300
+
301
+ return _diff_hat, _diff_est, max(-1.0, _diff_cl_l), min(1.0, _diff_cl_u)
302
+
303
+
304
+ def _propn_diff_ci_newcombe_score(
305
+ _npos1: int = 4,
306
+ _nobs1: int = 10,
307
+ _npos2: int = 4,
308
+ _nobs2: int = 10,
309
+ /,
310
+ *,
311
+ alpha: float = 0.05,
312
+ ) -> tuple[float, float, float, float]:
313
+ """
314
+ See Neccombe(1998), Agrest-Caffo (2002).
315
+ """
316
+ _l1, _u1 = propn_ci(_npos1, _nobs1, alpha=alpha, method="Wilson")[-2:]
317
+ _l2, _u2 = propn_ci(_npos2, _nobs2, alpha=alpha, method="Wilson")[-2:]
318
+
319
+ _zsc = norm.ppf(1 - alpha / 2)
320
+ _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
321
+
322
+ _diff_cl_l = _diff_hat - _zsc * np.sqrt(
323
+ _l1 * (1 - _l1) / _nobs1 + _u2 * (1 - _u2) / _nobs2
324
+ )
325
+ _diff_cl_u = _diff_hat + _zsc * np.sqrt(
326
+ _u1 * (1 - _u1) / _nobs1 + _l2 * (1 - _l2) / _nobs2
327
+ )
328
+
329
+ return _diff_hat, (_diff_cl_l + _diff_cl_u) / 2, _diff_cl_l, _diff_cl_u
330
+
331
+
332
+ def _propn_diff_ci_mn(
333
+ _npos1: int = 4,
334
+ _nobs1: int = 10,
335
+ _npos2: int = 4,
336
+ _nobs2: int = 10,
337
+ /,
338
+ *,
339
+ alpha: float = 0.05,
340
+ method: Literal["M-N", "Mee"] = "M-N",
341
+ ) -> tuple[float, float, float, float]:
342
+ """
343
+ See Miettinen and Nurminen (1985; Newcombe (1998);
344
+ and StasAndCIs.r -> BinomDiffCi -> "mn".
345
+
346
+ """
347
+ for _f in _npos1, _nobs1, _npos1, _nobs2:
348
+ if not isinstance(_f, int | np.integer):
349
+ raise ValueError(
350
+ f"Count, {_f!r} must have type that is a subtype of np.integer."
351
+ )
352
+
353
+ _chi_sq_cr = chi2.ppf(1 - alpha, 1)
354
+ _counts = (_npos1, _nobs1, _npos2, _nobs2)
355
+
356
+ _diff_hat = _npos1 / _nobs1 - _npos2 / _nobs2
357
+
358
+ _ci_est_start = np.array([(_diff_hat + _s) / 2 for _s in (-1, 1)])
359
+ # Avoid potential corner cases
360
+ _ci_est_offset = (1 - 1.055e-2, 1)
361
+ if _diff_hat == 1.0:
362
+ _ci_est_start += _ci_est_offset
363
+ elif _diff_hat == -1.0:
364
+ _ci_est_start -= _ci_est_offset[::-1]
365
+
366
+ def _obj_fn(
367
+ _dh: float, _counts: Sequence[int], _cr: float, _method: Literal["M-N", "Mee"]
368
+ ) -> float:
369
+ return _cr - _propn_diff_chisq_mn(_counts, _dh, method=_method)
370
+
371
+ def _get_sol(_sol: OptimizeResult, /) -> float:
372
+ return float(_sol.x[0] if _sol.x.shape else _sol.x)
373
+
374
+ _diff_cl_l, _diff_cl_u = (
375
+ _get_sol(root(_obj_fn, _dh0, args=(_counts, _chi_sq_cr, method)))
376
+ for _dh0 in _ci_est_start
377
+ )
378
+
379
+ _ci_lo, _ci_hi = max(-1.0, _diff_cl_l), min(1.0, _diff_cl_u)
380
+ return _diff_hat, (_ci_lo + _ci_hi) / 2, _ci_lo, _ci_hi
381
+
382
+
383
+ def _propn_diff_chisq_mn(
384
+ _counts: Sequence[int],
385
+ _rd: float = 0.0,
386
+ /,
387
+ *,
388
+ method: Literal["M-N", "Mee"] = "M-N",
389
+ ) -> float:
390
+ R"""Estimate the :math:`\chi^2` statistic for the Meittinen-Nurminen (1985),
391
+ and Newcombe (1998) confidence intervals for a difference in binomial proportions.
392
+
393
+ Parameters
394
+ ----------
395
+ _counts
396
+ Numbers of positives and observations for (two) samples to be tested
397
+
398
+ _rd
399
+ Starting value
400
+
401
+ method
402
+ Specify Meitinen-Nurminen or Mee
403
+
404
+ Returns
405
+ -------
406
+ Chi-square estimate
407
+
408
+ """
409
+ if _counts is None:
410
+ _counts = [1] * 4
411
+
412
+ _np1, _no1, _np2, _no2 = _counts
413
+ _p1h, _p2h = _np1 / _no1, _np2 / _no2
414
+ _diff = _p1h - _p2h - _rd
415
+
416
+ if not _diff:
417
+ return 0.0
418
+
419
+ _np, _no = _np1 + _np2, _no1 + _no2
420
+
421
+ _l3 = _no
422
+ _l2 = (_no1 + 2 * _no2) * _rd - _no - _np
423
+ _l1 = (_no2 * _rd - _no - 2 * _np2) * _rd + _np
424
+ _l0 = _np2 * _rd * (1 - _rd)
425
+ _l2_to_3l3 = _l2 / (3 * _l3)
426
+
427
+ _q = _l2_to_3l3**3 - (_l1 * _l2_to_3l3 - _l0) / (2 * _l3)
428
+ _p = np.sign(_q) * np.sqrt(_l2**2 - 3 * _l3 * _l1) / (3 * _l3)
429
+ _a = (np.pi + np.arccos(_q / _p**3)) / 3
430
+
431
+ _p2t: float = 2 * _p * np.cos(_a) - _l2_to_3l3
432
+ _p1t: float = _p2t + _rd
433
+
434
+ return _diff**2 / (
435
+ (_p1t * (1 - _p1t) / _no1 + _p2t * (1 - _p2t) / _no2)
436
+ * (_no / (_no - 1) if method == "M-N" else 1.0)
437
+ )
438
+
439
+
440
+ def propn_diff_ci_multinomial(
441
+ _counts: NDArray[np.integer[TI]], /, *, alpha: float = 0.05
442
+ ) -> ArrayDouble:
443
+ """Estimate confidence intervals of pair-wise differences in multinomial proportions
444
+
445
+ Differences in multinomial proportions sum to zero.
446
+
447
+ Parameters
448
+ ----------
449
+ _counts
450
+ Two dimensional np.array of observed values of multinomial distributions
451
+ (in columns).
452
+ alpha
453
+ Significance level
454
+
455
+ Returns
456
+ -------
457
+ Array of confidence intervals
458
+
459
+ """
460
+
461
+ if len(_counts.shape) > 2:
462
+ raise ValueError(
463
+ "This implementation is only valid for estimating confidence intervals "
464
+ "for differences in two (2) sets of multinomial proportions."
465
+ )
466
+
467
+ _prob = _counts / np.einsum("jk->k", _counts).astype(np.int64)
468
+ _var = np.einsum("jk->j", _prob * (1 - _prob) / _counts)[:, None]
469
+
470
+ _d, _d_cr = np.diff(_prob, axis=1), norm.ppf(1 - (alpha / len(_counts)))
471
+ return np.column_stack([_d + _f * _d_cr * np.sqrt(_var) for _f in (-1, 1)])
472
+
473
+
474
+ @dataclass(slots=True, frozen=True)
475
+ class MultinomialPropnsTest:
476
+ estimate: np.float64
477
+ dof: int
478
+ critical_value: np.float64
479
+ p_value: np.float64
480
+
481
+
482
+ def propn_test_multinomial(
483
+ _counts: NDArray[np.integer[TI]], /, *, alpha: float = 0.05
484
+ ) -> MultinomialPropnsTest:
485
+ """Chi-square test for homogeneity of differences in multinomial proportions.
486
+
487
+ Differences in multinomial proportions sum to zero.
488
+
489
+ Parameters
490
+ ----------
491
+ _counts
492
+ Two dimensional array of observed values of multinomial distributions
493
+ (in columns).
494
+ alpha
495
+ Significance level
496
+
497
+ Returns
498
+ -------
499
+ Estimated statistic, degrees of freedom, critical value, p-value
500
+
501
+ """
502
+
503
+ _n = np.einsum("jk->", _counts).astype(np.int64)
504
+ _n_k = np.einsum("jk->k", _counts).astype(np.int64)
505
+ _prob = _counts / _n_k
506
+
507
+ _p_bar = _n / np.einsum("jk->j", _n_k / _prob)
508
+
509
+ _y_sq = _n * ((1 / np.einsum("j->", _p_bar)) - 1)
510
+ _dof = np.array([_s - 1 for _s in _counts.shape]).prod()
511
+ _chi_rv = chi2(_dof)
512
+
513
+ return MultinomialPropnsTest(
514
+ _y_sq, _dof, _chi_rv.ppf(1 - alpha), 1 - _chi_rv.cdf(_y_sq)
515
+ )