evalci 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalci-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shreyas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
evalci-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalci
3
+ Version: 0.1.0
4
+ Summary: Statistical significance & confidence intervals for LLM evals
5
+ Author: Shreyas
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Shreyaskc/evalci
8
+ Project-URL: Repository, https://github.com/Shreyaskc/evalci
9
+ Project-URL: Issues, https://github.com/Shreyaskc/evalci/issues
10
+ Keywords: llm,evaluation,statistics,confidence-interval,significance-testing,benchmarking
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy
21
+ Requires-Dist: scipy
22
+ Requires-Dist: pandas
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest; extra == "test"
25
+ Requires-Dist: statsmodels; extra == "test"
26
+ Dynamic: license-file
27
+
28
+ # evalci
29
+
30
+ Statistically sound comparisons between LLMs on benchmarks: confidence
31
+ intervals on accuracy, paired significance tests, power analysis, clustered
32
+ standard errors for multi-sample decoding, and multiple-comparison correction
33
+ across many models/benchmarks — all validated against `statsmodels`/exact
34
+ enumeration fixtures.
35
+
36
+ ```python
37
+ >>> import evalci
38
+ >>> result = evalci.compare(model_a_scores, model_b_scores, method="permutation")
39
+ >>> evalci.report(result)
40
+ 'Δ=0.034, 95% CI [0.005, 0.063], paired permutation p=0.025*, n=1319' # exact numbers depend on your data
41
+ ```
42
+
43
+ ## Status
44
+
45
+ Core library (statistics, eval-shaped workflows, adapters, CLI) is implemented
46
+ and tested. Not yet released to PyPI; no arXiv paper or DOI yet.
47
+
48
+ ## Install
49
+
50
+ Not yet on PyPI. Install from source:
51
+
52
+ ```bash
53
+ git clone https://github.com/Shreyaskc/evalci.git
54
+ cd evalci
55
+ pip install -e ".[test]" # add [test] to also get pytest/statsmodels for running the test suite
56
+ ```
57
+
58
+ Requires Python ≥3.9. Runtime dependencies are numpy, scipy, and pandas only.
59
+
60
+ ## Usage
61
+
62
+ ### Confidence interval on a single model
63
+
64
+ ```python
65
+ import evalci
66
+
67
+ # binary (0/1) per-item correctness
68
+ evalci.ci(scores, method="wilson") # Wilson score interval
69
+ evalci.ci(scores, method="clopper-pearson") # exact interval
70
+
71
+ # continuous scores (e.g. a similarity metric)
72
+ evalci.ci(scores, method="bootstrap") # percentile/BCa bootstrap on the mean
73
+ ```
74
+
75
+ ### Comparing two models on the same items
76
+
77
+ ```python
78
+ result = evalci.compare(model_a_scores, model_b_scores, paired=True, method="permutation")
79
+ # result.delta, result.ci, result.p_value, result.n
80
+
81
+ evalci.compare(a, b, method="bootstrap") # null-shifted bootstrap hypothesis test
82
+ evalci.compare(a, b, method="mcnemar") # McNemar's test for paired binary outcomes
83
+ evalci.compare(a, b, paired=False, method="permutation") # independent samples
84
+ ```
85
+
86
+ ### Sample-size / power calculator
87
+
88
+ ```python
89
+ evalci.power(delta=0.03, power=0.8) # required n to detect a 3-point gap at 80% power
90
+ evalci.power(delta=0.03, n=1500) # achieved power at n=1500
91
+ evalci.power(delta=0.03, power=0.8, method="simulation", rho=0.3) # correlated-items simulation
92
+ ```
93
+
94
+ ### Many models × many benchmarks, with correction
95
+
96
+ ```python
97
+ import pandas as pd
98
+
99
+ # per-item schema: item_id, model, score, [subset], [sample_idx]
100
+ df = pd.DataFrame(...)
101
+ table = evalci.multi_compare(df, correction="holm")
102
+ print(evalci.report(table, format="markdown"))
103
+ ```
104
+
105
+ ### Clustered standard errors (repeated decoding, grouped questions)
106
+
107
+ ```python
108
+ # clusters groups multiple samples of the same underlying item
109
+ evalci.cluster_ci(scores, clusters)
110
+ ```
111
+
112
+ ### Loading results from eval harnesses
113
+
114
+ ```python
115
+ from evalci.adapters import load_lm_eval_harness, load_helm, load_csv
116
+
117
+ df_a = load_lm_eval_harness("results_a.json", model="model-a")
118
+ df_b = load_helm("per_instance_stats.json", model="model-b")
119
+ ```
120
+
121
+ ### CLI
122
+
123
+ ```bash
124
+ evalci compare results_a.json results_b.json --method permutation
125
+ evalci compare results_a.json results_b.json --format helm --method mcnemar
126
+ ```
127
+
128
+ Auto-detects lm-evaluation-harness / HELM / CSV format from the file
129
+ extension/content; pass `--format` to override, and `--model-a`/`--model-b` to
130
+ label the two runs explicitly.
131
+
132
+ ## What's validated, and how
133
+
134
+ Statistical correctness is the point of this library, so the test suite
135
+ cross-checks every routine against an independent reference rather than just
136
+ re-testing its own math:
137
+
138
+ - Wilson and Clopper-Pearson intervals against `statsmodels.stats.proportion.proportion_confint`
139
+ - McNemar's test (exact and asymptotic) against `statsmodels.stats.contingency_tables.mcnemar`
140
+ - Holm and Benjamini-Hochberg correction against `statsmodels.stats.multitest.multipletests`
141
+ - The paired permutation test against brute-force exact enumeration of all sign flips (small n)
142
+ - Bootstrap CIs via a coverage simulation (nominal 95% CIs should contain the true parameter ~95% of the time)
143
+
144
+ `statsmodels` is a test-only dependency (`pip install -e ".[test]"`), not a
145
+ runtime dependency.
146
+
147
+ ```bash
148
+ pytest tests/
149
+ ```
150
+
151
+ ## API surface
152
+
153
+ `evalci.ci`, `evalci.compare`, `evalci.power`, `evalci.multi_compare`,
154
+ `evalci.cluster_ci`, `evalci.report`, `evalci.adapters.{load_lm_eval_harness,
155
+ load_helm, load_csv}`.
156
+
157
+ ## License
158
+
159
+ MIT — see [LICENSE](LICENSE).
evalci-0.1.0/README.md ADDED
@@ -0,0 +1,132 @@
1
+ # evalci
2
+
3
+ Statistically sound comparisons between LLMs on benchmarks: confidence
4
+ intervals on accuracy, paired significance tests, power analysis, clustered
5
+ standard errors for multi-sample decoding, and multiple-comparison correction
6
+ across many models/benchmarks — all validated against `statsmodels`/exact
7
+ enumeration fixtures.
8
+
9
+ ```python
10
+ >>> import evalci
11
+ >>> result = evalci.compare(model_a_scores, model_b_scores, method="permutation")
12
+ >>> evalci.report(result)
13
+ 'Δ=0.034, 95% CI [0.005, 0.063], paired permutation p=0.025*, n=1319' # exact numbers depend on your data
14
+ ```
15
+
16
+ ## Status
17
+
18
+ Core library (statistics, eval-shaped workflows, adapters, CLI) is implemented
19
+ and tested. Not yet released to PyPI; no arXiv paper or DOI yet.
20
+
21
+ ## Install
22
+
23
+ Not yet on PyPI. Install from source:
24
+
25
+ ```bash
26
+ git clone https://github.com/Shreyaskc/evalci.git
27
+ cd evalci
28
+ pip install -e ".[test]" # add [test] to also get pytest/statsmodels for running the test suite
29
+ ```
30
+
31
+ Requires Python ≥3.9. Runtime dependencies are numpy, scipy, and pandas only.
32
+
33
+ ## Usage
34
+
35
+ ### Confidence interval on a single model
36
+
37
+ ```python
38
+ import evalci
39
+
40
+ # binary (0/1) per-item correctness
41
+ evalci.ci(scores, method="wilson") # Wilson score interval
42
+ evalci.ci(scores, method="clopper-pearson") # exact interval
43
+
44
+ # continuous scores (e.g. a similarity metric)
45
+ evalci.ci(scores, method="bootstrap") # percentile/BCa bootstrap on the mean
46
+ ```
47
+
48
+ ### Comparing two models on the same items
49
+
50
+ ```python
51
+ result = evalci.compare(model_a_scores, model_b_scores, paired=True, method="permutation")
52
+ # result.delta, result.ci, result.p_value, result.n
53
+
54
+ evalci.compare(a, b, method="bootstrap") # null-shifted bootstrap hypothesis test
55
+ evalci.compare(a, b, method="mcnemar") # McNemar's test for paired binary outcomes
56
+ evalci.compare(a, b, paired=False, method="permutation") # independent samples
57
+ ```
58
+
59
+ ### Sample-size / power calculator
60
+
61
+ ```python
62
+ evalci.power(delta=0.03, power=0.8) # required n to detect a 3-point gap at 80% power
63
+ evalci.power(delta=0.03, n=1500) # achieved power at n=1500
64
+ evalci.power(delta=0.03, power=0.8, method="simulation", rho=0.3) # correlated-items simulation
65
+ ```
66
+
67
+ ### Many models × many benchmarks, with correction
68
+
69
+ ```python
70
+ import pandas as pd
71
+
72
+ # per-item schema: item_id, model, score, [subset], [sample_idx]
73
+ df = pd.DataFrame(...)
74
+ table = evalci.multi_compare(df, correction="holm")
75
+ print(evalci.report(table, format="markdown"))
76
+ ```
77
+
78
+ ### Clustered standard errors (repeated decoding, grouped questions)
79
+
80
+ ```python
81
+ # clusters groups multiple samples of the same underlying item
82
+ evalci.cluster_ci(scores, clusters)
83
+ ```
84
+
85
+ ### Loading results from eval harnesses
86
+
87
+ ```python
88
+ from evalci.adapters import load_lm_eval_harness, load_helm, load_csv
89
+
90
+ df_a = load_lm_eval_harness("results_a.json", model="model-a")
91
+ df_b = load_helm("per_instance_stats.json", model="model-b")
92
+ ```
93
+
94
+ ### CLI
95
+
96
+ ```bash
97
+ evalci compare results_a.json results_b.json --method permutation
98
+ evalci compare results_a.json results_b.json --format helm --method mcnemar
99
+ ```
100
+
101
+ Auto-detects lm-evaluation-harness / HELM / CSV format from the file
102
+ extension/content; pass `--format` to override, and `--model-a`/`--model-b` to
103
+ label the two runs explicitly.
104
+
105
+ ## What's validated, and how
106
+
107
+ Statistical correctness is the point of this library, so the test suite
108
+ cross-checks every routine against an independent reference rather than just
109
+ re-testing its own math:
110
+
111
+ - Wilson and Clopper-Pearson intervals against `statsmodels.stats.proportion.proportion_confint`
112
+ - McNemar's test (exact and asymptotic) against `statsmodels.stats.contingency_tables.mcnemar`
113
+ - Holm and Benjamini-Hochberg correction against `statsmodels.stats.multitest.multipletests`
114
+ - The paired permutation test against brute-force exact enumeration of all sign flips (small n)
115
+ - Bootstrap CIs via a coverage simulation (nominal 95% CIs should contain the true parameter ~95% of the time)
116
+
117
+ `statsmodels` is a test-only dependency (`pip install -e ".[test]"`), not a
118
+ runtime dependency.
119
+
120
+ ```bash
121
+ pytest tests/
122
+ ```
123
+
124
+ ## API surface
125
+
126
+ `evalci.ci`, `evalci.compare`, `evalci.power`, `evalci.multi_compare`,
127
+ `evalci.cluster_ci`, `evalci.report`, `evalci.adapters.{load_lm_eval_harness,
128
+ load_helm, load_csv}`.
129
+
130
+ ## License
131
+
132
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "evalci"
7
+ version = "0.1.0"
8
+ description = "Statistical significance & confidence intervals for LLM evals"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "Shreyas" },
14
+ ]
15
+ keywords = ["llm", "evaluation", "statistics", "confidence-interval", "significance-testing", "benchmarking"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Science/Research",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Topic :: Scientific/Engineering",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ ]
24
+ dependencies = [
25
+ "numpy",
26
+ "scipy",
27
+ "pandas",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/Shreyaskc/evalci"
32
+ Repository = "https://github.com/Shreyaskc/evalci"
33
+ Issues = "https://github.com/Shreyaskc/evalci/issues"
34
+
35
+ [project.optional-dependencies]
36
+ test = ["pytest", "statsmodels"]
37
+
38
+ [project.scripts]
39
+ evalci = "evalci.cli:main"
40
+
41
+ [tool.setuptools.packages.find]
42
+ where = ["src"]
evalci-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ from .stats import (
2
+ CIResult,
3
+ CompareResult,
4
+ PowerResult,
5
+ ci,
6
+ cluster_ci,
7
+ compare,
8
+ multi_compare,
9
+ power,
10
+ )
11
+ from .report import report
12
+ from . import adapters
13
+
14
+ __all__ = [
15
+ "ci",
16
+ "compare",
17
+ "power",
18
+ "multi_compare",
19
+ "cluster_ci",
20
+ "report",
21
+ "CIResult",
22
+ "CompareResult",
23
+ "PowerResult",
24
+ "adapters",
25
+ ]
@@ -0,0 +1,41 @@
1
+ """Multiple-comparison correction: Holm and Benjamini-Hochberg."""
2
+ import numpy as np
3
+
4
+
5
+ def holm(pvalues):
6
+ """Holm (1979) step-down Bonferroni correction. Returns adjusted p-values."""
7
+ p = np.asarray(pvalues, dtype=float)
8
+ n = len(p)
9
+ order = np.argsort(p)
10
+ adj = np.empty(n)
11
+ running_max = 0.0
12
+ for i, idx in enumerate(order):
13
+ running_max = max(running_max, (n - i) * p[idx])
14
+ adj[idx] = min(running_max, 1.0)
15
+ return adj
16
+
17
+
18
+ def benjamini_hochberg(pvalues):
19
+ """Benjamini-Hochberg (1995) FDR correction. Returns adjusted (q) values."""
20
+ p = np.asarray(pvalues, dtype=float)
21
+ n = len(p)
22
+ order = np.argsort(p)[::-1]
23
+ adj = np.empty(n)
24
+ running_min = 1.0
25
+ for i, idx in enumerate(order):
26
+ rank = n - i
27
+ running_min = min(running_min, p[idx] * n / rank)
28
+ adj[idx] = running_min
29
+ return np.clip(adj, 0, 1)
30
+
31
+
32
+ def correct_pvalues(pvalues, method="holm"):
33
+ method = method.lower()
34
+ if method in ("holm", "bonferroni-holm"):
35
+ return holm(pvalues)
36
+ if method in ("bh", "benjamini-hochberg", "fdr_bh"):
37
+ return benjamini_hochberg(pvalues)
38
+ if method == "bonferroni":
39
+ p = np.asarray(pvalues, dtype=float)
40
+ return np.clip(p * len(p), 0, 1)
41
+ raise ValueError(f"unknown correction method: {method!r}")
@@ -0,0 +1,56 @@
1
+ """Interval estimators: Wilson, Clopper-Pearson, and bootstrap (percentile/BCa)."""
2
+ import numpy as np
3
+ from scipy import stats as _scipy_stats
4
+
5
+
6
+ def wilson_interval(k, n, confidence=0.95):
7
+ """Wilson score interval for a binomial proportion."""
8
+ if n <= 0:
9
+ raise ValueError("n must be positive")
10
+ z = _scipy_stats.norm.ppf(0.5 + confidence / 2)
11
+ phat = k / n
12
+ z2 = z * z
13
+ denom = 1 + z2 / n
14
+ center = (phat + z2 / (2 * n)) / denom
15
+ half = (z * np.sqrt(phat * (1 - phat) / n + z2 / (4 * n * n))) / denom
16
+ return max(0.0, center - half), min(1.0, center + half)
17
+
18
+
19
+ def clopper_pearson_interval(k, n, confidence=0.95):
20
+ """Exact (Clopper-Pearson) interval for a binomial proportion."""
21
+ if n <= 0:
22
+ raise ValueError("n must be positive")
23
+ alpha = 1 - confidence
24
+ lo = 0.0 if k == 0 else _scipy_stats.beta.ppf(alpha / 2, k, n - k + 1)
25
+ hi = 1.0 if k == n else _scipy_stats.beta.ppf(1 - alpha / 2, k + 1, n - k)
26
+ return float(lo), float(hi)
27
+
28
+
29
+ def bootstrap_interval(
30
+ data,
31
+ statistic=np.mean,
32
+ method="BCa",
33
+ confidence=0.95,
34
+ n_resamples=9999,
35
+ random_state=None,
36
+ vectorized=True,
37
+ ):
38
+ """Bootstrap CI for an arbitrary statistic via scipy.stats.bootstrap."""
39
+ data = np.asarray(data, dtype=float)
40
+ scipy_method = {"bca": "BCa", "percentile": "percentile", "basic": "basic"}.get(
41
+ method.lower(), method
42
+ )
43
+ if np.all(data == data[0]):
44
+ # degenerate (zero-variance) sample: BCa's acceleration is undefined
45
+ point = float(statistic(data))
46
+ return point, point
47
+ res = _scipy_stats.bootstrap(
48
+ (data,),
49
+ statistic,
50
+ confidence_level=confidence,
51
+ method=scipy_method,
52
+ n_resamples=n_resamples,
53
+ random_state=random_state,
54
+ vectorized=vectorized,
55
+ )
56
+ return float(res.confidence_interval.low), float(res.confidence_interval.high)
@@ -0,0 +1,65 @@
1
+ """Power / sample-size analysis: analytic (normal approximation) and simulation."""
2
+ import numpy as np
3
+ from scipy import stats as _scipy_stats
4
+
5
+
6
+ def analytic_n(delta, target_power=0.8, baseline=0.5, alpha=0.05):
7
+ """Two-proportion normal-approximation sample size for a given effect size."""
8
+ p1, p2 = baseline, baseline + delta
9
+ z_alpha = _scipy_stats.norm.ppf(1 - alpha / 2)
10
+ z_beta = _scipy_stats.norm.ppf(target_power)
11
+ var = p1 * (1 - p1) + p2 * (1 - p2)
12
+ n = ((z_alpha + z_beta) ** 2) * var / (delta**2)
13
+ return int(np.ceil(n))
14
+
15
+
16
+ def analytic_power(delta, n, baseline=0.5, alpha=0.05):
17
+ """Achieved power of a two-proportion test for given n and effect size."""
18
+ p1, p2 = baseline, baseline + delta
19
+ z_alpha = _scipy_stats.norm.ppf(1 - alpha / 2)
20
+ var = p1 * (1 - p1) + p2 * (1 - p2)
21
+ z_beta = np.sqrt(n * (delta**2) / var) - z_alpha
22
+ return float(_scipy_stats.norm.cdf(z_beta))
23
+
24
+
25
+ def simulate_power(delta, n, baseline=0.5, alpha=0.05, rho=0.0, sims=5000, random_state=None):
26
+ """Monte Carlo power for a paired design with correlated item-level outcomes.
27
+
28
+ `rho` is the fraction of items whose correctness is shared between the two
29
+ models (a common-difficulty link) rather than drawn independently; it is a
30
+ simplified correlation knob, not an exact bivariate-Bernoulli correlation.
31
+ """
32
+ rng = np.random.default_rng(random_state)
33
+ p1, p2 = baseline, baseline + delta
34
+ shared = rng.random((sims, n)) < rho
35
+ shared_correct = rng.random((sims, n)) < (p1 + p2) / 2
36
+ a_ind = rng.random((sims, n)) < p1
37
+ b_ind = rng.random((sims, n)) < p2
38
+ a = np.where(shared, shared_correct, a_ind).astype(float)
39
+ b = np.where(shared, shared_correct, b_ind).astype(float)
40
+ diffs = a - b
41
+ means = diffs.mean(axis=1)
42
+ sds = diffs.std(axis=1, ddof=1)
43
+ sds[sds == 0] = 1e-12
44
+ z = means / (sds / np.sqrt(n))
45
+ pvals = 2 * (1 - _scipy_stats.norm.cdf(np.abs(z)))
46
+ return float(np.mean(pvals < alpha))
47
+
48
+
49
+ def simulate_n(delta, target_power=0.8, baseline=0.5, alpha=0.05, rho=0.0, sims=3000, random_state=None):
50
+ """Bisection search for the n achieving target_power under simulate_power."""
51
+ lo, hi = 4, 64
52
+ max_n = 1_000_000
53
+ while simulate_power(delta, hi, baseline, alpha, rho, sims, random_state) < target_power:
54
+ lo = hi
55
+ hi *= 2
56
+ if hi > max_n:
57
+ return max_n
58
+ while hi - lo > 1:
59
+ mid = (lo + hi) // 2
60
+ p = simulate_power(delta, mid, baseline, alpha, rho, sims, random_state)
61
+ if p < target_power:
62
+ lo = mid
63
+ else:
64
+ hi = mid
65
+ return hi
@@ -0,0 +1,111 @@
1
+ """Significance-test internals: paired/unpaired permutation, bootstrap, McNemar."""
2
+ import itertools
3
+
4
+ import numpy as np
5
+ from scipy import stats as _scipy_stats
6
+
7
+ EXACT_ENUMERATION_THRESHOLD = 12
8
+
9
+
10
+ def paired_bootstrap_ci(diffs, confidence=0.95, n_resamples=9999, rng=None):
11
+ rng = rng or np.random.default_rng()
12
+ n = len(diffs)
13
+ idx = rng.integers(0, n, size=(n_resamples, n))
14
+ resampled_means = diffs[idx].mean(axis=1)
15
+ alpha = 1 - confidence
16
+ lo, hi = np.percentile(resampled_means, [100 * alpha / 2, 100 * (1 - alpha / 2)])
17
+ return float(lo), float(hi)
18
+
19
+
20
+ def unpaired_bootstrap_ci(a, b, confidence=0.95, n_resamples=9999, rng=None):
21
+ rng = rng or np.random.default_rng()
22
+ na, nb = len(a), len(b)
23
+ idx_a = rng.integers(0, na, size=(n_resamples, na))
24
+ idx_b = rng.integers(0, nb, size=(n_resamples, nb))
25
+ stats = a[idx_a].mean(axis=1) - b[idx_b].mean(axis=1)
26
+ alpha = 1 - confidence
27
+ lo, hi = np.percentile(stats, [100 * alpha / 2, 100 * (1 - alpha / 2)])
28
+ return float(lo), float(hi)
29
+
30
+
31
+ def paired_permutation_p(diffs, n_resamples=9999, rng=None, exact_threshold=EXACT_ENUMERATION_THRESHOLD):
32
+ """Sign-flip permutation test on paired differences. Exact enumeration for small n."""
33
+ rng = rng or np.random.default_rng()
34
+ n = len(diffs)
35
+ obs = diffs.mean()
36
+ if n <= exact_threshold:
37
+ signs = np.array(list(itertools.product([-1, 1], repeat=n)), dtype=float)
38
+ resampled = (signs * diffs).mean(axis=1)
39
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
40
+ return float(count / len(signs))
41
+ signs = rng.choice([-1.0, 1.0], size=(n_resamples, n))
42
+ resampled = (signs * diffs).mean(axis=1)
43
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
44
+ return float((count + 1) / (n_resamples + 1))
45
+
46
+
47
+ def unpaired_permutation_p(a, b, n_resamples=9999, rng=None):
48
+ """Label-shuffle permutation test for two independent samples."""
49
+ rng = rng or np.random.default_rng()
50
+ na = len(a)
51
+ combined = np.concatenate([a, b])
52
+ obs = a.mean() - b.mean()
53
+ tiled = np.broadcast_to(combined, (n_resamples, len(combined))).copy()
54
+ perms = rng.permuted(tiled, axis=1)
55
+ stat = perms[:, :na].mean(axis=1) - perms[:, na:].mean(axis=1)
56
+ count = np.sum(np.abs(stat) >= abs(obs) - 1e-9)
57
+ return float((count + 1) / (n_resamples + 1))
58
+
59
+
60
+ def paired_bootstrap_p(diffs, n_resamples=9999, rng=None):
61
+ """Null-shifted bootstrap hypothesis test on paired differences."""
62
+ rng = rng or np.random.default_rng()
63
+ n = len(diffs)
64
+ obs = diffs.mean()
65
+ shifted = diffs - obs
66
+ idx = rng.integers(0, n, size=(n_resamples, n))
67
+ resampled = shifted[idx].mean(axis=1)
68
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
69
+ return float((count + 1) / (n_resamples + 1))
70
+
71
+
72
+ def unpaired_bootstrap_p(a, b, n_resamples=9999, rng=None):
73
+ """Null-shifted bootstrap hypothesis test for two independent samples."""
74
+ rng = rng or np.random.default_rng()
75
+ obs = a.mean() - b.mean()
76
+ combined_mean = np.concatenate([a, b]).mean()
77
+ a_shift = a - a.mean() + combined_mean
78
+ b_shift = b - b.mean() + combined_mean
79
+ na, nb = len(a), len(b)
80
+ idx_a = rng.integers(0, na, size=(n_resamples, na))
81
+ idx_b = rng.integers(0, nb, size=(n_resamples, nb))
82
+ stats = a_shift[idx_a].mean(axis=1) - b_shift[idx_b].mean(axis=1)
83
+ count = np.sum(np.abs(stats) >= abs(obs) - 1e-9)
84
+ return float((count + 1) / (n_resamples + 1))
85
+
86
+
87
+ def mcnemar_test(a, b, correction=True, exact=None):
88
+ """McNemar's test on paired binary (0/1 correctness) outcomes.
89
+
90
+ Returns (p_value, n01, n10, statistic) where n01 = b-correct-only,
91
+ n10 = a-correct-only, statistic is None when the exact binomial test is used.
92
+ """
93
+ a = np.asarray(a)
94
+ b = np.asarray(b)
95
+ n01 = int(np.sum((a == 0) & (b == 1)))
96
+ n10 = int(np.sum((a == 1) & (b == 0)))
97
+ n = n01 + n10
98
+ if exact is None:
99
+ exact = n < 25
100
+ if n == 0:
101
+ return 1.0, n01, n10, 0.0
102
+ if exact:
103
+ k = min(n01, n10)
104
+ p = _scipy_stats.binomtest(k, n, p=0.5, alternative="two-sided").pvalue
105
+ return float(p), n01, n10, None
106
+ if correction:
107
+ statistic = (abs(n01 - n10) - 1) ** 2 / n
108
+ else:
109
+ statistic = (n01 - n10) ** 2 / n
110
+ p = float(1 - _scipy_stats.chi2.cdf(statistic, df=1))
111
+ return p, n01, n10, float(statistic)
@@ -0,0 +1,7 @@
1
+ """Adapters that load external eval outputs into evalci's per-item DataFrame schema."""
2
+ from . import csv, helm, lm_eval_harness
3
+ from .csv import load as load_csv
4
+ from .helm import load as load_helm
5
+ from .lm_eval_harness import load as load_lm_eval_harness
6
+
7
+ __all__ = ["load_lm_eval_harness", "load_helm", "load_csv", "lm_eval_harness", "helm", "csv"]