evalci 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalci/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ from .stats import (
2
+ CIResult,
3
+ CompareResult,
4
+ PowerResult,
5
+ ci,
6
+ cluster_ci,
7
+ compare,
8
+ multi_compare,
9
+ power,
10
+ )
11
+ from .report import report
12
+ from . import adapters
13
+
14
+ __all__ = [
15
+ "ci",
16
+ "compare",
17
+ "power",
18
+ "multi_compare",
19
+ "cluster_ci",
20
+ "report",
21
+ "CIResult",
22
+ "CompareResult",
23
+ "PowerResult",
24
+ "adapters",
25
+ ]
evalci/_correction.py ADDED
@@ -0,0 +1,41 @@
1
+ """Multiple-comparison correction: Holm and Benjamini-Hochberg."""
2
+ import numpy as np
3
+
4
+
5
+ def holm(pvalues):
6
+ """Holm (1979) step-down Bonferroni correction. Returns adjusted p-values."""
7
+ p = np.asarray(pvalues, dtype=float)
8
+ n = len(p)
9
+ order = np.argsort(p)
10
+ adj = np.empty(n)
11
+ running_max = 0.0
12
+ for i, idx in enumerate(order):
13
+ running_max = max(running_max, (n - i) * p[idx])
14
+ adj[idx] = min(running_max, 1.0)
15
+ return adj
16
+
17
+
18
+ def benjamini_hochberg(pvalues):
19
+ """Benjamini-Hochberg (1995) FDR correction. Returns adjusted (q) values."""
20
+ p = np.asarray(pvalues, dtype=float)
21
+ n = len(p)
22
+ order = np.argsort(p)[::-1]
23
+ adj = np.empty(n)
24
+ running_min = 1.0
25
+ for i, idx in enumerate(order):
26
+ rank = n - i
27
+ running_min = min(running_min, p[idx] * n / rank)
28
+ adj[idx] = running_min
29
+ return np.clip(adj, 0, 1)
30
+
31
+
32
+ def correct_pvalues(pvalues, method="holm"):
33
+ method = method.lower()
34
+ if method in ("holm", "bonferroni-holm"):
35
+ return holm(pvalues)
36
+ if method in ("bh", "benjamini-hochberg", "fdr_bh"):
37
+ return benjamini_hochberg(pvalues)
38
+ if method == "bonferroni":
39
+ p = np.asarray(pvalues, dtype=float)
40
+ return np.clip(p * len(p), 0, 1)
41
+ raise ValueError(f"unknown correction method: {method!r}")
evalci/_intervals.py ADDED
@@ -0,0 +1,56 @@
1
+ """Interval estimators: Wilson, Clopper-Pearson, and bootstrap (percentile/BCa)."""
2
+ import numpy as np
3
+ from scipy import stats as _scipy_stats
4
+
5
+
6
+ def wilson_interval(k, n, confidence=0.95):
7
+ """Wilson score interval for a binomial proportion."""
8
+ if n <= 0:
9
+ raise ValueError("n must be positive")
10
+ z = _scipy_stats.norm.ppf(0.5 + confidence / 2)
11
+ phat = k / n
12
+ z2 = z * z
13
+ denom = 1 + z2 / n
14
+ center = (phat + z2 / (2 * n)) / denom
15
+ half = (z * np.sqrt(phat * (1 - phat) / n + z2 / (4 * n * n))) / denom
16
+ return max(0.0, center - half), min(1.0, center + half)
17
+
18
+
19
+ def clopper_pearson_interval(k, n, confidence=0.95):
20
+ """Exact (Clopper-Pearson) interval for a binomial proportion."""
21
+ if n <= 0:
22
+ raise ValueError("n must be positive")
23
+ alpha = 1 - confidence
24
+ lo = 0.0 if k == 0 else _scipy_stats.beta.ppf(alpha / 2, k, n - k + 1)
25
+ hi = 1.0 if k == n else _scipy_stats.beta.ppf(1 - alpha / 2, k + 1, n - k)
26
+ return float(lo), float(hi)
27
+
28
+
29
+ def bootstrap_interval(
30
+ data,
31
+ statistic=np.mean,
32
+ method="BCa",
33
+ confidence=0.95,
34
+ n_resamples=9999,
35
+ random_state=None,
36
+ vectorized=True,
37
+ ):
38
+ """Bootstrap CI for an arbitrary statistic via scipy.stats.bootstrap."""
39
+ data = np.asarray(data, dtype=float)
40
+ scipy_method = {"bca": "BCa", "percentile": "percentile", "basic": "basic"}.get(
41
+ method.lower(), method
42
+ )
43
+ if np.all(data == data[0]):
44
+ # degenerate (zero-variance) sample: BCa's acceleration is undefined
45
+ point = float(statistic(data))
46
+ return point, point
47
+ res = _scipy_stats.bootstrap(
48
+ (data,),
49
+ statistic,
50
+ confidence_level=confidence,
51
+ method=scipy_method,
52
+ n_resamples=n_resamples,
53
+ random_state=random_state,
54
+ vectorized=vectorized,
55
+ )
56
+ return float(res.confidence_interval.low), float(res.confidence_interval.high)
evalci/_power.py ADDED
@@ -0,0 +1,65 @@
1
+ """Power / sample-size analysis: analytic (normal approximation) and simulation."""
2
+ import numpy as np
3
+ from scipy import stats as _scipy_stats
4
+
5
+
6
+ def analytic_n(delta, target_power=0.8, baseline=0.5, alpha=0.05):
7
+ """Two-proportion normal-approximation sample size for a given effect size."""
8
+ p1, p2 = baseline, baseline + delta
9
+ z_alpha = _scipy_stats.norm.ppf(1 - alpha / 2)
10
+ z_beta = _scipy_stats.norm.ppf(target_power)
11
+ var = p1 * (1 - p1) + p2 * (1 - p2)
12
+ n = ((z_alpha + z_beta) ** 2) * var / (delta**2)
13
+ return int(np.ceil(n))
14
+
15
+
16
+ def analytic_power(delta, n, baseline=0.5, alpha=0.05):
17
+ """Achieved power of a two-proportion test for given n and effect size."""
18
+ p1, p2 = baseline, baseline + delta
19
+ z_alpha = _scipy_stats.norm.ppf(1 - alpha / 2)
20
+ var = p1 * (1 - p1) + p2 * (1 - p2)
21
+ z_beta = np.sqrt(n * (delta**2) / var) - z_alpha
22
+ return float(_scipy_stats.norm.cdf(z_beta))
23
+
24
+
25
+ def simulate_power(delta, n, baseline=0.5, alpha=0.05, rho=0.0, sims=5000, random_state=None):
26
+ """Monte Carlo power for a paired design with correlated item-level outcomes.
27
+
28
+ `rho` is the fraction of items whose correctness is shared between the two
29
+ models (a common-difficulty link) rather than drawn independently; it is a
30
+ simplified correlation knob, not an exact bivariate-Bernoulli correlation.
31
+ """
32
+ rng = np.random.default_rng(random_state)
33
+ p1, p2 = baseline, baseline + delta
34
+ shared = rng.random((sims, n)) < rho
35
+ shared_correct = rng.random((sims, n)) < (p1 + p2) / 2
36
+ a_ind = rng.random((sims, n)) < p1
37
+ b_ind = rng.random((sims, n)) < p2
38
+ a = np.where(shared, shared_correct, a_ind).astype(float)
39
+ b = np.where(shared, shared_correct, b_ind).astype(float)
40
+ diffs = a - b
41
+ means = diffs.mean(axis=1)
42
+ sds = diffs.std(axis=1, ddof=1)
43
+ sds[sds == 0] = 1e-12
44
+ z = means / (sds / np.sqrt(n))
45
+ pvals = 2 * (1 - _scipy_stats.norm.cdf(np.abs(z)))
46
+ return float(np.mean(pvals < alpha))
47
+
48
+
49
+ def simulate_n(delta, target_power=0.8, baseline=0.5, alpha=0.05, rho=0.0, sims=3000, random_state=None):
50
+ """Bisection search for the n achieving target_power under simulate_power."""
51
+ lo, hi = 4, 64
52
+ max_n = 1_000_000
53
+ while simulate_power(delta, hi, baseline, alpha, rho, sims, random_state) < target_power:
54
+ lo = hi
55
+ hi *= 2
56
+ if hi > max_n:
57
+ return max_n
58
+ while hi - lo > 1:
59
+ mid = (lo + hi) // 2
60
+ p = simulate_power(delta, mid, baseline, alpha, rho, sims, random_state)
61
+ if p < target_power:
62
+ lo = mid
63
+ else:
64
+ hi = mid
65
+ return hi
@@ -0,0 +1,111 @@
1
+ """Significance-test internals: paired/unpaired permutation, bootstrap, McNemar."""
2
+ import itertools
3
+
4
+ import numpy as np
5
+ from scipy import stats as _scipy_stats
6
+
7
+ EXACT_ENUMERATION_THRESHOLD = 12
8
+
9
+
10
+ def paired_bootstrap_ci(diffs, confidence=0.95, n_resamples=9999, rng=None):
11
+ rng = rng or np.random.default_rng()
12
+ n = len(diffs)
13
+ idx = rng.integers(0, n, size=(n_resamples, n))
14
+ resampled_means = diffs[idx].mean(axis=1)
15
+ alpha = 1 - confidence
16
+ lo, hi = np.percentile(resampled_means, [100 * alpha / 2, 100 * (1 - alpha / 2)])
17
+ return float(lo), float(hi)
18
+
19
+
20
+ def unpaired_bootstrap_ci(a, b, confidence=0.95, n_resamples=9999, rng=None):
21
+ rng = rng or np.random.default_rng()
22
+ na, nb = len(a), len(b)
23
+ idx_a = rng.integers(0, na, size=(n_resamples, na))
24
+ idx_b = rng.integers(0, nb, size=(n_resamples, nb))
25
+ stats = a[idx_a].mean(axis=1) - b[idx_b].mean(axis=1)
26
+ alpha = 1 - confidence
27
+ lo, hi = np.percentile(stats, [100 * alpha / 2, 100 * (1 - alpha / 2)])
28
+ return float(lo), float(hi)
29
+
30
+
31
+ def paired_permutation_p(diffs, n_resamples=9999, rng=None, exact_threshold=EXACT_ENUMERATION_THRESHOLD):
32
+ """Sign-flip permutation test on paired differences. Exact enumeration for small n."""
33
+ rng = rng or np.random.default_rng()
34
+ n = len(diffs)
35
+ obs = diffs.mean()
36
+ if n <= exact_threshold:
37
+ signs = np.array(list(itertools.product([-1, 1], repeat=n)), dtype=float)
38
+ resampled = (signs * diffs).mean(axis=1)
39
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
40
+ return float(count / len(signs))
41
+ signs = rng.choice([-1.0, 1.0], size=(n_resamples, n))
42
+ resampled = (signs * diffs).mean(axis=1)
43
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
44
+ return float((count + 1) / (n_resamples + 1))
45
+
46
+
47
+ def unpaired_permutation_p(a, b, n_resamples=9999, rng=None):
48
+ """Label-shuffle permutation test for two independent samples."""
49
+ rng = rng or np.random.default_rng()
50
+ na = len(a)
51
+ combined = np.concatenate([a, b])
52
+ obs = a.mean() - b.mean()
53
+ tiled = np.broadcast_to(combined, (n_resamples, len(combined))).copy()
54
+ perms = rng.permuted(tiled, axis=1)
55
+ stat = perms[:, :na].mean(axis=1) - perms[:, na:].mean(axis=1)
56
+ count = np.sum(np.abs(stat) >= abs(obs) - 1e-9)
57
+ return float((count + 1) / (n_resamples + 1))
58
+
59
+
60
+ def paired_bootstrap_p(diffs, n_resamples=9999, rng=None):
61
+ """Null-shifted bootstrap hypothesis test on paired differences."""
62
+ rng = rng or np.random.default_rng()
63
+ n = len(diffs)
64
+ obs = diffs.mean()
65
+ shifted = diffs - obs
66
+ idx = rng.integers(0, n, size=(n_resamples, n))
67
+ resampled = shifted[idx].mean(axis=1)
68
+ count = np.sum(np.abs(resampled) >= abs(obs) - 1e-9)
69
+ return float((count + 1) / (n_resamples + 1))
70
+
71
+
72
+ def unpaired_bootstrap_p(a, b, n_resamples=9999, rng=None):
73
+ """Null-shifted bootstrap hypothesis test for two independent samples."""
74
+ rng = rng or np.random.default_rng()
75
+ obs = a.mean() - b.mean()
76
+ combined_mean = np.concatenate([a, b]).mean()
77
+ a_shift = a - a.mean() + combined_mean
78
+ b_shift = b - b.mean() + combined_mean
79
+ na, nb = len(a), len(b)
80
+ idx_a = rng.integers(0, na, size=(n_resamples, na))
81
+ idx_b = rng.integers(0, nb, size=(n_resamples, nb))
82
+ stats = a_shift[idx_a].mean(axis=1) - b_shift[idx_b].mean(axis=1)
83
+ count = np.sum(np.abs(stats) >= abs(obs) - 1e-9)
84
+ return float((count + 1) / (n_resamples + 1))
85
+
86
+
87
+ def mcnemar_test(a, b, correction=True, exact=None):
88
+ """McNemar's test on paired binary (0/1 correctness) outcomes.
89
+
90
+ Returns (p_value, n01, n10, statistic) where n01 = b-correct-only,
91
+ n10 = a-correct-only, statistic is None when the exact binomial test is used.
92
+ """
93
+ a = np.asarray(a)
94
+ b = np.asarray(b)
95
+ n01 = int(np.sum((a == 0) & (b == 1)))
96
+ n10 = int(np.sum((a == 1) & (b == 0)))
97
+ n = n01 + n10
98
+ if exact is None:
99
+ exact = n < 25
100
+ if n == 0:
101
+ return 1.0, n01, n10, 0.0
102
+ if exact:
103
+ k = min(n01, n10)
104
+ p = _scipy_stats.binomtest(k, n, p=0.5, alternative="two-sided").pvalue
105
+ return float(p), n01, n10, None
106
+ if correction:
107
+ statistic = (abs(n01 - n10) - 1) ** 2 / n
108
+ else:
109
+ statistic = (n01 - n10) ** 2 / n
110
+ p = float(1 - _scipy_stats.chi2.cdf(statistic, df=1))
111
+ return p, n01, n10, float(statistic)
@@ -0,0 +1,7 @@
1
+ """Adapters that load external eval outputs into evalci's per-item DataFrame schema."""
2
+ from . import csv, helm, lm_eval_harness
3
+ from .csv import load as load_csv
4
+ from .helm import load as load_helm
5
+ from .lm_eval_harness import load as load_lm_eval_harness
6
+
7
+ __all__ = ["load_lm_eval_harness", "load_helm", "load_csv", "lm_eval_harness", "helm", "csv"]
evalci/adapters/csv.py ADDED
@@ -0,0 +1,21 @@
1
+ """Load a plain per-item CSV (columns: item_id, score[, subset, sample_idx]) into the evalci schema."""
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ from ..schema import from_records
7
+
8
+
9
+ def load(path, model=None):
10
+ path = Path(path)
11
+ model = model or path.stem
12
+ df = pd.read_csv(path)
13
+ if "item_id" not in df.columns or "score" not in df.columns:
14
+ raise ValueError(f"{path}: CSV must have 'item_id' and 'score' columns, got {list(df.columns)}")
15
+ return from_records(
16
+ df["item_id"],
17
+ model,
18
+ df["score"],
19
+ subsets=df["subset"] if "subset" in df.columns else None,
20
+ sample_idxs=df["sample_idx"] if "sample_idx" in df.columns else None,
21
+ )
@@ -0,0 +1,59 @@
1
+ """Load per-item results from a HELM `per_instance_stats.json` file into the evalci schema.
2
+
3
+ Expects the standard HELM per-instance-stats structure: a JSON list of records
4
+ each with an "instance_id" and a "stats" list of {"name": {"name": <metric>},
5
+ "mean": <value>, ...} entries. If your HELM run uses a different metric name,
6
+ pass `metric_key` explicitly.
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+
11
+ from ..schema import from_records
12
+
13
+ DEFAULT_METRIC_KEYS = ("exact_match", "quasi_exact_match", "f1_score")
14
+
15
+
16
+ def _stat_value(stat):
17
+ if "mean" in stat and stat["mean"] is not None:
18
+ return float(stat["mean"])
19
+ if stat.get("count") == 1 and "sum" in stat:
20
+ return float(stat["sum"])
21
+ raise ValueError(f"could not extract a scalar value from stat record: {stat}")
22
+
23
+
24
+ def _extract_score(record, metric_key):
25
+ stats = record.get("stats", [])
26
+ names = [s.get("name", {}).get("name") for s in stats]
27
+ if metric_key is not None:
28
+ for s, name in zip(stats, names):
29
+ if name == metric_key:
30
+ return _stat_value(s)
31
+ raise ValueError(f"metric_key={metric_key!r} not found; metrics present: {names}")
32
+ for key in DEFAULT_METRIC_KEYS:
33
+ for s, name in zip(stats, names):
34
+ if name == key:
35
+ return _stat_value(s)
36
+ raise ValueError(
37
+ f"could not find a metric in stats (tried {DEFAULT_METRIC_KEYS}); "
38
+ f"pass metric_key explicitly. metrics present: {names}"
39
+ )
40
+
41
+
42
+ def load(path, model=None, subset=None, metric_key=None):
43
+ """Parse a HELM per_instance_stats.json file into a per-item DataFrame."""
44
+ path = Path(path)
45
+ model = model or path.stem
46
+ records = json.loads(path.read_text())
47
+ if not isinstance(records, list):
48
+ raise ValueError(f"{path}: expected a JSON list of per-instance stat records")
49
+ if not records:
50
+ raise ValueError(f"{path}: no records found")
51
+
52
+ item_ids, scores = [], []
53
+ for r in records:
54
+ item_id = r.get("instance_id", r.get("instance", {}).get("id"))
55
+ item_ids.append(item_id)
56
+ scores.append(_extract_score(r, metric_key))
57
+
58
+ subsets = [subset] * len(item_ids)
59
+ return from_records(item_ids, model, scores, subsets=subsets)
@@ -0,0 +1,67 @@
1
+ """Load per-item results from lm-evaluation-harness output into the evalci schema.
2
+
3
+ Supports both the `--log_samples` JSONL format (`samples_<task>_*.jsonl`, one
4
+ JSON record per line) and the older single-JSON format with a top-level
5
+ "samples" field. If your harness version uses different metric key names, pass
6
+ `metric_key` explicitly.
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+
11
+ from ..schema import from_records
12
+
13
+ DEFAULT_METRIC_KEYS = ("acc", "exact_match", "acc_norm", "score")
14
+
15
+
16
+ def _extract_score(record, metric_key):
17
+ if metric_key is not None:
18
+ if metric_key not in record:
19
+ raise ValueError(f"metric_key={metric_key!r} not found in sample record: {list(record.keys())}")
20
+ return float(record[metric_key])
21
+ for key in DEFAULT_METRIC_KEYS:
22
+ if key in record:
23
+ return float(record[key])
24
+ raise ValueError(
25
+ f"could not find a metric in sample record (tried {DEFAULT_METRIC_KEYS}); "
26
+ f"pass metric_key explicitly. keys present: {list(record.keys())}"
27
+ )
28
+
29
+
30
+ def load(path, model=None, task=None, metric_key=None):
31
+ """Parse an lm-evaluation-harness results file into a per-item DataFrame."""
32
+ path = Path(path)
33
+ model = model or path.stem
34
+ records = []
35
+
36
+ if path.suffix == ".jsonl":
37
+ task_name = task or path.stem.replace("samples_", "").split("_2")[0]
38
+ for line in path.read_text().splitlines():
39
+ line = line.strip()
40
+ if line:
41
+ records.append((task_name, json.loads(line)))
42
+ else:
43
+ data = json.loads(path.read_text())
44
+ samples = data.get("samples")
45
+ if samples is None:
46
+ raise ValueError(f"{path}: no 'samples' field found; is this an lm-eval-harness output file?")
47
+ if isinstance(samples, dict):
48
+ for task_name, items in samples.items():
49
+ if task is not None and task_name != task:
50
+ continue
51
+ records.extend((task_name, r) for r in items)
52
+ elif isinstance(samples, list):
53
+ records.extend((task or "task", r) for r in samples)
54
+ else:
55
+ raise ValueError(f"{path}: unrecognized 'samples' structure: {type(samples)}")
56
+
57
+ if not records:
58
+ raise ValueError(f"{path}: no sample records found (task filter={task!r})")
59
+
60
+ item_ids, subsets, sample_idxs, scores = [], [], [], []
61
+ for task_name, r in records:
62
+ item_ids.append(r.get("doc_id", r.get("idx", r.get("doc_hash"))))
63
+ subsets.append(task_name)
64
+ sample_idxs.append(r.get("repeat_idx", 0))
65
+ scores.append(_extract_score(r, metric_key))
66
+
67
+ return from_records(item_ids, model, scores, subsets=subsets, sample_idxs=sample_idxs)
evalci/cli.py ADDED
@@ -0,0 +1,96 @@
1
+ """`evalci compare results_a.json results_b.json` — CLI entry point."""
2
+ import argparse
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from . import adapters
8
+ from .schema import to_paired_arrays
9
+ from .stats import compare
10
+ from .report import report
11
+
12
+
13
+ def _sniff_format(path):
14
+ path = Path(path)
15
+ if path.suffix == ".csv":
16
+ return "csv"
17
+ if path.suffix == ".jsonl":
18
+ return "lm-eval-harness"
19
+ if path.suffix == ".json":
20
+ try:
21
+ data = json.loads(path.read_text())
22
+ except json.JSONDecodeError:
23
+ return "csv"
24
+ if isinstance(data, dict) and "samples" in data:
25
+ return "lm-eval-harness"
26
+ if isinstance(data, list):
27
+ return "helm"
28
+ raise ValueError(f"could not auto-detect format for {path}; pass --format explicitly")
29
+
30
+
31
+ def _load(path, fmt, model, metric_key):
32
+ fmt = fmt or _sniff_format(path)
33
+ if fmt == "lm-eval-harness":
34
+ return adapters.load_lm_eval_harness(path, model=model, metric_key=metric_key)
35
+ if fmt == "helm":
36
+ return adapters.load_helm(path, model=model, metric_key=metric_key)
37
+ if fmt == "csv":
38
+ return adapters.load_csv(path, model=model)
39
+ raise ValueError(f"unknown format: {fmt!r}")
40
+
41
+
42
+ def build_parser():
43
+ parser = argparse.ArgumentParser(prog="evalci")
44
+ sub = parser.add_subparsers(dest="command", required=True)
45
+
46
+ cmp_p = sub.add_parser("compare", help="Compare two models' per-item results")
47
+ cmp_p.add_argument("results_a")
48
+ cmp_p.add_argument("results_b")
49
+ cmp_p.add_argument("--format", choices=["lm-eval-harness", "helm", "csv"], default=None)
50
+ cmp_p.add_argument("--method", choices=["permutation", "bootstrap", "mcnemar"], default="permutation")
51
+ cmp_p.add_argument("--unpaired", action="store_true", help="treat samples as independent, not item-paired")
52
+ cmp_p.add_argument("--metric-key", default=None)
53
+ cmp_p.add_argument("--confidence", type=float, default=0.95)
54
+ cmp_p.add_argument("--n-resamples", type=int, default=9999)
55
+ cmp_p.add_argument("--output-format", choices=["markdown", "latex"], default="markdown")
56
+ cmp_p.add_argument("--model-a", default=None, help="label for results_a (default: filename)")
57
+ cmp_p.add_argument("--model-b", default=None, help="label for results_b (default: filename)")
58
+
59
+ return parser
60
+
61
+
62
+ def main(argv=None):
63
+ parser = build_parser()
64
+ args = parser.parse_args(argv)
65
+
66
+ if args.command == "compare":
67
+ df_a = _load(args.results_a, args.format, args.model_a, args.metric_key)
68
+ df_b = _load(args.results_b, args.format, args.model_b, args.metric_key)
69
+ model_a, model_b = df_a["model"].iloc[0], df_b["model"].iloc[0]
70
+ if model_a == model_b:
71
+ print(
72
+ f"error: both inputs resolved to the same model label {model_a!r}; "
73
+ "pass --model-a/--model-b to disambiguate",
74
+ file=sys.stderr,
75
+ )
76
+ return 1
77
+ import pandas as pd
78
+
79
+ merged = pd.concat([df_a, df_b], ignore_index=True)
80
+ a, b = to_paired_arrays(merged, model_a, model_b)
81
+ if len(a) == 0:
82
+ print(f"error: no overlapping item_ids between {args.results_a} and {args.results_b}", file=sys.stderr)
83
+ return 1
84
+ result = compare(
85
+ a, b, paired=not args.unpaired, method=args.method,
86
+ confidence=args.confidence, n_resamples=args.n_resamples,
87
+ )
88
+ print(report(result, format=args.output_format))
89
+ return 0
90
+
91
+ parser.error(f"unknown command: {args.command}")
92
+ return 2
93
+
94
+
95
+ if __name__ == "__main__":
96
+ sys.exit(main())
evalci/report.py ADDED
@@ -0,0 +1,86 @@
1
+ """Render ci()/compare()/multi_compare() results as Markdown or LaTeX."""
2
+ import pandas as pd
3
+
4
+ from .stats import CIResult, CompareResult
5
+
6
+
7
+ def _stars(p):
8
+ if p < 0.001:
9
+ return "***"
10
+ if p < 0.01:
11
+ return "**"
12
+ if p < 0.05:
13
+ return "*"
14
+ return ""
15
+
16
+
17
+ def _fmt(x, precision):
18
+ return f"{x:.{precision}f}"
19
+
20
+
21
+ def _report_ci(result, precision):
22
+ return (
23
+ f"{_fmt(result.estimate, precision)} "
24
+ f"[{int(result.confidence * 100)}% CI {_fmt(result.lower, precision)}, "
25
+ f"{_fmt(result.upper, precision)}], n={result.n}"
26
+ )
27
+
28
+
29
+ def _report_compare(result, stars, precision):
30
+ star = f"{_stars(result.p_value)}" if stars else ""
31
+ kind = "paired" if result.paired else "unpaired"
32
+ return (
33
+ f"Δ={_fmt(result.delta, precision)}, "
34
+ f"{int(result.confidence * 100)}% CI [{_fmt(result.ci[0], precision)}, {_fmt(result.ci[1], precision)}], "
35
+ f"{kind} {result.method} p={result.p_value:.4g}{star}, n={result.n}"
36
+ )
37
+
38
+
39
+ def _markdown_table(headers, rows):
40
+ lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]
41
+ for row in rows:
42
+ lines.append("| " + " | ".join(str(c) for c in row) + " |")
43
+ return "\n".join(lines)
44
+
45
+
46
+ def _latex_table(headers, rows):
47
+ lines = [r"\begin{tabular}{" + "l" * len(headers) + "}", r"\toprule",
48
+ " & ".join(headers) + r" \\", r"\midrule"]
49
+ for row in rows:
50
+ lines.append(" & ".join(str(c) for c in row) + r" \\")
51
+ lines += [r"\bottomrule", r"\end{tabular}"]
52
+ return "\n".join(lines)
53
+
54
+
55
+ def _report_table(df, format, stars, precision):
56
+ has_subset = "subset" in df.columns
57
+ headers = (["subset"] if has_subset else []) + ["model_a", "model_b", "delta", "ci", "p_adj"]
58
+ if stars:
59
+ headers.append("sig")
60
+ rows = []
61
+ for _, r in df.iterrows():
62
+ row = []
63
+ if has_subset:
64
+ row.append(r["subset"])
65
+ row.append(r["model_a"])
66
+ row.append(r["model_b"])
67
+ row.append(_fmt(r["delta"], precision))
68
+ row.append(f"[{_fmt(r['ci_low'], precision)}, {_fmt(r['ci_high'], precision)}]")
69
+ row.append(f"{r['p_adj']:.4g}")
70
+ if stars:
71
+ row.append(_stars(r["p_adj"]))
72
+ rows.append(row)
73
+ if format == "latex":
74
+ return _latex_table(headers, rows)
75
+ return _markdown_table(headers, rows)
76
+
77
+
78
+ def report(result, format="markdown", stars=True, precision=3):
79
+ """Render a CIResult, CompareResult, or multi_compare() DataFrame."""
80
+ if isinstance(result, pd.DataFrame):
81
+ return _report_table(result, format, stars, precision)
82
+ if isinstance(result, CompareResult):
83
+ return _report_compare(result, stars, precision)
84
+ if isinstance(result, CIResult):
85
+ return _report_ci(result, precision)
86
+ raise TypeError(f"don't know how to report a {type(result)!r}")
evalci/schema.py ADDED
@@ -0,0 +1,45 @@
1
+ """The per-item results schema shared by multi_compare(), cluster_ci(), and adapters."""
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ REQUIRED_COLUMNS = ("item_id", "model", "score")
6
+ OPTIONAL_COLUMNS = ("subset", "sample_idx")
7
+ PER_ITEM_COLUMNS = REQUIRED_COLUMNS + OPTIONAL_COLUMNS
8
+
9
+
10
+ def validate_schema(df):
11
+ if not isinstance(df, pd.DataFrame):
12
+ raise TypeError("expected a pandas DataFrame with columns "
13
+ f"{REQUIRED_COLUMNS} (+ optional {OPTIONAL_COLUMNS})")
14
+ missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
15
+ if missing:
16
+ raise ValueError(f"per-item DataFrame is missing required column(s): {missing}")
17
+
18
+
19
+ def to_paired_arrays(df, model_a, model_b, subset=None, subset_col="subset"):
20
+ """Pivot a per-item DataFrame to two aligned score arrays for model_a/model_b.
21
+
22
+ Multiple samples per item (`sample_idx`) are averaged into a single per-item
23
+ score before pairing.
24
+ """
25
+ validate_schema(df)
26
+ sub = df if subset is None or subset_col not in df.columns else df[df[subset_col] == subset]
27
+ pivot = sub.pivot_table(index="item_id", columns="model", values="score", aggfunc="mean")
28
+ missing = [m for m in (model_a, model_b) if m not in pivot.columns]
29
+ if missing:
30
+ raise ValueError(f"model(s) not found in DataFrame: {missing}")
31
+ paired = pivot[[model_a, model_b]].dropna()
32
+ return paired[model_a].to_numpy(dtype=float), paired[model_b].to_numpy(dtype=float)
33
+
34
+
35
+ def from_records(item_ids, models, scores, subsets=None, sample_idxs=None):
36
+ """Build a per-item DataFrame from parallel arrays (used by adapters)."""
37
+ n = len(item_ids)
38
+ data = {
39
+ "item_id": list(item_ids),
40
+ "model": list(models) if not isinstance(models, str) else [models] * n,
41
+ "score": np.asarray(scores, dtype=float),
42
+ }
43
+ data["subset"] = list(subsets) if subsets is not None else [None] * n
44
+ data["sample_idx"] = list(sample_idxs) if sample_idxs is not None else [0] * n
45
+ return pd.DataFrame(data)
evalci/stats.py ADDED
@@ -0,0 +1,278 @@
1
+ """Public statistics API: ci, compare, power, multi_compare, cluster_ci."""
2
+ import itertools
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional, Tuple
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from . import _power
10
+ from ._correction import correct_pvalues
11
+ from ._intervals import bootstrap_interval, clopper_pearson_interval, wilson_interval
12
+ from ._significance import (
13
+ mcnemar_test,
14
+ paired_bootstrap_ci,
15
+ paired_bootstrap_p,
16
+ paired_permutation_p,
17
+ unpaired_bootstrap_ci,
18
+ unpaired_bootstrap_p,
19
+ unpaired_permutation_p,
20
+ )
21
+ from .schema import to_paired_arrays, validate_schema
22
+
23
+ _BINARY_CI_METHODS = {"wilson", "clopper-pearson", "exact"}
24
+
25
+
26
+ @dataclass
27
+ class CIResult:
28
+ estimate: float
29
+ lower: float
30
+ upper: float
31
+ method: str
32
+ n: int
33
+ confidence: float = 0.95
34
+
35
+ def __iter__(self):
36
+ return iter((self.lower, self.upper))
37
+
38
+ def __repr__(self):
39
+ return (
40
+ f"CIResult(estimate={self.estimate:.4f}, "
41
+ f"ci=[{self.lower:.4f}, {self.upper:.4f}], "
42
+ f"method={self.method!r}, n={self.n})"
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class CompareResult:
48
+ delta: float
49
+ ci: Tuple[float, float]
50
+ p_value: float
51
+ method: str
52
+ paired: bool
53
+ n: int
54
+ confidence: float = 0.95
55
+ extra: dict = field(default_factory=dict)
56
+
57
+ def __repr__(self):
58
+ return (
59
+ f"CompareResult(delta={self.delta:.4f}, "
60
+ f"ci=[{self.ci[0]:.4f}, {self.ci[1]:.4f}], "
61
+ f"p_value={self.p_value:.4g}, method={self.method!r}, "
62
+ f"paired={self.paired}, n={self.n})"
63
+ )
64
+
65
+
66
+ @dataclass
67
+ class PowerResult:
68
+ delta: float
69
+ n: int
70
+ power: float
71
+ alpha: float
72
+ method: str
73
+
74
+
75
+ def ci(scores, method="wilson", confidence=0.95, n_resamples=9999, random_state=None):
76
+ """Confidence interval on a single model's score.
77
+
78
+ method: "wilson" or "clopper-pearson"/"exact" for binary (0/1) scores,
79
+ or "bootstrap" (percentile/BCa on the mean) for continuous scores.
80
+ """
81
+ scores = np.asarray(scores, dtype=float)
82
+ n = len(scores)
83
+ if n == 0:
84
+ raise ValueError("scores must be non-empty")
85
+ method_l = method.lower()
86
+ if method_l in _BINARY_CI_METHODS:
87
+ if not np.all(np.isin(scores, [0.0, 1.0])):
88
+ raise ValueError(
89
+ f"method={method!r} requires binary 0/1 scores; use method='bootstrap' "
90
+ "for continuous scores"
91
+ )
92
+ k = int(scores.sum())
93
+ if method_l == "wilson":
94
+ lo, hi = wilson_interval(k, n, confidence)
95
+ else:
96
+ lo, hi = clopper_pearson_interval(k, n, confidence)
97
+ return CIResult(estimate=k / n, lower=lo, upper=hi, method=method_l, n=n, confidence=confidence)
98
+ if method_l == "bootstrap":
99
+ est = float(scores.mean())
100
+ lo, hi = bootstrap_interval(
101
+ scores, np.mean, confidence=confidence, n_resamples=n_resamples, random_state=random_state
102
+ )
103
+ return CIResult(estimate=est, lower=lo, upper=hi, method="bootstrap", n=n, confidence=confidence)
104
+ raise ValueError(f"unknown method: {method!r}")
105
+
106
+
107
+ def compare(
108
+ a,
109
+ b,
110
+ paired=True,
111
+ method="permutation",
112
+ confidence=0.95,
113
+ n_resamples=9999,
114
+ correction=True,
115
+ random_state=None,
116
+ ):
117
+ """Model-vs-model comparison. method: "permutation", "bootstrap", or "mcnemar" (paired only)."""
118
+ a = np.asarray(a, dtype=float)
119
+ b = np.asarray(b, dtype=float)
120
+ rng = np.random.default_rng(random_state)
121
+ method_l = method.lower()
122
+
123
+ if paired:
124
+ if len(a) != len(b):
125
+ raise ValueError("paired comparison requires equal-length arrays")
126
+ diffs = a - b
127
+ n = len(diffs)
128
+ delta = float(diffs.mean())
129
+ lo, hi = paired_bootstrap_ci(diffs, confidence=confidence, n_resamples=n_resamples, rng=rng)
130
+ extra = {}
131
+ if method_l == "permutation":
132
+ p = paired_permutation_p(diffs, n_resamples=n_resamples, rng=rng)
133
+ elif method_l == "bootstrap":
134
+ p = paired_bootstrap_p(diffs, n_resamples=n_resamples, rng=rng)
135
+ elif method_l == "mcnemar":
136
+ p, n01, n10, statistic = mcnemar_test(a, b, correction=correction)
137
+ extra = {"n01": n01, "n10": n10, "statistic": statistic}
138
+ else:
139
+ raise ValueError(f"unknown method: {method!r}")
140
+ else:
141
+ if method_l == "mcnemar":
142
+ raise ValueError("mcnemar requires paired=True")
143
+ n = len(a) + len(b)
144
+ delta = float(a.mean() - b.mean())
145
+ lo, hi = unpaired_bootstrap_ci(a, b, confidence=confidence, n_resamples=n_resamples, rng=rng)
146
+ extra = {}
147
+ if method_l == "permutation":
148
+ p = unpaired_permutation_p(a, b, n_resamples=n_resamples, rng=rng)
149
+ elif method_l == "bootstrap":
150
+ p = unpaired_bootstrap_p(a, b, n_resamples=n_resamples, rng=rng)
151
+ else:
152
+ raise ValueError(f"unknown method: {method!r}")
153
+
154
+ return CompareResult(
155
+ delta=delta,
156
+ ci=(lo, hi),
157
+ p_value=p,
158
+ method=method_l,
159
+ paired=paired,
160
+ n=n,
161
+ confidence=confidence,
162
+ extra=extra,
163
+ )
164
+
165
+
166
+ def power(
167
+ delta,
168
+ n=None,
169
+ power=0.8,
170
+ baseline=0.5,
171
+ alpha=0.05,
172
+ method="analytic",
173
+ rho=0.0,
174
+ sims=5000,
175
+ random_state=None,
176
+ ):
177
+ """Sample-size (n is None) or achieved-power (n given) calculator.
178
+
179
+ method="analytic" uses a two-proportion normal approximation (fast, ignores
180
+ pairing correlation). method="simulation" simulates a paired design with a
181
+ `rho` correlation knob between the two models' item-level correctness.
182
+ """
183
+ method_l = method.lower()
184
+ if method_l == "analytic":
185
+ if n is None:
186
+ solved_n = _power.analytic_n(delta, target_power=power, baseline=baseline, alpha=alpha)
187
+ return PowerResult(delta=delta, n=solved_n, power=power, alpha=alpha, method="analytic")
188
+ achieved = _power.analytic_power(delta, n, baseline=baseline, alpha=alpha)
189
+ return PowerResult(delta=delta, n=n, power=achieved, alpha=alpha, method="analytic")
190
+ if method_l == "simulation":
191
+ if n is None:
192
+ solved_n = _power.simulate_n(
193
+ delta, target_power=power, baseline=baseline, alpha=alpha, rho=rho,
194
+ sims=sims, random_state=random_state,
195
+ )
196
+ return PowerResult(delta=delta, n=solved_n, power=power, alpha=alpha, method="simulation")
197
+ achieved = _power.simulate_power(
198
+ delta, n, baseline=baseline, alpha=alpha, rho=rho, sims=sims, random_state=random_state
199
+ )
200
+ return PowerResult(delta=delta, n=n, power=achieved, alpha=alpha, method="simulation")
201
+ raise ValueError(f"unknown method: {method!r}")
202
+
203
+
204
+ def multi_compare(
205
+ df,
206
+ correction="holm",
207
+ method="permutation",
208
+ paired=True,
209
+ alpha=0.05,
210
+ n_resamples=9999,
211
+ random_state=None,
212
+ subset_col="subset",
213
+ ):
214
+ """Pairwise model comparisons across (optionally subset-stratified) benchmarks,
215
+ with multiple-comparison correction across all resulting p-values."""
216
+ validate_schema(df)
217
+ models = sorted(df["model"].unique())
218
+ if len(models) < 2:
219
+ raise ValueError("need at least 2 models to compare")
220
+ has_subset = subset_col in df.columns and df[subset_col].notna().any()
221
+ subsets = sorted(df[subset_col].dropna().unique()) if has_subset else [None]
222
+
223
+ rows = []
224
+ for subset in subsets:
225
+ for model_a, model_b in itertools.combinations(models, 2):
226
+ a, b = to_paired_arrays(df, model_a, model_b, subset=subset, subset_col=subset_col)
227
+ if len(a) == 0:
228
+ continue
229
+ res = compare(
230
+ a, b, paired=paired, method=method, confidence=1 - alpha,
231
+ n_resamples=n_resamples, random_state=random_state,
232
+ )
233
+ rows.append(
234
+ {
235
+ "subset": subset,
236
+ "model_a": model_a,
237
+ "model_b": model_b,
238
+ "delta": res.delta,
239
+ "ci_low": res.ci[0],
240
+ "ci_high": res.ci[1],
241
+ "p_value": res.p_value,
242
+ "n": res.n,
243
+ }
244
+ )
245
+ result = pd.DataFrame(rows)
246
+ if result.empty:
247
+ return result
248
+ result["p_adj"] = correct_pvalues(result["p_value"].to_numpy(), method=correction)
249
+ result["significant"] = result["p_adj"] < alpha
250
+ if not has_subset:
251
+ result = result.drop(columns=["subset"])
252
+ return result
253
+
254
+
255
+ def cluster_ci(scores, clusters, statistic=np.mean, confidence=0.95, n_resamples=9999, random_state=None):
256
+ """Clustered bootstrap CI (resamples whole clusters, e.g. repeated decodes or
257
+ grouped questions, rather than individual items)."""
258
+ scores = np.asarray(scores, dtype=float)
259
+ clusters = np.asarray(clusters)
260
+ if len(scores) != len(clusters):
261
+ raise ValueError("scores and clusters must be the same length")
262
+ rng = np.random.default_rng(random_state)
263
+ unique_clusters = np.unique(clusters)
264
+ n_clusters = len(unique_clusters)
265
+ grouped = [scores[clusters == c] for c in unique_clusters]
266
+ overall = float(statistic(scores))
267
+
268
+ boot_stats = np.empty(n_resamples)
269
+ for i in range(n_resamples):
270
+ chosen = rng.integers(0, n_clusters, size=n_clusters)
271
+ resampled = np.concatenate([grouped[c] for c in chosen])
272
+ boot_stats[i] = statistic(resampled)
273
+ alpha = 1 - confidence
274
+ lo, hi = np.percentile(boot_stats, [100 * alpha / 2, 100 * (1 - alpha / 2)])
275
+ return CIResult(
276
+ estimate=overall, lower=float(lo), upper=float(hi),
277
+ method="cluster_bootstrap", n=len(scores), confidence=confidence,
278
+ )
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalci
3
+ Version: 0.1.0
4
+ Summary: Statistical significance & confidence intervals for LLM evals
5
+ Author: Shreyas
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Shreyaskc/evalci
8
+ Project-URL: Repository, https://github.com/Shreyaskc/evalci
9
+ Project-URL: Issues, https://github.com/Shreyaskc/evalci/issues
10
+ Keywords: llm,evaluation,statistics,confidence-interval,significance-testing,benchmarking
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy
21
+ Requires-Dist: scipy
22
+ Requires-Dist: pandas
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest; extra == "test"
25
+ Requires-Dist: statsmodels; extra == "test"
26
+ Dynamic: license-file
27
+
28
+ # evalci
29
+
30
+ Statistically sound comparisons between LLMs on benchmarks: confidence
31
+ intervals on accuracy, paired significance tests, power analysis, clustered
32
+ standard errors for multi-sample decoding, and multiple-comparison correction
33
+ across many models/benchmarks — all validated against `statsmodels`/exact
34
+ enumeration fixtures.
35
+
36
+ ```python
37
+ >>> import evalci
38
+ >>> result = evalci.compare(model_a_scores, model_b_scores, method="permutation")
39
+ >>> evalci.report(result)
40
+ 'Δ=0.034, 95% CI [0.005, 0.063], paired permutation p=0.025*, n=1319' # exact numbers depend on your data
41
+ ```
42
+
43
+ ## Status
44
+
45
+ Core library (statistics, eval-shaped workflows, adapters, CLI) is implemented
46
+ and tested. Not yet released to PyPI; no arXiv paper or DOI yet.
47
+
48
+ ## Install
49
+
50
+ Not yet on PyPI. Install from source:
51
+
52
+ ```bash
53
+ git clone https://github.com/Shreyaskc/evalci.git
54
+ cd evalci
55
+ pip install -e ".[test]" # add [test] to also get pytest/statsmodels for running the test suite
56
+ ```
57
+
58
+ Requires Python ≥3.9. Runtime dependencies are numpy, scipy, and pandas only.
59
+
60
+ ## Usage
61
+
62
+ ### Confidence interval on a single model
63
+
64
+ ```python
65
+ import evalci
66
+
67
+ # binary (0/1) per-item correctness
68
+ evalci.ci(scores, method="wilson") # Wilson score interval
69
+ evalci.ci(scores, method="clopper-pearson") # exact interval
70
+
71
+ # continuous scores (e.g. a similarity metric)
72
+ evalci.ci(scores, method="bootstrap") # percentile/BCa bootstrap on the mean
73
+ ```
74
+
75
+ ### Comparing two models on the same items
76
+
77
+ ```python
78
+ result = evalci.compare(model_a_scores, model_b_scores, paired=True, method="permutation")
79
+ # result.delta, result.ci, result.p_value, result.n
80
+
81
+ evalci.compare(a, b, method="bootstrap") # null-shifted bootstrap hypothesis test
82
+ evalci.compare(a, b, method="mcnemar") # McNemar's test for paired binary outcomes
83
+ evalci.compare(a, b, paired=False, method="permutation") # independent samples
84
+ ```
85
+
86
+ ### Sample-size / power calculator
87
+
88
+ ```python
89
+ evalci.power(delta=0.03, power=0.8) # required n to detect a 3-point gap at 80% power
90
+ evalci.power(delta=0.03, n=1500) # achieved power at n=1500
91
+ evalci.power(delta=0.03, power=0.8, method="simulation", rho=0.3) # correlated-items simulation
92
+ ```
93
+
94
+ ### Many models × many benchmarks, with correction
95
+
96
+ ```python
97
+ import pandas as pd
98
+
99
+ # per-item schema: item_id, model, score, [subset], [sample_idx]
100
+ df = pd.DataFrame(...)
101
+ table = evalci.multi_compare(df, correction="holm")
102
+ print(evalci.report(table, format="markdown"))
103
+ ```
104
+
105
+ ### Clustered standard errors (repeated decoding, grouped questions)
106
+
107
+ ```python
108
+ # clusters groups multiple samples of the same underlying item
109
+ evalci.cluster_ci(scores, clusters)
110
+ ```
111
+
112
+ ### Loading results from eval harnesses
113
+
114
+ ```python
115
+ from evalci.adapters import load_lm_eval_harness, load_helm, load_csv
116
+
117
+ df_a = load_lm_eval_harness("results_a.json", model="model-a")
118
+ df_b = load_helm("per_instance_stats.json", model="model-b")
119
+ ```
120
+
121
+ ### CLI
122
+
123
+ ```bash
124
+ evalci compare results_a.json results_b.json --method permutation
125
+ evalci compare results_a.json results_b.json --format helm --method mcnemar
126
+ ```
127
+
128
+ Auto-detects lm-evaluation-harness / HELM / CSV format from the file
129
+ extension/content; pass `--format` to override, and `--model-a`/`--model-b` to
130
+ label the two runs explicitly.
131
+
132
+ ## What's validated, and how
133
+
134
+ Statistical correctness is the point of this library, so the test suite
135
+ cross-checks every routine against an independent reference rather than just
136
+ re-testing its own math:
137
+
138
+ - Wilson and Clopper-Pearson intervals against `statsmodels.stats.proportion.proportion_confint`
139
+ - McNemar's test (exact and asymptotic) against `statsmodels.stats.contingency_tables.mcnemar`
140
+ - Holm and Benjamini-Hochberg correction against `statsmodels.stats.multitest.multipletests`
141
+ - The paired permutation test against brute-force exact enumeration of all sign flips (small n)
142
+ - Bootstrap CIs via a coverage simulation (nominal 95% CIs should contain the true parameter ~95% of the time)
143
+
144
+ `statsmodels` is a test-only dependency (`pip install -e ".[test]"`), not a
145
+ runtime dependency.
146
+
147
+ ```bash
148
+ pytest tests/
149
+ ```
150
+
151
+ ## API surface
152
+
153
+ `evalci.ci`, `evalci.compare`, `evalci.power`, `evalci.multi_compare`,
154
+ `evalci.cluster_ci`, `evalci.report`, `evalci.adapters.{load_lm_eval_harness,
155
+ load_helm, load_csv}`.
156
+
157
+ ## License
158
+
159
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,19 @@
1
+ evalci/__init__.py,sha256=BIGglYzvKq-4pxZLaHH6aKC6StQorG8XxoleIvPDHpw,368
2
+ evalci/_correction.py,sha256=R-meWXZ37Dq8Wc5XZ-BIr6wN1FRn_-B7ZdBD9iN_tz0,1318
3
+ evalci/_intervals.py,sha256=86Bn6fM3PBixRMgmJ_D-Vp_o6GckW70E0B_ePWjMQZI,1902
4
+ evalci/_power.py,sha256=zj6aMc4Ac792Ji8YbyMQCdtxZV1LnQ5nCTnMCcP9j7A,2616
5
+ evalci/_significance.py,sha256=qPppQSUut_iG-lmRnAyziE_HMWn2gCtp5svRDujO5UM,4337
6
+ evalci/cli.py,sha256=7ahTjHMO9P1U6AISv39b-QWvxUCjEak-f2Szpa1XYT4,3620
7
+ evalci/report.py,sha256=BC9HDdF3Ij8sWaQp8Sv7VFM0OQvm5GrHgIj7KzJlhK4,2864
8
+ evalci/schema.py,sha256=HaUQ0d7dvIqJTzHrqeCmokBFWQau4_3enxyRUIqI_KQ,2004
9
+ evalci/stats.py,sha256=QrQZ4hnw1yEkqduskVtE0qDT-En1zplMYJ_ch9Jchyk,9697
10
+ evalci/adapters/__init__.py,sha256=qjTGbejXDzF8hSuY9lqNSnH0f7PR-d1Ms_UZyQYBkD0,352
11
+ evalci/adapters/csv.py,sha256=oOt26sAt8dVV5aD7xognQrtBMn3mflzUP76jNh6qGs0,699
12
+ evalci/adapters/helm.py,sha256=nPstY8VawTmI9Mc-I4o-swqHd7lwhq1wEOHQknP9yCw,2221
13
+ evalci/adapters/lm_eval_harness.py,sha256=CvfYLej08y6LhKlZHJDLACdlCKLUNpY9Is00ERqCnyY,2694
14
+ evalci-0.1.0.dist-info/licenses/LICENSE,sha256=E5J5RZEY_2ybAJCvq79TWdtnQy4nO-IDeeyQ2LMF2l4,1064
15
+ evalci-0.1.0.dist-info/METADATA,sha256=O6K9eOMn790pLQ-F-lYzCdbccqkLt0qtgIWGFZe2fPA,5327
16
+ evalci-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
17
+ evalci-0.1.0.dist-info/entry_points.txt,sha256=yL6bWF7XTvcJba_eK0dzi9bXIORTvw7ahqL3ioKqWbY,43
18
+ evalci-0.1.0.dist-info/top_level.txt,sha256=HAQ7N_S3eZUMnC33FI6oywwl66lJ2Mx9kduKb6ukPEs,7
19
+ evalci-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ evalci = evalci.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shreyas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ evalci