e-valuator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: e-valuator
3
+ Version: 0.1.0
4
+ Summary: Sequential evaluator for LLM trajectories
5
+ Author: Shuvom Sadhuka
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/shuvom-s/e-valuator
8
+ Project-URL: Repository, https://github.com/shuvom-s/e-valuator
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: numpy
12
+ Requires-Dist: pandas
13
+ Requires-Dist: scikit-learn
14
+ Requires-Dist: scipy
15
+ Requires-Dist: tqdm
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest; extra == "dev"
18
+
19
+ # E-valuator
20
+ Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
21
+
22
+ ## Install
23
+ To start, please install our package:
24
+
25
+ ```bash
26
+ pip install e-valuator
27
+ ```
28
+
29
+ ## Quick start
30
+ Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
31
+
32
+ These notebooks provide examples of the input data format required and evaluation pipeline.
33
+
@@ -0,0 +1,15 @@
1
+ # E-valuator
2
+ Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
3
+
4
+ ## Install
5
+ To start, please install our package:
6
+
7
+ ```bash
8
+ pip install e-valuator
9
+ ```
10
+
11
+ ## Quick start
12
+ Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
13
+
14
+ These notebooks provide examples of the input data format required and evaluation pipeline.
15
+
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ # PyPI project name (with hyphen)
7
+ name = "e-valuator"
8
+
9
+ # Import name stays "evaluator"
10
+ version = "0.1.0"
11
+ description = "Sequential evaluator for LLM trajectories"
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+ license = { text = "MIT" }
15
+
16
+ authors = [
17
+ { name = "Shuvom Sadhuka" }
18
+ ]
19
+
20
+ dependencies = [
21
+ "numpy",
22
+ "pandas",
23
+ "scikit-learn",
24
+ "scipy",
25
+ "tqdm"
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = ["pytest"]
30
+
31
+ [tool.setuptools]
32
+ package-dir = {"" = "src"}
33
+
34
+ [tool.setuptools.packages.find]
35
+ where = ["src"]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/shuvom-s/e-valuator"
39
+ Repository = "https://github.com/shuvom-s/e-valuator"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: e-valuator
3
+ Version: 0.1.0
4
+ Summary: Sequential evaluator for LLM trajectories
5
+ Author: Shuvom Sadhuka
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/shuvom-s/e-valuator
8
+ Project-URL: Repository, https://github.com/shuvom-s/e-valuator
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: numpy
12
+ Requires-Dist: pandas
13
+ Requires-Dist: scikit-learn
14
+ Requires-Dist: scipy
15
+ Requires-Dist: tqdm
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest; extra == "dev"
18
+
19
+ # E-valuator
20
+ Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
21
+
22
+ ## Install
23
+ To start, please install our package:
24
+
25
+ ```bash
26
+ pip install e-valuator
27
+ ```
28
+
29
+ ## Quick start
30
+ Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
31
+
32
+ These notebooks provide examples of the input data format required and evaluation pipeline.
33
+
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/e_valuator.egg-info/PKG-INFO
4
+ src/e_valuator.egg-info/SOURCES.txt
5
+ src/e_valuator.egg-info/dependency_links.txt
6
+ src/e_valuator.egg-info/requires.txt
7
+ src/e_valuator.egg-info/top_level.txt
8
+ src/evaluator/__init__.py
9
+ src/evaluator/evaluator.py
10
+ src/evaluator/utils.py
11
+ tests/test_workflow.py
@@ -0,0 +1,8 @@
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ scipy
5
+ tqdm
6
+
7
+ [dev]
8
+ pytest
@@ -0,0 +1 @@
1
+ evaluator
@@ -0,0 +1,4 @@
1
+ from .evaluator import EValuator
2
+ from . import utils
3
+
4
+ __all__ = ["EValuator", "utils"]
@@ -0,0 +1,328 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.preprocessing import StandardScaler
6
+ from scipy.stats import binom
7
+
8
+
9
+ class EValuator:
10
+ """
11
+ Stepwise evaluator using density ratio estimation + e-values.
12
+
13
+ Args:
14
+ model_type: "logistic" or "random_forest"
15
+ mt_variant: "anytime", "split", or "both"
16
+ alphas: list of alpha levels
17
+ delta: tolerance-bound failure probability for split mode (1 - confidence)
18
+ problem_col: column name for problem identifier (default "uq_problem_idx")
19
+ step_col: column name for step index (default "num_steps")
20
+ split_fraction: fraction of problems used for log-training in split mode
21
+ random_state: RNG seed for the internal log/cal split
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ model_type: str = "logistic",
27
+ mt_variant: str = "split",
28
+ alphas=None,
29
+ delta: float = 0.1,
30
+ problem_col: str = "uq_problem_idx",
31
+ step_col: str = "num_steps",
32
+ split_fraction: float = 0.5,
33
+ random_state: int = 42,
34
+ ):
35
+ self.model_type = model_type
36
+ self.mt_variant = mt_variant
37
+ self.alphas = alphas if alphas is not None else [0.05]
38
+ self.delta = delta
39
+ self.problem_col = problem_col
40
+ self.step_col = step_col
41
+ self.split_fraction = split_fraction
42
+ self.random_state = random_state
43
+
44
+ assert isinstance(self.problem_col, str), "problem_col must be a string."
45
+ assert isinstance(self.step_col, str), "step_col must be a string."
46
+
47
+ assert len(self.alphas) > 0, "Must specify at least one alpha."
48
+ for a in self.alphas:
49
+ assert 0 < a < 1, f"Alpha must be in (0,1). Got {a}"
50
+
51
+ assert 0 < self.delta < 1, f"Delta must be in (0,1). Got {self.delta}"
52
+ assert 0 < self.split_fraction < 1, f"split_fraction must be in (0,1). Got {self.split_fraction}"
53
+ assert self.mt_variant in {"anytime", "split", "both"}, (
54
+ "mt_variant must be 'anytime', 'split', or 'both'. "
55
+ f"Got {self.mt_variant}"
56
+ )
57
+
58
+ ## Model families for anytime and split
59
+ self.step_models_anytime = {}
60
+ self.step_scalers_anytime = {}
61
+ self.step_base_probs_anytime = {}
62
+ self.max_trained_step_anytime = 0
63
+
64
+ self.step_models_split = {}
65
+ self.step_scalers_split = {}
66
+ self.step_base_probs_split = {}
67
+ self.max_trained_step_split = 0
68
+
69
+ ## For "anytime" or "split": dict[alpha] -> threshold
70
+ ## For "both": {"anytime": {alpha: thr}, "split": {alpha: thr}}
71
+ self.thresholds = {}
72
+
73
+ def _new_model(self):
74
+ if self.model_type == "logistic":
75
+ return LogisticRegression(max_iter=200)
76
+ elif self.model_type == "random_forest":
77
+ return RandomForestClassifier(n_estimators=200)
78
+ raise ValueError(f"Unknown model type: {self.model_type}")
79
+
80
+ def _upper_tolerance_bound(self, values, alpha, delta):
81
+ """
82
+ Distribution-free one-sided upper bound for the (1−alpha)-quantile
83
+ with confidence (1−delta).
84
+ """
85
+ Xs = np.sort(np.asarray(values))
86
+ n = Xs.size
87
+ if n == 0:
88
+ return 0.0
89
+
90
+ p = 1.0 - alpha
91
+ sig = delta
92
+
93
+ for k in range(1, n + 1):
94
+ if binom.sf(k - 1, n, p) <= sig:
95
+ return float(Xs[k - 1])
96
+ return float(Xs[-1])
97
+
98
+
99
+ def _fit_step_models(self, df: pd.DataFrame, which: str):
100
+ """
101
+ Fit one density-ratio model per step on df, for the given variant.
102
+ Should be one of {"anytime", "split"}.
103
+ """
104
+ max_steps = df[self.step_col].max()
105
+
106
+ for step in range(1, max_steps + 1):
107
+ step_df = df[df[self.step_col] == step]
108
+ y = step_df["solved"].values
109
+ uniq, counts = np.unique(y, return_counts=True)
110
+
111
+ if step == 1:
112
+ if len(uniq) < 2 or np.min(counts) < 5:
113
+ raise AssertionError("Step 1 requires ≥5 examples per class.")
114
+ else:
115
+ if len(uniq) < 2 or np.min(counts) < 5:
116
+ continue
117
+
118
+ X = np.array([list(s) for s in step_df["judge_probability_series"].values])
119
+ scaler = StandardScaler()
120
+ Xs = scaler.fit_transform(X)
121
+ model = self._new_model()
122
+ model.fit(Xs, y)
123
+
124
+ p1 = float(np.mean(y))
125
+ p0 = 1.0 - p1
126
+
127
+ if which == "anytime":
128
+ self.step_models_anytime[step] = model
129
+ self.step_scalers_anytime[step] = scaler
130
+ self.step_base_probs_anytime[step] = {"p0": p0, "p1": p1}
131
+ self.max_trained_step_anytime = max(self.max_trained_step_anytime, step)
132
+ elif which == "split":
133
+ self.step_models_split[step] = model
134
+ self.step_scalers_split[step] = scaler
135
+ self.step_base_probs_split[step] = {"p0": p0, "p1": p1}
136
+ self.max_trained_step_split = max(self.max_trained_step_split, step)
137
+ else:
138
+ raise ValueError(f"Unknown which={which}")
139
+
140
+ def _compute_e_vals_for_variant(self, df: pd.DataFrame, which: str) -> np.ndarray:
141
+ """
142
+ Compute density ratios per row for the requested variant ("anytime" or "split").
143
+ """
144
+ if which == "anytime":
145
+ models = self.step_models_anytime
146
+ scalers = self.step_scalers_anytime
147
+ base_probs = self.step_base_probs_anytime
148
+ elif which == "split":
149
+ models = self.step_models_split
150
+ scalers = self.step_scalers_split
151
+ base_probs = self.step_base_probs_split
152
+ else:
153
+ raise ValueError(f"Unknown which={which}")
154
+
155
+ e_vals = np.full(len(df), np.nan, dtype=float)
156
+ last_ratio = {}
157
+
158
+ for idx, row in df.iterrows():
159
+ pid = row[self.problem_col]
160
+ step = int(row[self.step_col])
161
+
162
+ if pid not in last_ratio:
163
+ last_ratio[pid] = 1.0
164
+
165
+ if step in models:
166
+ series = np.array(row["judge_probability_series"], dtype=float)
167
+ X = series.reshape(1, -1)
168
+ scaler = scalers[step]
169
+ model = models[step]
170
+
171
+ Xs = scaler.transform(X)
172
+ proba = model.predict_proba(Xs)[0]
173
+ p1_s = np.clip(proba[1], 1e-6, 1 - 1e-6)
174
+ p0_s = np.clip(proba[0], 1e-6, 1 - 1e-6)
175
+
176
+ base = base_probs[step]
177
+ p0 = base["p0"]
178
+ p1 = base["p1"]
179
+
180
+ ## Density ratio:
181
+ ## Pr(S | Y=0) / Pr(S | Y=1)
182
+ ## = (Pr(Y=1 | S) / Pr(Y=0 | S)) * (Pr(Y=0) / Pr(Y=1))
183
+ ## and we are using the equivalent form:
184
+ ## (p0_s / p1_s) * (p1 / p0)
185
+ ratio = (p0_s / p1_s) * (p1 / p0)
186
+ last_ratio[pid] = ratio
187
+ else:
188
+ ratio = last_ratio[pid]
189
+
190
+ e_vals[idx] = ratio
191
+
192
+ return e_vals
193
+
194
+ def _compute_split_thresholds(self, calib_df: pd.DataFrame) -> dict:
195
+ """
196
+ Use the held-out calibration split to get tolerance-bound thresholds
197
+ for split_e_val.
198
+ """
199
+ tmp = calib_df.copy()
200
+ tmp["split_e_val"] = self._compute_e_vals_for_variant(tmp, which="split")
201
+ solved = tmp[tmp["solved"] == 1]
202
+
203
+ group_max = []
204
+ for _, g in solved.groupby(self.problem_col):
205
+ vals = pd.to_numeric(g["split_e_val"], errors="coerce").dropna()
206
+ if len(vals) > 0:
207
+ group_max.append(vals.max())
208
+
209
+ thresholds = {}
210
+ for a in self.alphas:
211
+ if len(group_max) == 0:
212
+ thresholds[a] = 0.0
213
+ else:
214
+ thresholds[a] = self._upper_tolerance_bound(
215
+ group_max, alpha=a, delta=self.delta
216
+ )
217
+ return thresholds
218
+
219
+
220
+ def fit(self, calib_df: pd.DataFrame):
221
+ """
222
+ Fit one model per step and compute per-alpha thresholds.
223
+
224
+ calib_df must contain:
225
+ - self.step_col
226
+ - "solved"
227
+ - "judge_probability_series"
228
+ - self.problem_col (for split/both threshold computation)
229
+ """
230
+ calib_df = calib_df.copy()
231
+
232
+ ## Anytime-only: train on full calib, Ville thresholds 1/alpha
233
+ if self.mt_variant == "anytime":
234
+ self._fit_step_models(calib_df, which="anytime")
235
+ self.thresholds = {a: 1.0 / a for a in self.alphas}
236
+ return
237
+
238
+ ## For split version, first build a log/cal split by problem
239
+ unique_ids = calib_df[self.problem_col].unique()
240
+ rng = np.random.default_rng(self.random_state)
241
+ n_log = max(1, int(len(unique_ids) * self.split_fraction))
242
+ log_ids = rng.choice(unique_ids, size=n_log, replace=False)
243
+ cal_ids = np.setdiff1d(unique_ids, log_ids)
244
+
245
+ log_df = calib_df[calib_df[self.problem_col].isin(log_ids)].reset_index(drop=True)
246
+ cal_df = calib_df[calib_df[self.problem_col].isin(cal_ids)].reset_index(drop=True)
247
+
248
+ if self.mt_variant == "split":
249
+ ## Split-only: train on one half, thresholds from held-out other half
250
+ self._fit_step_models(log_df, which="split")
251
+ self.thresholds = self._compute_split_thresholds(cal_df)
252
+ return
253
+
254
+ ## mt_variant == "both"
255
+ ## Anytime side: train on full calib set, thresholds 1/alpha
256
+ self._fit_step_models(calib_df, which="anytime")
257
+ anytime_thresholds = {a: 1.0 / a for a in self.alphas}
258
+
259
+ ## Split side: train on one half, thresholds from held-out other half
260
+ self._fit_step_models(log_df, which="split")
261
+ split_thresholds = self._compute_split_thresholds(cal_df)
262
+
263
+ self.thresholds = {
264
+ "anytime": anytime_thresholds,
265
+ "split": split_thresholds,
266
+ }
267
+
268
+ def apply(self, df: pd.DataFrame, compute_rejects: bool = True) -> pd.DataFrame:
269
+ """
270
+ Compute e-values per row.
271
+
272
+ df must contain:
273
+ - self.step_col
274
+ - self.problem_col
275
+ - "judge_probability_series"
276
+
277
+ Returns a copy of df with:
278
+ - "anytime_e_val" if mt_variant in {"anytime", "both"}
279
+ - "split_e_val" if mt_variant in {"split", "both"}
280
+ - reject columns (if compute_rejects=True):
281
+ * "reject_anytime_alpha_{a}" for anytime/both
282
+ * "reject_split_alpha_{a}" for split/both
283
+ """
284
+ df = df.copy()
285
+
286
+ has_anytime = self.mt_variant in {"anytime", "both"}
287
+ has_split = self.mt_variant in {"split", "both"}
288
+
289
+ if has_anytime:
290
+ df["anytime_e_val"] = self._compute_e_vals_for_variant(df, which="anytime")
291
+
292
+ if has_split:
293
+ df["split_e_val"] = self._compute_e_vals_for_variant(df, which="split")
294
+
295
+ if not compute_rejects:
296
+ return df
297
+
298
+ if self.mt_variant == "anytime":
299
+ for a in self.alphas:
300
+ thr = self.thresholds[a]
301
+ base = str(a).replace(".", "_")
302
+ col = f"reject_anytime_alpha_{base}"
303
+ df[col] = df["anytime_e_val"] > thr
304
+
305
+ elif self.mt_variant == "split":
306
+ for a in self.alphas:
307
+ thr = self.thresholds[a]
308
+ base = str(a).replace(".", "_")
309
+ col = f"reject_split_alpha_{base}"
310
+ df[col] = df["split_e_val"] > thr
311
+
312
+ else: ## mt_variant == "both"
313
+ anytime_thr = self.thresholds["anytime"]
314
+ split_thr = self.thresholds["split"]
315
+
316
+ for a in self.alphas:
317
+ base = str(a).replace(".", "_")
318
+
319
+ thr_any = anytime_thr[a]
320
+ thr_split = split_thr[a]
321
+
322
+ col_any = f"reject_anytime_alpha_{base}"
323
+ col_split = f"reject_split_alpha_{base}"
324
+
325
+ df[col_any] = df["anytime_e_val"] > thr_any
326
+ df[col_split] = df["split_e_val"] > thr_split
327
+
328
+ return df
@@ -0,0 +1,62 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+
5
+
6
+ def add_judge_probability_series(
7
+ df: pd.DataFrame,
8
+ prob_col: str = "judge_probability",
9
+ problem_col: str = "uq_problem_idx",
10
+ step_col: str = "num_steps",
11
+ ) -> pd.DataFrame:
12
+ """
13
+ Add a column `judge_probability_series`, where each entry is a list of
14
+ judge probabilities from step 1 up to that row's step for the same problem.
15
+
16
+ Args:
17
+ df:
18
+ Input DataFrame. Must contain:
19
+ - problem_col (default: "uq_problem_idx")
20
+ - step_col (default: "num_steps")
21
+ - prob_col (default: "judge_probability")
22
+ prob_col:
23
+ Column containing the per-step judge probability.
24
+ problem_col:
25
+ Column identifying the problem / trajectory.
26
+ step_col:
27
+ Column giving the step index (1, 2, 3, ...).
28
+
29
+ Returns:
30
+ A copy of df with an extra column:
31
+ - judge_probability_series: list of probabilities from step 1
32
+ up to the current step for that (problem_col).
33
+ """
34
+ if prob_col not in df.columns:
35
+ raise KeyError(f"Probability column '{prob_col}' not found in df.")
36
+ if problem_col not in df.columns:
37
+ raise KeyError(f"problem_col '{problem_col}' not found in df.")
38
+ if step_col not in df.columns:
39
+ raise KeyError(f"step_col '{step_col}' not found in df.")
40
+
41
+ df = df.copy()
42
+ series_out = []
43
+
44
+ for _, row in tqdm(
45
+ df.iterrows(),
46
+ total=len(df),
47
+ desc="Building judge_probability_series"
48
+ ):
49
+ pid = row[problem_col]
50
+ step = row[step_col]
51
+
52
+ prefix = (
53
+ df[(df[problem_col] == pid) & (df[step_col] <= step)]
54
+ .sort_values(step_col)
55
+ .drop_duplicates(subset=[problem_col, step_col], keep="first")
56
+ )
57
+
58
+ series_out.append(prefix[prob_col].tolist())
59
+
60
+ df["judge_probability_series"] = series_out
61
+ return df
62
+
@@ -0,0 +1,143 @@
1
+ # tests/test_evaluator_workflow.py
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import pytest
9
+
10
+ ROOT = Path(__file__).resolve().parents[1]
11
+ SRC = ROOT / "src"
12
+ if str(SRC) not in sys.path:
13
+ sys.path.insert(0, str(SRC))
14
+
15
+ from evaluator import EValuator
16
+ from evaluator.utils import add_judge_probability_series
17
+
18
+
19
+ def test_invalid_alpha_raises():
20
+ # alpha <= 0
21
+ with pytest.raises(AssertionError):
22
+ EValuator(alphas=[0.0])
23
+
24
+ # alpha >= 1
25
+ with pytest.raises(AssertionError):
26
+ EValuator(alphas=[1.0])
27
+
28
+ # alpha outside (0,1)
29
+ with pytest.raises(AssertionError):
30
+ EValuator(alphas=[-0.1, 0.05])
31
+
32
+ # empty alphas
33
+ with pytest.raises(AssertionError):
34
+ EValuator(alphas=[])
35
+
36
+
37
+ def test_invalid_delta_raises():
38
+ with pytest.raises(AssertionError):
39
+ EValuator(delta=0.0) # must be in (0,1)
40
+
41
+ with pytest.raises(AssertionError):
42
+ EValuator(delta=1.5)
43
+
44
+
45
+ def test_invalid_mt_variant_raises():
46
+ with pytest.raises(AssertionError):
47
+ EValuator(mt_variant="not_a_valid_variant")
48
+
49
+
50
+
51
+ def test_insufficient_data_step1_raises():
52
+ """
53
+ Step 1 must have >= 5 examples per class AND two classes present.
54
+ Construct a tiny calib_df that violates this, and ensure .fit raises.
55
+ """
56
+ calib_df = pd.DataFrame({
57
+ "uq_problem_idx": [f"p{i}" for i in range(4)],
58
+ "num_steps": [1, 1, 1, 1],
59
+ "solved": [1, 1, 1, 1],
60
+ "judge_probability_series": [[0.5]] * 4,
61
+ })
62
+
63
+ ev = EValuator(mt_variant="anytime")
64
+
65
+ with pytest.raises(AssertionError):
66
+ ev.fit(calib_df)
67
+
68
+
69
+ @pytest.mark.slow
70
+ @pytest.mark.parametrize("mt_variant", ["anytime", "split", "both"])
71
+ def test_evaluator_workflow_from_csv(mt_variant):
72
+ data_path = ROOT / "data" / "hotpotqa_cleaned_w_scores.csv"
73
+ if not data_path.exists():
74
+ pytest.skip(f"Data file not found: {data_path}")
75
+
76
+ # Load raw data
77
+ df = pd.read_csv(data_path)
78
+ assert "uq_problem_idx" in df.columns, "Expected 'uq_problem_idx' in CSV"
79
+ assert "num_steps" in df.columns, "Expected 'num_steps' in CSV"
80
+ assert "solved" in df.columns, "Expected 'solved' in CSV"
81
+ assert "judge_probability" in df.columns or "judge_probability_expanded" in df.columns
82
+
83
+ # Add judge_probability_series
84
+ df = add_judge_probability_series(df)
85
+ assert "judge_probability_series" in df.columns
86
+ # Every entry should be a non-empty list
87
+ assert df["judge_probability_series"].apply(lambda x: isinstance(x, list) and len(x) > 0).all()
88
+
89
+ # Split into calibration / test by uq_problem_idx
90
+ unique_ids = df["uq_problem_idx"].unique()
91
+ rng = np.random.default_rng(42)
92
+ n_cal = int(0.8 * len(unique_ids))
93
+ cal_ids = rng.choice(unique_ids, size=n_cal, replace=False)
94
+ test_ids = np.setdiff1d(unique_ids, cal_ids)
95
+
96
+ cal_df = df[df["uq_problem_idx"].isin(cal_ids)].reset_index(drop=True)
97
+ test_df = df[df["uq_problem_idx"].isin(test_ids)].reset_index(drop=True)
98
+
99
+ assert len(cal_df) > 0
100
+ assert len(test_df) > 0
101
+
102
+ # Instantiate evaluator
103
+ ev = EValuator(
104
+ model_type="logistic",
105
+ mt_variant=mt_variant,
106
+ alphas=[0.01, 0.05, 0.1],
107
+ delta=0.05,
108
+ )
109
+
110
+ # Fit on calibration set
111
+ ev.fit(cal_df)
112
+
113
+ # Basic sanity on thresholds
114
+ if mt_variant == "anytime":
115
+ assert isinstance(ev.thresholds, dict)
116
+ assert set(ev.thresholds.keys()) == set(ev.alphas)
117
+ for a, thr in ev.thresholds.items():
118
+ assert thr == pytest.approx(1.0 / a)
119
+ elif mt_variant == "split":
120
+ assert isinstance(ev.thresholds, dict)
121
+ assert set(ev.thresholds.keys()) == set(ev.alphas)
122
+ # All thresholds should be positive
123
+ for thr in ev.thresholds.values():
124
+ assert thr >= 0.0
125
+ else: # "both"
126
+ assert isinstance(ev.thresholds, dict)
127
+ assert set(ev.thresholds.keys()) == {"anytime", "split"}
128
+ assert set(ev.thresholds["anytime"].keys()) == set(ev.alphas)
129
+ assert set(ev.thresholds["split"].keys()) == set(ev.alphas)
130
+
131
+ # Apply to test set
132
+ test_with_scores = ev.apply(test_df)
133
+
134
+ ## Check columns by variant
135
+ if mt_variant in {"anytime", "both"}:
136
+ assert "anytime_e_val" in test_with_scores.columns
137
+ ## e-values should be positive
138
+ assert (test_with_scores["anytime_e_val"] > 0).all()
139
+
140
+ if mt_variant in {"split", "both"}:
141
+ assert "split_e_val" in test_with_scores.columns
142
+ assert (test_with_scores["split_e_val"] > 0).all()
143
+