e-valuator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- e_valuator-0.1.0/PKG-INFO +33 -0
- e_valuator-0.1.0/README.md +15 -0
- e_valuator-0.1.0/pyproject.toml +39 -0
- e_valuator-0.1.0/setup.cfg +4 -0
- e_valuator-0.1.0/src/e_valuator.egg-info/PKG-INFO +33 -0
- e_valuator-0.1.0/src/e_valuator.egg-info/SOURCES.txt +11 -0
- e_valuator-0.1.0/src/e_valuator.egg-info/dependency_links.txt +1 -0
- e_valuator-0.1.0/src/e_valuator.egg-info/requires.txt +8 -0
- e_valuator-0.1.0/src/e_valuator.egg-info/top_level.txt +1 -0
- e_valuator-0.1.0/src/evaluator/__init__.py +4 -0
- e_valuator-0.1.0/src/evaluator/evaluator.py +328 -0
- e_valuator-0.1.0/src/evaluator/utils.py +62 -0
- e_valuator-0.1.0/tests/test_workflow.py +143 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: e-valuator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sequential evaluator for LLM trajectories
|
|
5
|
+
Author: Shuvom Sadhuka
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/shuvom-s/e-valuator
|
|
8
|
+
Project-URL: Repository, https://github.com/shuvom-s/e-valuator
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Requires-Dist: scikit-learn
|
|
14
|
+
Requires-Dist: scipy
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# E-valuator
|
|
20
|
+
Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
To start, please install our package:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install e-valuator
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
|
|
31
|
+
|
|
32
|
+
These notebooks provide examples of the input data format required and evaluation pipeline.
|
|
33
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# E-valuator
|
|
2
|
+
Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
|
|
3
|
+
|
|
4
|
+
## Install
|
|
5
|
+
To start, please install our package:
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install e-valuator
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick start
|
|
12
|
+
Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
|
|
13
|
+
|
|
14
|
+
These notebooks provide examples of the input data format required and evaluation pipeline.
|
|
15
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
# PyPI project name (with hyphen)
|
|
7
|
+
name = "e-valuator"
|
|
8
|
+
|
|
9
|
+
# Import name stays "evaluator"
|
|
10
|
+
version = "0.1.0"
|
|
11
|
+
description = "Sequential evaluator for LLM trajectories"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
license = { text = "MIT" }
|
|
15
|
+
|
|
16
|
+
authors = [
|
|
17
|
+
{ name = "Shuvom Sadhuka" }
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy",
|
|
22
|
+
"pandas",
|
|
23
|
+
"scikit-learn",
|
|
24
|
+
"scipy",
|
|
25
|
+
"tqdm"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = ["pytest"]
|
|
30
|
+
|
|
31
|
+
[tool.setuptools]
|
|
32
|
+
package-dir = {"" = "src"}
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
where = ["src"]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/shuvom-s/e-valuator"
|
|
39
|
+
Repository = "https://github.com/shuvom-s/e-valuator"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: e-valuator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sequential evaluator for LLM trajectories
|
|
5
|
+
Author: Shuvom Sadhuka
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/shuvom-s/e-valuator
|
|
8
|
+
Project-URL: Repository, https://github.com/shuvom-s/e-valuator
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Requires-Dist: scikit-learn
|
|
14
|
+
Requires-Dist: scipy
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# E-valuator
|
|
20
|
+
Code for paper _E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing_. We build a sequential evaluator that can convert any black-box verifier/agent system into one with statistical guarantees. At deployment time, our system can flag and terminate agent trajectories that are likely to be unsuccessful without access to anything but a verifier's (black-box) scores.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
To start, please install our package:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install e-valuator
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
Once installed, you can boot up e-valuator with `from evaluator import EValuator`. We provide two demo notebooks (and corresponding datasets) in `demos/notebooks/hotpot_example.ipynb` (corresponding dataset in `data/hotpotqa_cleaned_w_scores.csv`) and `demos/notebooks/math_example_tokens.ipynb` (corresponding dataset in `data/math_cleaned_w_scores.csv`).
|
|
31
|
+
|
|
32
|
+
These notebooks provide examples of the input data format required and evaluation pipeline.
|
|
33
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/e_valuator.egg-info/PKG-INFO
|
|
4
|
+
src/e_valuator.egg-info/SOURCES.txt
|
|
5
|
+
src/e_valuator.egg-info/dependency_links.txt
|
|
6
|
+
src/e_valuator.egg-info/requires.txt
|
|
7
|
+
src/e_valuator.egg-info/top_level.txt
|
|
8
|
+
src/evaluator/__init__.py
|
|
9
|
+
src/evaluator/evaluator.py
|
|
10
|
+
src/evaluator/utils.py
|
|
11
|
+
tests/test_workflow.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
evaluator
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.linear_model import LogisticRegression
|
|
4
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
5
|
+
from sklearn.preprocessing import StandardScaler
|
|
6
|
+
from scipy.stats import binom
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EValuator:
|
|
10
|
+
"""
|
|
11
|
+
Stepwise evaluator using density ratio estimation + e-values.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
model_type: "logistic" or "random_forest"
|
|
15
|
+
mt_variant: "anytime", "split", or "both"
|
|
16
|
+
alphas: list of alpha levels
|
|
17
|
+
delta: tolerance-bound failure probability for split mode (1 - confidence)
|
|
18
|
+
problem_col: column name for problem identifier (default "uq_problem_idx")
|
|
19
|
+
step_col: column name for step index (default "num_steps")
|
|
20
|
+
split_fraction: fraction of problems used for log-training in split mode
|
|
21
|
+
random_state: RNG seed for the internal log/cal split
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
model_type: str = "logistic",
|
|
27
|
+
mt_variant: str = "split",
|
|
28
|
+
alphas=None,
|
|
29
|
+
delta: float = 0.1,
|
|
30
|
+
problem_col: str = "uq_problem_idx",
|
|
31
|
+
step_col: str = "num_steps",
|
|
32
|
+
split_fraction: float = 0.5,
|
|
33
|
+
random_state: int = 42,
|
|
34
|
+
):
|
|
35
|
+
self.model_type = model_type
|
|
36
|
+
self.mt_variant = mt_variant
|
|
37
|
+
self.alphas = alphas if alphas is not None else [0.05]
|
|
38
|
+
self.delta = delta
|
|
39
|
+
self.problem_col = problem_col
|
|
40
|
+
self.step_col = step_col
|
|
41
|
+
self.split_fraction = split_fraction
|
|
42
|
+
self.random_state = random_state
|
|
43
|
+
|
|
44
|
+
assert isinstance(self.problem_col, str), "problem_col must be a string."
|
|
45
|
+
assert isinstance(self.step_col, str), "step_col must be a string."
|
|
46
|
+
|
|
47
|
+
assert len(self.alphas) > 0, "Must specify at least one alpha."
|
|
48
|
+
for a in self.alphas:
|
|
49
|
+
assert 0 < a < 1, f"Alpha must be in (0,1). Got {a}"
|
|
50
|
+
|
|
51
|
+
assert 0 < self.delta < 1, f"Delta must be in (0,1). Got {self.delta}"
|
|
52
|
+
assert 0 < self.split_fraction < 1, f"split_fraction must be in (0,1). Got {self.split_fraction}"
|
|
53
|
+
assert self.mt_variant in {"anytime", "split", "both"}, (
|
|
54
|
+
"mt_variant must be 'anytime', 'split', or 'both'. "
|
|
55
|
+
f"Got {self.mt_variant}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
## Model families for anytime and split
|
|
59
|
+
self.step_models_anytime = {}
|
|
60
|
+
self.step_scalers_anytime = {}
|
|
61
|
+
self.step_base_probs_anytime = {}
|
|
62
|
+
self.max_trained_step_anytime = 0
|
|
63
|
+
|
|
64
|
+
self.step_models_split = {}
|
|
65
|
+
self.step_scalers_split = {}
|
|
66
|
+
self.step_base_probs_split = {}
|
|
67
|
+
self.max_trained_step_split = 0
|
|
68
|
+
|
|
69
|
+
## For "anytime" or "split": dict[alpha] -> threshold
|
|
70
|
+
## For "both": {"anytime": {alpha: thr}, "split": {alpha: thr}}
|
|
71
|
+
self.thresholds = {}
|
|
72
|
+
|
|
73
|
+
def _new_model(self):
|
|
74
|
+
if self.model_type == "logistic":
|
|
75
|
+
return LogisticRegression(max_iter=200)
|
|
76
|
+
elif self.model_type == "random_forest":
|
|
77
|
+
return RandomForestClassifier(n_estimators=200)
|
|
78
|
+
raise ValueError(f"Unknown model type: {self.model_type}")
|
|
79
|
+
|
|
80
|
+
def _upper_tolerance_bound(self, values, alpha, delta):
|
|
81
|
+
"""
|
|
82
|
+
Distribution-free one-sided upper bound for the (1−alpha)-quantile
|
|
83
|
+
with confidence (1−delta).
|
|
84
|
+
"""
|
|
85
|
+
Xs = np.sort(np.asarray(values))
|
|
86
|
+
n = Xs.size
|
|
87
|
+
if n == 0:
|
|
88
|
+
return 0.0
|
|
89
|
+
|
|
90
|
+
p = 1.0 - alpha
|
|
91
|
+
sig = delta
|
|
92
|
+
|
|
93
|
+
for k in range(1, n + 1):
|
|
94
|
+
if binom.sf(k - 1, n, p) <= sig:
|
|
95
|
+
return float(Xs[k - 1])
|
|
96
|
+
return float(Xs[-1])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _fit_step_models(self, df: pd.DataFrame, which: str):
|
|
100
|
+
"""
|
|
101
|
+
Fit one density-ratio model per step on df, for the given variant.
|
|
102
|
+
Should be one of {"anytime", "split"}.
|
|
103
|
+
"""
|
|
104
|
+
max_steps = df[self.step_col].max()
|
|
105
|
+
|
|
106
|
+
for step in range(1, max_steps + 1):
|
|
107
|
+
step_df = df[df[self.step_col] == step]
|
|
108
|
+
y = step_df["solved"].values
|
|
109
|
+
uniq, counts = np.unique(y, return_counts=True)
|
|
110
|
+
|
|
111
|
+
if step == 1:
|
|
112
|
+
if len(uniq) < 2 or np.min(counts) < 5:
|
|
113
|
+
raise AssertionError("Step 1 requires ≥5 examples per class.")
|
|
114
|
+
else:
|
|
115
|
+
if len(uniq) < 2 or np.min(counts) < 5:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
X = np.array([list(s) for s in step_df["judge_probability_series"].values])
|
|
119
|
+
scaler = StandardScaler()
|
|
120
|
+
Xs = scaler.fit_transform(X)
|
|
121
|
+
model = self._new_model()
|
|
122
|
+
model.fit(Xs, y)
|
|
123
|
+
|
|
124
|
+
p1 = float(np.mean(y))
|
|
125
|
+
p0 = 1.0 - p1
|
|
126
|
+
|
|
127
|
+
if which == "anytime":
|
|
128
|
+
self.step_models_anytime[step] = model
|
|
129
|
+
self.step_scalers_anytime[step] = scaler
|
|
130
|
+
self.step_base_probs_anytime[step] = {"p0": p0, "p1": p1}
|
|
131
|
+
self.max_trained_step_anytime = max(self.max_trained_step_anytime, step)
|
|
132
|
+
elif which == "split":
|
|
133
|
+
self.step_models_split[step] = model
|
|
134
|
+
self.step_scalers_split[step] = scaler
|
|
135
|
+
self.step_base_probs_split[step] = {"p0": p0, "p1": p1}
|
|
136
|
+
self.max_trained_step_split = max(self.max_trained_step_split, step)
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Unknown which={which}")
|
|
139
|
+
|
|
140
|
+
def _compute_e_vals_for_variant(self, df: pd.DataFrame, which: str) -> np.ndarray:
|
|
141
|
+
"""
|
|
142
|
+
Compute density ratios per row for the requested variant ("anytime" or "split").
|
|
143
|
+
"""
|
|
144
|
+
if which == "anytime":
|
|
145
|
+
models = self.step_models_anytime
|
|
146
|
+
scalers = self.step_scalers_anytime
|
|
147
|
+
base_probs = self.step_base_probs_anytime
|
|
148
|
+
elif which == "split":
|
|
149
|
+
models = self.step_models_split
|
|
150
|
+
scalers = self.step_scalers_split
|
|
151
|
+
base_probs = self.step_base_probs_split
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"Unknown which={which}")
|
|
154
|
+
|
|
155
|
+
e_vals = np.full(len(df), np.nan, dtype=float)
|
|
156
|
+
last_ratio = {}
|
|
157
|
+
|
|
158
|
+
for idx, row in df.iterrows():
|
|
159
|
+
pid = row[self.problem_col]
|
|
160
|
+
step = int(row[self.step_col])
|
|
161
|
+
|
|
162
|
+
if pid not in last_ratio:
|
|
163
|
+
last_ratio[pid] = 1.0
|
|
164
|
+
|
|
165
|
+
if step in models:
|
|
166
|
+
series = np.array(row["judge_probability_series"], dtype=float)
|
|
167
|
+
X = series.reshape(1, -1)
|
|
168
|
+
scaler = scalers[step]
|
|
169
|
+
model = models[step]
|
|
170
|
+
|
|
171
|
+
Xs = scaler.transform(X)
|
|
172
|
+
proba = model.predict_proba(Xs)[0]
|
|
173
|
+
p1_s = np.clip(proba[1], 1e-6, 1 - 1e-6)
|
|
174
|
+
p0_s = np.clip(proba[0], 1e-6, 1 - 1e-6)
|
|
175
|
+
|
|
176
|
+
base = base_probs[step]
|
|
177
|
+
p0 = base["p0"]
|
|
178
|
+
p1 = base["p1"]
|
|
179
|
+
|
|
180
|
+
## Density ratio:
|
|
181
|
+
## Pr(S | Y=0) / Pr(S | Y=1)
|
|
182
|
+
## = (Pr(Y=1 | S) / Pr(Y=0 | S)) * (Pr(Y=0) / Pr(Y=1))
|
|
183
|
+
## and we are using the equivalent form:
|
|
184
|
+
## (p0_s / p1_s) * (p1 / p0)
|
|
185
|
+
ratio = (p0_s / p1_s) * (p1 / p0)
|
|
186
|
+
last_ratio[pid] = ratio
|
|
187
|
+
else:
|
|
188
|
+
ratio = last_ratio[pid]
|
|
189
|
+
|
|
190
|
+
e_vals[idx] = ratio
|
|
191
|
+
|
|
192
|
+
return e_vals
|
|
193
|
+
|
|
194
|
+
def _compute_split_thresholds(self, calib_df: pd.DataFrame) -> dict:
|
|
195
|
+
"""
|
|
196
|
+
Use the held-out calibration split to get tolerance-bound thresholds
|
|
197
|
+
for split_e_val.
|
|
198
|
+
"""
|
|
199
|
+
tmp = calib_df.copy()
|
|
200
|
+
tmp["split_e_val"] = self._compute_e_vals_for_variant(tmp, which="split")
|
|
201
|
+
solved = tmp[tmp["solved"] == 1]
|
|
202
|
+
|
|
203
|
+
group_max = []
|
|
204
|
+
for _, g in solved.groupby(self.problem_col):
|
|
205
|
+
vals = pd.to_numeric(g["split_e_val"], errors="coerce").dropna()
|
|
206
|
+
if len(vals) > 0:
|
|
207
|
+
group_max.append(vals.max())
|
|
208
|
+
|
|
209
|
+
thresholds = {}
|
|
210
|
+
for a in self.alphas:
|
|
211
|
+
if len(group_max) == 0:
|
|
212
|
+
thresholds[a] = 0.0
|
|
213
|
+
else:
|
|
214
|
+
thresholds[a] = self._upper_tolerance_bound(
|
|
215
|
+
group_max, alpha=a, delta=self.delta
|
|
216
|
+
)
|
|
217
|
+
return thresholds
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def fit(self, calib_df: pd.DataFrame):
|
|
221
|
+
"""
|
|
222
|
+
Fit one model per step and compute per-alpha thresholds.
|
|
223
|
+
|
|
224
|
+
calib_df must contain:
|
|
225
|
+
- self.step_col
|
|
226
|
+
- "solved"
|
|
227
|
+
- "judge_probability_series"
|
|
228
|
+
- self.problem_col (for split/both threshold computation)
|
|
229
|
+
"""
|
|
230
|
+
calib_df = calib_df.copy()
|
|
231
|
+
|
|
232
|
+
## Anytime-only: train on full calib, Ville thresholds 1/alpha
|
|
233
|
+
if self.mt_variant == "anytime":
|
|
234
|
+
self._fit_step_models(calib_df, which="anytime")
|
|
235
|
+
self.thresholds = {a: 1.0 / a for a in self.alphas}
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
## For split version, first build a log/cal split by problem
|
|
239
|
+
unique_ids = calib_df[self.problem_col].unique()
|
|
240
|
+
rng = np.random.default_rng(self.random_state)
|
|
241
|
+
n_log = max(1, int(len(unique_ids) * self.split_fraction))
|
|
242
|
+
log_ids = rng.choice(unique_ids, size=n_log, replace=False)
|
|
243
|
+
cal_ids = np.setdiff1d(unique_ids, log_ids)
|
|
244
|
+
|
|
245
|
+
log_df = calib_df[calib_df[self.problem_col].isin(log_ids)].reset_index(drop=True)
|
|
246
|
+
cal_df = calib_df[calib_df[self.problem_col].isin(cal_ids)].reset_index(drop=True)
|
|
247
|
+
|
|
248
|
+
if self.mt_variant == "split":
|
|
249
|
+
## Split-only: train on one half, thresholds from held-out other half
|
|
250
|
+
self._fit_step_models(log_df, which="split")
|
|
251
|
+
self.thresholds = self._compute_split_thresholds(cal_df)
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
## mt_variant == "both"
|
|
255
|
+
## Anytime side: train on full calib set, thresholds 1/alpha
|
|
256
|
+
self._fit_step_models(calib_df, which="anytime")
|
|
257
|
+
anytime_thresholds = {a: 1.0 / a for a in self.alphas}
|
|
258
|
+
|
|
259
|
+
## Split side: train on one half, thresholds from held-out other half
|
|
260
|
+
self._fit_step_models(log_df, which="split")
|
|
261
|
+
split_thresholds = self._compute_split_thresholds(cal_df)
|
|
262
|
+
|
|
263
|
+
self.thresholds = {
|
|
264
|
+
"anytime": anytime_thresholds,
|
|
265
|
+
"split": split_thresholds,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
def apply(self, df: pd.DataFrame, compute_rejects: bool = True) -> pd.DataFrame:
|
|
269
|
+
"""
|
|
270
|
+
Compute e-values per row.
|
|
271
|
+
|
|
272
|
+
df must contain:
|
|
273
|
+
- self.step_col
|
|
274
|
+
- self.problem_col
|
|
275
|
+
- "judge_probability_series"
|
|
276
|
+
|
|
277
|
+
Returns a copy of df with:
|
|
278
|
+
- "anytime_e_val" if mt_variant in {"anytime", "both"}
|
|
279
|
+
- "split_e_val" if mt_variant in {"split", "both"}
|
|
280
|
+
- reject columns (if compute_rejects=True):
|
|
281
|
+
* "reject_anytime_alpha_{a}" for anytime/both
|
|
282
|
+
* "reject_split_alpha_{a}" for split/both
|
|
283
|
+
"""
|
|
284
|
+
df = df.copy()
|
|
285
|
+
|
|
286
|
+
has_anytime = self.mt_variant in {"anytime", "both"}
|
|
287
|
+
has_split = self.mt_variant in {"split", "both"}
|
|
288
|
+
|
|
289
|
+
if has_anytime:
|
|
290
|
+
df["anytime_e_val"] = self._compute_e_vals_for_variant(df, which="anytime")
|
|
291
|
+
|
|
292
|
+
if has_split:
|
|
293
|
+
df["split_e_val"] = self._compute_e_vals_for_variant(df, which="split")
|
|
294
|
+
|
|
295
|
+
if not compute_rejects:
|
|
296
|
+
return df
|
|
297
|
+
|
|
298
|
+
if self.mt_variant == "anytime":
|
|
299
|
+
for a in self.alphas:
|
|
300
|
+
thr = self.thresholds[a]
|
|
301
|
+
base = str(a).replace(".", "_")
|
|
302
|
+
col = f"reject_anytime_alpha_{base}"
|
|
303
|
+
df[col] = df["anytime_e_val"] > thr
|
|
304
|
+
|
|
305
|
+
elif self.mt_variant == "split":
|
|
306
|
+
for a in self.alphas:
|
|
307
|
+
thr = self.thresholds[a]
|
|
308
|
+
base = str(a).replace(".", "_")
|
|
309
|
+
col = f"reject_split_alpha_{base}"
|
|
310
|
+
df[col] = df["split_e_val"] > thr
|
|
311
|
+
|
|
312
|
+
else: ## mt_variant == "both"
|
|
313
|
+
anytime_thr = self.thresholds["anytime"]
|
|
314
|
+
split_thr = self.thresholds["split"]
|
|
315
|
+
|
|
316
|
+
for a in self.alphas:
|
|
317
|
+
base = str(a).replace(".", "_")
|
|
318
|
+
|
|
319
|
+
thr_any = anytime_thr[a]
|
|
320
|
+
thr_split = split_thr[a]
|
|
321
|
+
|
|
322
|
+
col_any = f"reject_anytime_alpha_{base}"
|
|
323
|
+
col_split = f"reject_split_alpha_{base}"
|
|
324
|
+
|
|
325
|
+
df[col_any] = df["anytime_e_val"] > thr_any
|
|
326
|
+
df[col_split] = df["split_e_val"] > thr_split
|
|
327
|
+
|
|
328
|
+
return df
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def add_judge_probability_series(
|
|
7
|
+
df: pd.DataFrame,
|
|
8
|
+
prob_col: str = "judge_probability",
|
|
9
|
+
problem_col: str = "uq_problem_idx",
|
|
10
|
+
step_col: str = "num_steps",
|
|
11
|
+
) -> pd.DataFrame:
|
|
12
|
+
"""
|
|
13
|
+
Add a column `judge_probability_series`, where each entry is a list of
|
|
14
|
+
judge probabilities from step 1 up to that row's step for the same problem.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df:
|
|
18
|
+
Input DataFrame. Must contain:
|
|
19
|
+
- problem_col (default: "uq_problem_idx")
|
|
20
|
+
- step_col (default: "num_steps")
|
|
21
|
+
- prob_col (default: "judge_probability")
|
|
22
|
+
prob_col:
|
|
23
|
+
Column containing the per-step judge probability.
|
|
24
|
+
problem_col:
|
|
25
|
+
Column identifying the problem / trajectory.
|
|
26
|
+
step_col:
|
|
27
|
+
Column giving the step index (1, 2, 3, ...).
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
A copy of df with an extra column:
|
|
31
|
+
- judge_probability_series: list of probabilities from step 1
|
|
32
|
+
up to the current step for that (problem_col).
|
|
33
|
+
"""
|
|
34
|
+
if prob_col not in df.columns:
|
|
35
|
+
raise KeyError(f"Probability column '{prob_col}' not found in df.")
|
|
36
|
+
if problem_col not in df.columns:
|
|
37
|
+
raise KeyError(f"problem_col '{problem_col}' not found in df.")
|
|
38
|
+
if step_col not in df.columns:
|
|
39
|
+
raise KeyError(f"step_col '{step_col}' not found in df.")
|
|
40
|
+
|
|
41
|
+
df = df.copy()
|
|
42
|
+
series_out = []
|
|
43
|
+
|
|
44
|
+
for _, row in tqdm(
|
|
45
|
+
df.iterrows(),
|
|
46
|
+
total=len(df),
|
|
47
|
+
desc="Building judge_probability_series"
|
|
48
|
+
):
|
|
49
|
+
pid = row[problem_col]
|
|
50
|
+
step = row[step_col]
|
|
51
|
+
|
|
52
|
+
prefix = (
|
|
53
|
+
df[(df[problem_col] == pid) & (df[step_col] <= step)]
|
|
54
|
+
.sort_values(step_col)
|
|
55
|
+
.drop_duplicates(subset=[problem_col, step_col], keep="first")
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
series_out.append(prefix[prob_col].tolist())
|
|
59
|
+
|
|
60
|
+
df["judge_probability_series"] = series_out
|
|
61
|
+
return df
|
|
62
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# tests/test_evaluator_workflow.py
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
11
|
+
SRC = ROOT / "src"
|
|
12
|
+
if str(SRC) not in sys.path:
|
|
13
|
+
sys.path.insert(0, str(SRC))
|
|
14
|
+
|
|
15
|
+
from evaluator import EValuator
|
|
16
|
+
from evaluator.utils import add_judge_probability_series
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_invalid_alpha_raises():
|
|
20
|
+
# alpha <= 0
|
|
21
|
+
with pytest.raises(AssertionError):
|
|
22
|
+
EValuator(alphas=[0.0])
|
|
23
|
+
|
|
24
|
+
# alpha >= 1
|
|
25
|
+
with pytest.raises(AssertionError):
|
|
26
|
+
EValuator(alphas=[1.0])
|
|
27
|
+
|
|
28
|
+
# alpha outside (0,1)
|
|
29
|
+
with pytest.raises(AssertionError):
|
|
30
|
+
EValuator(alphas=[-0.1, 0.05])
|
|
31
|
+
|
|
32
|
+
# empty alphas
|
|
33
|
+
with pytest.raises(AssertionError):
|
|
34
|
+
EValuator(alphas=[])
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_invalid_delta_raises():
|
|
38
|
+
with pytest.raises(AssertionError):
|
|
39
|
+
EValuator(delta=0.0) # must be in (0,1)
|
|
40
|
+
|
|
41
|
+
with pytest.raises(AssertionError):
|
|
42
|
+
EValuator(delta=1.5)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_invalid_mt_variant_raises():
|
|
46
|
+
with pytest.raises(AssertionError):
|
|
47
|
+
EValuator(mt_variant="not_a_valid_variant")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_insufficient_data_step1_raises():
|
|
52
|
+
"""
|
|
53
|
+
Step 1 must have >= 5 examples per class AND two classes present.
|
|
54
|
+
Construct a tiny calib_df that violates this, and ensure .fit raises.
|
|
55
|
+
"""
|
|
56
|
+
calib_df = pd.DataFrame({
|
|
57
|
+
"uq_problem_idx": [f"p{i}" for i in range(4)],
|
|
58
|
+
"num_steps": [1, 1, 1, 1],
|
|
59
|
+
"solved": [1, 1, 1, 1],
|
|
60
|
+
"judge_probability_series": [[0.5]] * 4,
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
ev = EValuator(mt_variant="anytime")
|
|
64
|
+
|
|
65
|
+
with pytest.raises(AssertionError):
|
|
66
|
+
ev.fit(calib_df)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@pytest.mark.slow
|
|
70
|
+
@pytest.mark.parametrize("mt_variant", ["anytime", "split", "both"])
|
|
71
|
+
def test_evaluator_workflow_from_csv(mt_variant):
|
|
72
|
+
data_path = ROOT / "data" / "hotpotqa_cleaned_w_scores.csv"
|
|
73
|
+
if not data_path.exists():
|
|
74
|
+
pytest.skip(f"Data file not found: {data_path}")
|
|
75
|
+
|
|
76
|
+
# Load raw data
|
|
77
|
+
df = pd.read_csv(data_path)
|
|
78
|
+
assert "uq_problem_idx" in df.columns, "Expected 'uq_problem_idx' in CSV"
|
|
79
|
+
assert "num_steps" in df.columns, "Expected 'num_steps' in CSV"
|
|
80
|
+
assert "solved" in df.columns, "Expected 'solved' in CSV"
|
|
81
|
+
assert "judge_probability" in df.columns or "judge_probability_expanded" in df.columns
|
|
82
|
+
|
|
83
|
+
# Add judge_probability_series
|
|
84
|
+
df = add_judge_probability_series(df)
|
|
85
|
+
assert "judge_probability_series" in df.columns
|
|
86
|
+
# Every entry should be a non-empty list
|
|
87
|
+
assert df["judge_probability_series"].apply(lambda x: isinstance(x, list) and len(x) > 0).all()
|
|
88
|
+
|
|
89
|
+
# Split into calibration / test by uq_problem_idx
|
|
90
|
+
unique_ids = df["uq_problem_idx"].unique()
|
|
91
|
+
rng = np.random.default_rng(42)
|
|
92
|
+
n_cal = int(0.8 * len(unique_ids))
|
|
93
|
+
cal_ids = rng.choice(unique_ids, size=n_cal, replace=False)
|
|
94
|
+
test_ids = np.setdiff1d(unique_ids, cal_ids)
|
|
95
|
+
|
|
96
|
+
cal_df = df[df["uq_problem_idx"].isin(cal_ids)].reset_index(drop=True)
|
|
97
|
+
test_df = df[df["uq_problem_idx"].isin(test_ids)].reset_index(drop=True)
|
|
98
|
+
|
|
99
|
+
assert len(cal_df) > 0
|
|
100
|
+
assert len(test_df) > 0
|
|
101
|
+
|
|
102
|
+
# Instantiate evaluator
|
|
103
|
+
ev = EValuator(
|
|
104
|
+
model_type="logistic",
|
|
105
|
+
mt_variant=mt_variant,
|
|
106
|
+
alphas=[0.01, 0.05, 0.1],
|
|
107
|
+
delta=0.05,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Fit on calibration set
|
|
111
|
+
ev.fit(cal_df)
|
|
112
|
+
|
|
113
|
+
# Basic sanity on thresholds
|
|
114
|
+
if mt_variant == "anytime":
|
|
115
|
+
assert isinstance(ev.thresholds, dict)
|
|
116
|
+
assert set(ev.thresholds.keys()) == set(ev.alphas)
|
|
117
|
+
for a, thr in ev.thresholds.items():
|
|
118
|
+
assert thr == pytest.approx(1.0 / a)
|
|
119
|
+
elif mt_variant == "split":
|
|
120
|
+
assert isinstance(ev.thresholds, dict)
|
|
121
|
+
assert set(ev.thresholds.keys()) == set(ev.alphas)
|
|
122
|
+
# All thresholds should be positive
|
|
123
|
+
for thr in ev.thresholds.values():
|
|
124
|
+
assert thr >= 0.0
|
|
125
|
+
else: # "both"
|
|
126
|
+
assert isinstance(ev.thresholds, dict)
|
|
127
|
+
assert set(ev.thresholds.keys()) == {"anytime", "split"}
|
|
128
|
+
assert set(ev.thresholds["anytime"].keys()) == set(ev.alphas)
|
|
129
|
+
assert set(ev.thresholds["split"].keys()) == set(ev.alphas)
|
|
130
|
+
|
|
131
|
+
# Apply to test set
|
|
132
|
+
test_with_scores = ev.apply(test_df)
|
|
133
|
+
|
|
134
|
+
## Check columns by variant
|
|
135
|
+
if mt_variant in {"anytime", "both"}:
|
|
136
|
+
assert "anytime_e_val" in test_with_scores.columns
|
|
137
|
+
## e-values should be positive
|
|
138
|
+
assert (test_with_scores["anytime_e_val"] > 0).all()
|
|
139
|
+
|
|
140
|
+
if mt_variant in {"split", "both"}:
|
|
141
|
+
assert "split_e_val" in test_with_scores.columns
|
|
142
|
+
assert (test_with_scores["split_e_val"] > 0).all()
|
|
143
|
+
|