agent-eval 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval-0.1.1.dist-info/METADATA +74 -0
- agent_eval-0.1.1.dist-info/RECORD +16 -0
- agent_eval-0.1.1.dist-info/WHEEL +5 -0
- agent_eval-0.1.1.dist-info/entry_points.txt +2 -0
- agent_eval-0.1.1.dist-info/top_level.txt +1 -0
- agenteval/__init__.py +14 -0
- agenteval/cli.py +412 -0
- agenteval/config.py +87 -0
- agenteval/dataset_features.yml +71 -0
- agenteval/io.py +38 -0
- agenteval/log.py +58 -0
- agenteval/models.py +89 -0
- agenteval/schema_generator.py +95 -0
- agenteval/score.py +152 -0
- agenteval/summary.py +110 -0
- agenteval/upload.py +175 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-eval
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Agent evaluation toolkit
|
|
5
|
+
Project-URL: Homepage, https://github.com/allenai/agent-eval
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Requires-Dist: inspect-ai
|
|
10
|
+
Requires-Dist: litellm
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: huggingface_hub
|
|
13
|
+
Requires-Dist: pyarrow
|
|
14
|
+
Requires-Dist: datasets
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: black==24.2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: isort; extra == "dev"
|
|
18
|
+
Requires-Dist: autoflake; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
21
|
+
Requires-Dist: mypy==1.15; extra == "dev"
|
|
22
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
23
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
24
|
+
|
|
25
|
+
# agent-eval
|
|
26
|
+
|
|
27
|
+
A utility for evaluating agents on a suite of [Inspect](https://github.com/UKGovernmentBEIS/inspect_ai)-formatted evals, with the following primary benefits:
|
|
28
|
+
1. Task suite specifications as config.
|
|
29
|
+
2. Extracts the token usage of the agent from log files, and computes cost using `litellm`.
|
|
30
|
+
3. Submits task suite results to a leaderboard, with submission metadata and easy upload to a HuggingFace repo for distribution of scores and logs.
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
To install from pypi, use `pip install agent-eval`.
|
|
35
|
+
|
|
36
|
+
# Usage
|
|
37
|
+
|
|
38
|
+
## Run evaluation suite
|
|
39
|
+
```shell
|
|
40
|
+
agenteval eval --config-path CONFIG_PATH --split SPLIT LOG_DIR
|
|
41
|
+
```
|
|
42
|
+
Evaluate an agent on the supplied eval suite configuration. Results are written to `agenteval.json` in the log directory.
|
|
43
|
+
|
|
44
|
+
See [sample-config.yml](sample-config.yml) for a sample configuration file.
|
|
45
|
+
|
|
46
|
+
For aggregation in a leaderboard, each task specifies a `primary_metric` as `{scorer_name}/{metric_name}`.
|
|
47
|
+
The scoring utils will look for a corresponding stderr metric,
|
|
48
|
+
by looking for another metric with the same `scorer_name` and with a `metric_name` containing the string "stderr".
|
|
49
|
+
|
|
50
|
+
## Score results
|
|
51
|
+
```shell
|
|
52
|
+
agenteval score [OPTIONS] LOG_DIR
|
|
53
|
+
```
|
|
54
|
+
Compute scores for the results in `agenteval.json` and update the file with the computed scores.
|
|
55
|
+
|
|
56
|
+
## Publish scores
|
|
57
|
+
```shell
|
|
58
|
+
agenteval publish [OPTIONS] LOG_DIR
|
|
59
|
+
```
|
|
60
|
+
Upload the scored results to HuggingFace datasets.
|
|
61
|
+
|
|
62
|
+
# Administer the HuggingFace datasets
|
|
63
|
+
Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
|
|
64
|
+
|
|
65
|
+
If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
|
|
66
|
+
This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
|
|
67
|
+
This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
|
|
68
|
+
See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
|
|
69
|
+
|
|
70
|
+
To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
|
|
71
|
+
|
|
72
|
+
# Development
|
|
73
|
+
|
|
74
|
+
See [Development.md](Development.md) for development instructions.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
agenteval/__init__.py,sha256=nEm7fcpqtTId-83MMLiVDKkszvaGEeFUQ_IxrdA63bo,372
|
|
2
|
+
agenteval/cli.py,sha256=zzx3aceprFcQQ2fNUGBbEZ7RLt3e_gWGoR8OPDgDXaU,13767
|
|
3
|
+
agenteval/config.py,sha256=jrqyAPfC728ymfYUzwWJehzeW8Dpnsmpn5utAySz0uo,2294
|
|
4
|
+
agenteval/dataset_features.yml,sha256=UVsQUxohyjD9YZGw_HmIg6RFbdrkUQRZvmQSADffPS0,1439
|
|
5
|
+
agenteval/io.py,sha256=QjxS2Ta_-gWRBLWOog0sX2Q-WlXmsD5iF3dsVktETR0,1004
|
|
6
|
+
agenteval/log.py,sha256=lJoGkeRpuo017vAAM77mP9BIRexpWuIE8JwIKCVNOto,1958
|
|
7
|
+
agenteval/models.py,sha256=UMGas9wgLT9nM1cyaVEh5PwkUURYSb02dnJDmRXI4Vk,2671
|
|
8
|
+
agenteval/schema_generator.py,sha256=S36u6SSjFbAdcDqJvuzTQkFF2PG7Yq8A0diUsB8yWJg,3037
|
|
9
|
+
agenteval/score.py,sha256=Pb3PofFdW0IobvZu-tv7313u_kTX3tgGw6nmlliMTak,4803
|
|
10
|
+
agenteval/summary.py,sha256=LzkC2ZDf0MNLlXzL3bugAdv3VLkzCMBP85_52fsnToA,3599
|
|
11
|
+
agenteval/upload.py,sha256=B17OK_ZD7q82-MjKCI63NQozB7mksaLknIdqm5ETAAU,5827
|
|
12
|
+
agent_eval-0.1.1.dist-info/METADATA,sha256=mzwemf3uIdIpFHMpkJPNE7MEtiUNA_G4a0XSlcQzjkQ,3447
|
|
13
|
+
agent_eval-0.1.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
|
14
|
+
agent_eval-0.1.1.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48
|
|
15
|
+
agent_eval-0.1.1.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10
|
|
16
|
+
agent_eval-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
agenteval
|
agenteval/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from importlib.metadata import version as get_version
|
|
2
|
+
|
|
3
|
+
__version__ = get_version("agent-eval")
|
|
4
|
+
|
|
5
|
+
from .score import process_eval_logs
|
|
6
|
+
from .summary import compute_summary_statistics
|
|
7
|
+
from .upload import upload_folder_to_hf, upload_summary_to_hf
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"process_eval_logs",
|
|
11
|
+
"compute_summary_statistics",
|
|
12
|
+
"upload_folder_to_hf",
|
|
13
|
+
"upload_summary_to_hf",
|
|
14
|
+
]
|
agenteval/cli.py
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from .config import load_suite_config
|
|
11
|
+
from .models import EvalConfig, EvalResult
|
|
12
|
+
from .score import process_eval_logs
|
|
13
|
+
from .summary import compute_summary_statistics
|
|
14
|
+
from .upload import sanitize_path_component, upload_folder_to_hf, upload_summary_to_hf
|
|
15
|
+
|
|
16
|
+
EVAL_FILENAME = "agenteval.json"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def verify_git_reproducibility(ignore_git: bool) -> None:
|
|
20
|
+
if ignore_git:
|
|
21
|
+
return
|
|
22
|
+
try:
|
|
23
|
+
# Get current commit SHA and origin
|
|
24
|
+
sha_result = subprocess.run(
|
|
25
|
+
["git", "rev-parse", "--short", "HEAD"],
|
|
26
|
+
capture_output=True,
|
|
27
|
+
text=True,
|
|
28
|
+
check=True,
|
|
29
|
+
)
|
|
30
|
+
origin_result = subprocess.run(
|
|
31
|
+
["git", "remote", "get-url", "origin"],
|
|
32
|
+
capture_output=True,
|
|
33
|
+
text=True,
|
|
34
|
+
check=True,
|
|
35
|
+
)
|
|
36
|
+
sha = sha_result.stdout.strip() if sha_result.returncode == 0 else None
|
|
37
|
+
origin = origin_result.stdout.strip() if origin_result.returncode == 0 else None
|
|
38
|
+
|
|
39
|
+
# Check for dirty working directory
|
|
40
|
+
git_dirty = (
|
|
41
|
+
subprocess.run(
|
|
42
|
+
["git", "diff", "--quiet", "--exit-code"],
|
|
43
|
+
capture_output=True,
|
|
44
|
+
check=False,
|
|
45
|
+
).returncode
|
|
46
|
+
!= 0
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Warn about untracked (non-ignored) files
|
|
50
|
+
untracked_result = subprocess.run(
|
|
51
|
+
["git", "ls-files", "--others", "--exclude-standard"],
|
|
52
|
+
capture_output=True,
|
|
53
|
+
text=True,
|
|
54
|
+
check=True,
|
|
55
|
+
)
|
|
56
|
+
untracked_files = untracked_result.stdout.strip().splitlines()
|
|
57
|
+
if untracked_files:
|
|
58
|
+
click.echo(
|
|
59
|
+
f"Warning: Untracked files present: {', '.join(untracked_files)}. "
|
|
60
|
+
"For reproducibility, please add, ignore, or remove these files."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Abort if worktree is dirty
|
|
64
|
+
if git_dirty:
|
|
65
|
+
raise click.ClickException(
|
|
66
|
+
f"Git working directory contains uncommitted changes. "
|
|
67
|
+
f"For reproducibility, Inspect will save: origin={origin}, sha={sha}. "
|
|
68
|
+
"Please commit your changes or use --ignore-git to bypass this check (not recommended)."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Check if commit exists on remote
|
|
72
|
+
if sha:
|
|
73
|
+
remote_exists = subprocess.run(
|
|
74
|
+
["git", "branch", "-r", "--contains", sha],
|
|
75
|
+
capture_output=True,
|
|
76
|
+
text=True,
|
|
77
|
+
check=True,
|
|
78
|
+
).stdout.strip()
|
|
79
|
+
if not remote_exists:
|
|
80
|
+
raise click.ClickException(
|
|
81
|
+
f"Commit {sha} not found on remote '{origin}'. Others won't be able to "
|
|
82
|
+
"access this code version. Please push your changes or use --ignore-git "
|
|
83
|
+
"to bypass this check (not recommended)."
|
|
84
|
+
)
|
|
85
|
+
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
|
86
|
+
if isinstance(e, click.ClickException):
|
|
87
|
+
raise
|
|
88
|
+
raise click.ClickException(
|
|
89
|
+
f"Unable to verify git status for reproducibility: {e}. "
|
|
90
|
+
"Use --ignore-git to bypass this check if git is not available."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@click.group()
|
|
95
|
+
def cli():
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@click.command(
|
|
100
|
+
name="score",
|
|
101
|
+
help="Score a directory of evaluation logs.",
|
|
102
|
+
)
|
|
103
|
+
@click.argument("log_dir", type=click.Path(exists=True, file_okay=False))
|
|
104
|
+
@click.option(
|
|
105
|
+
"--config-path",
|
|
106
|
+
"config_path",
|
|
107
|
+
type=str,
|
|
108
|
+
help=f"Path to a yml config file. Ignored if {EVAL_FILENAME} exists.",
|
|
109
|
+
default=None,
|
|
110
|
+
)
|
|
111
|
+
@click.option(
|
|
112
|
+
"--split",
|
|
113
|
+
type=str,
|
|
114
|
+
help=f"Config data split. Ignored if {EVAL_FILENAME} exists.",
|
|
115
|
+
default=None,
|
|
116
|
+
)
|
|
117
|
+
def score_command(
|
|
118
|
+
log_dir: str,
|
|
119
|
+
config_path: str | None,
|
|
120
|
+
split: str | None,
|
|
121
|
+
):
|
|
122
|
+
# Load or create EvalResult and process logs (inlined from processor)
|
|
123
|
+
json_path = Path(log_dir) / EVAL_FILENAME
|
|
124
|
+
if json_path.exists():
|
|
125
|
+
try:
|
|
126
|
+
raw = json_path.read_text(encoding="utf-8")
|
|
127
|
+
eval_result = EvalResult.model_validate_json(raw)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
raise click.ClickException(
|
|
130
|
+
f"Failed to load existing '{EVAL_FILENAME}' at {json_path}: {e}"
|
|
131
|
+
)
|
|
132
|
+
if config_path:
|
|
133
|
+
try:
|
|
134
|
+
cli_cfg = load_suite_config(config_path)
|
|
135
|
+
if cli_cfg.version != eval_result.suite_config.version:
|
|
136
|
+
click.echo(
|
|
137
|
+
f"Warning: CLI config version '{cli_cfg.version}' "
|
|
138
|
+
f"does not match JSON config version '{eval_result.suite_config.version}'."
|
|
139
|
+
)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
click.echo(
|
|
142
|
+
f"Warning: could not load CLI config '{config_path}' for comparison: {e}"
|
|
143
|
+
)
|
|
144
|
+
if split and split != eval_result.split:
|
|
145
|
+
raise click.ClickException(
|
|
146
|
+
f"Split mismatch: JSON split '{eval_result.split}' != CLI split '{split}'"
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
if not config_path or not split:
|
|
150
|
+
raise click.ClickException(
|
|
151
|
+
"--config-path and --split must be provided when no existing result JSON"
|
|
152
|
+
)
|
|
153
|
+
suite_cfg = load_suite_config(config_path)
|
|
154
|
+
eval_result = EvalResult(suite_config=suite_cfg, split=split)
|
|
155
|
+
|
|
156
|
+
task_results, eval_specs = process_eval_logs(log_dir)
|
|
157
|
+
eval_result.eval_specs = eval_specs
|
|
158
|
+
eval_result.results = task_results
|
|
159
|
+
|
|
160
|
+
# Warn if multiple evaluation specs present
|
|
161
|
+
if eval_result.eval_specs and len(eval_result.eval_specs) > 1:
|
|
162
|
+
click.echo(
|
|
163
|
+
f"Warning: Found {len(eval_result.eval_specs)} different eval specs. "
|
|
164
|
+
"Logs may come from mixed runs."
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Warn about any missing tasks
|
|
168
|
+
missing_tasks = eval_result.find_missing_tasks()
|
|
169
|
+
if missing_tasks:
|
|
170
|
+
click.echo(f"Warning: Missing tasks in result set: {', '.join(missing_tasks)}")
|
|
171
|
+
|
|
172
|
+
# Compute and display summary statistics
|
|
173
|
+
stats = compute_summary_statistics(
|
|
174
|
+
eval_result.suite_config,
|
|
175
|
+
eval_result.split,
|
|
176
|
+
eval_result.results or [],
|
|
177
|
+
)
|
|
178
|
+
click.echo("Summary statistics:")
|
|
179
|
+
click.echo(json.dumps({k: v.model_dump() for k, v in stats.items()}, indent=2))
|
|
180
|
+
|
|
181
|
+
# Persist updated EvalResult JSON
|
|
182
|
+
eval_result.save_json(Path(log_dir) / EVAL_FILENAME)
|
|
183
|
+
|
|
184
|
+
click.echo(f"Saved results to {log_dir}/{EVAL_FILENAME}")
|
|
185
|
+
ctx = click.get_current_context()
|
|
186
|
+
click.echo(
|
|
187
|
+
f"You can now run '{ctx.parent.info_name if ctx.parent else 'cli'} publish {log_dir}' to publish the results"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
cli.add_command(score_command)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@click.command(
|
|
195
|
+
name="publish",
|
|
196
|
+
help="Publish scored results in log_dir to Hugging Face leaderboard.",
|
|
197
|
+
)
|
|
198
|
+
@click.argument("log_dir", type=click.Path(exists=True, file_okay=False))
|
|
199
|
+
@click.option(
|
|
200
|
+
"--submissions-repo-id",
|
|
201
|
+
type=str,
|
|
202
|
+
default=lambda: os.environ.get("SUBMISSIONS_REPO_ID", ""),
|
|
203
|
+
help="HF repo id for submissions. Defaults to SUBMISSIONS_REPO_ID env var.",
|
|
204
|
+
)
|
|
205
|
+
@click.option(
|
|
206
|
+
"--results-repo-id",
|
|
207
|
+
type=str,
|
|
208
|
+
default=lambda: os.environ.get("RESULTS_REPO_ID", ""),
|
|
209
|
+
help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.",
|
|
210
|
+
)
|
|
211
|
+
@click.option(
|
|
212
|
+
"--username",
|
|
213
|
+
type=str,
|
|
214
|
+
default=None,
|
|
215
|
+
help="HF username/org for submission. Defaults to your HF account name.",
|
|
216
|
+
)
|
|
217
|
+
@click.option(
|
|
218
|
+
"--agent-name",
|
|
219
|
+
type=str,
|
|
220
|
+
required=True,
|
|
221
|
+
help="Descriptive agent name for submission.",
|
|
222
|
+
)
|
|
223
|
+
@click.option(
|
|
224
|
+
"--agent-description",
|
|
225
|
+
type=str,
|
|
226
|
+
default=None,
|
|
227
|
+
help="Description of the agent being submitted.",
|
|
228
|
+
)
|
|
229
|
+
@click.option(
|
|
230
|
+
"--agent-url",
|
|
231
|
+
type=str,
|
|
232
|
+
default=None,
|
|
233
|
+
help="URL to the agent's repository or documentation.",
|
|
234
|
+
)
|
|
235
|
+
def publish_command(
|
|
236
|
+
log_dir: str,
|
|
237
|
+
submissions_repo_id: str,
|
|
238
|
+
results_repo_id: str,
|
|
239
|
+
username: str | None,
|
|
240
|
+
agent_name: str,
|
|
241
|
+
agent_description: str | None,
|
|
242
|
+
agent_url: str | None,
|
|
243
|
+
):
|
|
244
|
+
# Allow huggingface imports to be optional
|
|
245
|
+
from huggingface_hub import HfApi
|
|
246
|
+
|
|
247
|
+
# Derive a filesafe agent_name
|
|
248
|
+
safe_agent_name = sanitize_path_component(agent_name)
|
|
249
|
+
if safe_agent_name != agent_name:
|
|
250
|
+
click.echo(
|
|
251
|
+
f"Note: agent_name '{agent_name}' contains unsafe characters; "
|
|
252
|
+
f"using '{safe_agent_name}' for submission filenames."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Load existing scored results from JSON
|
|
256
|
+
json_path = Path(log_dir) / EVAL_FILENAME
|
|
257
|
+
if not json_path.exists():
|
|
258
|
+
raise click.ClickException(f"No scored results found at {json_path}")
|
|
259
|
+
raw = json_path.read_text(encoding="utf-8")
|
|
260
|
+
eval_result = EvalResult.model_validate_json(raw)
|
|
261
|
+
|
|
262
|
+
# Validate eval result
|
|
263
|
+
if not eval_result.is_scored():
|
|
264
|
+
raise click.ClickException(
|
|
265
|
+
f"{EVAL_FILENAME} is not scored. Please run 'score {log_dir}' first."
|
|
266
|
+
)
|
|
267
|
+
missing_tasks = eval_result.find_missing_tasks()
|
|
268
|
+
if missing_tasks:
|
|
269
|
+
click.echo(f"Warning: Missing tasks in result set: {', '.join(missing_tasks)}")
|
|
270
|
+
|
|
271
|
+
# Determine HF user
|
|
272
|
+
hf_api = HfApi()
|
|
273
|
+
if not username:
|
|
274
|
+
try:
|
|
275
|
+
username = hf_api.whoami()["name"]
|
|
276
|
+
assert isinstance(username, str), "Invalid username type from HF API"
|
|
277
|
+
click.echo(f"Defaulting username to Hugging Face account: {username}")
|
|
278
|
+
except Exception:
|
|
279
|
+
raise click.ClickException(
|
|
280
|
+
"--username must be provided or ensure HF authentication is configured"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Derive a filesafe username
|
|
284
|
+
safe_username = sanitize_path_component(username)
|
|
285
|
+
if safe_username != username:
|
|
286
|
+
click.echo(
|
|
287
|
+
f"Note: username '{username}' contains unsafe characters; "
|
|
288
|
+
f"using '{safe_username}' for submission filenames."
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Fill submission metadata
|
|
292
|
+
eval_result.submission.username = username
|
|
293
|
+
eval_result.submission.agent_name = agent_name
|
|
294
|
+
eval_result.submission.agent_description = agent_description
|
|
295
|
+
eval_result.submission.agent_url = agent_url
|
|
296
|
+
eval_result.submission.submit_time = datetime.now(timezone.utc)
|
|
297
|
+
|
|
298
|
+
# Validate suite config version
|
|
299
|
+
config_name = eval_result.suite_config.version
|
|
300
|
+
if not config_name:
|
|
301
|
+
raise click.ClickException("Suite config version is required for upload.")
|
|
302
|
+
|
|
303
|
+
# Build submission name
|
|
304
|
+
ts = eval_result.submission.submit_time.strftime("%Y-%m-%dT%H-%M-%S")
|
|
305
|
+
subm_name = f"{safe_username}_{safe_agent_name}_{ts}"
|
|
306
|
+
|
|
307
|
+
# Upload logs and summary
|
|
308
|
+
logs_url = upload_folder_to_hf(
|
|
309
|
+
hf_api, log_dir, submissions_repo_id, config_name, eval_result.split, subm_name
|
|
310
|
+
)
|
|
311
|
+
click.echo(f"Uploaded submission logs dir to {logs_url}")
|
|
312
|
+
eval_result.submission.logs_url = logs_url
|
|
313
|
+
|
|
314
|
+
summary_url = upload_summary_to_hf(
|
|
315
|
+
hf_api, eval_result, results_repo_id, config_name, eval_result.split, subm_name
|
|
316
|
+
)
|
|
317
|
+
click.echo(f"Uploaded results summary file to {summary_url}")
|
|
318
|
+
eval_result.submission.summary_url = summary_url
|
|
319
|
+
|
|
320
|
+
# Save updated JSON
|
|
321
|
+
eval_result.save_json(Path(log_dir) / EVAL_FILENAME)
|
|
322
|
+
click.echo(f"Updated {EVAL_FILENAME} with publication metadata.")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
cli.add_command(publish_command)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@cli.command(
|
|
329
|
+
name="eval",
|
|
330
|
+
help="Run inspect eval-set on specified tasks with the given arguments",
|
|
331
|
+
context_settings={"ignore_unknown_options": True},
|
|
332
|
+
)
|
|
333
|
+
@click.option(
|
|
334
|
+
"--log-dir",
|
|
335
|
+
type=str,
|
|
336
|
+
help="Log directory. Defaults to INSPECT_LOG_DIR or auto-generated under ./logs.",
|
|
337
|
+
)
|
|
338
|
+
@click.option(
|
|
339
|
+
"--config-path",
|
|
340
|
+
"config_path",
|
|
341
|
+
type=str,
|
|
342
|
+
help="Path to a yml config file.",
|
|
343
|
+
required=True,
|
|
344
|
+
)
|
|
345
|
+
@click.option(
|
|
346
|
+
"--split",
|
|
347
|
+
type=str,
|
|
348
|
+
help="Config data split.",
|
|
349
|
+
required=True,
|
|
350
|
+
)
|
|
351
|
+
@click.option(
|
|
352
|
+
"--ignore-git",
|
|
353
|
+
is_flag=True,
|
|
354
|
+
help="Ignore git reproducibility checks (not recommended).",
|
|
355
|
+
)
|
|
356
|
+
@click.argument("args", nargs=-1, type=click.UNPROCESSED)
|
|
357
|
+
def eval_command(
|
|
358
|
+
log_dir: str | None,
|
|
359
|
+
config_path: str,
|
|
360
|
+
split: str,
|
|
361
|
+
ignore_git: bool,
|
|
362
|
+
args: tuple[str],
|
|
363
|
+
):
|
|
364
|
+
"""Run inspect eval-set with arguments and append tasks"""
|
|
365
|
+
suite_config = load_suite_config(config_path)
|
|
366
|
+
tasks = suite_config.get_tasks(split)
|
|
367
|
+
|
|
368
|
+
# Verify git status for reproducibility
|
|
369
|
+
verify_git_reproducibility(ignore_git)
|
|
370
|
+
|
|
371
|
+
if not log_dir:
|
|
372
|
+
log_dir = os.environ.get("INSPECT_LOG_DIR")
|
|
373
|
+
if not log_dir:
|
|
374
|
+
timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
|
|
375
|
+
log_dir = os.path.join(
|
|
376
|
+
".",
|
|
377
|
+
"logs",
|
|
378
|
+
f"{suite_config.name}_{suite_config.version}_{split}_{timestamp}",
|
|
379
|
+
)
|
|
380
|
+
click.echo(f"No log dir was manually set; using {log_dir}")
|
|
381
|
+
logd_args = ["--log-dir", log_dir]
|
|
382
|
+
|
|
383
|
+
# We use subprocess here to keep arg management simple; an alternative
|
|
384
|
+
# would be calling `inspect_ai.eval_set()` directly, which would allow for
|
|
385
|
+
# programmatic execution
|
|
386
|
+
full_command = (
|
|
387
|
+
["inspect", "eval-set"] + list(args) + logd_args + [x.path for x in tasks]
|
|
388
|
+
)
|
|
389
|
+
click.echo(f"Running {config_path}: {' '.join(full_command)}")
|
|
390
|
+
proc = subprocess.run(full_command)
|
|
391
|
+
|
|
392
|
+
if proc.returncode != 0:
|
|
393
|
+
raise click.ClickException(
|
|
394
|
+
f"inspect eval-set failed while running {config_path}"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Write the config portion of the results file
|
|
398
|
+
with open(os.path.join(log_dir, EVAL_FILENAME), "w", encoding="utf-8") as f:
|
|
399
|
+
unscored_eval_config = EvalConfig(suite_config=suite_config, split=split)
|
|
400
|
+
f.write(unscored_eval_config.model_dump_json(indent=2))
|
|
401
|
+
|
|
402
|
+
ctx = click.get_current_context()
|
|
403
|
+
click.echo(
|
|
404
|
+
f"You can now run '{ctx.parent.info_name if ctx.parent else 'cli'} score {log_dir}' to score the results"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
cli.add_command(eval_command)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
if __name__ == "__main__":
|
|
412
|
+
cli()
|
agenteval/config.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for agent evaluation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
from pydantic import BaseModel, ValidationError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Task(BaseModel):
|
|
10
|
+
name: str
|
|
11
|
+
"""Canonical task name (used by the leaderboard)."""
|
|
12
|
+
|
|
13
|
+
path: str
|
|
14
|
+
"""Path to the task definition (used by Inspect)."""
|
|
15
|
+
|
|
16
|
+
primary_metric: str
|
|
17
|
+
"""Primary metric for the task, used for summary scores."""
|
|
18
|
+
|
|
19
|
+
tags: list[str] | None = None
|
|
20
|
+
"""List of tags, used for computing summary scores for task groups."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Split(BaseModel):
|
|
24
|
+
name: str
|
|
25
|
+
"""Name of the split."""
|
|
26
|
+
|
|
27
|
+
tasks: list[Task]
|
|
28
|
+
"""List of tasks associated with the split."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SuiteConfig(BaseModel):
|
|
32
|
+
name: str
|
|
33
|
+
"""Name of the suite."""
|
|
34
|
+
|
|
35
|
+
version: str | None = None
|
|
36
|
+
"""Version of the suite, e.g. '1.0.0.dev1'."""
|
|
37
|
+
|
|
38
|
+
splits: list[Split]
|
|
39
|
+
"""List of splits in the suite."""
|
|
40
|
+
|
|
41
|
+
def get_tasks(self, split_name: str) -> list[Task]:
|
|
42
|
+
"""
|
|
43
|
+
Get the tasks for a specific split.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
split_name: Name of the split to retrieve tasks from
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of Task objects for the specified split
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If the split is not found
|
|
53
|
+
"""
|
|
54
|
+
for split in self.splits:
|
|
55
|
+
if split.name == split_name:
|
|
56
|
+
return split.tasks
|
|
57
|
+
|
|
58
|
+
available_splits = ", ".join(split.name for split in self.splits)
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Split '{split_name}' not found. Available splits: {available_splits}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_suite_config(file_path: str) -> SuiteConfig:
|
|
65
|
+
"""
|
|
66
|
+
Load the suite configuration from the specified YAML file.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
file_path: Path to the YAML file containing the suite/tasks configuration
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
A validated SuiteConfig object
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
with open(file_path, "r") as f:
|
|
76
|
+
config_data = yaml.safe_load(f)
|
|
77
|
+
except FileNotFoundError:
|
|
78
|
+
raise FileNotFoundError(f"Task configuration file not found: {file_path}")
|
|
79
|
+
except yaml.YAMLError as e:
|
|
80
|
+
raise ValueError(f"Failed to parse YAML file: {e}")
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
return SuiteConfig.model_validate(config_data)
|
|
84
|
+
except ValidationError as e:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Invalid task configuration: {e}\nPlease refer to the config spec."
|
|
87
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
- name: suite_config
|
|
2
|
+
struct:
|
|
3
|
+
- name: name
|
|
4
|
+
dtype: string
|
|
5
|
+
- name: version
|
|
6
|
+
dtype: string
|
|
7
|
+
- name: splits
|
|
8
|
+
list:
|
|
9
|
+
- name: name
|
|
10
|
+
dtype: string
|
|
11
|
+
- name: tasks
|
|
12
|
+
list:
|
|
13
|
+
- name: name
|
|
14
|
+
dtype: string
|
|
15
|
+
- name: path
|
|
16
|
+
dtype: string
|
|
17
|
+
- name: primary_metric
|
|
18
|
+
dtype: string
|
|
19
|
+
- name: tags
|
|
20
|
+
sequence: string
|
|
21
|
+
- name: split
|
|
22
|
+
dtype: string
|
|
23
|
+
- name: results
|
|
24
|
+
list:
|
|
25
|
+
- name: task_name
|
|
26
|
+
dtype: string
|
|
27
|
+
- name: metrics
|
|
28
|
+
list:
|
|
29
|
+
- name: name
|
|
30
|
+
dtype: string
|
|
31
|
+
- name: value
|
|
32
|
+
dtype: float64
|
|
33
|
+
- name: model_usages
|
|
34
|
+
list:
|
|
35
|
+
list:
|
|
36
|
+
- name: model
|
|
37
|
+
dtype: string
|
|
38
|
+
- name: usage
|
|
39
|
+
struct:
|
|
40
|
+
- name: input_tokens
|
|
41
|
+
dtype: int64
|
|
42
|
+
- name: output_tokens
|
|
43
|
+
dtype: int64
|
|
44
|
+
- name: total_tokens
|
|
45
|
+
dtype: int64
|
|
46
|
+
- name: input_tokens_cache_write
|
|
47
|
+
dtype: int64
|
|
48
|
+
- name: input_tokens_cache_read
|
|
49
|
+
dtype: int64
|
|
50
|
+
- name: reasoning_tokens
|
|
51
|
+
dtype: int64
|
|
52
|
+
- name: model_costs
|
|
53
|
+
sequence: float64
|
|
54
|
+
- name: submission
|
|
55
|
+
struct:
|
|
56
|
+
- name: submit_time
|
|
57
|
+
dtype: timestamp[us, tz=UTC]
|
|
58
|
+
- name: username
|
|
59
|
+
dtype: string
|
|
60
|
+
- name: agent_name
|
|
61
|
+
dtype: string
|
|
62
|
+
- name: agent_description
|
|
63
|
+
dtype: string
|
|
64
|
+
- name: agent_url
|
|
65
|
+
dtype: string
|
|
66
|
+
- name: logs_url
|
|
67
|
+
dtype: string
|
|
68
|
+
- name: logs_url_public
|
|
69
|
+
dtype: string
|
|
70
|
+
- name: summary_url
|
|
71
|
+
dtype: string
|