agent-eval 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-eval
3
+ Version: 0.1.1
4
+ Summary: Agent evaluation toolkit
5
+ Project-URL: Homepage, https://github.com/allenai/agent-eval
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: click
9
+ Requires-Dist: inspect-ai
10
+ Requires-Dist: litellm
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: huggingface_hub
13
+ Requires-Dist: pyarrow
14
+ Requires-Dist: datasets
15
+ Provides-Extra: dev
16
+ Requires-Dist: black==24.2.0; extra == "dev"
17
+ Requires-Dist: isort; extra == "dev"
18
+ Requires-Dist: autoflake; extra == "dev"
19
+ Requires-Dist: pytest; extra == "dev"
20
+ Requires-Dist: pytest-asyncio; extra == "dev"
21
+ Requires-Dist: mypy==1.15; extra == "dev"
22
+ Requires-Dist: types-PyYAML; extra == "dev"
23
+ Requires-Dist: types-setuptools; extra == "dev"
24
+
25
+ # agent-eval
26
+
27
+ A utility for evaluating agents on a suite of [Inspect](https://github.com/UKGovernmentBEIS/inspect_ai)-formatted evals, with the following primary benefits:
28
+ 1. Task suite specifications as config.
29
+ 2. Extracts the token usage of the agent from log files, and computes cost using `litellm`.
30
+ 3. Submits task suite results to a leaderboard, with submission metadata and easy upload to a HuggingFace repo for distribution of scores and logs.
31
+
32
+ # Installation
33
+
34
+ To install from pypi, use `pip install agent-eval`.
35
+
36
+ # Usage
37
+
38
+ ## Run evaluation suite
39
+ ```shell
40
+ agenteval eval --config-path CONFIG_PATH --split SPLIT LOG_DIR
41
+ ```
42
+ Evaluate an agent on the supplied eval suite configuration. Results are written to `agenteval.json` in the log directory.
43
+
44
+ See [sample-config.yml](sample-config.yml) for a sample configuration file.
45
+
46
+ For aggregation in a leaderboard, each task specifies a `primary_metric` as `{scorer_name}/{metric_name}`.
47
+ The scoring utils will look for a corresponding stderr metric,
48
+ by looking for another metric with the same `scorer_name` and with a `metric_name` containing the string "stderr".
49
+
50
+ ## Score results
51
+ ```shell
52
+ agenteval score [OPTIONS] LOG_DIR
53
+ ```
54
+ Compute scores for the results in `agenteval.json` and update the file with the computed scores.
55
+
56
+ ## Publish scores
57
+ ```shell
58
+ agenteval publish [OPTIONS] LOG_DIR
59
+ ```
60
+ Upload the scored results to HuggingFace datasets.
61
+
62
+ # Administer the HuggingFace datasets
63
+ Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
64
+
65
+ If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
66
+ This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
67
+ This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
68
+ See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
69
+
70
+ To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
71
+
72
+ # Development
73
+
74
+ See [Development.md](Development.md) for development instructions.
@@ -0,0 +1,16 @@
1
+ agenteval/__init__.py,sha256=nEm7fcpqtTId-83MMLiVDKkszvaGEeFUQ_IxrdA63bo,372
2
+ agenteval/cli.py,sha256=zzx3aceprFcQQ2fNUGBbEZ7RLt3e_gWGoR8OPDgDXaU,13767
3
+ agenteval/config.py,sha256=jrqyAPfC728ymfYUzwWJehzeW8Dpnsmpn5utAySz0uo,2294
4
+ agenteval/dataset_features.yml,sha256=UVsQUxohyjD9YZGw_HmIg6RFbdrkUQRZvmQSADffPS0,1439
5
+ agenteval/io.py,sha256=QjxS2Ta_-gWRBLWOog0sX2Q-WlXmsD5iF3dsVktETR0,1004
6
+ agenteval/log.py,sha256=lJoGkeRpuo017vAAM77mP9BIRexpWuIE8JwIKCVNOto,1958
7
+ agenteval/models.py,sha256=UMGas9wgLT9nM1cyaVEh5PwkUURYSb02dnJDmRXI4Vk,2671
8
+ agenteval/schema_generator.py,sha256=S36u6SSjFbAdcDqJvuzTQkFF2PG7Yq8A0diUsB8yWJg,3037
9
+ agenteval/score.py,sha256=Pb3PofFdW0IobvZu-tv7313u_kTX3tgGw6nmlliMTak,4803
10
+ agenteval/summary.py,sha256=LzkC2ZDf0MNLlXzL3bugAdv3VLkzCMBP85_52fsnToA,3599
11
+ agenteval/upload.py,sha256=B17OK_ZD7q82-MjKCI63NQozB7mksaLknIdqm5ETAAU,5827
12
+ agent_eval-0.1.1.dist-info/METADATA,sha256=mzwemf3uIdIpFHMpkJPNE7MEtiUNA_G4a0XSlcQzjkQ,3447
13
+ agent_eval-0.1.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
14
+ agent_eval-0.1.1.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48
15
+ agent_eval-0.1.1.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10
16
+ agent_eval-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.4.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ agenteval = agenteval.cli:cli
@@ -0,0 +1 @@
1
+ agenteval
agenteval/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from importlib.metadata import version as get_version
2
+
3
+ __version__ = get_version("agent-eval")
4
+
5
+ from .score import process_eval_logs
6
+ from .summary import compute_summary_statistics
7
+ from .upload import upload_folder_to_hf, upload_summary_to_hf
8
+
9
+ __all__ = [
10
+ "process_eval_logs",
11
+ "compute_summary_statistics",
12
+ "upload_folder_to_hf",
13
+ "upload_summary_to_hf",
14
+ ]
agenteval/cli.py ADDED
@@ -0,0 +1,412 @@
1
+ #!/usr/bin/env python3
2
+ import json
3
+ import os
4
+ import subprocess
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from .config import load_suite_config
11
+ from .models import EvalConfig, EvalResult
12
+ from .score import process_eval_logs
13
+ from .summary import compute_summary_statistics
14
+ from .upload import sanitize_path_component, upload_folder_to_hf, upload_summary_to_hf
15
+
16
+ EVAL_FILENAME = "agenteval.json"
17
+
18
+
19
+ def verify_git_reproducibility(ignore_git: bool) -> None:
20
+ if ignore_git:
21
+ return
22
+ try:
23
+ # Get current commit SHA and origin
24
+ sha_result = subprocess.run(
25
+ ["git", "rev-parse", "--short", "HEAD"],
26
+ capture_output=True,
27
+ text=True,
28
+ check=True,
29
+ )
30
+ origin_result = subprocess.run(
31
+ ["git", "remote", "get-url", "origin"],
32
+ capture_output=True,
33
+ text=True,
34
+ check=True,
35
+ )
36
+ sha = sha_result.stdout.strip() if sha_result.returncode == 0 else None
37
+ origin = origin_result.stdout.strip() if origin_result.returncode == 0 else None
38
+
39
+ # Check for dirty working directory
40
+ git_dirty = (
41
+ subprocess.run(
42
+ ["git", "diff", "--quiet", "--exit-code"],
43
+ capture_output=True,
44
+ check=False,
45
+ ).returncode
46
+ != 0
47
+ )
48
+
49
+ # Warn about untracked (non-ignored) files
50
+ untracked_result = subprocess.run(
51
+ ["git", "ls-files", "--others", "--exclude-standard"],
52
+ capture_output=True,
53
+ text=True,
54
+ check=True,
55
+ )
56
+ untracked_files = untracked_result.stdout.strip().splitlines()
57
+ if untracked_files:
58
+ click.echo(
59
+ f"Warning: Untracked files present: {', '.join(untracked_files)}. "
60
+ "For reproducibility, please add, ignore, or remove these files."
61
+ )
62
+
63
+ # Abort if worktree is dirty
64
+ if git_dirty:
65
+ raise click.ClickException(
66
+ f"Git working directory contains uncommitted changes. "
67
+ f"For reproducibility, Inspect will save: origin={origin}, sha={sha}. "
68
+ "Please commit your changes or use --ignore-git to bypass this check (not recommended)."
69
+ )
70
+
71
+ # Check if commit exists on remote
72
+ if sha:
73
+ remote_exists = subprocess.run(
74
+ ["git", "branch", "-r", "--contains", sha],
75
+ capture_output=True,
76
+ text=True,
77
+ check=True,
78
+ ).stdout.strip()
79
+ if not remote_exists:
80
+ raise click.ClickException(
81
+ f"Commit {sha} not found on remote '{origin}'. Others won't be able to "
82
+ "access this code version. Please push your changes or use --ignore-git "
83
+ "to bypass this check (not recommended)."
84
+ )
85
+ except (subprocess.SubprocessError, FileNotFoundError) as e:
86
+ if isinstance(e, click.ClickException):
87
+ raise
88
+ raise click.ClickException(
89
+ f"Unable to verify git status for reproducibility: {e}. "
90
+ "Use --ignore-git to bypass this check if git is not available."
91
+ )
92
+
93
+
94
+ @click.group()
95
+ def cli():
96
+ pass
97
+
98
+
99
+ @click.command(
100
+ name="score",
101
+ help="Score a directory of evaluation logs.",
102
+ )
103
+ @click.argument("log_dir", type=click.Path(exists=True, file_okay=False))
104
+ @click.option(
105
+ "--config-path",
106
+ "config_path",
107
+ type=str,
108
+ help=f"Path to a yml config file. Ignored if {EVAL_FILENAME} exists.",
109
+ default=None,
110
+ )
111
+ @click.option(
112
+ "--split",
113
+ type=str,
114
+ help=f"Config data split. Ignored if {EVAL_FILENAME} exists.",
115
+ default=None,
116
+ )
117
+ def score_command(
118
+ log_dir: str,
119
+ config_path: str | None,
120
+ split: str | None,
121
+ ):
122
+ # Load or create EvalResult and process logs (inlined from processor)
123
+ json_path = Path(log_dir) / EVAL_FILENAME
124
+ if json_path.exists():
125
+ try:
126
+ raw = json_path.read_text(encoding="utf-8")
127
+ eval_result = EvalResult.model_validate_json(raw)
128
+ except Exception as e:
129
+ raise click.ClickException(
130
+ f"Failed to load existing '{EVAL_FILENAME}' at {json_path}: {e}"
131
+ )
132
+ if config_path:
133
+ try:
134
+ cli_cfg = load_suite_config(config_path)
135
+ if cli_cfg.version != eval_result.suite_config.version:
136
+ click.echo(
137
+ f"Warning: CLI config version '{cli_cfg.version}' "
138
+ f"does not match JSON config version '{eval_result.suite_config.version}'."
139
+ )
140
+ except Exception as e:
141
+ click.echo(
142
+ f"Warning: could not load CLI config '{config_path}' for comparison: {e}"
143
+ )
144
+ if split and split != eval_result.split:
145
+ raise click.ClickException(
146
+ f"Split mismatch: JSON split '{eval_result.split}' != CLI split '{split}'"
147
+ )
148
+ else:
149
+ if not config_path or not split:
150
+ raise click.ClickException(
151
+ "--config-path and --split must be provided when no existing result JSON"
152
+ )
153
+ suite_cfg = load_suite_config(config_path)
154
+ eval_result = EvalResult(suite_config=suite_cfg, split=split)
155
+
156
+ task_results, eval_specs = process_eval_logs(log_dir)
157
+ eval_result.eval_specs = eval_specs
158
+ eval_result.results = task_results
159
+
160
+ # Warn if multiple evaluation specs present
161
+ if eval_result.eval_specs and len(eval_result.eval_specs) > 1:
162
+ click.echo(
163
+ f"Warning: Found {len(eval_result.eval_specs)} different eval specs. "
164
+ "Logs may come from mixed runs."
165
+ )
166
+
167
+ # Warn about any missing tasks
168
+ missing_tasks = eval_result.find_missing_tasks()
169
+ if missing_tasks:
170
+ click.echo(f"Warning: Missing tasks in result set: {', '.join(missing_tasks)}")
171
+
172
+ # Compute and display summary statistics
173
+ stats = compute_summary_statistics(
174
+ eval_result.suite_config,
175
+ eval_result.split,
176
+ eval_result.results or [],
177
+ )
178
+ click.echo("Summary statistics:")
179
+ click.echo(json.dumps({k: v.model_dump() for k, v in stats.items()}, indent=2))
180
+
181
+ # Persist updated EvalResult JSON
182
+ eval_result.save_json(Path(log_dir) / EVAL_FILENAME)
183
+
184
+ click.echo(f"Saved results to {log_dir}/{EVAL_FILENAME}")
185
+ ctx = click.get_current_context()
186
+ click.echo(
187
+ f"You can now run '{ctx.parent.info_name if ctx.parent else 'cli'} publish {log_dir}' to publish the results"
188
+ )
189
+
190
+
191
+ cli.add_command(score_command)
192
+
193
+
194
+ @click.command(
195
+ name="publish",
196
+ help="Publish scored results in log_dir to Hugging Face leaderboard.",
197
+ )
198
+ @click.argument("log_dir", type=click.Path(exists=True, file_okay=False))
199
+ @click.option(
200
+ "--submissions-repo-id",
201
+ type=str,
202
+ default=lambda: os.environ.get("SUBMISSIONS_REPO_ID", ""),
203
+ help="HF repo id for submissions. Defaults to SUBMISSIONS_REPO_ID env var.",
204
+ )
205
+ @click.option(
206
+ "--results-repo-id",
207
+ type=str,
208
+ default=lambda: os.environ.get("RESULTS_REPO_ID", ""),
209
+ help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.",
210
+ )
211
+ @click.option(
212
+ "--username",
213
+ type=str,
214
+ default=None,
215
+ help="HF username/org for submission. Defaults to your HF account name.",
216
+ )
217
+ @click.option(
218
+ "--agent-name",
219
+ type=str,
220
+ required=True,
221
+ help="Descriptive agent name for submission.",
222
+ )
223
+ @click.option(
224
+ "--agent-description",
225
+ type=str,
226
+ default=None,
227
+ help="Description of the agent being submitted.",
228
+ )
229
+ @click.option(
230
+ "--agent-url",
231
+ type=str,
232
+ default=None,
233
+ help="URL to the agent's repository or documentation.",
234
+ )
235
+ def publish_command(
236
+ log_dir: str,
237
+ submissions_repo_id: str,
238
+ results_repo_id: str,
239
+ username: str | None,
240
+ agent_name: str,
241
+ agent_description: str | None,
242
+ agent_url: str | None,
243
+ ):
244
+ # Allow huggingface imports to be optional
245
+ from huggingface_hub import HfApi
246
+
247
+ # Derive a filesafe agent_name
248
+ safe_agent_name = sanitize_path_component(agent_name)
249
+ if safe_agent_name != agent_name:
250
+ click.echo(
251
+ f"Note: agent_name '{agent_name}' contains unsafe characters; "
252
+ f"using '{safe_agent_name}' for submission filenames."
253
+ )
254
+
255
+ # Load existing scored results from JSON
256
+ json_path = Path(log_dir) / EVAL_FILENAME
257
+ if not json_path.exists():
258
+ raise click.ClickException(f"No scored results found at {json_path}")
259
+ raw = json_path.read_text(encoding="utf-8")
260
+ eval_result = EvalResult.model_validate_json(raw)
261
+
262
+ # Validate eval result
263
+ if not eval_result.is_scored():
264
+ raise click.ClickException(
265
+ f"{EVAL_FILENAME} is not scored. Please run 'score {log_dir}' first."
266
+ )
267
+ missing_tasks = eval_result.find_missing_tasks()
268
+ if missing_tasks:
269
+ click.echo(f"Warning: Missing tasks in result set: {', '.join(missing_tasks)}")
270
+
271
+ # Determine HF user
272
+ hf_api = HfApi()
273
+ if not username:
274
+ try:
275
+ username = hf_api.whoami()["name"]
276
+ assert isinstance(username, str), "Invalid username type from HF API"
277
+ click.echo(f"Defaulting username to Hugging Face account: {username}")
278
+ except Exception:
279
+ raise click.ClickException(
280
+ "--username must be provided or ensure HF authentication is configured"
281
+ )
282
+
283
+ # Derive a filesafe username
284
+ safe_username = sanitize_path_component(username)
285
+ if safe_username != username:
286
+ click.echo(
287
+ f"Note: username '{username}' contains unsafe characters; "
288
+ f"using '{safe_username}' for submission filenames."
289
+ )
290
+
291
+ # Fill submission metadata
292
+ eval_result.submission.username = username
293
+ eval_result.submission.agent_name = agent_name
294
+ eval_result.submission.agent_description = agent_description
295
+ eval_result.submission.agent_url = agent_url
296
+ eval_result.submission.submit_time = datetime.now(timezone.utc)
297
+
298
+ # Validate suite config version
299
+ config_name = eval_result.suite_config.version
300
+ if not config_name:
301
+ raise click.ClickException("Suite config version is required for upload.")
302
+
303
+ # Build submission name
304
+ ts = eval_result.submission.submit_time.strftime("%Y-%m-%dT%H-%M-%S")
305
+ subm_name = f"{safe_username}_{safe_agent_name}_{ts}"
306
+
307
+ # Upload logs and summary
308
+ logs_url = upload_folder_to_hf(
309
+ hf_api, log_dir, submissions_repo_id, config_name, eval_result.split, subm_name
310
+ )
311
+ click.echo(f"Uploaded submission logs dir to {logs_url}")
312
+ eval_result.submission.logs_url = logs_url
313
+
314
+ summary_url = upload_summary_to_hf(
315
+ hf_api, eval_result, results_repo_id, config_name, eval_result.split, subm_name
316
+ )
317
+ click.echo(f"Uploaded results summary file to {summary_url}")
318
+ eval_result.submission.summary_url = summary_url
319
+
320
+ # Save updated JSON
321
+ eval_result.save_json(Path(log_dir) / EVAL_FILENAME)
322
+ click.echo(f"Updated {EVAL_FILENAME} with publication metadata.")
323
+
324
+
325
+ cli.add_command(publish_command)
326
+
327
+
328
+ @cli.command(
329
+ name="eval",
330
+ help="Run inspect eval-set on specified tasks with the given arguments",
331
+ context_settings={"ignore_unknown_options": True},
332
+ )
333
+ @click.option(
334
+ "--log-dir",
335
+ type=str,
336
+ help="Log directory. Defaults to INSPECT_LOG_DIR or auto-generated under ./logs.",
337
+ )
338
+ @click.option(
339
+ "--config-path",
340
+ "config_path",
341
+ type=str,
342
+ help="Path to a yml config file.",
343
+ required=True,
344
+ )
345
+ @click.option(
346
+ "--split",
347
+ type=str,
348
+ help="Config data split.",
349
+ required=True,
350
+ )
351
+ @click.option(
352
+ "--ignore-git",
353
+ is_flag=True,
354
+ help="Ignore git reproducibility checks (not recommended).",
355
+ )
356
+ @click.argument("args", nargs=-1, type=click.UNPROCESSED)
357
+ def eval_command(
358
+ log_dir: str | None,
359
+ config_path: str,
360
+ split: str,
361
+ ignore_git: bool,
362
+ args: tuple[str],
363
+ ):
364
+ """Run inspect eval-set with arguments and append tasks"""
365
+ suite_config = load_suite_config(config_path)
366
+ tasks = suite_config.get_tasks(split)
367
+
368
+ # Verify git status for reproducibility
369
+ verify_git_reproducibility(ignore_git)
370
+
371
+ if not log_dir:
372
+ log_dir = os.environ.get("INSPECT_LOG_DIR")
373
+ if not log_dir:
374
+ timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
375
+ log_dir = os.path.join(
376
+ ".",
377
+ "logs",
378
+ f"{suite_config.name}_{suite_config.version}_{split}_{timestamp}",
379
+ )
380
+ click.echo(f"No log dir was manually set; using {log_dir}")
381
+ logd_args = ["--log-dir", log_dir]
382
+
383
+ # We use subprocess here to keep arg management simple; an alternative
384
+ # would be calling `inspect_ai.eval_set()` directly, which would allow for
385
+ # programmatic execution
386
+ full_command = (
387
+ ["inspect", "eval-set"] + list(args) + logd_args + [x.path for x in tasks]
388
+ )
389
+ click.echo(f"Running {config_path}: {' '.join(full_command)}")
390
+ proc = subprocess.run(full_command)
391
+
392
+ if proc.returncode != 0:
393
+ raise click.ClickException(
394
+ f"inspect eval-set failed while running {config_path}"
395
+ )
396
+
397
+ # Write the config portion of the results file
398
+ with open(os.path.join(log_dir, EVAL_FILENAME), "w", encoding="utf-8") as f:
399
+ unscored_eval_config = EvalConfig(suite_config=suite_config, split=split)
400
+ f.write(unscored_eval_config.model_dump_json(indent=2))
401
+
402
+ ctx = click.get_current_context()
403
+ click.echo(
404
+ f"You can now run '{ctx.parent.info_name if ctx.parent else 'cli'} score {log_dir}' to score the results"
405
+ )
406
+
407
+
408
+ cli.add_command(eval_command)
409
+
410
+
411
+ if __name__ == "__main__":
412
+ cli()
agenteval/config.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ Configuration management for agent evaluation.
3
+ """
4
+
5
+ import yaml
6
+ from pydantic import BaseModel, ValidationError
7
+
8
+
9
+ class Task(BaseModel):
10
+ name: str
11
+ """Canonical task name (used by the leaderboard)."""
12
+
13
+ path: str
14
+ """Path to the task definition (used by Inspect)."""
15
+
16
+ primary_metric: str
17
+ """Primary metric for the task, used for summary scores."""
18
+
19
+ tags: list[str] | None = None
20
+ """List of tags, used for computing summary scores for task groups."""
21
+
22
+
23
+ class Split(BaseModel):
24
+ name: str
25
+ """Name of the split."""
26
+
27
+ tasks: list[Task]
28
+ """List of tasks associated with the split."""
29
+
30
+
31
+ class SuiteConfig(BaseModel):
32
+ name: str
33
+ """Name of the suite."""
34
+
35
+ version: str | None = None
36
+ """Version of the suite, e.g. '1.0.0.dev1'."""
37
+
38
+ splits: list[Split]
39
+ """List of splits in the suite."""
40
+
41
+ def get_tasks(self, split_name: str) -> list[Task]:
42
+ """
43
+ Get the tasks for a specific split.
44
+
45
+ Args:
46
+ split_name: Name of the split to retrieve tasks from
47
+
48
+ Returns:
49
+ List of Task objects for the specified split
50
+
51
+ Raises:
52
+ ValueError: If the split is not found
53
+ """
54
+ for split in self.splits:
55
+ if split.name == split_name:
56
+ return split.tasks
57
+
58
+ available_splits = ", ".join(split.name for split in self.splits)
59
+ raise ValueError(
60
+ f"Split '{split_name}' not found. Available splits: {available_splits}"
61
+ )
62
+
63
+
64
+ def load_suite_config(file_path: str) -> SuiteConfig:
65
+ """
66
+ Load the suite configuration from the specified YAML file.
67
+
68
+ Args:
69
+ file_path: Path to the YAML file containing the suite/tasks configuration
70
+
71
+ Returns:
72
+ A validated SuiteConfig object
73
+ """
74
+ try:
75
+ with open(file_path, "r") as f:
76
+ config_data = yaml.safe_load(f)
77
+ except FileNotFoundError:
78
+ raise FileNotFoundError(f"Task configuration file not found: {file_path}")
79
+ except yaml.YAMLError as e:
80
+ raise ValueError(f"Failed to parse YAML file: {e}")
81
+
82
+ try:
83
+ return SuiteConfig.model_validate(config_data)
84
+ except ValidationError as e:
85
+ raise ValueError(
86
+ f"Invalid task configuration: {e}\nPlease refer to the config spec."
87
+ )
@@ -0,0 +1,71 @@
1
+ - name: suite_config
2
+ struct:
3
+ - name: name
4
+ dtype: string
5
+ - name: version
6
+ dtype: string
7
+ - name: splits
8
+ list:
9
+ - name: name
10
+ dtype: string
11
+ - name: tasks
12
+ list:
13
+ - name: name
14
+ dtype: string
15
+ - name: path
16
+ dtype: string
17
+ - name: primary_metric
18
+ dtype: string
19
+ - name: tags
20
+ sequence: string
21
+ - name: split
22
+ dtype: string
23
+ - name: results
24
+ list:
25
+ - name: task_name
26
+ dtype: string
27
+ - name: metrics
28
+ list:
29
+ - name: name
30
+ dtype: string
31
+ - name: value
32
+ dtype: float64
33
+ - name: model_usages
34
+ list:
35
+ list:
36
+ - name: model
37
+ dtype: string
38
+ - name: usage
39
+ struct:
40
+ - name: input_tokens
41
+ dtype: int64
42
+ - name: output_tokens
43
+ dtype: int64
44
+ - name: total_tokens
45
+ dtype: int64
46
+ - name: input_tokens_cache_write
47
+ dtype: int64
48
+ - name: input_tokens_cache_read
49
+ dtype: int64
50
+ - name: reasoning_tokens
51
+ dtype: int64
52
+ - name: model_costs
53
+ sequence: float64
54
+ - name: submission
55
+ struct:
56
+ - name: submit_time
57
+ dtype: timestamp[us, tz=UTC]
58
+ - name: username
59
+ dtype: string
60
+ - name: agent_name
61
+ dtype: string
62
+ - name: agent_description
63
+ dtype: string
64
+ - name: agent_url
65
+ dtype: string
66
+ - name: logs_url
67
+ dtype: string
68
+ - name: logs_url_public
69
+ dtype: string
70
+ - name: summary_url
71
+ dtype: string