fableforge-bench-agent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench_agent/__init__.py +3 -0
- bench_agent/cli.py +199 -0
- bench_agent/evaluator.py +129 -0
- bench_agent/leaderboard.py +99 -0
- bench_agent/models.py +67 -0
- bench_agent/runner.py +159 -0
- bench_agent/scorer.py +180 -0
- bench_agent/tasks.py +1455 -0
- fableforge_bench_agent-0.1.0.dist-info/METADATA +224 -0
- fableforge_bench_agent-0.1.0.dist-info/RECORD +13 -0
- fableforge_bench_agent-0.1.0.dist-info/WHEEL +4 -0
- fableforge_bench_agent-0.1.0.dist-info/entry_points.txt +2 -0
- fableforge_bench_agent-0.1.0.dist-info/licenses/LICENSE +21 -0
bench_agent/__init__.py
ADDED
bench_agent/cli.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""CLI interface for BenchAgent benchmark runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
|
|
13
|
+
from bench_agent.evaluator import ModelProvider, evaluate_model, evaluate_with_retry
|
|
14
|
+
from bench_agent.leaderboard import (
|
|
15
|
+
export_leaderboard,
|
|
16
|
+
export_markdown,
|
|
17
|
+
load_leaderboard,
|
|
18
|
+
save_leaderboard,
|
|
19
|
+
update_leaderboard,
|
|
20
|
+
)
|
|
21
|
+
from bench_agent.models import Leaderboard, TaskCategory
|
|
22
|
+
from bench_agent.runner import TaskRunner
|
|
23
|
+
from bench_agent.tasks import ALL_TASKS, TASKS_BY_CATEGORY, get_task_count
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
console = Console()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@click.group()
|
|
30
|
+
@click.version_option(version="0.1.0")
|
|
31
|
+
def main() -> None:
|
|
32
|
+
"""BenchAgent — HumanEval for tool use."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@main.command("list-tasks")
|
|
36
|
+
@click.option("--category", "-c", type=click.Choice([c.value for c in TaskCategory]), default=None)
|
|
37
|
+
@click.option("--difficulty", "-d", type=click.Choice(["easy", "medium", "hard"]), default=None)
|
|
38
|
+
def list_tasks(category: str | None, difficulty: str | None) -> None:
|
|
39
|
+
"""List available benchmark tasks."""
|
|
40
|
+
table = Table(title="BenchAgent Tasks")
|
|
41
|
+
table.add_column("ID", style="cyan")
|
|
42
|
+
table.add_column("Category", style="green")
|
|
43
|
+
table.add_column("Difficulty", style="yellow")
|
|
44
|
+
table.add_column("Description", style="white")
|
|
45
|
+
table.add_column("Max Turns", style="magenta")
|
|
46
|
+
|
|
47
|
+
tasks = ALL_TASKS
|
|
48
|
+
if category:
|
|
49
|
+
tasks = [t for t in tasks if t.category.value == category]
|
|
50
|
+
if difficulty:
|
|
51
|
+
tasks = [t for t in tasks if t.difficulty.value == difficulty]
|
|
52
|
+
|
|
53
|
+
for task in tasks:
|
|
54
|
+
table.add_row(
|
|
55
|
+
task.task_id,
|
|
56
|
+
task.category.value,
|
|
57
|
+
task.difficulty.value,
|
|
58
|
+
task.description[:60] + ("..." if len(task.description) > 60 else ""),
|
|
59
|
+
str(task.max_turns),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
console.print(table)
|
|
63
|
+
console.print(f"\nTotal: {len(tasks)} tasks (of {get_task_count()} total)")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@main.command("run")
|
|
67
|
+
@click.option("--model", "-m", required=True, help="Model name to evaluate")
|
|
68
|
+
@click.option("--category", "-c", type=click.Choice([c.value for c in TaskCategory]), default=None, help="Task category")
|
|
69
|
+
@click.option("--all", "run_all", is_flag=True, help="Run all categories")
|
|
70
|
+
@click.option("--num-tasks", "-n", type=int, default=None, help="Number of tasks to run")
|
|
71
|
+
@click.option("--output", "-o", type=click.Path(), default=None, help="Output file for results")
|
|
72
|
+
@click.option("--provider", "-p", type=click.Choice(["openai", "anthropic", "local", "huggingface"]), default="openai")
|
|
73
|
+
@click.option("--timeout", type=float, default=300.0, help="Total timeout per task in seconds")
|
|
74
|
+
@click.option("--retries", type=int, default=3, help="Max retries on API errors")
|
|
75
|
+
def run(
|
|
76
|
+
model: str,
|
|
77
|
+
category: str | None,
|
|
78
|
+
run_all: bool,
|
|
79
|
+
num_tasks: int | None,
|
|
80
|
+
output: str | None,
|
|
81
|
+
provider: str,
|
|
82
|
+
timeout: float,
|
|
83
|
+
retries: int,
|
|
84
|
+
) -> None:
|
|
85
|
+
"""Run benchmark tasks against a model."""
|
|
86
|
+
categories = None
|
|
87
|
+
if run_all:
|
|
88
|
+
categories = list(TaskCategory)
|
|
89
|
+
elif category:
|
|
90
|
+
categories = [TaskCategory(category)]
|
|
91
|
+
|
|
92
|
+
console.print(f"[bold blue]Running BenchAgent for model: {model}[/bold blue]")
|
|
93
|
+
console.print(f"Provider: {provider}")
|
|
94
|
+
|
|
95
|
+
runner = TaskRunner(total_timeout=timeout)
|
|
96
|
+
|
|
97
|
+
if categories:
|
|
98
|
+
cat_names = [c.value for c in categories]
|
|
99
|
+
console.print(f"Categories: {', '.join(cat_names)}")
|
|
100
|
+
else:
|
|
101
|
+
console.print("Categories: all")
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
report = evaluate_with_retry(
|
|
105
|
+
model_name=model,
|
|
106
|
+
provider=provider,
|
|
107
|
+
categories=categories,
|
|
108
|
+
num_tasks=num_tasks,
|
|
109
|
+
max_retries=retries,
|
|
110
|
+
runner=runner,
|
|
111
|
+
)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
|
|
116
|
+
console.print(f"\n[bold green]Results for {model}[/bold green]")
|
|
117
|
+
console.print(f"Total Score: {report.total_score}")
|
|
118
|
+
console.print(f"Error Recovery Rate: {report.error_recovery_rate}")
|
|
119
|
+
|
|
120
|
+
if report.category_scores:
|
|
121
|
+
table = Table(title="Category Scores")
|
|
122
|
+
table.add_column("Category", style="cyan")
|
|
123
|
+
table.add_column("Score", style="green")
|
|
124
|
+
for cat, score in report.category_scores.items():
|
|
125
|
+
table.add_row(cat, f"{score:.1f}")
|
|
126
|
+
console.print(table)
|
|
127
|
+
|
|
128
|
+
if output:
|
|
129
|
+
output_path = Path(output)
|
|
130
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
output_path.write_text(report.model_dump_json(indent=2))
|
|
132
|
+
console.print(f"\nResults saved to: {output}")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@main.command("leaderboard")
|
|
136
|
+
@click.option("--update", is_flag=True, help="Update leaderboard with new results")
|
|
137
|
+
@click.option("--results", type=click.Path(exists=True), help="Results file to add")
|
|
138
|
+
@click.option("--path", type=click.Path(), default="leaderboard.json", help="Leaderboard file path")
|
|
139
|
+
@click.option("--format", "-f", type=click.Choice(["json", "markdown"]), default="json", help="Export format")
|
|
140
|
+
def leaderboard(update: bool, results: str | None, path: str, format: str) -> None:
|
|
141
|
+
"""Show or update the leaderboard."""
|
|
142
|
+
lb = load_leaderboard(path)
|
|
143
|
+
|
|
144
|
+
if update and results:
|
|
145
|
+
results_path = Path(results)
|
|
146
|
+
data = json.loads(results_path.read_text())
|
|
147
|
+
|
|
148
|
+
if isinstance(data, list):
|
|
149
|
+
for entry in data:
|
|
150
|
+
lb = update_leaderboard(lb, entry.get("model", "unknown"), [])
|
|
151
|
+
elif isinstance(data, dict) and "task_id" in data:
|
|
152
|
+
entries = [data]
|
|
153
|
+
else:
|
|
154
|
+
report = ScoreReport(**data) if isinstance(data, dict) else data
|
|
155
|
+
lb.entries.append(report)
|
|
156
|
+
|
|
157
|
+
lb = update_leaderboard(lb, data.get("model", "unknown") if isinstance(data, dict) else "unknown", [])
|
|
158
|
+
save_leaderboard(lb, path)
|
|
159
|
+
console.print(f"[green]Leaderboard updated and saved to {path}[/green]")
|
|
160
|
+
|
|
161
|
+
if format == "markdown":
|
|
162
|
+
console.print(export_markdown(lb))
|
|
163
|
+
else:
|
|
164
|
+
table = Table(title="BenchAgent Leaderboard")
|
|
165
|
+
table.add_column("Rank", style="bold")
|
|
166
|
+
table.add_column("Model", style="cyan")
|
|
167
|
+
table.add_column("Score", style="green")
|
|
168
|
+
table.add_column("Recovery Rate", style="yellow")
|
|
169
|
+
|
|
170
|
+
for entry in lb.entries:
|
|
171
|
+
table.add_row(
|
|
172
|
+
str(entry.leaderboard_rank),
|
|
173
|
+
entry.model,
|
|
174
|
+
f"{entry.total_score:.1f}",
|
|
175
|
+
f"{entry.error_recovery_rate:.3f}",
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
console.print(table)
|
|
179
|
+
console.print(f"\nLast updated: {lb.last_updated}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@main.command("export")
|
|
183
|
+
@click.option("--path", type=click.Path(), default="leaderboard.json", help="Leaderboard file path")
|
|
184
|
+
@click.option("--format", "-f", type=click.Choice(["json", "markdown"]), default="json")
|
|
185
|
+
@click.option("--output", "-o", type=click.Path(), default=None)
|
|
186
|
+
def export(path: str, format: str, output: str | None) -> None:
|
|
187
|
+
"""Export leaderboard in the specified format."""
|
|
188
|
+
lb = load_leaderboard(path)
|
|
189
|
+
content = export_leaderboard(lb, format=format)
|
|
190
|
+
|
|
191
|
+
if output:
|
|
192
|
+
Path(output).write_text(content)
|
|
193
|
+
console.print(f"Exported to: {output}")
|
|
194
|
+
else:
|
|
195
|
+
console.print(content)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
if __name__ == "__main__":
|
|
199
|
+
main()
|
bench_agent/evaluator.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Model evaluation orchestrator for BenchAgent benchmark."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from bench_agent.models import ScoreReport, Task, TaskCategory, TaskResult
|
|
10
|
+
from bench_agent.runner import TaskRunner
|
|
11
|
+
from bench_agent.scorer import (
|
|
12
|
+
calculate_category_scores,
|
|
13
|
+
calculate_overall_score,
|
|
14
|
+
error_recovery_score,
|
|
15
|
+
)
|
|
16
|
+
from bench_agent.tasks import ALL_TASKS, TASKS_BY_CATEGORY
|
|
17
|
+
|
|
18
|
+
PROMPT_TEMPLATE = """You are an AI assistant being evaluated on your tool-use capabilities.
|
|
19
|
+
|
|
20
|
+
You have access to the following tools: {tools}
|
|
21
|
+
|
|
22
|
+
Your task: {description}
|
|
23
|
+
|
|
24
|
+
Working directory: {workdir}
|
|
25
|
+
|
|
26
|
+
{initial_files}
|
|
27
|
+
|
|
28
|
+
You must complete this task using the tools available. You have {max_turns} turns maximum.
|
|
29
|
+
Be precise and thorough. After completing the task, verify your work.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ModelProvider:
|
|
34
|
+
OPENAI = "openai"
|
|
35
|
+
ANTHROPIC = "anthropic"
|
|
36
|
+
LOCAL = "local"
|
|
37
|
+
HUGGINGFACE = "huggingface"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _build_prompt(task: Task, workdir: str) -> str:
|
|
41
|
+
files_section = ""
|
|
42
|
+
if task.initial_state:
|
|
43
|
+
files_section = "Initial files:\n"
|
|
44
|
+
for fname, content in task.initial_state.items():
|
|
45
|
+
files_section += f"\n--- {fname} ---\n{content}\n"
|
|
46
|
+
|
|
47
|
+
return PROMPT_TEMPLATE.format(
|
|
48
|
+
tools=", ".join(task.tools_required),
|
|
49
|
+
description=task.description,
|
|
50
|
+
workdir=workdir,
|
|
51
|
+
initial_files=files_section,
|
|
52
|
+
max_turns=task.max_turns,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def evaluate_model(
|
|
57
|
+
model_name: str,
|
|
58
|
+
provider: str = ModelProvider.OPENAI,
|
|
59
|
+
categories: list[TaskCategory] | None = None,
|
|
60
|
+
num_tasks: int | None = None,
|
|
61
|
+
api_key: str | None = None,
|
|
62
|
+
base_url: str | None = None,
|
|
63
|
+
runner: TaskRunner | None = None,
|
|
64
|
+
) -> ScoreReport:
|
|
65
|
+
runner = runner or TaskRunner()
|
|
66
|
+
|
|
67
|
+
tasks: list[Task] = []
|
|
68
|
+
if categories:
|
|
69
|
+
for cat in categories:
|
|
70
|
+
tasks.extend(TASKS_BY_CATEGORY.get(cat, []))
|
|
71
|
+
else:
|
|
72
|
+
tasks = list(ALL_TASKS)
|
|
73
|
+
|
|
74
|
+
if num_tasks:
|
|
75
|
+
tasks = tasks[:num_tasks]
|
|
76
|
+
|
|
77
|
+
results: list[TaskResult] = []
|
|
78
|
+
for task in tasks:
|
|
79
|
+
try:
|
|
80
|
+
result = runner.run_task(task, model_name)
|
|
81
|
+
results.append(result)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
results.append(
|
|
84
|
+
TaskResult(
|
|
85
|
+
task_id=task.task_id,
|
|
86
|
+
model=model_name,
|
|
87
|
+
success=False,
|
|
88
|
+
errors=[str(e)],
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
total_score = calculate_overall_score(results)
|
|
93
|
+
category_scores = calculate_category_scores(results)
|
|
94
|
+
|
|
95
|
+
recovery_scores = [error_recovery_score(r) for r in results]
|
|
96
|
+
avg_recovery = sum(recovery_scores) / len(recovery_scores) if recovery_scores else 0.0
|
|
97
|
+
|
|
98
|
+
return ScoreReport(
|
|
99
|
+
model=model_name,
|
|
100
|
+
total_score=total_score,
|
|
101
|
+
category_scores=category_scores,
|
|
102
|
+
error_recovery_rate=round(avg_recovery, 3),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def evaluate_with_retry(
|
|
107
|
+
model_name: str,
|
|
108
|
+
provider: str = ModelProvider.OPENAI,
|
|
109
|
+
categories: list[TaskCategory] | None = None,
|
|
110
|
+
num_tasks: int | None = None,
|
|
111
|
+
max_retries: int = 3,
|
|
112
|
+
retry_delay: float = 5.0,
|
|
113
|
+
**kwargs: Any,
|
|
114
|
+
) -> ScoreReport:
|
|
115
|
+
for attempt in range(max_retries):
|
|
116
|
+
try:
|
|
117
|
+
return evaluate_model(
|
|
118
|
+
model_name=model_name,
|
|
119
|
+
provider=provider,
|
|
120
|
+
categories=categories,
|
|
121
|
+
num_tasks=num_tasks,
|
|
122
|
+
**kwargs,
|
|
123
|
+
)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
if attempt == max_retries - 1:
|
|
126
|
+
raise
|
|
127
|
+
time.sleep(retry_delay * (2**attempt))
|
|
128
|
+
|
|
129
|
+
raise RuntimeError("Unreachable")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Leaderboard management for BenchAgent benchmark results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from bench_agent.models import Leaderboard, ScoreReport, TaskResult
|
|
10
|
+
from bench_agent.scorer import calculate_overall_score, calculate_category_scores, error_recovery_score
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_leaderboard(path: str | Path) -> Leaderboard:
|
|
14
|
+
path = Path(path)
|
|
15
|
+
if not path.exists():
|
|
16
|
+
return Leaderboard(last_updated=datetime.now(timezone.utc).isoformat())
|
|
17
|
+
|
|
18
|
+
data = json.loads(path.read_text())
|
|
19
|
+
return Leaderboard(**data)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def update_leaderboard(
|
|
23
|
+
leaderboard: Leaderboard, model_name: str, results: list[TaskResult]
|
|
24
|
+
) -> Leaderboard:
|
|
25
|
+
total_score = calculate_overall_score(results)
|
|
26
|
+
category_scores = calculate_category_scores(results)
|
|
27
|
+
|
|
28
|
+
recovery_scores = [error_recovery_score(r) for r in results]
|
|
29
|
+
avg_recovery = sum(recovery_scores) / len(recovery_scores) if recovery_scores else 0.0
|
|
30
|
+
|
|
31
|
+
existing = None
|
|
32
|
+
for entry in leaderboard.entries:
|
|
33
|
+
if entry.model == model_name:
|
|
34
|
+
existing = entry
|
|
35
|
+
break
|
|
36
|
+
|
|
37
|
+
report = ScoreReport(
|
|
38
|
+
model=model_name,
|
|
39
|
+
total_score=total_score,
|
|
40
|
+
category_scores=category_scores,
|
|
41
|
+
error_recovery_rate=round(avg_recovery, 3),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if existing:
|
|
45
|
+
idx = leaderboard.entries.index(existing)
|
|
46
|
+
leaderboard.entries[idx] = report
|
|
47
|
+
else:
|
|
48
|
+
leaderboard.entries.append(report)
|
|
49
|
+
|
|
50
|
+
leaderboard = sort_leaderboard(leaderboard)
|
|
51
|
+
leaderboard.last_updated = datetime.now(timezone.utc).isoformat()
|
|
52
|
+
return leaderboard
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def sort_leaderboard(leaderboard: Leaderboard) -> Leaderboard:
|
|
56
|
+
sorted_entries = sorted(leaderboard.entries, key=lambda e: e.total_score, reverse=True)
|
|
57
|
+
for idx, entry in enumerate(sorted_entries):
|
|
58
|
+
entry.leaderboard_rank = idx + 1
|
|
59
|
+
leaderboard.entries = sorted_entries
|
|
60
|
+
return leaderboard
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def export_leaderboard(leaderboard: Leaderboard, format: str = "json") -> str:
|
|
64
|
+
if format == "json":
|
|
65
|
+
return leaderboard.model_dump_json(indent=2)
|
|
66
|
+
elif format == "markdown":
|
|
67
|
+
return export_markdown(leaderboard)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(f"Unknown format: {format}")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def export_markdown(leaderboard: Leaderboard) -> str:
|
|
73
|
+
lines = [
|
|
74
|
+
"# BenchAgent Leaderboard",
|
|
75
|
+
"",
|
|
76
|
+
f"Last updated: {leaderboard.last_updated}",
|
|
77
|
+
"",
|
|
78
|
+
"| Rank | Model | Total Score | Error Recovery | Bash | Edit | Read | Write | Multi-Tool | Error Recovery Cat |",
|
|
79
|
+
"|------|-------|-------------|-----------------|------|------|------|-------|------------|-------------------|",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
for entry in leaderboard.entries:
|
|
83
|
+
cat = entry.category_scores
|
|
84
|
+
lines.append(
|
|
85
|
+
f"| {entry.leaderboard_rank} | {entry.model} | {entry.total_score:.1f} | "
|
|
86
|
+
f"{entry.error_recovery_rate:.3f} | "
|
|
87
|
+
f"{cat.get('bash', 0):.1f} | {cat.get('edit', 0):.1f} | "
|
|
88
|
+
f"{cat.get('read', 0):.1f} | {cat.get('write', 0):.1f} | "
|
|
89
|
+
f"{cat.get('multi_tool', 0):.1f} | {cat.get('error_recovery', 0):.1f} |"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
lines.append("")
|
|
93
|
+
return "\n".join(lines)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def save_leaderboard(leaderboard: Leaderboard, path: str | Path) -> None:
|
|
97
|
+
path = Path(path)
|
|
98
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
path.write_text(export_leaderboard(leaderboard, format="json"))
|
bench_agent/models.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Pydantic models for BenchAgent benchmark definitions and results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TaskCategory(str, Enum):
|
|
12
|
+
BASH = "bash"
|
|
13
|
+
EDIT = "edit"
|
|
14
|
+
READ = "read"
|
|
15
|
+
WRITE = "write"
|
|
16
|
+
MULTI_TOOL = "multi_tool"
|
|
17
|
+
ERROR_RECOVERY = "error_recovery"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Difficulty(str, Enum):
|
|
21
|
+
EASY = "easy"
|
|
22
|
+
MEDIUM = "medium"
|
|
23
|
+
HARD = "hard"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Task(BaseModel):
|
|
27
|
+
task_id: str
|
|
28
|
+
category: TaskCategory
|
|
29
|
+
difficulty: Difficulty
|
|
30
|
+
description: str
|
|
31
|
+
initial_state: dict[str, str] = Field(
|
|
32
|
+
default_factory=dict,
|
|
33
|
+
description="Mapping of filename -> file content to create before the task",
|
|
34
|
+
)
|
|
35
|
+
expected_outcome: dict[str, Any] = Field(
|
|
36
|
+
default_factory=dict,
|
|
37
|
+
description="Expected result: keys may include 'files', 'exit_code', 'stdout', 'file_exists'",
|
|
38
|
+
)
|
|
39
|
+
tools_required: list[str] = Field(default_factory=list)
|
|
40
|
+
max_turns: int = 10
|
|
41
|
+
verification_script: str = ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TaskResult(BaseModel):
|
|
45
|
+
task_id: str
|
|
46
|
+
model: str
|
|
47
|
+
success: bool = False
|
|
48
|
+
turns_used: int = 0
|
|
49
|
+
tokens_used: int = 0
|
|
50
|
+
errors: list[str] = Field(default_factory=list)
|
|
51
|
+
recovery_attempts: int = 0
|
|
52
|
+
duration_seconds: float = 0.0
|
|
53
|
+
actual_output: dict[str, Any] = Field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ScoreReport(BaseModel):
|
|
57
|
+
model: str
|
|
58
|
+
total_score: float = 0.0
|
|
59
|
+
category_scores: dict[str, float] = Field(default_factory=dict)
|
|
60
|
+
tool_scores: dict[str, float] = Field(default_factory=dict)
|
|
61
|
+
error_recovery_rate: float = 0.0
|
|
62
|
+
leaderboard_rank: int = 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Leaderboard(BaseModel):
|
|
66
|
+
entries: list[ScoreReport] = Field(default_factory=list)
|
|
67
|
+
last_updated: str = ""
|
bench_agent/runner.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Task execution runner with sandboxing, timeouts, and token tracking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from bench_agent.models import Task, TaskCategory, TaskResult
|
|
14
|
+
from bench_agent.tasks import TASKS_BY_CATEGORY, ALL_TASKS
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
class TaskRunner:
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
sandbox_root: str | Path | None = None,
|
|
22
|
+
per_turn_timeout: float = 30.0,
|
|
23
|
+
total_timeout: float = 300.0,
|
|
24
|
+
) -> None:
|
|
25
|
+
self.sandbox_root = Path(sandbox_root) if sandbox_root else Path(tempfile.mkdtemp())
|
|
26
|
+
self.per_turn_timeout = per_turn_timeout
|
|
27
|
+
self.total_timeout = total_timeout
|
|
28
|
+
|
|
29
|
+
def _setup_sandbox(self, task: Task, sandbox_dir: Path) -> Path:
|
|
30
|
+
sandbox_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
for filename, content in task.initial_state.items():
|
|
32
|
+
filepath = sandbox_dir / filename
|
|
33
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
filepath.write_text(content)
|
|
35
|
+
return sandbox_dir
|
|
36
|
+
|
|
37
|
+
def _cleanup_sandbox(self, sandbox_dir: Path) -> None:
|
|
38
|
+
if sandbox_dir.exists():
|
|
39
|
+
shutil.rmtree(sandbox_dir, ignore_errors=True)
|
|
40
|
+
|
|
41
|
+
def _verify_task(self, task: Task, sandbox_dir: Path) -> bool:
|
|
42
|
+
if not task.verification_script:
|
|
43
|
+
return self._verify_expected_outcome(task, sandbox_dir)
|
|
44
|
+
try:
|
|
45
|
+
result = subprocess.run(
|
|
46
|
+
["python3", "-c", task.verification_script],
|
|
47
|
+
cwd=str(sandbox_dir),
|
|
48
|
+
capture_output=True,
|
|
49
|
+
text=True,
|
|
50
|
+
timeout=15,
|
|
51
|
+
)
|
|
52
|
+
return result.returncode == 0
|
|
53
|
+
except (subprocess.TimeoutExpired, Exception):
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
def _verify_expected_outcome(self, task: Task, sandbox_dir: Path) -> bool:
|
|
57
|
+
expected = task.expected_outcome
|
|
58
|
+
if not expected:
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
if "files" in expected:
|
|
62
|
+
for filename, expected_content in expected["files"].items():
|
|
63
|
+
filepath = sandbox_dir / filename
|
|
64
|
+
if not filepath.exists():
|
|
65
|
+
return False
|
|
66
|
+
actual = filepath.read_text()
|
|
67
|
+
if actual.strip() != expected_content.strip():
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
if "file_exists" in expected:
|
|
71
|
+
for filename in expected["file_exists"]:
|
|
72
|
+
if not (sandbox_dir / filename).exists():
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
if "exit_code" in expected:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
def _count_tokens(self, text: str) -> int:
|
|
81
|
+
return max(1, len(text) // 4)
|
|
82
|
+
|
|
83
|
+
def run_task(
|
|
84
|
+
self, task: Task, model: str, max_turns: int | None = None
|
|
85
|
+
) -> TaskResult:
|
|
86
|
+
max_turns = max_turns or task.max_turns
|
|
87
|
+
sandbox_dir = self.sandbox_root / f"task_{task.task_id}"
|
|
88
|
+
errors: list[str] = []
|
|
89
|
+
recovery_attempts = 0
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
self._setup_sandbox(task, sandbox_dir)
|
|
94
|
+
|
|
95
|
+
turns_used = 1
|
|
96
|
+
tokens_used = 0
|
|
97
|
+
|
|
98
|
+
prompt = f"You are a coding assistant. Complete this task:\n\n{task.description}\n\n"
|
|
99
|
+
prompt += f"Working directory: {sandbox_dir}\n"
|
|
100
|
+
prompt += f"Tools allowed: {', '.join(task.tools_required)}\n"
|
|
101
|
+
prompt += f"Max turns: {max_turns}\n"
|
|
102
|
+
|
|
103
|
+
if task.initial_state:
|
|
104
|
+
prompt += "\nInitial files:\n"
|
|
105
|
+
for fname, content in task.initial_state.items():
|
|
106
|
+
prompt += f"\n--- {fname} ---\n{content}\n"
|
|
107
|
+
|
|
108
|
+
tokens_used += self._count_tokens(prompt)
|
|
109
|
+
|
|
110
|
+
for turn in range(max_turns):
|
|
111
|
+
turns_used = turn + 1
|
|
112
|
+
if time.time() - start_time > self.total_timeout:
|
|
113
|
+
errors.append(f"Total timeout exceeded after {turns_used} turns")
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
success = self._verify_task(task, sandbox_dir)
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
errors.append(str(e))
|
|
120
|
+
success = False
|
|
121
|
+
turns_used = 1
|
|
122
|
+
finally:
|
|
123
|
+
self._cleanup_sandbox(sandbox_dir)
|
|
124
|
+
|
|
125
|
+
duration = time.time() - start_time
|
|
126
|
+
|
|
127
|
+
return TaskResult(
|
|
128
|
+
task_id=task.task_id,
|
|
129
|
+
model=model,
|
|
130
|
+
success=success,
|
|
131
|
+
turns_used=turns_used,
|
|
132
|
+
tokens_used=tokens_used,
|
|
133
|
+
errors=errors,
|
|
134
|
+
recovery_attempts=recovery_attempts,
|
|
135
|
+
duration_seconds=round(duration, 2),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def run_benchmark(
|
|
139
|
+
self,
|
|
140
|
+
model: str,
|
|
141
|
+
categories: list[TaskCategory] | None = None,
|
|
142
|
+
num_tasks: int | None = None,
|
|
143
|
+
) -> list[TaskResult]:
|
|
144
|
+
if categories:
|
|
145
|
+
tasks: list[Task] = []
|
|
146
|
+
for cat in categories:
|
|
147
|
+
tasks.extend(TASKS_BY_CATEGORY.get(cat, []))
|
|
148
|
+
else:
|
|
149
|
+
tasks = list(ALL_TASKS)
|
|
150
|
+
|
|
151
|
+
if num_tasks:
|
|
152
|
+
tasks = tasks[:num_tasks]
|
|
153
|
+
|
|
154
|
+
results: list[TaskResult] = []
|
|
155
|
+
for task in tasks:
|
|
156
|
+
result = self.run_task(task, model)
|
|
157
|
+
results.append(result)
|
|
158
|
+
|
|
159
|
+
return results
|