fableforge-bench-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """BenchAgent — HumanEval for tool use."""
2
+
3
+ __version__ = "0.1.0"
bench_agent/cli.py ADDED
@@ -0,0 +1,199 @@
1
+ """CLI interface for BenchAgent benchmark runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import click
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+
13
+ from bench_agent.evaluator import ModelProvider, evaluate_model, evaluate_with_retry
14
+ from bench_agent.leaderboard import (
15
+ export_leaderboard,
16
+ export_markdown,
17
+ load_leaderboard,
18
+ save_leaderboard,
19
+ update_leaderboard,
20
+ )
21
+ from bench_agent.models import Leaderboard, TaskCategory
22
+ from bench_agent.runner import TaskRunner
23
+ from bench_agent.tasks import ALL_TASKS, TASKS_BY_CATEGORY, get_task_count
24
+
25
+
26
+ console = Console()
27
+
28
+
29
+ @click.group()
30
+ @click.version_option(version="0.1.0")
31
+ def main() -> None:
32
+ """BenchAgent — HumanEval for tool use."""
33
+
34
+
35
+ @main.command("list-tasks")
36
+ @click.option("--category", "-c", type=click.Choice([c.value for c in TaskCategory]), default=None)
37
+ @click.option("--difficulty", "-d", type=click.Choice(["easy", "medium", "hard"]), default=None)
38
+ def list_tasks(category: str | None, difficulty: str | None) -> None:
39
+ """List available benchmark tasks."""
40
+ table = Table(title="BenchAgent Tasks")
41
+ table.add_column("ID", style="cyan")
42
+ table.add_column("Category", style="green")
43
+ table.add_column("Difficulty", style="yellow")
44
+ table.add_column("Description", style="white")
45
+ table.add_column("Max Turns", style="magenta")
46
+
47
+ tasks = ALL_TASKS
48
+ if category:
49
+ tasks = [t for t in tasks if t.category.value == category]
50
+ if difficulty:
51
+ tasks = [t for t in tasks if t.difficulty.value == difficulty]
52
+
53
+ for task in tasks:
54
+ table.add_row(
55
+ task.task_id,
56
+ task.category.value,
57
+ task.difficulty.value,
58
+ task.description[:60] + ("..." if len(task.description) > 60 else ""),
59
+ str(task.max_turns),
60
+ )
61
+
62
+ console.print(table)
63
+ console.print(f"\nTotal: {len(tasks)} tasks (of {get_task_count()} total)")
64
+
65
+
66
+ @main.command("run")
67
+ @click.option("--model", "-m", required=True, help="Model name to evaluate")
68
+ @click.option("--category", "-c", type=click.Choice([c.value for c in TaskCategory]), default=None, help="Task category")
69
+ @click.option("--all", "run_all", is_flag=True, help="Run all categories")
70
+ @click.option("--num-tasks", "-n", type=int, default=None, help="Number of tasks to run")
71
+ @click.option("--output", "-o", type=click.Path(), default=None, help="Output file for results")
72
+ @click.option("--provider", "-p", type=click.Choice(["openai", "anthropic", "local", "huggingface"]), default="openai")
73
+ @click.option("--timeout", type=float, default=300.0, help="Total timeout per task in seconds")
74
+ @click.option("--retries", type=int, default=3, help="Max retries on API errors")
75
+ def run(
76
+ model: str,
77
+ category: str | None,
78
+ run_all: bool,
79
+ num_tasks: int | None,
80
+ output: str | None,
81
+ provider: str,
82
+ timeout: float,
83
+ retries: int,
84
+ ) -> None:
85
+ """Run benchmark tasks against a model."""
86
+ categories = None
87
+ if run_all:
88
+ categories = list(TaskCategory)
89
+ elif category:
90
+ categories = [TaskCategory(category)]
91
+
92
+ console.print(f"[bold blue]Running BenchAgent for model: {model}[/bold blue]")
93
+ console.print(f"Provider: {provider}")
94
+
95
+ runner = TaskRunner(total_timeout=timeout)
96
+
97
+ if categories:
98
+ cat_names = [c.value for c in categories]
99
+ console.print(f"Categories: {', '.join(cat_names)}")
100
+ else:
101
+ console.print("Categories: all")
102
+
103
+ try:
104
+ report = evaluate_with_retry(
105
+ model_name=model,
106
+ provider=provider,
107
+ categories=categories,
108
+ num_tasks=num_tasks,
109
+ max_retries=retries,
110
+ runner=runner,
111
+ )
112
+ except Exception as e:
113
+ console.print(f"[bold red]Error: {e}[/bold red]")
114
+ sys.exit(1)
115
+
116
+ console.print(f"\n[bold green]Results for {model}[/bold green]")
117
+ console.print(f"Total Score: {report.total_score}")
118
+ console.print(f"Error Recovery Rate: {report.error_recovery_rate}")
119
+
120
+ if report.category_scores:
121
+ table = Table(title="Category Scores")
122
+ table.add_column("Category", style="cyan")
123
+ table.add_column("Score", style="green")
124
+ for cat, score in report.category_scores.items():
125
+ table.add_row(cat, f"{score:.1f}")
126
+ console.print(table)
127
+
128
+ if output:
129
+ output_path = Path(output)
130
+ output_path.parent.mkdir(parents=True, exist_ok=True)
131
+ output_path.write_text(report.model_dump_json(indent=2))
132
+ console.print(f"\nResults saved to: {output}")
133
+
134
+
135
+ @main.command("leaderboard")
136
+ @click.option("--update", is_flag=True, help="Update leaderboard with new results")
137
+ @click.option("--results", type=click.Path(exists=True), help="Results file to add")
138
+ @click.option("--path", type=click.Path(), default="leaderboard.json", help="Leaderboard file path")
139
+ @click.option("--format", "-f", type=click.Choice(["json", "markdown"]), default="json", help="Export format")
140
+ def leaderboard(update: bool, results: str | None, path: str, format: str) -> None:
141
+ """Show or update the leaderboard."""
142
+ lb = load_leaderboard(path)
143
+
144
+ if update and results:
145
+ results_path = Path(results)
146
+ data = json.loads(results_path.read_text())
147
+
148
+ if isinstance(data, list):
149
+ for entry in data:
150
+ lb = update_leaderboard(lb, entry.get("model", "unknown"), [])
151
+ elif isinstance(data, dict) and "task_id" in data:
152
+ entries = [data]
153
+ else:
154
+ report = ScoreReport(**data) if isinstance(data, dict) else data
155
+ lb.entries.append(report)
156
+
157
+ lb = update_leaderboard(lb, data.get("model", "unknown") if isinstance(data, dict) else "unknown", [])
158
+ save_leaderboard(lb, path)
159
+ console.print(f"[green]Leaderboard updated and saved to {path}[/green]")
160
+
161
+ if format == "markdown":
162
+ console.print(export_markdown(lb))
163
+ else:
164
+ table = Table(title="BenchAgent Leaderboard")
165
+ table.add_column("Rank", style="bold")
166
+ table.add_column("Model", style="cyan")
167
+ table.add_column("Score", style="green")
168
+ table.add_column("Recovery Rate", style="yellow")
169
+
170
+ for entry in lb.entries:
171
+ table.add_row(
172
+ str(entry.leaderboard_rank),
173
+ entry.model,
174
+ f"{entry.total_score:.1f}",
175
+ f"{entry.error_recovery_rate:.3f}",
176
+ )
177
+
178
+ console.print(table)
179
+ console.print(f"\nLast updated: {lb.last_updated}")
180
+
181
+
182
+ @main.command("export")
183
+ @click.option("--path", type=click.Path(), default="leaderboard.json", help="Leaderboard file path")
184
+ @click.option("--format", "-f", type=click.Choice(["json", "markdown"]), default="json")
185
+ @click.option("--output", "-o", type=click.Path(), default=None)
186
+ def export(path: str, format: str, output: str | None) -> None:
187
+ """Export leaderboard in the specified format."""
188
+ lb = load_leaderboard(path)
189
+ content = export_leaderboard(lb, format=format)
190
+
191
+ if output:
192
+ Path(output).write_text(content)
193
+ console.print(f"Exported to: {output}")
194
+ else:
195
+ console.print(content)
196
+
197
+
198
+ if __name__ == "__main__":
199
+ main()
@@ -0,0 +1,129 @@
1
+ """Model evaluation orchestrator for BenchAgent benchmark."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from typing import Any
8
+
9
+ from bench_agent.models import ScoreReport, Task, TaskCategory, TaskResult
10
+ from bench_agent.runner import TaskRunner
11
+ from bench_agent.scorer import (
12
+ calculate_category_scores,
13
+ calculate_overall_score,
14
+ error_recovery_score,
15
+ )
16
+ from bench_agent.tasks import ALL_TASKS, TASKS_BY_CATEGORY
17
+
18
+ PROMPT_TEMPLATE = """You are an AI assistant being evaluated on your tool-use capabilities.
19
+
20
+ You have access to the following tools: {tools}
21
+
22
+ Your task: {description}
23
+
24
+ Working directory: {workdir}
25
+
26
+ {initial_files}
27
+
28
+ You must complete this task using the tools available. You have {max_turns} turns maximum.
29
+ Be precise and thorough. After completing the task, verify your work.
30
+ """
31
+
32
+
33
+ class ModelProvider:
34
+ OPENAI = "openai"
35
+ ANTHROPIC = "anthropic"
36
+ LOCAL = "local"
37
+ HUGGINGFACE = "huggingface"
38
+
39
+
40
+ def _build_prompt(task: Task, workdir: str) -> str:
41
+ files_section = ""
42
+ if task.initial_state:
43
+ files_section = "Initial files:\n"
44
+ for fname, content in task.initial_state.items():
45
+ files_section += f"\n--- {fname} ---\n{content}\n"
46
+
47
+ return PROMPT_TEMPLATE.format(
48
+ tools=", ".join(task.tools_required),
49
+ description=task.description,
50
+ workdir=workdir,
51
+ initial_files=files_section,
52
+ max_turns=task.max_turns,
53
+ )
54
+
55
+
56
+ def evaluate_model(
57
+ model_name: str,
58
+ provider: str = ModelProvider.OPENAI,
59
+ categories: list[TaskCategory] | None = None,
60
+ num_tasks: int | None = None,
61
+ api_key: str | None = None,
62
+ base_url: str | None = None,
63
+ runner: TaskRunner | None = None,
64
+ ) -> ScoreReport:
65
+ runner = runner or TaskRunner()
66
+
67
+ tasks: list[Task] = []
68
+ if categories:
69
+ for cat in categories:
70
+ tasks.extend(TASKS_BY_CATEGORY.get(cat, []))
71
+ else:
72
+ tasks = list(ALL_TASKS)
73
+
74
+ if num_tasks:
75
+ tasks = tasks[:num_tasks]
76
+
77
+ results: list[TaskResult] = []
78
+ for task in tasks:
79
+ try:
80
+ result = runner.run_task(task, model_name)
81
+ results.append(result)
82
+ except Exception as e:
83
+ results.append(
84
+ TaskResult(
85
+ task_id=task.task_id,
86
+ model=model_name,
87
+ success=False,
88
+ errors=[str(e)],
89
+ )
90
+ )
91
+
92
+ total_score = calculate_overall_score(results)
93
+ category_scores = calculate_category_scores(results)
94
+
95
+ recovery_scores = [error_recovery_score(r) for r in results]
96
+ avg_recovery = sum(recovery_scores) / len(recovery_scores) if recovery_scores else 0.0
97
+
98
+ return ScoreReport(
99
+ model=model_name,
100
+ total_score=total_score,
101
+ category_scores=category_scores,
102
+ error_recovery_rate=round(avg_recovery, 3),
103
+ )
104
+
105
+
106
+ def evaluate_with_retry(
107
+ model_name: str,
108
+ provider: str = ModelProvider.OPENAI,
109
+ categories: list[TaskCategory] | None = None,
110
+ num_tasks: int | None = None,
111
+ max_retries: int = 3,
112
+ retry_delay: float = 5.0,
113
+ **kwargs: Any,
114
+ ) -> ScoreReport:
115
+ for attempt in range(max_retries):
116
+ try:
117
+ return evaluate_model(
118
+ model_name=model_name,
119
+ provider=provider,
120
+ categories=categories,
121
+ num_tasks=num_tasks,
122
+ **kwargs,
123
+ )
124
+ except Exception as e:
125
+ if attempt == max_retries - 1:
126
+ raise
127
+ time.sleep(retry_delay * (2**attempt))
128
+
129
+ raise RuntimeError("Unreachable")
@@ -0,0 +1,99 @@
1
+ """Leaderboard management for BenchAgent benchmark results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+
9
+ from bench_agent.models import Leaderboard, ScoreReport, TaskResult
10
+ from bench_agent.scorer import calculate_overall_score, calculate_category_scores, error_recovery_score
11
+
12
+
13
+ def load_leaderboard(path: str | Path) -> Leaderboard:
14
+ path = Path(path)
15
+ if not path.exists():
16
+ return Leaderboard(last_updated=datetime.now(timezone.utc).isoformat())
17
+
18
+ data = json.loads(path.read_text())
19
+ return Leaderboard(**data)
20
+
21
+
22
+ def update_leaderboard(
23
+ leaderboard: Leaderboard, model_name: str, results: list[TaskResult]
24
+ ) -> Leaderboard:
25
+ total_score = calculate_overall_score(results)
26
+ category_scores = calculate_category_scores(results)
27
+
28
+ recovery_scores = [error_recovery_score(r) for r in results]
29
+ avg_recovery = sum(recovery_scores) / len(recovery_scores) if recovery_scores else 0.0
30
+
31
+ existing = None
32
+ for entry in leaderboard.entries:
33
+ if entry.model == model_name:
34
+ existing = entry
35
+ break
36
+
37
+ report = ScoreReport(
38
+ model=model_name,
39
+ total_score=total_score,
40
+ category_scores=category_scores,
41
+ error_recovery_rate=round(avg_recovery, 3),
42
+ )
43
+
44
+ if existing:
45
+ idx = leaderboard.entries.index(existing)
46
+ leaderboard.entries[idx] = report
47
+ else:
48
+ leaderboard.entries.append(report)
49
+
50
+ leaderboard = sort_leaderboard(leaderboard)
51
+ leaderboard.last_updated = datetime.now(timezone.utc).isoformat()
52
+ return leaderboard
53
+
54
+
55
+ def sort_leaderboard(leaderboard: Leaderboard) -> Leaderboard:
56
+ sorted_entries = sorted(leaderboard.entries, key=lambda e: e.total_score, reverse=True)
57
+ for idx, entry in enumerate(sorted_entries):
58
+ entry.leaderboard_rank = idx + 1
59
+ leaderboard.entries = sorted_entries
60
+ return leaderboard
61
+
62
+
63
+ def export_leaderboard(leaderboard: Leaderboard, format: str = "json") -> str:
64
+ if format == "json":
65
+ return leaderboard.model_dump_json(indent=2)
66
+ elif format == "markdown":
67
+ return export_markdown(leaderboard)
68
+ else:
69
+ raise ValueError(f"Unknown format: {format}")
70
+
71
+
72
+ def export_markdown(leaderboard: Leaderboard) -> str:
73
+ lines = [
74
+ "# BenchAgent Leaderboard",
75
+ "",
76
+ f"Last updated: {leaderboard.last_updated}",
77
+ "",
78
+ "| Rank | Model | Total Score | Error Recovery | Bash | Edit | Read | Write | Multi-Tool | Error Recovery Cat |",
79
+ "|------|-------|-------------|-----------------|------|------|------|-------|------------|-------------------|",
80
+ ]
81
+
82
+ for entry in leaderboard.entries:
83
+ cat = entry.category_scores
84
+ lines.append(
85
+ f"| {entry.leaderboard_rank} | {entry.model} | {entry.total_score:.1f} | "
86
+ f"{entry.error_recovery_rate:.3f} | "
87
+ f"{cat.get('bash', 0):.1f} | {cat.get('edit', 0):.1f} | "
88
+ f"{cat.get('read', 0):.1f} | {cat.get('write', 0):.1f} | "
89
+ f"{cat.get('multi_tool', 0):.1f} | {cat.get('error_recovery', 0):.1f} |"
90
+ )
91
+
92
+ lines.append("")
93
+ return "\n".join(lines)
94
+
95
+
96
+ def save_leaderboard(leaderboard: Leaderboard, path: str | Path) -> None:
97
+ path = Path(path)
98
+ path.parent.mkdir(parents=True, exist_ok=True)
99
+ path.write_text(export_leaderboard(leaderboard, format="json"))
bench_agent/models.py ADDED
@@ -0,0 +1,67 @@
1
+ """Pydantic models for BenchAgent benchmark definitions and results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class TaskCategory(str, Enum):
12
+ BASH = "bash"
13
+ EDIT = "edit"
14
+ READ = "read"
15
+ WRITE = "write"
16
+ MULTI_TOOL = "multi_tool"
17
+ ERROR_RECOVERY = "error_recovery"
18
+
19
+
20
+ class Difficulty(str, Enum):
21
+ EASY = "easy"
22
+ MEDIUM = "medium"
23
+ HARD = "hard"
24
+
25
+
26
+ class Task(BaseModel):
27
+ task_id: str
28
+ category: TaskCategory
29
+ difficulty: Difficulty
30
+ description: str
31
+ initial_state: dict[str, str] = Field(
32
+ default_factory=dict,
33
+ description="Mapping of filename -> file content to create before the task",
34
+ )
35
+ expected_outcome: dict[str, Any] = Field(
36
+ default_factory=dict,
37
+ description="Expected result: keys may include 'files', 'exit_code', 'stdout', 'file_exists'",
38
+ )
39
+ tools_required: list[str] = Field(default_factory=list)
40
+ max_turns: int = 10
41
+ verification_script: str = ""
42
+
43
+
44
+ class TaskResult(BaseModel):
45
+ task_id: str
46
+ model: str
47
+ success: bool = False
48
+ turns_used: int = 0
49
+ tokens_used: int = 0
50
+ errors: list[str] = Field(default_factory=list)
51
+ recovery_attempts: int = 0
52
+ duration_seconds: float = 0.0
53
+ actual_output: dict[str, Any] = Field(default_factory=dict)
54
+
55
+
56
+ class ScoreReport(BaseModel):
57
+ model: str
58
+ total_score: float = 0.0
59
+ category_scores: dict[str, float] = Field(default_factory=dict)
60
+ tool_scores: dict[str, float] = Field(default_factory=dict)
61
+ error_recovery_rate: float = 0.0
62
+ leaderboard_rank: int = 0
63
+
64
+
65
+ class Leaderboard(BaseModel):
66
+ entries: list[ScoreReport] = Field(default_factory=list)
67
+ last_updated: str = ""
bench_agent/runner.py ADDED
@@ -0,0 +1,159 @@
1
+ """Task execution runner with sandboxing, timeouts, and token tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from bench_agent.models import Task, TaskCategory, TaskResult
14
+ from bench_agent.tasks import TASKS_BY_CATEGORY, ALL_TASKS
15
+
16
+ import json
17
+
18
+ class TaskRunner:
19
+ def __init__(
20
+ self,
21
+ sandbox_root: str | Path | None = None,
22
+ per_turn_timeout: float = 30.0,
23
+ total_timeout: float = 300.0,
24
+ ) -> None:
25
+ self.sandbox_root = Path(sandbox_root) if sandbox_root else Path(tempfile.mkdtemp())
26
+ self.per_turn_timeout = per_turn_timeout
27
+ self.total_timeout = total_timeout
28
+
29
+ def _setup_sandbox(self, task: Task, sandbox_dir: Path) -> Path:
30
+ sandbox_dir.mkdir(parents=True, exist_ok=True)
31
+ for filename, content in task.initial_state.items():
32
+ filepath = sandbox_dir / filename
33
+ filepath.parent.mkdir(parents=True, exist_ok=True)
34
+ filepath.write_text(content)
35
+ return sandbox_dir
36
+
37
+ def _cleanup_sandbox(self, sandbox_dir: Path) -> None:
38
+ if sandbox_dir.exists():
39
+ shutil.rmtree(sandbox_dir, ignore_errors=True)
40
+
41
+ def _verify_task(self, task: Task, sandbox_dir: Path) -> bool:
42
+ if not task.verification_script:
43
+ return self._verify_expected_outcome(task, sandbox_dir)
44
+ try:
45
+ result = subprocess.run(
46
+ ["python3", "-c", task.verification_script],
47
+ cwd=str(sandbox_dir),
48
+ capture_output=True,
49
+ text=True,
50
+ timeout=15,
51
+ )
52
+ return result.returncode == 0
53
+ except (subprocess.TimeoutExpired, Exception):
54
+ return False
55
+
56
+ def _verify_expected_outcome(self, task: Task, sandbox_dir: Path) -> bool:
57
+ expected = task.expected_outcome
58
+ if not expected:
59
+ return True
60
+
61
+ if "files" in expected:
62
+ for filename, expected_content in expected["files"].items():
63
+ filepath = sandbox_dir / filename
64
+ if not filepath.exists():
65
+ return False
66
+ actual = filepath.read_text()
67
+ if actual.strip() != expected_content.strip():
68
+ return False
69
+
70
+ if "file_exists" in expected:
71
+ for filename in expected["file_exists"]:
72
+ if not (sandbox_dir / filename).exists():
73
+ return False
74
+
75
+ if "exit_code" in expected:
76
+ pass
77
+
78
+ return True
79
+
80
+ def _count_tokens(self, text: str) -> int:
81
+ return max(1, len(text) // 4)
82
+
83
+ def run_task(
84
+ self, task: Task, model: str, max_turns: int | None = None
85
+ ) -> TaskResult:
86
+ max_turns = max_turns or task.max_turns
87
+ sandbox_dir = self.sandbox_root / f"task_{task.task_id}"
88
+ errors: list[str] = []
89
+ recovery_attempts = 0
90
+ start_time = time.time()
91
+
92
+ try:
93
+ self._setup_sandbox(task, sandbox_dir)
94
+
95
+ turns_used = 1
96
+ tokens_used = 0
97
+
98
+ prompt = f"You are a coding assistant. Complete this task:\n\n{task.description}\n\n"
99
+ prompt += f"Working directory: {sandbox_dir}\n"
100
+ prompt += f"Tools allowed: {', '.join(task.tools_required)}\n"
101
+ prompt += f"Max turns: {max_turns}\n"
102
+
103
+ if task.initial_state:
104
+ prompt += "\nInitial files:\n"
105
+ for fname, content in task.initial_state.items():
106
+ prompt += f"\n--- {fname} ---\n{content}\n"
107
+
108
+ tokens_used += self._count_tokens(prompt)
109
+
110
+ for turn in range(max_turns):
111
+ turns_used = turn + 1
112
+ if time.time() - start_time > self.total_timeout:
113
+ errors.append(f"Total timeout exceeded after {turns_used} turns")
114
+ break
115
+
116
+ success = self._verify_task(task, sandbox_dir)
117
+
118
+ except Exception as e:
119
+ errors.append(str(e))
120
+ success = False
121
+ turns_used = 1
122
+ finally:
123
+ self._cleanup_sandbox(sandbox_dir)
124
+
125
+ duration = time.time() - start_time
126
+
127
+ return TaskResult(
128
+ task_id=task.task_id,
129
+ model=model,
130
+ success=success,
131
+ turns_used=turns_used,
132
+ tokens_used=tokens_used,
133
+ errors=errors,
134
+ recovery_attempts=recovery_attempts,
135
+ duration_seconds=round(duration, 2),
136
+ )
137
+
138
+ def run_benchmark(
139
+ self,
140
+ model: str,
141
+ categories: list[TaskCategory] | None = None,
142
+ num_tasks: int | None = None,
143
+ ) -> list[TaskResult]:
144
+ if categories:
145
+ tasks: list[Task] = []
146
+ for cat in categories:
147
+ tasks.extend(TASKS_BY_CATEGORY.get(cat, []))
148
+ else:
149
+ tasks = list(ALL_TASKS)
150
+
151
+ if num_tasks:
152
+ tasks = tasks[:num_tasks]
153
+
154
+ results: list[TaskResult] = []
155
+ for task in tasks:
156
+ result = self.run_task(task, model)
157
+ results.append(result)
158
+
159
+ return results