clawsbench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawsbench-0.1.0/PKG-INFO +15 -0
- clawsbench-0.1.0/clawbench/__init__.py +1 -0
- clawsbench-0.1.0/clawbench/_env.py +74 -0
- clawsbench-0.1.0/clawbench/_paths.py +25 -0
- clawsbench-0.1.0/clawbench/backend.py +515 -0
- clawsbench-0.1.0/clawbench/cli.py +535 -0
- clawsbench-0.1.0/clawbench/dashboard.py +905 -0
- clawsbench-0.1.0/clawbench/eval.py +334 -0
- clawsbench-0.1.0/clawbench/executor.py +278 -0
- clawsbench-0.1.0/clawbench/export.py +115 -0
- clawsbench-0.1.0/clawbench/optimize.py +696 -0
- clawsbench-0.1.0/clawbench/output_schema.py +152 -0
- clawsbench-0.1.0/clawbench/proposer.py +405 -0
- clawsbench-0.1.0/clawbench/score_cache.py +108 -0
- clawsbench-0.1.0/clawbench/scoring.py +68 -0
- clawsbench-0.1.0/clawbench/task_check.py +446 -0
- clawsbench-0.1.0/clawbench/task_contract.py +171 -0
- clawsbench-0.1.0/clawbench/task_utils.py +177 -0
- clawsbench-0.1.0/clawbench/trajectory.py +223 -0
- clawsbench-0.1.0/clawsbench.egg-info/PKG-INFO +15 -0
- clawsbench-0.1.0/clawsbench.egg-info/SOURCES.txt +42 -0
- clawsbench-0.1.0/clawsbench.egg-info/dependency_links.txt +1 -0
- clawsbench-0.1.0/clawsbench.egg-info/entry_points.txt +2 -0
- clawsbench-0.1.0/clawsbench.egg-info/requires.txt +11 -0
- clawsbench-0.1.0/clawsbench.egg-info/top_level.txt +1 -0
- clawsbench-0.1.0/pyproject.toml +31 -0
- clawsbench-0.1.0/setup.cfg +4 -0
- clawsbench-0.1.0/tests/test_backend.py +62 -0
- clawsbench-0.1.0/tests/test_benchflow_backend.py +349 -0
- clawsbench-0.1.0/tests/test_concurrency.py +47 -0
- clawsbench-0.1.0/tests/test_daytona_smoke.py +70 -0
- clawsbench-0.1.0/tests/test_engine_switch.py +237 -0
- clawsbench-0.1.0/tests/test_eval.py +825 -0
- clawsbench-0.1.0/tests/test_executor.py +291 -0
- clawsbench-0.1.0/tests/test_export.py +118 -0
- clawsbench-0.1.0/tests/test_litellm.py +326 -0
- clawsbench-0.1.0/tests/test_optimize.py +311 -0
- clawsbench-0.1.0/tests/test_optimize_daytona.py +351 -0
- clawsbench-0.1.0/tests/test_output_schema.py +145 -0
- clawsbench-0.1.0/tests/test_scoring.py +65 -0
- clawsbench-0.1.0/tests/test_task_contract.py +154 -0
- clawsbench-0.1.0/tests/test_task_utils.py +44 -0
- clawsbench-0.1.0/tests/test_trajectory.py +285 -0
- clawsbench-0.1.0/tests/test_web.py +235 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clawsbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Auto-improve agent skills via task completion optimization
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: anthropic>=0.40.0
|
|
7
|
+
Requires-Dist: httpx>=0.27.0
|
|
8
|
+
Requires-Dist: click>=8.0
|
|
9
|
+
Requires-Dist: fastapi>=0.115.0
|
|
10
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
11
|
+
Requires-Dist: litellm>=1.40.0
|
|
12
|
+
Requires-Dist: cloudpickle>=3.0
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""clawbench — Auto-improve agent skills via task completion optimization."""
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Environment loading helpers for pipeline LLM/API keys.
|
|
2
|
+
|
|
3
|
+
Policy:
|
|
4
|
+
- Load project-local env files for CLI/web commands.
|
|
5
|
+
- Do not overwrite keys that are already exported in the shell.
|
|
6
|
+
- Canonical Gemini key variable is GOOGLE_API_KEY.
|
|
7
|
+
- DEFAULT_OPTIMIZER_MODEL is used by proposer + trajectory summarizer (not the agent).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from clawbench._paths import _PROJECT_ROOT
|
|
16
|
+
|
|
17
|
+
# Default model for optimizer LLM calls (proposer + trajectory summarizer).
|
|
18
|
+
# The agent model is separate and controlled by CLI --model.
|
|
19
|
+
DEFAULT_OPTIMIZER_MODEL = "gemini/gemini-3.1-flash-lite-preview"
|
|
20
|
+
|
|
21
|
+
# Score threshold for PASS/FAIL classification.
|
|
22
|
+
# Used consistently across eval display, JSON artifacts, and dashboard data.
|
|
23
|
+
PASS_THRESHOLD = 0.8
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_local_env_files(project_root: Path | None = None) -> list[Path]:
|
|
27
|
+
"""Load local env files into os.environ without overriding existing keys.
|
|
28
|
+
|
|
29
|
+
Files are loaded in this order:
|
|
30
|
+
1) <repo>/.gemini/.env
|
|
31
|
+
2) <repo>/.env
|
|
32
|
+
"""
|
|
33
|
+
root = project_root or _PROJECT_ROOT
|
|
34
|
+
candidates = [
|
|
35
|
+
root / ".gemini" / ".env",
|
|
36
|
+
root / ".env",
|
|
37
|
+
]
|
|
38
|
+
loaded: list[Path] = []
|
|
39
|
+
|
|
40
|
+
for env_path in candidates:
|
|
41
|
+
if not env_path.exists() or not env_path.is_file():
|
|
42
|
+
continue
|
|
43
|
+
for line in env_path.read_text().splitlines():
|
|
44
|
+
parsed = _parse_env_line(line)
|
|
45
|
+
if parsed is None:
|
|
46
|
+
continue
|
|
47
|
+
key, value = parsed
|
|
48
|
+
if key not in os.environ:
|
|
49
|
+
os.environ[key] = value
|
|
50
|
+
loaded.append(env_path)
|
|
51
|
+
|
|
52
|
+
# Alias: gemini-cli uses GEMINI_API_KEY; pipeline uses GOOGLE_API_KEY
|
|
53
|
+
gemini_key = os.environ.get("GEMINI_API_KEY")
|
|
54
|
+
if gemini_key and "GOOGLE_API_KEY" not in os.environ:
|
|
55
|
+
os.environ["GOOGLE_API_KEY"] = gemini_key
|
|
56
|
+
|
|
57
|
+
return loaded
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _parse_env_line(line: str) -> tuple[str, str] | None:
|
|
61
|
+
line = line.strip()
|
|
62
|
+
if not line or line.startswith("#"):
|
|
63
|
+
return None
|
|
64
|
+
if line.startswith("export "):
|
|
65
|
+
line = line[len("export ") :].strip()
|
|
66
|
+
if "=" not in line:
|
|
67
|
+
return None
|
|
68
|
+
key, value = line.split("=", 1)
|
|
69
|
+
key = key.strip()
|
|
70
|
+
value = value.strip().strip('"').strip("'")
|
|
71
|
+
if not key:
|
|
72
|
+
return None
|
|
73
|
+
return key, value
|
|
74
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Path helpers for project layout."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _find_project_root() -> Path:
|
|
9
|
+
"""Find project root by looking for config.toml.
|
|
10
|
+
|
|
11
|
+
1. Walk up from cwd (handles `uv tool install` — package lives outside repo)
|
|
12
|
+
2. Fall back to __file__-relative (handles editable install from packages/pipeline/)
|
|
13
|
+
"""
|
|
14
|
+
# Try cwd first — works when gskills is run from repo root (or any subdir)
|
|
15
|
+
p = Path.cwd().resolve()
|
|
16
|
+
while p != p.parent:
|
|
17
|
+
if (p / "config.toml").exists():
|
|
18
|
+
return p
|
|
19
|
+
p = p.parent
|
|
20
|
+
|
|
21
|
+
# Fallback: __file__-relative (3 levels up: clawbench → clawbench → packages → root)
|
|
22
|
+
return Path(__file__).resolve().parents[3]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_PROJECT_ROOT = _find_project_root()
|
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""Runtime backend — Backend protocol, TrialResult, and BenchflowBackend.
|
|
2
|
+
|
|
3
|
+
Called by: executor.py, eval.py, optimize.py
|
|
4
|
+
|
|
5
|
+
BenchflowBackend is the sole backend implementation. It uses benchflow's
|
|
6
|
+
ACP-native SDK for agent execution, skills deployment, trajectory capture,
|
|
7
|
+
and agent installation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import datetime
|
|
14
|
+
import hashlib
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import shutil
|
|
18
|
+
import tempfile
|
|
19
|
+
import uuid
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Protocol, runtime_checkable
|
|
23
|
+
|
|
24
|
+
import click
|
|
25
|
+
|
|
26
|
+
from clawbench._paths import _PROJECT_ROOT
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Protocol & data types
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class TrialResult:
|
|
36
|
+
"""What every backend must return from a trial run.
|
|
37
|
+
|
|
38
|
+
The backend is responsible for extracting the score from verifier rewards.
|
|
39
|
+
Use scoring.extract_score() for the standard normalization.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
score: float
|
|
43
|
+
side_info: dict = field(default_factory=dict)
|
|
44
|
+
trial_dir: Path | None = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@runtime_checkable
|
|
48
|
+
class Backend(Protocol):
|
|
49
|
+
"""Interface for trial execution backends.
|
|
50
|
+
|
|
51
|
+
Lifecycle: setup() → run_trial / run_trial_async (many) → cleanup()
|
|
52
|
+
|
|
53
|
+
Both sync and async are required. The executor picks which to call
|
|
54
|
+
based on --env and --workers:
|
|
55
|
+
- Docker (single worker): sync path — run_trial()
|
|
56
|
+
- Cloud envs / multi-worker: async path — run_trial_async()
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def setup(self) -> None:
|
|
60
|
+
"""One-time initialization before trials (monkey-patches, SDK init, etc.)."""
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
def cleanup(self) -> None:
|
|
64
|
+
"""Resource teardown after all trials complete."""
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
def run_trial(
|
|
68
|
+
self,
|
|
69
|
+
candidate: dict[str, str],
|
|
70
|
+
task: dict,
|
|
71
|
+
*,
|
|
72
|
+
skills_dir: Path | None = None,
|
|
73
|
+
run_id: str | None = None,
|
|
74
|
+
) -> TrialResult:
|
|
75
|
+
"""Sync trial execution. Default: asyncio.run(run_trial_async(...))."""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
async def run_trial_async(
|
|
79
|
+
self,
|
|
80
|
+
candidate: dict[str, str],
|
|
81
|
+
task: dict,
|
|
82
|
+
*,
|
|
83
|
+
skills_dir: Path | None = None,
|
|
84
|
+
run_id: str | None = None,
|
|
85
|
+
) -> TrialResult:
|
|
86
|
+
"""Async trial execution. Backends must implement this."""
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Agent environment helpers
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_agent_env(agent_name: str | None = None) -> dict[str, str]:
|
|
96
|
+
"""Forward provider credentials that installed agents expect.
|
|
97
|
+
|
|
98
|
+
Passthrough strategy: forward every key that any supported agent
|
|
99
|
+
might need. Keys absent from the host environment are silently
|
|
100
|
+
skipped, so this is safe for all agents.
|
|
101
|
+
"""
|
|
102
|
+
agent_env: dict[str, str] = {}
|
|
103
|
+
|
|
104
|
+
passthrough_keys = (
|
|
105
|
+
"ANTHROPIC_API_KEY",
|
|
106
|
+
"GEMINI_API_KEY",
|
|
107
|
+
"GOOGLE_API_KEY",
|
|
108
|
+
"GOOGLE_APPLICATION_CREDENTIALS",
|
|
109
|
+
"GOOGLE_CLOUD_PROJECT",
|
|
110
|
+
"GOOGLE_CLOUD_LOCATION",
|
|
111
|
+
"GOOGLE_GENAI_USE_VERTEXAI",
|
|
112
|
+
"OPENAI_API_KEY",
|
|
113
|
+
"OPENAI_BASE_URL",
|
|
114
|
+
"OPENAI_MODEL",
|
|
115
|
+
"OPENAI_ORG_ID",
|
|
116
|
+
"OPENAI_PROJECT",
|
|
117
|
+
"AZURE_OPENAI_API_KEY",
|
|
118
|
+
"AZURE_OPENAI_ENDPOINT",
|
|
119
|
+
"AZURE_API_KEY",
|
|
120
|
+
"AZURE_RESOURCE_NAME",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
for key in passthrough_keys:
|
|
124
|
+
val = os.environ.get(key)
|
|
125
|
+
if val:
|
|
126
|
+
agent_env[key] = val
|
|
127
|
+
|
|
128
|
+
# Azure fallback: many agents expect OPENAI_API_KEY even for Azure
|
|
129
|
+
if "OPENAI_API_KEY" not in agent_env and agent_env.get("AZURE_OPENAI_API_KEY"):
|
|
130
|
+
agent_env["OPENAI_API_KEY"] = agent_env["AZURE_OPENAI_API_KEY"]
|
|
131
|
+
|
|
132
|
+
return agent_env
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# BenchflowBackend
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class BenchflowBackend:
|
|
141
|
+
"""Benchflow SDK backend — runs trials via ACP-native agent communication.
|
|
142
|
+
|
|
143
|
+
Lifecycle:
|
|
144
|
+
setup() — validate benchflow is importable
|
|
145
|
+
run_trial() / run_trial_async() — execute a single trial
|
|
146
|
+
cleanup() — no-op
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(
|
|
150
|
+
self,
|
|
151
|
+
agent_name: str = "claude-agent-acp",
|
|
152
|
+
model_name: str = "claude-haiku-4-5-20251001",
|
|
153
|
+
environment_type: str = "docker",
|
|
154
|
+
trials_dir: Path | None = None,
|
|
155
|
+
trajectory_store: list | None = None,
|
|
156
|
+
trajectory_lock=None,
|
|
157
|
+
agent_import_path: str | None = None,
|
|
158
|
+
keep_trials: bool = False,
|
|
159
|
+
):
|
|
160
|
+
self.agent_name = agent_name
|
|
161
|
+
self.model_name = model_name
|
|
162
|
+
self.environment_type = environment_type
|
|
163
|
+
self.trials_dir = trials_dir or (_PROJECT_ROOT / ".local" / "trials")
|
|
164
|
+
self.trajectory_store = trajectory_store
|
|
165
|
+
self.trajectory_lock = trajectory_lock
|
|
166
|
+
self.agent_import_path = agent_import_path
|
|
167
|
+
self.keep_trials = keep_trials
|
|
168
|
+
self._sdk = None
|
|
169
|
+
|
|
170
|
+
def setup(self) -> None:
|
|
171
|
+
"""Import and initialize benchflow SDK (idempotent)."""
|
|
172
|
+
if self._sdk is not None:
|
|
173
|
+
return
|
|
174
|
+
try:
|
|
175
|
+
from benchflow import SDK
|
|
176
|
+
self._sdk = SDK()
|
|
177
|
+
except ImportError:
|
|
178
|
+
raise RuntimeError(
|
|
179
|
+
"benchflow SDK not installed. Install with: "
|
|
180
|
+
"pip install 'benchflow @ git+https://github.com/benchflow-ai/benchflow.git'"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def cleanup(self) -> None:
|
|
184
|
+
"""No-op — benchflow handles cleanup per-trial."""
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
def run_trial(
|
|
188
|
+
self,
|
|
189
|
+
candidate: dict[str, str],
|
|
190
|
+
task: dict,
|
|
191
|
+
*,
|
|
192
|
+
skills_dir: Path | None = None,
|
|
193
|
+
run_id: str | None = None,
|
|
194
|
+
) -> TrialResult:
|
|
195
|
+
"""Sync trial execution — wraps async via asyncio.run()."""
|
|
196
|
+
self.setup()
|
|
197
|
+
return asyncio.run(
|
|
198
|
+
self.run_trial_async(
|
|
199
|
+
candidate, task, skills_dir=skills_dir, run_id=run_id,
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
async def run_trial_async(
|
|
204
|
+
self,
|
|
205
|
+
candidate: dict[str, str],
|
|
206
|
+
task: dict,
|
|
207
|
+
*,
|
|
208
|
+
skills_dir: Path | None = None,
|
|
209
|
+
run_id: str | None = None,
|
|
210
|
+
) -> TrialResult:
|
|
211
|
+
"""Async trial execution via benchflow SDK.
|
|
212
|
+
|
|
213
|
+
Maps the clawbench (candidate, task) → (score, side_info) contract
|
|
214
|
+
onto benchflow's SDK.run() → RunResult.
|
|
215
|
+
"""
|
|
216
|
+
self.setup()
|
|
217
|
+
if task is None:
|
|
218
|
+
return TrialResult(score=0.0, side_info={"Feedback": {"Status": "no_example"}})
|
|
219
|
+
|
|
220
|
+
task_name = task.get("task_name", "?")
|
|
221
|
+
task_path = task["task_path"]
|
|
222
|
+
skills_content = candidate.get("skills", "")
|
|
223
|
+
|
|
224
|
+
# Always copy task dir — SDK's context_root staging mutates in-place
|
|
225
|
+
tmp_task = _copy_task(task_path, skills_content, run_id=run_id)
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Resolve model name — benchflow uses bare model IDs,
|
|
229
|
+
# clawbench/Harbor uses "provider/model" format
|
|
230
|
+
model = self.model_name
|
|
231
|
+
if model and "/" in model:
|
|
232
|
+
model = model.split("/", 1)[1]
|
|
233
|
+
|
|
234
|
+
# Build agent env — auto-inherit API keys
|
|
235
|
+
agent_env = build_agent_env(self.agent_name)
|
|
236
|
+
|
|
237
|
+
# Generate unique trial/job names
|
|
238
|
+
trial_name = f"{task_name}__{uuid.uuid4().hex[:8]}"
|
|
239
|
+
_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
240
|
+
job_name = run_id or f"cb-{_ts}-{uuid.uuid4().hex[:8]}"
|
|
241
|
+
self.trials_dir.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
jobs_dir = str(self.trials_dir)
|
|
243
|
+
|
|
244
|
+
# Build pre-agent hooks for claw-* services
|
|
245
|
+
pre_agent_hooks = _build_service_hooks(tmp_task)
|
|
246
|
+
|
|
247
|
+
result = await self._sdk.run(
|
|
248
|
+
task_path=tmp_task,
|
|
249
|
+
agent=self.agent_name,
|
|
250
|
+
model=model,
|
|
251
|
+
agent_env=agent_env,
|
|
252
|
+
job_name=job_name,
|
|
253
|
+
trial_name=trial_name,
|
|
254
|
+
jobs_dir=jobs_dir,
|
|
255
|
+
environment=self.environment_type,
|
|
256
|
+
skills_dir=str(skills_dir) if skills_dir else None,
|
|
257
|
+
sandbox_user="agent" if self.agent_name != "oracle" else None,
|
|
258
|
+
pre_agent_hooks=pre_agent_hooks or None,
|
|
259
|
+
context_root=str(_PROJECT_ROOT),
|
|
260
|
+
)
|
|
261
|
+
finally:
|
|
262
|
+
shutil.rmtree(tmp_task, ignore_errors=True)
|
|
263
|
+
|
|
264
|
+
# Write metadata files for export.py compatibility
|
|
265
|
+
trial_dir = self.trials_dir / job_name / trial_name
|
|
266
|
+
if trial_dir.is_dir():
|
|
267
|
+
(trial_dir / "task_name.txt").write_text(task_name + "\n")
|
|
268
|
+
if run_id:
|
|
269
|
+
(trial_dir / "run_id.txt").write_text(run_id + "\n")
|
|
270
|
+
|
|
271
|
+
# Write agent/trajectory.json in ATIF format for export.py
|
|
272
|
+
trajectory = result.trajectory or []
|
|
273
|
+
if trajectory:
|
|
274
|
+
agent_dir = trial_dir / "agent"
|
|
275
|
+
agent_dir.mkdir(exist_ok=True)
|
|
276
|
+
atif = _acp_to_atif(trajectory, self.agent_name, model or "")
|
|
277
|
+
(agent_dir / "trajectory.json").write_text(
|
|
278
|
+
json.dumps(atif, indent=2) + "\n"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Map RunResult → TrialResult with clawbench side_info contract
|
|
282
|
+
return self._to_trial_result(result, task_name, skills_content)
|
|
283
|
+
|
|
284
|
+
def _to_trial_result(self, result, task_name: str, skills_content: str) -> TrialResult:
|
|
285
|
+
"""Convert benchflow RunResult to clawbench TrialResult."""
|
|
286
|
+
score = 0.0
|
|
287
|
+
side_info: dict = {"Input": {"task_name": task_name}}
|
|
288
|
+
|
|
289
|
+
# Honor verifier result even if agent timed out (verifier runs after timeout)
|
|
290
|
+
if result.rewards:
|
|
291
|
+
rewards = result.rewards
|
|
292
|
+
raw = rewards.get("score", rewards.get("reward", 0.0))
|
|
293
|
+
score = max(-1.0, min(1.0, float(raw)))
|
|
294
|
+
side_info["Feedback"] = {"Status": "completed", "Rewards": rewards}
|
|
295
|
+
side_info["scores"] = rewards
|
|
296
|
+
if result.error:
|
|
297
|
+
click.echo(f" [{task_name}] WARN: agent error ({result.error}) but verifier returned score={score}", err=True)
|
|
298
|
+
elif result.error:
|
|
299
|
+
click.echo(f" [{task_name}] ERROR: {result.error}", err=True)
|
|
300
|
+
side_info["Feedback"] = {
|
|
301
|
+
"Status": "trial_error",
|
|
302
|
+
"Error": result.error,
|
|
303
|
+
}
|
|
304
|
+
else:
|
|
305
|
+
click.echo(f" [{task_name}] WARNING: no verifier result", err=True)
|
|
306
|
+
side_info["Feedback"] = {"Status": "no_verifier_result"}
|
|
307
|
+
|
|
308
|
+
# Build trajectory summary from ACP trajectory
|
|
309
|
+
trajectory = result.trajectory or []
|
|
310
|
+
trajectory_text = _format_acp_trajectory(trajectory)
|
|
311
|
+
side_info["Generated Outputs"] = {
|
|
312
|
+
"Agent Trace": trajectory_text,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Timing
|
|
316
|
+
timing = {}
|
|
317
|
+
if result.started_at and result.finished_at:
|
|
318
|
+
timing["total"] = round(
|
|
319
|
+
(result.finished_at - result.started_at).total_seconds(), 1
|
|
320
|
+
)
|
|
321
|
+
side_info["Input"]["timing"] = timing
|
|
322
|
+
if timing.get("total"):
|
|
323
|
+
side_info["Input"]["duration_sec"] = timing["total"]
|
|
324
|
+
|
|
325
|
+
if not result.error or score != 0.0:
|
|
326
|
+
duration = f" ({timing['total']:.0f}s)" if timing.get("total") else ""
|
|
327
|
+
click.echo(f" [{task_name}] score={score}{duration}")
|
|
328
|
+
|
|
329
|
+
# Save to trajectory store (for dashboard/proposer)
|
|
330
|
+
if self.trajectory_store is not None:
|
|
331
|
+
candidate_hash = hashlib.md5(skills_content.encode()).hexdigest()[:12]
|
|
332
|
+
raw_steps = []
|
|
333
|
+
for event in trajectory:
|
|
334
|
+
etype = event.get("type", "")
|
|
335
|
+
if etype == "tool_call":
|
|
336
|
+
raw_steps.append({
|
|
337
|
+
"type": "tool",
|
|
338
|
+
"tool": event.get("kind", ""),
|
|
339
|
+
"command": event.get("title", ""),
|
|
340
|
+
"content": json.dumps(event.get("content", []))[:300],
|
|
341
|
+
})
|
|
342
|
+
elif etype == "agent_message":
|
|
343
|
+
raw_steps.append({
|
|
344
|
+
"type": "thought",
|
|
345
|
+
"content": event.get("text", "")[:500],
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
entry = {
|
|
349
|
+
"candidate_hash": candidate_hash,
|
|
350
|
+
"task_name": task_name,
|
|
351
|
+
"score": score,
|
|
352
|
+
"steps": raw_steps,
|
|
353
|
+
"summary": trajectory_text[:500] if trajectory_text else "",
|
|
354
|
+
"duration_sec": timing.get("total", 0),
|
|
355
|
+
}
|
|
356
|
+
if self.trajectory_lock:
|
|
357
|
+
with self.trajectory_lock:
|
|
358
|
+
self.trajectory_store.append(entry)
|
|
359
|
+
else:
|
|
360
|
+
self.trajectory_store.append(entry)
|
|
361
|
+
|
|
362
|
+
return TrialResult(score=score, side_info=side_info)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# ---------------------------------------------------------------------------
|
|
366
|
+
# Helpers
|
|
367
|
+
# ---------------------------------------------------------------------------
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _copy_task(task_path: str, skills_content: str = "", run_id: str | None = None) -> str:
|
|
371
|
+
"""Copy task dir to a temp location, optionally injecting skill content.
|
|
372
|
+
|
|
373
|
+
Always copies so that SDK operations (context_root staging, skills_dir
|
|
374
|
+
injection) don't mutate the original task directory.
|
|
375
|
+
"""
|
|
376
|
+
tmpdir_name = f"bf{uuid.uuid4().hex[:12]}"
|
|
377
|
+
tmpdir = os.path.join(tempfile.gettempdir(), tmpdir_name)
|
|
378
|
+
os.makedirs(tmpdir)
|
|
379
|
+
src = Path(task_path).resolve()
|
|
380
|
+
dst = Path(tmpdir)
|
|
381
|
+
|
|
382
|
+
for item in src.iterdir():
|
|
383
|
+
dest = dst / item.name
|
|
384
|
+
if item.is_dir():
|
|
385
|
+
shutil.copytree(item, dest)
|
|
386
|
+
else:
|
|
387
|
+
shutil.copy2(item, dest)
|
|
388
|
+
|
|
389
|
+
# Inject a per-eval-run cache-bust ARG so Daytona rebuilds images between
|
|
390
|
+
# runs but reuses the same image for all repeats within a single run.
|
|
391
|
+
dockerfile = dst / "environment" / "Dockerfile"
|
|
392
|
+
if dockerfile.exists() and run_id:
|
|
393
|
+
content = dockerfile.read_text()
|
|
394
|
+
content = content.replace("FROM ", f"ARG EVAL_RUN={run_id}\nFROM ", 1)
|
|
395
|
+
dockerfile.write_text(content)
|
|
396
|
+
|
|
397
|
+
if skills_content:
|
|
398
|
+
instruction_path = dst / "instruction.md"
|
|
399
|
+
original = instruction_path.read_text() if instruction_path.exists() else ""
|
|
400
|
+
instruction_path.write_text(
|
|
401
|
+
f"# Agent Skill\n\n{skills_content}\n\n---\n\n# Task\n\n{original}"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return tmpdir
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _build_service_hooks(task_path: str) -> list:
|
|
408
|
+
"""Auto-detect claw-* services from Dockerfile and build pre-agent hooks.
|
|
409
|
+
|
|
410
|
+
Reads config.toml for the service registry, scans the Dockerfile for
|
|
411
|
+
service names, and returns async hook functions that start each matched
|
|
412
|
+
service and wait for its health check.
|
|
413
|
+
"""
|
|
414
|
+
import tomllib
|
|
415
|
+
|
|
416
|
+
env_dir = Path(task_path) / "environment"
|
|
417
|
+
dockerfile_path = env_dir / "Dockerfile"
|
|
418
|
+
if not dockerfile_path.exists():
|
|
419
|
+
return []
|
|
420
|
+
# docker-compose tasks handle their own services
|
|
421
|
+
if (env_dir / "docker-compose.yaml").exists():
|
|
422
|
+
return []
|
|
423
|
+
|
|
424
|
+
toml_path = _PROJECT_ROOT / "config.toml"
|
|
425
|
+
if not toml_path.exists():
|
|
426
|
+
return []
|
|
427
|
+
|
|
428
|
+
with open(toml_path, "rb") as f:
|
|
429
|
+
env_cfg = tomllib.load(f)
|
|
430
|
+
|
|
431
|
+
service_registry: dict[str, tuple[str, int]] = {
|
|
432
|
+
name: (cfg["db_path"], cfg["port"])
|
|
433
|
+
for name, cfg in env_cfg.items()
|
|
434
|
+
if name.startswith("claw-")
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
dockerfile_text = dockerfile_path.read_text()
|
|
438
|
+
services = [
|
|
439
|
+
(name, db_path, port)
|
|
440
|
+
for name, (db_path, port) in service_registry.items()
|
|
441
|
+
if name in dockerfile_text
|
|
442
|
+
]
|
|
443
|
+
|
|
444
|
+
if not services:
|
|
445
|
+
return []
|
|
446
|
+
|
|
447
|
+
async def start_services(env):
|
|
448
|
+
for cli_name, db_path, port in services:
|
|
449
|
+
await env.exec(
|
|
450
|
+
f"{cli_name} --db {db_path} serve --host 0.0.0.0 --port {port} --no-mcp &"
|
|
451
|
+
)
|
|
452
|
+
for cli_name, _db_path, port in services:
|
|
453
|
+
await env.exec(
|
|
454
|
+
f"for i in $(seq 1 30); do curl -sf http://localhost:{port}/health > /dev/null && break; sleep 1; done"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
return [start_services]
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _acp_to_atif(events: list[dict], agent_name: str, model_name: str) -> dict:
|
|
461
|
+
"""Convert ACP trajectory events → ATIF-v1.6 format for export.py."""
|
|
462
|
+
steps = []
|
|
463
|
+
for i, event in enumerate(events, start=1):
|
|
464
|
+
etype = event.get("type", "")
|
|
465
|
+
step: dict = {"step_id": i, "source": "agent"}
|
|
466
|
+
if etype == "tool_call":
|
|
467
|
+
step["message"] = ""
|
|
468
|
+
step["tool_calls"] = [{
|
|
469
|
+
"tool_call_id": event.get("tool_call_id", ""),
|
|
470
|
+
"function_name": event.get("kind", ""),
|
|
471
|
+
"arguments": {"command": event.get("title", "")},
|
|
472
|
+
}]
|
|
473
|
+
content = event.get("content")
|
|
474
|
+
if content:
|
|
475
|
+
step["observation"] = {"results": [{"content": str(content)[:2000]}]}
|
|
476
|
+
elif etype == "agent_message":
|
|
477
|
+
step["message"] = event.get("text", "")
|
|
478
|
+
elif etype == "agent_thought":
|
|
479
|
+
step["source"] = "agent_thought"
|
|
480
|
+
step["message"] = event.get("text", "")
|
|
481
|
+
else:
|
|
482
|
+
continue
|
|
483
|
+
steps.append(step)
|
|
484
|
+
|
|
485
|
+
return {
|
|
486
|
+
"schema_version": "ATIF-v1.6",
|
|
487
|
+
"agent": {"name": agent_name, "model_name": model_name},
|
|
488
|
+
"steps": steps,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _format_acp_trajectory(trajectory: list[dict], max_chars: int = 8000) -> str:
|
|
493
|
+
"""Format ACP trajectory events into a human-readable summary."""
|
|
494
|
+
if not trajectory:
|
|
495
|
+
return "(no trajectory)"
|
|
496
|
+
|
|
497
|
+
lines = []
|
|
498
|
+
for event in trajectory:
|
|
499
|
+
etype = event.get("type", "")
|
|
500
|
+
if etype == "tool_call":
|
|
501
|
+
kind = event.get("kind", "")
|
|
502
|
+
title = event.get("title", "")
|
|
503
|
+
status = event.get("status", "")
|
|
504
|
+
lines.append(f"[{kind}] {title} → {status}")
|
|
505
|
+
elif etype == "agent_message":
|
|
506
|
+
text = event.get("text", "")
|
|
507
|
+
lines.append(f"[Message]: {text[:300]}")
|
|
508
|
+
elif etype == "agent_thought":
|
|
509
|
+
text = event.get("text", "")
|
|
510
|
+
lines.append(f"[Thought]: {text[:200]}")
|
|
511
|
+
|
|
512
|
+
full = "\n".join(lines)
|
|
513
|
+
if len(full) > max_chars:
|
|
514
|
+
return full[:max_chars] + f"\n... ({len(lines)} events total, truncated)"
|
|
515
|
+
return full + f"\n[Summary: {len(trajectory)} events]"
|