clawsbench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. clawsbench-0.1.0/PKG-INFO +15 -0
  2. clawsbench-0.1.0/clawbench/__init__.py +1 -0
  3. clawsbench-0.1.0/clawbench/_env.py +74 -0
  4. clawsbench-0.1.0/clawbench/_paths.py +25 -0
  5. clawsbench-0.1.0/clawbench/backend.py +515 -0
  6. clawsbench-0.1.0/clawbench/cli.py +535 -0
  7. clawsbench-0.1.0/clawbench/dashboard.py +905 -0
  8. clawsbench-0.1.0/clawbench/eval.py +334 -0
  9. clawsbench-0.1.0/clawbench/executor.py +278 -0
  10. clawsbench-0.1.0/clawbench/export.py +115 -0
  11. clawsbench-0.1.0/clawbench/optimize.py +696 -0
  12. clawsbench-0.1.0/clawbench/output_schema.py +152 -0
  13. clawsbench-0.1.0/clawbench/proposer.py +405 -0
  14. clawsbench-0.1.0/clawbench/score_cache.py +108 -0
  15. clawsbench-0.1.0/clawbench/scoring.py +68 -0
  16. clawsbench-0.1.0/clawbench/task_check.py +446 -0
  17. clawsbench-0.1.0/clawbench/task_contract.py +171 -0
  18. clawsbench-0.1.0/clawbench/task_utils.py +177 -0
  19. clawsbench-0.1.0/clawbench/trajectory.py +223 -0
  20. clawsbench-0.1.0/clawsbench.egg-info/PKG-INFO +15 -0
  21. clawsbench-0.1.0/clawsbench.egg-info/SOURCES.txt +42 -0
  22. clawsbench-0.1.0/clawsbench.egg-info/dependency_links.txt +1 -0
  23. clawsbench-0.1.0/clawsbench.egg-info/entry_points.txt +2 -0
  24. clawsbench-0.1.0/clawsbench.egg-info/requires.txt +11 -0
  25. clawsbench-0.1.0/clawsbench.egg-info/top_level.txt +1 -0
  26. clawsbench-0.1.0/pyproject.toml +31 -0
  27. clawsbench-0.1.0/setup.cfg +4 -0
  28. clawsbench-0.1.0/tests/test_backend.py +62 -0
  29. clawsbench-0.1.0/tests/test_benchflow_backend.py +349 -0
  30. clawsbench-0.1.0/tests/test_concurrency.py +47 -0
  31. clawsbench-0.1.0/tests/test_daytona_smoke.py +70 -0
  32. clawsbench-0.1.0/tests/test_engine_switch.py +237 -0
  33. clawsbench-0.1.0/tests/test_eval.py +825 -0
  34. clawsbench-0.1.0/tests/test_executor.py +291 -0
  35. clawsbench-0.1.0/tests/test_export.py +118 -0
  36. clawsbench-0.1.0/tests/test_litellm.py +326 -0
  37. clawsbench-0.1.0/tests/test_optimize.py +311 -0
  38. clawsbench-0.1.0/tests/test_optimize_daytona.py +351 -0
  39. clawsbench-0.1.0/tests/test_output_schema.py +145 -0
  40. clawsbench-0.1.0/tests/test_scoring.py +65 -0
  41. clawsbench-0.1.0/tests/test_task_contract.py +154 -0
  42. clawsbench-0.1.0/tests/test_task_utils.py +44 -0
  43. clawsbench-0.1.0/tests/test_trajectory.py +285 -0
  44. clawsbench-0.1.0/tests/test_web.py +235 -0
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: clawsbench
3
+ Version: 0.1.0
4
+ Summary: Auto-improve agent skills via task completion optimization
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: anthropic>=0.40.0
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: click>=8.0
9
+ Requires-Dist: fastapi>=0.115.0
10
+ Requires-Dist: uvicorn[standard]>=0.30.0
11
+ Requires-Dist: litellm>=1.40.0
12
+ Requires-Dist: cloudpickle>=3.0
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest>=8.0; extra == "dev"
15
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
@@ -0,0 +1 @@
1
+ """clawbench — Auto-improve agent skills via task completion optimization."""
@@ -0,0 +1,74 @@
1
+ """Environment loading helpers for pipeline LLM/API keys.
2
+
3
+ Policy:
4
+ - Load project-local env files for CLI/web commands.
5
+ - Do not overwrite keys that are already exported in the shell.
6
+ - Canonical Gemini key variable is GOOGLE_API_KEY.
7
+ - DEFAULT_OPTIMIZER_MODEL is used by proposer + trajectory summarizer (not the agent).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from pathlib import Path
14
+
15
+ from clawbench._paths import _PROJECT_ROOT
16
+
17
+ # Default model for optimizer LLM calls (proposer + trajectory summarizer).
18
+ # The agent model is separate and controlled by CLI --model.
19
+ DEFAULT_OPTIMIZER_MODEL = "gemini/gemini-3.1-flash-lite-preview"
20
+
21
+ # Score threshold for PASS/FAIL classification.
22
+ # Used consistently across eval display, JSON artifacts, and dashboard data.
23
+ PASS_THRESHOLD = 0.8
24
+
25
+
26
+ def load_local_env_files(project_root: Path | None = None) -> list[Path]:
27
+ """Load local env files into os.environ without overriding existing keys.
28
+
29
+ Files are loaded in this order:
30
+ 1) <repo>/.gemini/.env
31
+ 2) <repo>/.env
32
+ """
33
+ root = project_root or _PROJECT_ROOT
34
+ candidates = [
35
+ root / ".gemini" / ".env",
36
+ root / ".env",
37
+ ]
38
+ loaded: list[Path] = []
39
+
40
+ for env_path in candidates:
41
+ if not env_path.exists() or not env_path.is_file():
42
+ continue
43
+ for line in env_path.read_text().splitlines():
44
+ parsed = _parse_env_line(line)
45
+ if parsed is None:
46
+ continue
47
+ key, value = parsed
48
+ if key not in os.environ:
49
+ os.environ[key] = value
50
+ loaded.append(env_path)
51
+
52
+ # Alias: gemini-cli uses GEMINI_API_KEY; pipeline uses GOOGLE_API_KEY
53
+ gemini_key = os.environ.get("GEMINI_API_KEY")
54
+ if gemini_key and "GOOGLE_API_KEY" not in os.environ:
55
+ os.environ["GOOGLE_API_KEY"] = gemini_key
56
+
57
+ return loaded
58
+
59
+
60
+ def _parse_env_line(line: str) -> tuple[str, str] | None:
61
+ line = line.strip()
62
+ if not line or line.startswith("#"):
63
+ return None
64
+ if line.startswith("export "):
65
+ line = line[len("export ") :].strip()
66
+ if "=" not in line:
67
+ return None
68
+ key, value = line.split("=", 1)
69
+ key = key.strip()
70
+ value = value.strip().strip('"').strip("'")
71
+ if not key:
72
+ return None
73
+ return key, value
74
+
@@ -0,0 +1,25 @@
1
+ """Path helpers for project layout."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+
8
+ def _find_project_root() -> Path:
9
+ """Find project root by looking for config.toml.
10
+
11
+ 1. Walk up from cwd (handles `uv tool install` — package lives outside repo)
12
+ 2. Fall back to __file__-relative (handles editable install from packages/pipeline/)
13
+ """
14
+ # Try cwd first — works when gskills is run from repo root (or any subdir)
15
+ p = Path.cwd().resolve()
16
+ while p != p.parent:
17
+ if (p / "config.toml").exists():
18
+ return p
19
+ p = p.parent
20
+
21
+ # Fallback: __file__-relative (3 levels up: clawbench → clawbench → packages → root)
22
+ return Path(__file__).resolve().parents[3]
23
+
24
+
25
+ _PROJECT_ROOT = _find_project_root()
@@ -0,0 +1,515 @@
1
+ """Runtime backend — Backend protocol, TrialResult, and BenchflowBackend.
2
+
3
+ Called by: executor.py, eval.py, optimize.py
4
+
5
+ BenchflowBackend is the sole backend implementation. It uses benchflow's
6
+ ACP-native SDK for agent execution, skills deployment, trajectory capture,
7
+ and agent installation.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import datetime
14
+ import hashlib
15
+ import json
16
+ import os
17
+ import shutil
18
+ import tempfile
19
+ import uuid
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+ from typing import Protocol, runtime_checkable
23
+
24
+ import click
25
+
26
+ from clawbench._paths import _PROJECT_ROOT
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Protocol & data types
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ @dataclass
35
+ class TrialResult:
36
+ """What every backend must return from a trial run.
37
+
38
+ The backend is responsible for extracting the score from verifier rewards.
39
+ Use scoring.extract_score() for the standard normalization.
40
+ """
41
+
42
+ score: float
43
+ side_info: dict = field(default_factory=dict)
44
+ trial_dir: Path | None = None
45
+
46
+
47
+ @runtime_checkable
48
+ class Backend(Protocol):
49
+ """Interface for trial execution backends.
50
+
51
+ Lifecycle: setup() → run_trial / run_trial_async (many) → cleanup()
52
+
53
+ Both sync and async are required. The executor picks which to call
54
+ based on --env and --workers:
55
+ - Docker (single worker): sync path — run_trial()
56
+ - Cloud envs / multi-worker: async path — run_trial_async()
57
+ """
58
+
59
+ def setup(self) -> None:
60
+ """One-time initialization before trials (monkey-patches, SDK init, etc.)."""
61
+ ...
62
+
63
+ def cleanup(self) -> None:
64
+ """Resource teardown after all trials complete."""
65
+ ...
66
+
67
+ def run_trial(
68
+ self,
69
+ candidate: dict[str, str],
70
+ task: dict,
71
+ *,
72
+ skills_dir: Path | None = None,
73
+ run_id: str | None = None,
74
+ ) -> TrialResult:
75
+ """Sync trial execution. Default: asyncio.run(run_trial_async(...))."""
76
+ ...
77
+
78
+ async def run_trial_async(
79
+ self,
80
+ candidate: dict[str, str],
81
+ task: dict,
82
+ *,
83
+ skills_dir: Path | None = None,
84
+ run_id: str | None = None,
85
+ ) -> TrialResult:
86
+ """Async trial execution. Backends must implement this."""
87
+ ...
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Agent environment helpers
92
+ # ---------------------------------------------------------------------------
93
+
94
+
95
+ def build_agent_env(agent_name: str | None = None) -> dict[str, str]:
96
+ """Forward provider credentials that installed agents expect.
97
+
98
+ Passthrough strategy: forward every key that any supported agent
99
+ might need. Keys absent from the host environment are silently
100
+ skipped, so this is safe for all agents.
101
+ """
102
+ agent_env: dict[str, str] = {}
103
+
104
+ passthrough_keys = (
105
+ "ANTHROPIC_API_KEY",
106
+ "GEMINI_API_KEY",
107
+ "GOOGLE_API_KEY",
108
+ "GOOGLE_APPLICATION_CREDENTIALS",
109
+ "GOOGLE_CLOUD_PROJECT",
110
+ "GOOGLE_CLOUD_LOCATION",
111
+ "GOOGLE_GENAI_USE_VERTEXAI",
112
+ "OPENAI_API_KEY",
113
+ "OPENAI_BASE_URL",
114
+ "OPENAI_MODEL",
115
+ "OPENAI_ORG_ID",
116
+ "OPENAI_PROJECT",
117
+ "AZURE_OPENAI_API_KEY",
118
+ "AZURE_OPENAI_ENDPOINT",
119
+ "AZURE_API_KEY",
120
+ "AZURE_RESOURCE_NAME",
121
+ )
122
+
123
+ for key in passthrough_keys:
124
+ val = os.environ.get(key)
125
+ if val:
126
+ agent_env[key] = val
127
+
128
+ # Azure fallback: many agents expect OPENAI_API_KEY even for Azure
129
+ if "OPENAI_API_KEY" not in agent_env and agent_env.get("AZURE_OPENAI_API_KEY"):
130
+ agent_env["OPENAI_API_KEY"] = agent_env["AZURE_OPENAI_API_KEY"]
131
+
132
+ return agent_env
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # BenchflowBackend
137
+ # ---------------------------------------------------------------------------
138
+
139
+
140
+ class BenchflowBackend:
141
+ """Benchflow SDK backend — runs trials via ACP-native agent communication.
142
+
143
+ Lifecycle:
144
+ setup() — validate benchflow is importable
145
+ run_trial() / run_trial_async() — execute a single trial
146
+ cleanup() — no-op
147
+ """
148
+
149
+ def __init__(
150
+ self,
151
+ agent_name: str = "claude-agent-acp",
152
+ model_name: str = "claude-haiku-4-5-20251001",
153
+ environment_type: str = "docker",
154
+ trials_dir: Path | None = None,
155
+ trajectory_store: list | None = None,
156
+ trajectory_lock=None,
157
+ agent_import_path: str | None = None,
158
+ keep_trials: bool = False,
159
+ ):
160
+ self.agent_name = agent_name
161
+ self.model_name = model_name
162
+ self.environment_type = environment_type
163
+ self.trials_dir = trials_dir or (_PROJECT_ROOT / ".local" / "trials")
164
+ self.trajectory_store = trajectory_store
165
+ self.trajectory_lock = trajectory_lock
166
+ self.agent_import_path = agent_import_path
167
+ self.keep_trials = keep_trials
168
+ self._sdk = None
169
+
170
+ def setup(self) -> None:
171
+ """Import and initialize benchflow SDK (idempotent)."""
172
+ if self._sdk is not None:
173
+ return
174
+ try:
175
+ from benchflow import SDK
176
+ self._sdk = SDK()
177
+ except ImportError:
178
+ raise RuntimeError(
179
+ "benchflow SDK not installed. Install with: "
180
+ "pip install 'benchflow @ git+https://github.com/benchflow-ai/benchflow.git'"
181
+ )
182
+
183
+ def cleanup(self) -> None:
184
+ """No-op — benchflow handles cleanup per-trial."""
185
+ pass
186
+
187
+ def run_trial(
188
+ self,
189
+ candidate: dict[str, str],
190
+ task: dict,
191
+ *,
192
+ skills_dir: Path | None = None,
193
+ run_id: str | None = None,
194
+ ) -> TrialResult:
195
+ """Sync trial execution — wraps async via asyncio.run()."""
196
+ self.setup()
197
+ return asyncio.run(
198
+ self.run_trial_async(
199
+ candidate, task, skills_dir=skills_dir, run_id=run_id,
200
+ )
201
+ )
202
+
203
+ async def run_trial_async(
204
+ self,
205
+ candidate: dict[str, str],
206
+ task: dict,
207
+ *,
208
+ skills_dir: Path | None = None,
209
+ run_id: str | None = None,
210
+ ) -> TrialResult:
211
+ """Async trial execution via benchflow SDK.
212
+
213
+ Maps the clawbench (candidate, task) → (score, side_info) contract
214
+ onto benchflow's SDK.run() → RunResult.
215
+ """
216
+ self.setup()
217
+ if task is None:
218
+ return TrialResult(score=0.0, side_info={"Feedback": {"Status": "no_example"}})
219
+
220
+ task_name = task.get("task_name", "?")
221
+ task_path = task["task_path"]
222
+ skills_content = candidate.get("skills", "")
223
+
224
+ # Always copy task dir — SDK's context_root staging mutates in-place
225
+ tmp_task = _copy_task(task_path, skills_content, run_id=run_id)
226
+
227
+ try:
228
+ # Resolve model name — benchflow uses bare model IDs,
229
+ # clawbench/Harbor uses "provider/model" format
230
+ model = self.model_name
231
+ if model and "/" in model:
232
+ model = model.split("/", 1)[1]
233
+
234
+ # Build agent env — auto-inherit API keys
235
+ agent_env = build_agent_env(self.agent_name)
236
+
237
+ # Generate unique trial/job names
238
+ trial_name = f"{task_name}__{uuid.uuid4().hex[:8]}"
239
+ _ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
240
+ job_name = run_id or f"cb-{_ts}-{uuid.uuid4().hex[:8]}"
241
+ self.trials_dir.mkdir(parents=True, exist_ok=True)
242
+ jobs_dir = str(self.trials_dir)
243
+
244
+ # Build pre-agent hooks for claw-* services
245
+ pre_agent_hooks = _build_service_hooks(tmp_task)
246
+
247
+ result = await self._sdk.run(
248
+ task_path=tmp_task,
249
+ agent=self.agent_name,
250
+ model=model,
251
+ agent_env=agent_env,
252
+ job_name=job_name,
253
+ trial_name=trial_name,
254
+ jobs_dir=jobs_dir,
255
+ environment=self.environment_type,
256
+ skills_dir=str(skills_dir) if skills_dir else None,
257
+ sandbox_user="agent" if self.agent_name != "oracle" else None,
258
+ pre_agent_hooks=pre_agent_hooks or None,
259
+ context_root=str(_PROJECT_ROOT),
260
+ )
261
+ finally:
262
+ shutil.rmtree(tmp_task, ignore_errors=True)
263
+
264
+ # Write metadata files for export.py compatibility
265
+ trial_dir = self.trials_dir / job_name / trial_name
266
+ if trial_dir.is_dir():
267
+ (trial_dir / "task_name.txt").write_text(task_name + "\n")
268
+ if run_id:
269
+ (trial_dir / "run_id.txt").write_text(run_id + "\n")
270
+
271
+ # Write agent/trajectory.json in ATIF format for export.py
272
+ trajectory = result.trajectory or []
273
+ if trajectory:
274
+ agent_dir = trial_dir / "agent"
275
+ agent_dir.mkdir(exist_ok=True)
276
+ atif = _acp_to_atif(trajectory, self.agent_name, model or "")
277
+ (agent_dir / "trajectory.json").write_text(
278
+ json.dumps(atif, indent=2) + "\n"
279
+ )
280
+
281
+ # Map RunResult → TrialResult with clawbench side_info contract
282
+ return self._to_trial_result(result, task_name, skills_content)
283
+
284
+ def _to_trial_result(self, result, task_name: str, skills_content: str) -> TrialResult:
285
+ """Convert benchflow RunResult to clawbench TrialResult."""
286
+ score = 0.0
287
+ side_info: dict = {"Input": {"task_name": task_name}}
288
+
289
+ # Honor verifier result even if agent timed out (verifier runs after timeout)
290
+ if result.rewards:
291
+ rewards = result.rewards
292
+ raw = rewards.get("score", rewards.get("reward", 0.0))
293
+ score = max(-1.0, min(1.0, float(raw)))
294
+ side_info["Feedback"] = {"Status": "completed", "Rewards": rewards}
295
+ side_info["scores"] = rewards
296
+ if result.error:
297
+ click.echo(f" [{task_name}] WARN: agent error ({result.error}) but verifier returned score={score}", err=True)
298
+ elif result.error:
299
+ click.echo(f" [{task_name}] ERROR: {result.error}", err=True)
300
+ side_info["Feedback"] = {
301
+ "Status": "trial_error",
302
+ "Error": result.error,
303
+ }
304
+ else:
305
+ click.echo(f" [{task_name}] WARNING: no verifier result", err=True)
306
+ side_info["Feedback"] = {"Status": "no_verifier_result"}
307
+
308
+ # Build trajectory summary from ACP trajectory
309
+ trajectory = result.trajectory or []
310
+ trajectory_text = _format_acp_trajectory(trajectory)
311
+ side_info["Generated Outputs"] = {
312
+ "Agent Trace": trajectory_text,
313
+ }
314
+
315
+ # Timing
316
+ timing = {}
317
+ if result.started_at and result.finished_at:
318
+ timing["total"] = round(
319
+ (result.finished_at - result.started_at).total_seconds(), 1
320
+ )
321
+ side_info["Input"]["timing"] = timing
322
+ if timing.get("total"):
323
+ side_info["Input"]["duration_sec"] = timing["total"]
324
+
325
+ if not result.error or score != 0.0:
326
+ duration = f" ({timing['total']:.0f}s)" if timing.get("total") else ""
327
+ click.echo(f" [{task_name}] score={score}{duration}")
328
+
329
+ # Save to trajectory store (for dashboard/proposer)
330
+ if self.trajectory_store is not None:
331
+ candidate_hash = hashlib.md5(skills_content.encode()).hexdigest()[:12]
332
+ raw_steps = []
333
+ for event in trajectory:
334
+ etype = event.get("type", "")
335
+ if etype == "tool_call":
336
+ raw_steps.append({
337
+ "type": "tool",
338
+ "tool": event.get("kind", ""),
339
+ "command": event.get("title", ""),
340
+ "content": json.dumps(event.get("content", []))[:300],
341
+ })
342
+ elif etype == "agent_message":
343
+ raw_steps.append({
344
+ "type": "thought",
345
+ "content": event.get("text", "")[:500],
346
+ })
347
+
348
+ entry = {
349
+ "candidate_hash": candidate_hash,
350
+ "task_name": task_name,
351
+ "score": score,
352
+ "steps": raw_steps,
353
+ "summary": trajectory_text[:500] if trajectory_text else "",
354
+ "duration_sec": timing.get("total", 0),
355
+ }
356
+ if self.trajectory_lock:
357
+ with self.trajectory_lock:
358
+ self.trajectory_store.append(entry)
359
+ else:
360
+ self.trajectory_store.append(entry)
361
+
362
+ return TrialResult(score=score, side_info=side_info)
363
+
364
+
365
+ # ---------------------------------------------------------------------------
366
+ # Helpers
367
+ # ---------------------------------------------------------------------------
368
+
369
+
370
+ def _copy_task(task_path: str, skills_content: str = "", run_id: str | None = None) -> str:
371
+ """Copy task dir to a temp location, optionally injecting skill content.
372
+
373
+ Always copies so that SDK operations (context_root staging, skills_dir
374
+ injection) don't mutate the original task directory.
375
+ """
376
+ tmpdir_name = f"bf{uuid.uuid4().hex[:12]}"
377
+ tmpdir = os.path.join(tempfile.gettempdir(), tmpdir_name)
378
+ os.makedirs(tmpdir)
379
+ src = Path(task_path).resolve()
380
+ dst = Path(tmpdir)
381
+
382
+ for item in src.iterdir():
383
+ dest = dst / item.name
384
+ if item.is_dir():
385
+ shutil.copytree(item, dest)
386
+ else:
387
+ shutil.copy2(item, dest)
388
+
389
+ # Inject a per-eval-run cache-bust ARG so Daytona rebuilds images between
390
+ # runs but reuses the same image for all repeats within a single run.
391
+ dockerfile = dst / "environment" / "Dockerfile"
392
+ if dockerfile.exists() and run_id:
393
+ content = dockerfile.read_text()
394
+ content = content.replace("FROM ", f"ARG EVAL_RUN={run_id}\nFROM ", 1)
395
+ dockerfile.write_text(content)
396
+
397
+ if skills_content:
398
+ instruction_path = dst / "instruction.md"
399
+ original = instruction_path.read_text() if instruction_path.exists() else ""
400
+ instruction_path.write_text(
401
+ f"# Agent Skill\n\n{skills_content}\n\n---\n\n# Task\n\n{original}"
402
+ )
403
+
404
+ return tmpdir
405
+
406
+
407
+ def _build_service_hooks(task_path: str) -> list:
408
+ """Auto-detect claw-* services from Dockerfile and build pre-agent hooks.
409
+
410
+ Reads config.toml for the service registry, scans the Dockerfile for
411
+ service names, and returns async hook functions that start each matched
412
+ service and wait for its health check.
413
+ """
414
+ import tomllib
415
+
416
+ env_dir = Path(task_path) / "environment"
417
+ dockerfile_path = env_dir / "Dockerfile"
418
+ if not dockerfile_path.exists():
419
+ return []
420
+ # docker-compose tasks handle their own services
421
+ if (env_dir / "docker-compose.yaml").exists():
422
+ return []
423
+
424
+ toml_path = _PROJECT_ROOT / "config.toml"
425
+ if not toml_path.exists():
426
+ return []
427
+
428
+ with open(toml_path, "rb") as f:
429
+ env_cfg = tomllib.load(f)
430
+
431
+ service_registry: dict[str, tuple[str, int]] = {
432
+ name: (cfg["db_path"], cfg["port"])
433
+ for name, cfg in env_cfg.items()
434
+ if name.startswith("claw-")
435
+ }
436
+
437
+ dockerfile_text = dockerfile_path.read_text()
438
+ services = [
439
+ (name, db_path, port)
440
+ for name, (db_path, port) in service_registry.items()
441
+ if name in dockerfile_text
442
+ ]
443
+
444
+ if not services:
445
+ return []
446
+
447
+ async def start_services(env):
448
+ for cli_name, db_path, port in services:
449
+ await env.exec(
450
+ f"{cli_name} --db {db_path} serve --host 0.0.0.0 --port {port} --no-mcp &"
451
+ )
452
+ for cli_name, _db_path, port in services:
453
+ await env.exec(
454
+ f"for i in $(seq 1 30); do curl -sf http://localhost:{port}/health > /dev/null && break; sleep 1; done"
455
+ )
456
+
457
+ return [start_services]
458
+
459
+
460
+ def _acp_to_atif(events: list[dict], agent_name: str, model_name: str) -> dict:
461
+ """Convert ACP trajectory events → ATIF-v1.6 format for export.py."""
462
+ steps = []
463
+ for i, event in enumerate(events, start=1):
464
+ etype = event.get("type", "")
465
+ step: dict = {"step_id": i, "source": "agent"}
466
+ if etype == "tool_call":
467
+ step["message"] = ""
468
+ step["tool_calls"] = [{
469
+ "tool_call_id": event.get("tool_call_id", ""),
470
+ "function_name": event.get("kind", ""),
471
+ "arguments": {"command": event.get("title", "")},
472
+ }]
473
+ content = event.get("content")
474
+ if content:
475
+ step["observation"] = {"results": [{"content": str(content)[:2000]}]}
476
+ elif etype == "agent_message":
477
+ step["message"] = event.get("text", "")
478
+ elif etype == "agent_thought":
479
+ step["source"] = "agent_thought"
480
+ step["message"] = event.get("text", "")
481
+ else:
482
+ continue
483
+ steps.append(step)
484
+
485
+ return {
486
+ "schema_version": "ATIF-v1.6",
487
+ "agent": {"name": agent_name, "model_name": model_name},
488
+ "steps": steps,
489
+ }
490
+
491
+
492
+ def _format_acp_trajectory(trajectory: list[dict], max_chars: int = 8000) -> str:
493
+ """Format ACP trajectory events into a human-readable summary."""
494
+ if not trajectory:
495
+ return "(no trajectory)"
496
+
497
+ lines = []
498
+ for event in trajectory:
499
+ etype = event.get("type", "")
500
+ if etype == "tool_call":
501
+ kind = event.get("kind", "")
502
+ title = event.get("title", "")
503
+ status = event.get("status", "")
504
+ lines.append(f"[{kind}] {title} → {status}")
505
+ elif etype == "agent_message":
506
+ text = event.get("text", "")
507
+ lines.append(f"[Message]: {text[:300]}")
508
+ elif etype == "agent_thought":
509
+ text = event.get("text", "")
510
+ lines.append(f"[Thought]: {text[:200]}")
511
+
512
+ full = "\n".join(lines)
513
+ if len(full) > max_chars:
514
+ return full[:max_chars] + f"\n... ({len(lines)} events total, truncated)"
515
+ return full + f"\n[Summary: {len(trajectory)} events]"