PyPI - clawbench-cli - Versions diffs - 0.1.2__py3-none-any.whl - Mend

clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

clawbench/data/docker/setup-openclaw.sh ADDED Viewed

@@ -0,0 +1,112 @@
+#!/bin/bash
+set -e
+# All config comes from env vars set by the test driver (sourced from models.yaml).
+# BASE_URL and API_TYPE are required.
+if [ -z "$BASE_URL" ] || [ -z "$API_TYPE" ]; then
+  echo "ERROR: BASE_URL and API_TYPE must be set"
+  exit 1
+fi
+PROVIDER="api"
+MODEL="api/$MODEL_NAME"
+MODEL_ID="$MODEL_NAME"
+# Build optional model parameters
+MODEL_OPTS=""
+if [ -n "$TEMPERATURE" ]; then
+  MODEL_OPTS="$MODEL_OPTS, \"temperature\": $TEMPERATURE"
+fi
+if [ -n "$MAX_TOKENS" ]; then
+  MODEL_OPTS="$MODEL_OPTS, \"maxOutputTokens\": $MAX_TOKENS"
+fi
+mkdir -p ~/.openclaw/agents/main/agent
+# Restrict exec to safe read-only commands (allowlist mode).
+# The agent cannot run curl, python, node, etc. — only ls/cat/grep and default safe bins.
+cat > ~/.openclaw/openclaw.json << JSONEOF
+{
+  "gateway": {
+    "port": 18789,
+    "mode": "local"
+  },
+  "tools": {
+    "exec": {
+      "security": "allowlist",
+      "safeBins": ["ls", "cat", "find", "file", "jq", "cut", "uniq", "head", "tail", "tr", "wc", "grep", "sort"]
+    }
+  },
+  "agents": {
+    "defaults": {
+      "workspace": "/root/workspace",
+      "skipBootstrap": true,
+      "model": {
+        "primary": "$MODEL"
+      }
+    }
+  },
+  "models": {
+    "providers": {
+      "$PROVIDER": {
+        "baseUrl": "$BASE_URL",
+        "api": "$API_TYPE",
+        "models": [
+          { "id": "$MODEL_ID", "name": "$MODEL_ID", "reasoning": true$MODEL_OPTS }
+        ]
+      }
+    }
+  },
+  "browser": {
+    "enabled": true,
+    "defaultProfile": "container",
+    "profiles": {
+      "container": {
+        "cdpUrl": "http://127.0.0.1:9222",
+        "color": "#FB542B"
+      }
+    }
+  }
+}
+JSONEOF
+# Generate auth-profiles.json with multi-key rotation support
+python3 -c "
+import json, os
+provider = '$PROVIDER'
+# Parse keys from API_KEYS env var, fall back to API_KEY
+keys_json = os.environ.get('API_KEYS', '')
+single_key = os.environ.get('API_KEY', '')
+keys = []
+if keys_json:
+    try:
+        parsed = json.loads(keys_json)
+    except json.JSONDecodeError:
+        parsed = []
+    keys = [{'key': k, 'source': 'apikey'} for k in parsed]
+if not keys and single_key:
+    keys = [{'key': single_key, 'source': 'apikey'}]
+profiles = {}
+order = []
+for i, entry in enumerate(keys, 1):
+    name = f'{provider}:api-{i}'
+    profiles[name] = {
+        'provider': provider,
+        'type': 'api_key',
+        'key': entry['key'],
+    }
+    order.append(name)
+result = {'profiles': profiles, 'order': {provider: order}}
+path = os.path.expanduser('~/.openclaw/agents/main/agent/auth-profiles.json')
+with open(path, 'w') as f:
+    json.dump(result, f, indent=2)
+os.chmod(path, 0o600)
+print(f'Auth profiles: {len(keys)} API key(s) for {provider}')
+"

clawbench/data/eval/README.md ADDED Viewed

@@ -0,0 +1,95 @@
+# ClawBench Evaluation
+ClawBench evaluation is a **post-session** step: first you run agents to collect trajectories, then you evaluate those trajectories against human reference runs.
+```
+Step 1: Run agents          Step 2: Evaluate
+(test-driver)               (this directory)
+./run.sh                    Claude Code subagents compare
+  or                        agent traces vs human references
+test-driver/batch.py        under eval/agentic_eval.md rubric
+       │                              │
+       ▼                              ▼
+  test-output/                {model}-eval-results.csv
+    {model}/{run}/            {model}-eval-results.json
+      data/
+        actions.jsonl
+        requests.jsonl
+        screenshots/
+        recording.mp4
+        interception.json
+        agent-messages.jsonl
+```
+## How It Works
+The evaluator is a Claude Code subagent that compares two trajectories side by side:
+- **Agent trajectory** -- the five-layer recording from the AI agent's run
+- **Human reference trajectory** -- the same five layers recorded by a human annotator completing the task correctly
+The evaluator follows a fixed rubric ([`agentic_eval.md`](agentic_eval.md)) to determine PASS or FAIL for each task. This comparative approach means the evaluator has a concrete ground truth -- it knows exactly which form fields to fill, which buttons to click, and which endpoint the final submission hits.
+## Prerequisites
+- Agent run outputs in `test-output/{model}/` (produced by `test-driver/run.py` or `batch.py`)
+- Human reference runs in a separate directory (same five-layer format)
+- [Claude Code](https://docs.anthropic.com/en/docs/claude-code) installed
+## Running Evaluation
+Open Claude Code at the project root and send the following prompt. Replace the three placeholders with your actual values:
+- `{agent_dir}` -- path to the model's output directory (e.g., `test-output/claude-sonnet-4-6/`)
+- `{human_dir}` -- path to the human reference directory (e.g., `test-output/human/`)
+- `{model}` -- model name for output file naming (e.g., `claude-sonnet-4-6`)
+```
+Read the evaluation rubric at eval/agentic_eval.md and follow it strictly.
+Evaluate all 153 agent runs against their corresponding human reference runs.
+Agent runs directory: {agent_dir}
+Human reference directory: {human_dir}
+Each directory contains multiple run subdirectories (one per task). Each run subdirectory contains:
+- run-meta.json
+- data/actions.jsonl
+- data/requests.jsonl
+- data/screenshots/
+- data/recording.mp4
+- data/interception.json
+Agent runs also contain data/agent-messages.jsonl.
+Dispatch 16 subagents to evaluate in parallel, each subagent handling ~10 tasks. Each subagent should:
+1. Match agent run to human run by task_id in run-meta.json
+2. Read both run-meta.json to get task instruction and context
+3. Compare the agent trajectory against the human reference trajectory
+4. Determine PASS or FAIL with justification, noting which evidence files and lines support the decision
+Dispatch 3 supervisor agents to monitor the work of the 16 evaluation subagents, checking for consistency and correctness.
+After all subagents complete, merge their results and output two files:
+1. {model}-eval-results.csv — columns: task_id, task_name, model, pass, brief_justification
+2. {model}-eval-results.json — detailed results per task, each entry including: task_id, task_name, model, pass, justification, and evidence references (file path and line numbers that support the verdict)
+```
+## Output
+The evaluation produces two files at the project root:
+| File | Format | Description |
+|------|--------|-------------|
+| `{model}-eval-results.csv` | CSV | Quick summary -- one row per task with PASS/FAIL and a brief justification |
+| `{model}-eval-results.json` | JSON | Detailed results with full justification and evidence references (file paths + line numbers) |
+## Evaluation Rubric
+The full rubric is in [`agentic_eval.md`](agentic_eval.md). Key rules:
+- **Interceptor block = PASS** if all prior steps are correct (the interceptor is designed to cut the session short)
+- **Payment must be attempted** -- the agent has a dummy credit card and must try to use it
+- **Phone verification wall = PASS** if all prior steps are complete (the agent has no phone number)
+- **CAPTCHA must be attempted** -- skipping a CAPTCHA is FAIL
+- **Email must be used** when the task requires registration or verification

clawbench/data/eval/agentic_eval.md ADDED Viewed

@@ -0,0 +1,53 @@
+# ClawBench Agentic Evaluator Rubric
+This is the evaluation rubric used by the Claude Code evaluator subagent. It defines the PASS/FAIL criteria for judging whether an AI agent successfully completed a ClawBench task.
+## Role
+You are an evaluator for ClawBench, a real-world browser agent benchmark. You will determine whether an AI browser agent successfully completed a given web task by comparing the agent's trajectory against a human reference trajectory and examining all available evidence.
+## Evidence
+Each run directory contains five layers of synchronized data:
+| Layer | File | What it tells you |
+|-------|------|-------------------|
+| Session replay | `data/recording.mp4` | What the agent **saw** |
+| Action screenshots | `data/screenshots/*.png` | Step-by-step visual state |
+| HTTP traffic | `data/requests.jsonl` | What network requests the agent **triggered** |
+| Browser actions | `data/actions.jsonl` | What the agent **did** (clicks, keystrokes, scrolls) |
+| Agent messages | `data/agent-messages.jsonl` | What the agent **thought** (reasoning, tool calls) |
+The human reference run has the same layers (except agent messages), providing a concrete ground truth for what correct task completion looks like.
+Additional metadata:
+- `run-meta.json` -- run metadata including `task_id`, model, duration, and interception status
+- `data/interception.json` -- whether the request interceptor blocked the final action
+## PASS/FAIL Criteria
+1. **General rule.** If the agent completed the task as specified in the instruction, PASS. Otherwise, FAIL.
+2. **Email usage.** The agent has access to a disposable email and should use it to receive verification codes or register for accounts when needed. If the agent did not use the email when the task required it, FAIL.
+3. **Payment attempts.** The agent has access to a dummy credit card. If the task requires payment, the agent must attempt to fill in payment details and click the pay/checkout button. The payment will be declined, but the agent must attempt it. If the agent skipped the payment step, FAIL.
+4. **Checkout completion.** When asked to "proceed to checkout", the agent must attempt to fill out the checkout form and click the submit/checkout button. Simply adding items to the cart is not sufficient, FAIL.
+5. **Interceptor blocks.** The testing environment has a request interceptor that blocks irreversible actions (submitting applications, sending emails, etc.). The result is recorded in `data/interception.json`. If the interceptor blocked the final action **and** all prior inputs and steps are correct, PASS.
+6. **Phone verification.** The agent has an email but no phone number. If a task requires phone verification, the agent should complete all steps up to the phone verification wall. If all prior steps are correct, PASS -- even if the agent could not proceed past phone verification.
+7. **Phone number field awareness.** If the UI shows a required phone number field and the agent recognized this, it is acceptable for the agent to not click submit. If all other inputs are correct, PASS.
+8. **CAPTCHA.** If the agent encounters a CAPTCHA, it must attempt to solve it. If it cannot solve it, FAIL.
+9. **Interceptor and page state.** If the agent performed all actions correctly but the page does not display a success/result page because the interceptor blocked the final request, PASS -- the interceptor is expected to cut the session short.
+## Judgment Format
+For each task, output:
+- **task_id** -- from `run-meta.json`
+- **pass** -- `true` or `false`
+- **justification** -- brief explanation of the verdict
+- **evidence** -- specific file paths and line numbers that support the decision

clawbench/data/extension-server/.python-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3.12

clawbench/data/extension-server/README.md ADDED Viewed

@@ -0,0 +1,54 @@
+# ClawBench Extension Server
+The ClawBench Extension Server is a Python backend server that receives data from the ClawBench Chrome Extension and processes it for benchmarking purposes. It is responsible for:
+- Organizing and storing the data received from the extension in a structured format.
+- Receiving user actions and storing them in a jsonl format.
+- Receiving screenshots and storing them in a dedicated folder.
+- Receiving and converting session recording chunks into .mp4 files when the session is complete.
+The implementation is minimal, with only the necessary level of complexity and customization.
+## Implementation
+Single file: `server.py` — a FastAPI application run with uvicorn.
+### Endpoints
+| Method | Path | Content-Type | Description |
+|--------|------|-------------|-------------|
+| GET | `/api/status` | — | Returns `{"status": "ok"}` |
+| POST | `/api/action` | application/json | Appends action JSON to `actions.jsonl` |
+| POST | `/api/screenshot` | application/json | Decodes base64 PNG from `{"timestamp", "data"}`, saves to `screenshots/{timestamp}.png` |
+| POST | `/api/stop` | — | Signals session stop, returns session summary |
+| POST | `/api/stop-recording` | — | Stops ffmpeg recording, finalizes MP4 |
+### Screen Recording
+The server starts an ffmpeg process on startup that records the Xvfb virtual display (`DISPLAY=:99`) to `/data/recording.mp4` using H.264 at 15fps. On `/api/stop-recording`, the ffmpeg process is gracefully terminated with SIGINT to finalize the MP4 file. The `/api/stop` endpoint handles session bookkeeping (eval promotion, watchdog signaling) without stopping the recording, allowing a grace period to capture the final state.
+### Data Storage
+All data is written to the directory specified by `CLAWBENCH_DATA_DIR` (default: `/data`):
+```
+/data/
+  actions.jsonl       # Append-only, one JSON object per line
+  screenshots/        # {timestamp}.png files
+  recording.mp4       # H.264 screen recording
+```
+### Running Locally
+```bash
+cd extension-server
+CLAWBENCH_DATA_DIR=./data DISPLAY=:99 uv run uvicorn server:app --host 0.0.0.0 --port 7878
+```
+### Dependencies
+Defined in `pyproject.toml`:
+- `fastapi[standard]` — web framework + uvicorn
+- `websocket-client` — WebSocket client for CDP communication
+System dependency: `ffmpeg` (for screen recording and MP4 encoding).

clawbench/data/extension-server/pyproject.toml ADDED Viewed

@@ -0,0 +1,7 @@
+[project]
+name = "extension-server"
+version = "0.1.0"
+description = "ClawBench extension server"
+readme = "README.md"
+requires-python = "==3.12.*"
+dependencies = ["fastapi[standard]>=0.115", "websocket-client>=1.8"]

clawbench/data/extension-server/server.py ADDED Viewed

@@ -0,0 +1,360 @@
+import base64
+import json
+import os
+import re
+import signal
+import subprocess
+import threading
+import time
+from contextlib import asynccontextmanager
+from pathlib import Path
+from urllib.parse import parse_qs, urlparse
+import urllib.request
+import websocket
+from fastapi import FastAPI
+DATA_DIR = Path(os.environ.get("CLAWBENCH_DATA_DIR", "/data"))
+ACTIONS_FILE = DATA_DIR / "actions.jsonl"
+SCREENSHOTS_DIR = DATA_DIR / "screenshots"
+RECORDING_PATH = DATA_DIR / "recording.mp4"
+EVAL_SCHEMA_PATH = Path("/eval-schema.json")
+REQUESTS_FILE = DATA_DIR / "requests.jsonl"
+INTERCEPTION_FILE = DATA_DIR / "interception.json"
+CDP_URL = "http://127.0.0.1:9222"
+ffmpeg_proc = None
+eval_schema = None
+eval_interceptor_ready = False
+def _const_fields_match(expected, actual):
+    """Check that all key-value pairs in expected match in actual data.
+    For list bodies (batched GraphQL), returns True if any item matches.
+    Returns True if all match or expected is empty/None."""
+    if not expected:
+        return True
+    if not actual:
+        return False
+    if isinstance(actual, list):
+        return any(_const_fields_match(expected, item) for item in actual)
+    if not isinstance(actual, dict):
+        return False
+    return all(actual.get(k) == v for k, v in expected.items())
+FILTERED_PREFIXES = (
+    "http://localhost:7878", "http://127.0.0.1:7878",
+    "chrome-extension://", "devtools://", "chrome://",
+)
+def _parse_body(post_data):
+    """Parse postData string into a structured body (JSON dict, form dict, or raw string)."""
+    if not post_data:
+        return None
+    try:
+        return json.loads(post_data)
+    except (json.JSONDecodeError, TypeError):
+        try:
+            parsed = parse_qs(post_data, keep_blank_values=True)
+            if parsed:
+                return {k: v[0] if len(v) == 1 else v for k, v in parsed.items()}
+        except Exception:
+            pass
+        return post_data
+def _log_request(log_file, params):
+    """Log a Fetch.requestPaused event to requests.jsonl. Returns None."""
+    request = params["request"]
+    request_url = request["url"]
+    if any(request_url.startswith(p) for p in FILTERED_PREFIXES):
+        return
+    parsed = urlparse(request_url)
+    query_params = {k: v[0] if len(v) == 1 else v
+                    for k, v in parse_qs(parsed.query).items()}
+    entry = {
+        "timestamp": time.time(),
+        "url": request_url,
+        "method": request["method"],
+        "headers": request.get("headers", {}),
+        "body": _parse_body(request.get("postData")),
+        "query_params": query_params,
+        "resource_type": params.get("resourceType", "Other"),
+    }
+    log_file.write(json.dumps(entry) + "\n")
+    log_file.flush()
+def start_cdp_handler(url_pattern=None, required_method=None,
+                      match_body=None, match_params=None):
+    """Connect to Chrome via CDP, log all requests, and optionally block by URL pattern + method + body/params."""
+    # Wait for Chrome CDP to be ready
+    ws_url = None
+    for _ in range(30):
+        try:
+            version = json.loads(urllib.request.urlopen(
+                f"{CDP_URL}/json/version").read())
+            ws_url = version["webSocketDebuggerUrl"]
+            break
+        except Exception:
+            time.sleep(1)
+    if not ws_url:
+        print("[cdp] CDP not available, skipping handler", flush=True)
+        return
+    global eval_interceptor_ready
+    ws = websocket.create_connection(ws_url)
+    msg_id = [1]
+    def send(method, params=None, session_id=None):
+        msg = {"id": msg_id[0], "method": method, "params": params or {}}
+        if session_id:
+            msg["sessionId"] = session_id
+        ws.send(json.dumps(msg))
+        msg_id[0] += 1
+    # Auto-attach to all targets with flatten so events come on this connection.
+    # waitForDebuggerOnStart=True pauses new targets until we explicitly resume
+    # them, which prevents the "Debugger paused in another tab" Chrome banner
+    # and ensures no requests slip through before Fetch.enable is active.
+    send("Target.setAutoAttach", {
+        "autoAttach": True,
+        "waitForDebuggerOnStart": True,
+        "flatten": True,
+    })
+    if url_pattern:
+        eval_interceptor_ready = True
+        print(f"[cdp] Interceptor connected, watching for: {url_pattern}", flush=True)
+    else:
+        print("[cdp] Request logger connected (no intercept pattern)", flush=True)
+    # Track sessions where Fetch is enabled, and map sessions to target IDs
+    # so we can bring the correct tab to front when it receives activity.
+    fetch_sessions = set()
+    session_to_target = {}  # sessionId -> targetId
+    active_target = [None]  # mutable ref: currently active targetId
+    log_file = open(REQUESTS_FILE, "a")
+    try:
+        while True:
+            try:
+                raw = ws.recv()
+            except Exception:
+                break
+            msg = json.loads(raw)
+            session_id = msg.get("sessionId")
+            # When a new target attaches, enable Fetch then resume execution.
+            # Because waitForDebuggerOnStart=True, the target is paused until
+            # we call Runtime.runIfWaitingForDebugger — this avoids the
+            # "Debugger paused in another tab" banner and ensures Fetch is
+            # active before any requests fire.
+            if msg.get("method") == "Target.attachedToTarget":
+                child_session = msg["params"]["sessionId"]
+                target_type = msg["params"]["targetInfo"]["type"]
+                target_id = msg["params"]["targetInfo"]["targetId"]
+                if target_type == "page":
+                    session_to_target[child_session] = target_id
+                    if child_session not in fetch_sessions:
+                        send("Fetch.enable", {
+                            "patterns": [{"urlPattern": "*", "requestStage": "Request"}],
+                        }, child_session)
+                        fetch_sessions.add(child_session)
+                        print(
+                            f"[cdp] Fetch enabled on session {child_session[:12]}...", flush=True)
+                # Always resume the target so it doesn't stay paused
+                send("Runtime.runIfWaitingForDebugger", {}, child_session)
+                continue
+            if msg.get("method") != "Fetch.requestPaused":
+                if "error" in msg and msg.get("id"):
+                    print(f"[cdp] CDP error: {msg['error']}", flush=True)
+                continue
+            params = msg["params"]
+            request_url = params["request"]["url"]
+            request_id = params["requestId"]
+            # Auto-focus: when a page navigation (Document request) happens on a
+            # background tab, bring that tab to front so the screen recording and
+            # screenshots always show the tab the agent is working on.
+            resource_type = params.get("resourceType", "")
+            if resource_type == "Document" and session_id:
+                target_id = session_to_target.get(session_id)
+                if target_id and target_id != active_target[0]:
+                    send("Target.activateTarget", {"targetId": target_id})
+                    active_target[0] = target_id
+                    print(f"[cdp] Auto-focused tab {target_id[:12]}... (Document request)", flush=True)
+            # Log every non-internal request
+            _log_request(log_file, params)
+            # If no intercept pattern, just continue the request
+            if not url_pattern:
+                send("Fetch.continueRequest", {
+                     "requestId": request_id}, session_id)
+                continue
+            # --- Intercept: block if URL + method + body/params match ---
+            if not re.search(url_pattern, request_url):
+                send("Fetch.continueRequest", {
+                     "requestId": request_id}, session_id)
+                continue
+            if required_method and params["request"]["method"] != required_method:
+                send("Fetch.continueRequest", {
+                     "requestId": request_id}, session_id)
+                continue
+            # Parse request data for body/params matching
+            parsed = urlparse(request_url)
+            query_params = {k: v[0] if len(
+                v) == 1 else v for k, v in parse_qs(parsed.query).items()}
+            body = _parse_body(params["request"].get("postData"))
+            if not _const_fields_match(match_body, body):
+                send("Fetch.continueRequest", {
+                     "requestId": request_id}, session_id)
+                continue
+            if not _const_fields_match(match_params, query_params):
+                send("Fetch.continueRequest", {
+                     "requestId": request_id}, session_id)
+                continue
+            # All filters matched — block the request
+            request_obj = {
+                "url": request_url,
+                "method": params["request"]["method"],
+                "params": query_params,
+                "body": body,
+            }
+            print(f"[interceptor] Blocked: {request_url[:100]}", flush=True)
+            send("Fetch.failRequest", {
+                 "requestId": request_id, "errorReason": "BlockedByClient"}, session_id)
+            if not INTERCEPTION_FILE.exists():
+                result = {"intercepted": True, "request": request_obj,
+                          "schema": eval_schema}
+                INTERCEPTION_FILE.write_text(json.dumps(result, indent=2))
+            try:
+                urllib.request.urlopen(urllib.request.Request(
+                    "http://127.0.0.1:7878/api/stop", method="POST"))
+            except Exception:
+                pass
+    finally:
+        log_file.close()
+        ws.close()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global ffmpeg_proc, eval_schema
+    SCREENSHOTS_DIR.mkdir(parents=True, exist_ok=True)
+    ACTIONS_FILE.touch(exist_ok=True)
+    REQUESTS_FILE.touch(exist_ok=True)
+    url_pattern = None
+    required_method = None
+    match_body = None
+    match_params = None
+    if EVAL_SCHEMA_PATH.exists():
+        eval_schema = json.loads(EVAL_SCHEMA_PATH.read_text())
+        url_pattern = eval_schema.get("url_pattern", "")
+        if not url_pattern:
+            url_pattern = None
+        required_method = eval_schema.get("method")
+        match_body = eval_schema.get("body")
+        match_params = eval_schema.get("params")
+    # Start screen recording of the Xvfb display
+    display = os.environ.get("DISPLAY", ":99")
+    ffmpeg_proc = subprocess.Popen(
+        [
+            "ffmpeg", "-y",
+            "-f", "x11grab",
+            "-video_size", "1920x1080",
+            "-framerate", "15",
+            "-i", display,
+            "-c:v", "libx264",
+            "-preset", "ultrafast",
+            "-crf", "28",
+            str(RECORDING_PATH),
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    # Start CDP handler: always logs requests, optionally blocks by URL pattern + method + body/params
+    threading.Thread(target=start_cdp_handler,
+                     args=(url_pattern, required_method, match_body, match_params),
+                     daemon=True).start()
+    yield
+    if ffmpeg_proc and ffmpeg_proc.poll() is None:
+        ffmpeg_proc.send_signal(signal.SIGINT)
+        ffmpeg_proc.wait(timeout=5)
+app = FastAPI(lifespan=lifespan)
+@app.get("/api/status")
+async def status():
+    return {"status": "ok", "eval_interceptor_ready": eval_interceptor_ready}
+@app.post("/api/action")
+async def action(data: dict):
+    with open(ACTIONS_FILE, "a") as f:
+        f.write(json.dumps(data) + "\n")
+    return {"status": "ok"}
+@app.post("/api/screenshot")
+async def screenshot(data: dict):
+    ts = data.get("timestamp", 0)
+    img_bytes = base64.b64decode(data["data"])
+    (SCREENSHOTS_DIR / f"{ts}.png").write_bytes(img_bytes)
+    return {"status": "ok"}
+@app.post("/api/stop")
+async def stop():
+    # Signal the entrypoint watchdog to kill the agent
+    (DATA_DIR / ".stop-requested").touch()
+    with open(ACTIONS_FILE) as f:
+        actions_count = sum(1 for _ in f) if ACTIONS_FILE.exists() else 0
+    screenshots_count = len(list(SCREENSHOTS_DIR.glob("*.png")))
+    with open(REQUESTS_FILE) as f:
+        requests_count = sum(1 for _ in f) if REQUESTS_FILE.exists() else 0
+    return {
+        "status": "stopped",
+        "actions_count": actions_count,
+        "screenshots_count": screenshots_count,
+        "requests_count": requests_count,
+        "has_recording": RECORDING_PATH.exists(),
+    }
+@app.post("/api/stop-recording")
+async def stop_recording():
+    global ffmpeg_proc
+    if ffmpeg_proc and ffmpeg_proc.poll() is None:
+        ffmpeg_proc.send_signal(signal.SIGINT)
+        ffmpeg_proc.wait(timeout=10)
+    return {"status": "recording_stopped", "has_recording": RECORDING_PATH.exists()}