PyPI - harness-browser - Versions diffs - 0.1.1__py3-none-any.whl - Mend

harness-browser 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

harness_browser/__init__.py +18 -0
harness_browser/actions/__init__.py +0 -0
harness_browser/actions/capture.py +228 -0
harness_browser/actions/interact.py +163 -0
harness_browser/actions/js_eval.py +59 -0
harness_browser/actions/navigate.py +76 -0
harness_browser/cdp/__init__.py +0 -0
harness_browser/cdp/client.py +135 -0
harness_browser/cdp/launcher.py +337 -0
harness_browser/dom/__init__.py +0 -0
harness_browser/dom/builder.py +273 -0
harness_browser/dom/refs.py +29 -0
harness_browser/hooks.py +50 -0
harness_browser/mcp_server.py +146 -0
harness_browser/mode.py +75 -0
harness_browser/models.py +42 -0
harness_browser/profile.py +51 -0
harness_browser/py.typed +0 -0
harness_browser/session.py +516 -0
harness_browser/settings.py +155 -0
harness_browser/tool_interface.py +122 -0
harness_browser-0.1.1.dist-info/METADATA +344 -0
harness_browser-0.1.1.dist-info/RECORD +24 -0
harness_browser-0.1.1.dist-info/WHEEL +4 -0

harness_browser/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""harness-browser: AI-friendly browser automation via CDP."""
+from harness_browser.mode import BrowserMode
+from harness_browser.models import ActionMetrics, TabInfo, ToolResult
+from harness_browser.session import BrowserSession
+from harness_browser.settings import HarnessSettings, settings
+from harness_browser.tool_interface import browser_tool
+__all__ = [
+    "BrowserSession",
+    "browser_tool",
+    "BrowserMode",
+    "HarnessSettings",
+    "settings",
+    "ToolResult",
+    "ActionMetrics",
+    "TabInfo",
+]

harness_browser/actions/__init__.py ADDED Viewed

File without changes

harness_browser/actions/capture.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Screenshot capture action.
+Writes a PNG file to disk and returns its path. We deliberately do **not**
+return the raw base64 payload to callers — base64 strings are large and
+costly to push through agent toolchains, and the file path is what
+downstream renderers (file_preview, dashboards, MCP clients) need anyway.
+Output location precedence:
+1. Explicit ``path=`` argument (absolute → as-is, relative → under
+   ``settings.screenshots_dir``)
+2. ``settings.screenshots_dir / harness-<timestamp_ms>.png``
+Optional flags:
+- ``full_page=True`` — capture the entire scrollable page using
+  ``Page.getLayoutMetrics().cssContentSize`` to size the clip rectangle.
+  **Use sparingly.** Real-world pages scroll for many screens (10+ viewports
+  is typical), so ``full_page`` PNGs are huge and rarely useful for agents
+  trying to locate one element — prefer the viewport default plus targeted
+  scrolling. ``full_page`` is appropriate only when the caller explicitly
+  needs a single-image archival/regression capture.
+- ``element_ref="..."`` — clip to one element's bounding box. Mutually
+  exclusive with ``full_page`` (element_ref wins to keep behavior
+  predictable).
+The action also enriches ``ToolResult.metadata`` with the page ``url`` and
+``title`` so the caller can surface them without an extra ``Runtime.evaluate``.
+"""
+from __future__ import annotations
+import base64
+import json
+import time
+from pathlib import Path
+from typing import Any
+from harness_browser.cdp.client import CDPClient, CDPSessionError
+from harness_browser.dom.refs import RefCache
+from harness_browser.models import ActionMetrics, ToolResult
+from harness_browser.settings import HarnessSettings
+from harness_browser.settings import settings as _default_settings
+def _resolve_path(
+    path: str | Path | None,
+    cfg: HarnessSettings,
+) -> Path:
+    """Compute the destination path for a screenshot."""
+    base = Path(cfg.screenshots_dir)
+    if path:
+        candidate = Path(path).expanduser()
+        if not candidate.is_absolute():
+            candidate = base / candidate
+        candidate.parent.mkdir(parents=True, exist_ok=True)
+        return candidate
+    base.mkdir(parents=True, exist_ok=True)
+    return base / f"harness-{int(time.time() * 1000)}.png"
+async def _page_context(client: CDPClient) -> dict[str, str]:
+    """Return ``{"url": ..., "title": ...}`` via a single Runtime.evaluate.
+    Failures are non-fatal — we just return empty strings.
+    """
+    try:
+        info = await client.send(
+            "Runtime.evaluate",
+            {
+                "expression": (
+                    "JSON.stringify({url: location.href, title: document.title})"
+                ),
+                "returnByValue": True,
+            },
+        )
+        raw = info.get("result", {}).get("value", "{}")
+        data = json.loads(raw) if isinstance(raw, str) else {}
+        return {
+            "url": str(data.get("url", "")),
+            "title": str(data.get("title", "")),
+        }
+    except Exception:
+        return {"url": "", "title": ""}
+async def screenshot(
+    client: CDPClient,
+    ref_cache: RefCache,
+    crop: bool = False,
+    element_ref: str | None = None,
+    full_page: bool = False,
+    path: str | Path | None = None,
+    settings: HarnessSettings | None = None,
+) -> ToolResult:
+    """Capture a screenshot, write it to disk, and return its path.
+    Args:
+        client: Connected CDP client for the active page.
+        ref_cache: Ref cache for ``element_ref`` lookup.
+        crop: Reserved for future use; currently has no independent effect.
+        element_ref: If set, crop to the bounding box of this element ref.
+            Takes precedence over ``full_page``.
+        full_page: If True (and ``element_ref`` is unset), capture the full
+            scrollable page, not just the visible viewport. **Default False
+            — almost always leave it that way.** Modern landing pages
+            routinely scroll for 10+ viewports; a ``full_page`` PNG is
+            then several MB of mostly-empty visual noise that drowns
+            agents in tokens. Reserve ``full_page=True`` for archival /
+            visual-regression captures where a single image is the
+            explicit deliverable.
+        path: Optional output path. Absolute paths used verbatim; relative
+            paths resolve under ``settings.screenshots_dir``.
+        settings: :class:`HarnessSettings` override. Defaults to the module
+            singleton (env-driven).
+    Returns:
+        :class:`ToolResult` whose ``content`` is the absolute path string of
+        the saved PNG. ``metadata`` carries ``{"url", "title", "full_page",
+        "width", "height", "size_kb", "path"}`` so callers can render context
+        without a follow-up CDP call.
+    """
+    cfg = settings or _default_settings
+    start = time.monotonic()
+    params: dict[str, Any] = {"format": "png"}
+    width = 0
+    height = 0
+    if element_ref is not None:
+        node_id = ref_cache.lookup(element_ref)
+        if node_id is None:
+            raise CDPSessionError(
+                f"Ref '{element_ref}' not found. Call dom_tree() first."
+            )
+        box = await client.send("DOM.getBoxModel", {"nodeId": node_id})
+        model = box.get("model", {})
+        content_pts = model.get("content", [0, 0, 100, 0, 100, 100, 0, 100])
+        x = min(content_pts[0::2])
+        y = min(content_pts[1::2])
+        width = max(content_pts[0::2]) - x
+        height = max(content_pts[1::2]) - y
+        params["clip"] = {
+            "x": x,
+            "y": y,
+            "width": width,
+            "height": height,
+            "scale": 1,
+        }
+    elif full_page:
+        # Use CSS content size so the clip captures the full scrollable page,
+        # not just the current viewport. ``captureBeyondViewport`` lets Chrome
+        # render outside the visible area into the PNG.
+        metrics = await client.send("Page.getLayoutMetrics", {})
+        # Newer CDP exposes cssContentSize / cssVisualViewport; fall back to
+        # the older snake-cased fields if needed.
+        css = metrics.get("cssContentSize") or metrics.get("contentSize") or {}
+        width = int(css.get("width", 0)) or 0
+        height = int(css.get("height", 0)) or 0
+        if width > 0 and height > 0:
+            params["clip"] = {
+                "x": 0,
+                "y": 0,
+                "width": width,
+                "height": height,
+                "scale": 1,
+            }
+            params["captureBeyondViewport"] = True
+    result = await client.send("Page.captureScreenshot", params)
+    raw_b64: str = result.get("data", "")
+    if not raw_b64:
+        return ToolResult(
+            success=False,
+            content="",
+            error="Page.captureScreenshot returned empty data",
+            metrics=ActionMetrics(
+                action="screenshot",
+                duration_ms=int((time.monotonic() - start) * 1000),
+                dom_nodes_scanned=0,
+                estimated_tokens=0,
+            ),
+        )
+    img_bytes = base64.b64decode(raw_b64)
+    target = _resolve_path(path, cfg)
+    target.write_bytes(img_bytes)
+    # Default (viewport) capture leaves width/height at 0 above. Try to fill
+    # them in from the visual viewport so callers can render a meaningful
+    # size badge without a follow-up CDP call. Failures are non-fatal.
+    if width == 0 and height == 0:
+        try:
+            metrics_resp = await client.send("Page.getLayoutMetrics", {})
+            vp = (
+                metrics_resp.get("cssVisualViewport")
+                or metrics_resp.get("visualViewport")
+                or {}
+            )
+            width = int(vp.get("clientWidth") or vp.get("width") or 0)
+            height = int(vp.get("clientHeight") or vp.get("height") or 0)
+        except Exception:
+            pass
+    ctx = await _page_context(client)
+    metadata: dict[str, object] = {
+        "path": str(target),
+        "url": ctx["url"],
+        "title": ctx["title"],
+        "full_page": bool(full_page and element_ref is None),
+        "element_ref": element_ref,
+        "width": width,
+        "height": height,
+        "size_kb": len(img_bytes) // 1024,
+    }
+    return ToolResult(
+        success=True,
+        content=str(target),
+        metrics=ActionMetrics(
+            action="screenshot",
+            duration_ms=int((time.monotonic() - start) * 1000),
+            dom_nodes_scanned=0,
+            estimated_tokens=0,
+            screenshot_size_kb=len(img_bytes) // 1024,
+        ),
+        metadata=metadata,
+    )

harness_browser/actions/interact.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Interaction actions: click, type_text, scroll, hover."""
+from __future__ import annotations
+import time
+from harness_browser.cdp.client import CDPClient, CDPSessionError
+from harness_browser.dom.refs import RefCache
+from harness_browser.models import ActionMetrics, ToolResult
+def _metrics(action: str, start: float) -> ActionMetrics:
+    return ActionMetrics(
+        action=action,
+        duration_ms=int((time.monotonic() - start) * 1000),
+        dom_nodes_scanned=1,
+        estimated_tokens=10,
+    )
+async def _get_center(client: CDPClient, node_id: int) -> tuple[float, float]:
+    """Get the center coordinates of a DOM node."""
+    box = await client.send("DOM.getBoxModel", {"nodeId": node_id})
+    model = box.get("model", {})
+    content = model.get("content", [0, 0, 0, 0, 0, 0, 0, 0])
+    # content is [x0,y0, x1,y1, x2,y2, x3,y3] (clockwise from top-left)
+    cx = (content[0] + content[4]) / 2
+    cy = (content[1] + content[5]) / 2
+    return cx, cy
+async def _resolve_coords(
+    client: CDPClient,
+    ref_cache: RefCache,
+    ref: str | None,
+    selector: str | None,
+    x: int | None,
+    y: int | None,
+) -> tuple[float, float, str]:
+    """Resolve click target to (x, y, description)."""
+    if ref is not None:
+        node_id = ref_cache.lookup(ref)
+        if node_id is None:
+            raise CDPSessionError(f"Ref '{ref}' not found. Call dom_tree() first.")
+        cx, cy = await _get_center(client, node_id)
+        return cx, cy, f"ref={ref}"
+    if selector is not None:
+        doc = await client.send("DOM.getDocument", {"depth": 0})
+        root_id = doc["root"]["nodeId"]
+        result = await client.send(
+            "DOM.querySelector", {"nodeId": root_id, "selector": selector}
+        )
+        node_id = result.get("nodeId", 0)
+        if not node_id:
+            raise CDPSessionError(f"Selector '{selector}' matched no elements.")
+        cx, cy = await _get_center(client, node_id)
+        return cx, cy, f"selector={selector}"
+    if x is not None and y is not None:
+        return float(x), float(y), f"({x}, {y})"
+    raise ValueError("Must provide ref, selector, or (x, y)")
+async def click(
+    client: CDPClient,
+    ref_cache: RefCache,
+    ref: str | None = None,
+    selector: str | None = None,
+    x: int | None = None,
+    y: int | None = None,
+) -> ToolResult:
+    """Click an element identified by ref, selector, or coordinates."""
+    start = time.monotonic()
+    cx, cy, desc = await _resolve_coords(client, ref_cache, ref, selector, x, y)
+    for event_type in ("mousePressed", "mouseReleased"):
+        await client.send(
+            "Input.dispatchMouseEvent",
+            {
+                "type": event_type,
+                "x": cx,
+                "y": cy,
+                "button": "left",
+                "clickCount": 1,
+            },
+        )
+    return ToolResult(
+        success=True,
+        content=f"Clicked {desc} at ({cx:.0f}, {cy:.0f})",
+        metrics=_metrics("click", start),
+    )
+async def type_text(
+    client: CDPClient,
+    ref_cache: RefCache,
+    text: str,
+    ref: str | None = None,
+) -> ToolResult:
+    """Type text, optionally clicking a target element first."""
+    start = time.monotonic()
+    if ref is not None:
+        await click(client, ref_cache, ref=ref)
+    for char in text:
+        await client.send("Input.dispatchKeyEvent", {"type": "char", "text": char})
+    return ToolResult(
+        success=True,
+        content=f"Typed {len(text)} character(s)",
+        metrics=_metrics("type", start),
+    )
+async def scroll(
+    client: CDPClient,
+    ref_cache: RefCache,
+    direction: str = "down",
+    amount: int = 300,
+) -> ToolResult:
+    """Scroll the page in a direction by pixel amount."""
+    start = time.monotonic()
+    delta_y = amount if direction == "down" else -amount
+    delta_x = (
+        amount if direction == "right" else (-amount if direction == "left" else 0)
+    )
+    await client.send(
+        "Input.dispatchMouseEvent",
+        {
+            "type": "mouseWheel",
+            "x": 400,
+            "y": 300,
+            "deltaX": delta_x,
+            "deltaY": delta_y,
+        },
+    )
+    return ToolResult(
+        success=True,
+        content=f"Scrolled {direction} by {amount}px",
+        metrics=_metrics("scroll", start),
+    )
+async def hover(
+    client: CDPClient,
+    ref_cache: RefCache,
+    ref: str,
+) -> ToolResult:
+    """Move the mouse over an element."""
+    start = time.monotonic()
+    node_id = ref_cache.lookup(ref)
+    if node_id is None:
+        raise CDPSessionError(f"Ref '{ref}' not found. Call dom_tree() first.")
+    cx, cy = await _get_center(client, node_id)
+    await client.send(
+        "Input.dispatchMouseEvent",
+        {
+            "type": "mouseMoved",
+            "x": cx,
+            "y": cy,
+        },
+    )
+    return ToolResult(
+        success=True,
+        content=f"Hovered over ref={ref} at ({cx:.0f}, {cy:.0f})",
+        metrics=_metrics("hover", start),
+    )

harness_browser/actions/js_eval.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""JavaScript evaluation action."""
+from __future__ import annotations
+import json
+import time
+from harness_browser.cdp.client import CDPClient
+from harness_browser.models import ActionMetrics, ToolResult
+async def eval_js(client: CDPClient, expression: str) -> ToolResult:
+    """
+    Execute a JavaScript expression in the page context.
+    Returns the serialized result as a JSON string in content.
+    """
+    start = time.monotonic()
+    result = await client.send(
+        "Runtime.evaluate",
+        {
+            "expression": expression,
+            "returnByValue": True,
+            "awaitPromise": True,
+        },
+    )
+    value = result.get("result", {})
+    if value.get("subtype") == "error":
+        description = value.get("description", "Unknown JS error")
+        return ToolResult(
+            success=False,
+            content="",
+            error=description,
+            metrics=ActionMetrics(
+                action="eval_js",
+                duration_ms=int((time.monotonic() - start) * 1000),
+                dom_nodes_scanned=0,
+                estimated_tokens=0,
+            ),
+        )
+    raw = value.get("value")
+    content: str | dict[str, object]
+    if isinstance(raw, dict):
+        content = raw
+    elif isinstance(raw, list):
+        content = json.dumps(raw, ensure_ascii=False)
+    else:
+        content = json.dumps(raw, ensure_ascii=False) if raw is not None else "null"
+    tokens = len(str(content)) // 4
+    return ToolResult(
+        success=True,
+        content=content,
+        metrics=ActionMetrics(
+            action="eval_js",
+            duration_ms=int((time.monotonic() - start) * 1000),
+            dom_nodes_scanned=0,
+            estimated_tokens=tokens,
+        ),
+    )

harness_browser/actions/navigate.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""Navigation actions: navigate, go_back, go_forward, reload."""
+from __future__ import annotations
+import asyncio
+import time
+from typing import Any
+from harness_browser.cdp.client import CDPClient
+from harness_browser.dom.refs import RefCache
+from harness_browser.models import ActionMetrics, ToolResult
+def _metrics(action: str, start: float) -> ActionMetrics:
+    return ActionMetrics(
+        action=action,
+        duration_ms=int((time.monotonic() - start) * 1000),
+        dom_nodes_scanned=0,
+        estimated_tokens=20,
+    )
+async def navigate(client: CDPClient, ref_cache: RefCache, url: str) -> ToolResult:
+    """Navigate the page to a URL and wait for load."""
+    start = time.monotonic()
+    ref_cache.invalidate()
+    result = await client.send("Page.navigate", {"url": url})
+    load_event: asyncio.Future[None] = asyncio.get_event_loop().create_future()
+    def on_load(_params: dict[str, Any]) -> None:
+        if not load_event.done():
+            load_event.set_result(None)
+    client.on("Page.loadEventFired", on_load)
+    try:
+        await asyncio.wait_for(load_event, timeout=30.0)
+    except asyncio.TimeoutError:
+        pass
+    finally:
+        client.off("Page.loadEventFired", on_load)
+    _ = result.get("frameId", url)
+    content = f"Navigated to {url}"
+    return ToolResult(
+        success=True, content=content, metrics=_metrics("navigate", start)
+    )
+async def go_back(client: CDPClient, ref_cache: RefCache) -> ToolResult:
+    """Navigate back in browser history."""
+    start = time.monotonic()
+    ref_cache.invalidate()
+    await client.send("Runtime.evaluate", {"expression": "history.back()"})
+    return ToolResult(
+        success=True, content="Navigated back", metrics=_metrics("go_back", start)
+    )
+async def go_forward(client: CDPClient, ref_cache: RefCache) -> ToolResult:
+    """Navigate forward in browser history."""
+    start = time.monotonic()
+    ref_cache.invalidate()
+    await client.send("Runtime.evaluate", {"expression": "history.forward()"})
+    return ToolResult(
+        success=True, content="Navigated forward", metrics=_metrics("go_forward", start)
+    )
+async def reload(client: CDPClient, ref_cache: RefCache) -> ToolResult:
+    """Reload the current page."""
+    start = time.monotonic()
+    ref_cache.invalidate()
+    await client.send("Page.reload", {"ignoreCache": False})
+    return ToolResult(
+        success=True, content="Page reloaded", metrics=_metrics("reload", start)
+    )

harness_browser/cdp/__init__.py ADDED Viewed

File without changes

harness_browser/cdp/client.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Pure asyncio Chrome DevTools Protocol WebSocket client."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+from collections.abc import Callable
+from typing import Any
+import websockets
+from websockets.asyncio.client import ClientConnection
+from harness_browser.settings import settings as _settings
+logger = logging.getLogger(__name__)
+class CDPSessionError(Exception):
+    """Raised when a CDP command fails or the session is invalid."""
+class CDPClient:
+    """
+    Manages a single CDP WebSocket connection to one browser page.
+    Usage::
+        client = CDPClient()
+        await client.connect("ws://localhost:9222/devtools/page/ABC")
+        result = await client.send("Page.navigate", {"url": "https://example.com"})
+        await client.close()
+    """
+    def __init__(self, timeout: float | None = None) -> None:
+        self._timeout = timeout if timeout is not None else _settings.cdp_timeout
+        self._ws: ClientConnection | None = None
+        self._pending: dict[int, asyncio.Future[dict[str, Any]]] = {}
+        self._listeners: dict[str, list[Callable[..., Any]]] = {}
+        self._recv_task: asyncio.Task[None] | None = None
+        self._id = 0
+    async def connect(self, ws_url: str) -> None:
+        """Connect to a CDP WebSocket endpoint.
+        ``websockets`` defaults to a 1 MiB receive frame cap; CDP responses
+        for ``Page.captureScreenshot`` routinely exceed that on real-world
+        pages (a base64 PNG of a 1440×900 viewport easily lands at 1.3-2 MiB).
+        Hitting the cap closes the socket with code 1009 and the agent sees
+        a confusing ``message too big`` error mid-action. Honor
+        ``settings.cdp_max_message_size`` so screenshots up to that size
+        come through cleanly.
+        """
+        ws = await websockets.connect(
+            ws_url,
+            max_size=_settings.cdp_max_message_size,
+        )
+        self._ws = ws
+        self._recv_task = asyncio.create_task(self._recv_loop())
+        logger.debug("CDP connected to %s", ws_url)
+    async def send(
+        self, method: str, params: dict[str, Any] | None = None
+    ) -> dict[str, Any]:
+        """Send a CDP command and await its response."""
+        if self._ws is None:
+            raise CDPSessionError("Not connected")
+        self._id += 1
+        msg_id = self._id
+        payload = json.dumps({"id": msg_id, "method": method, "params": params or {}})
+        loop = asyncio.get_event_loop()
+        future: asyncio.Future[dict[str, Any]] = loop.create_future()
+        self._pending[msg_id] = future
+        await self._ws.send(payload)
+        try:
+            result = await asyncio.wait_for(future, timeout=self._timeout)
+        except asyncio.TimeoutError as exc:
+            self._pending.pop(msg_id, None)
+            raise CDPSessionError(f"Timeout waiting for response to {method}") from exc
+        if "error" in result:
+            raise CDPSessionError(f"CDP error for {method}: {result['error']}")
+        inner: dict[str, Any] = result.get("result", {})
+        return inner
+    async def enable_domain(self, domain: str) -> None:
+        """Enable a CDP domain (e.g. 'DOM', 'Page', 'Input')."""
+        await self.send(f"{domain}.enable")
+    def on(self, event: str, callback: Callable[..., Any]) -> None:
+        """Register a listener for a CDP event."""
+        self._listeners.setdefault(event, []).append(callback)
+    def off(self, event: str, callback: Callable[..., Any]) -> None:
+        """Unregister a listener."""
+        listeners = self._listeners.get(event, [])
+        if callback in listeners:
+            listeners.remove(callback)
+    async def close(self) -> None:
+        """Close the WebSocket connection."""
+        if self._recv_task:
+            self._recv_task.cancel()
+            try:
+                await self._recv_task
+            except asyncio.CancelledError:
+                pass
+        if self._ws:
+            await self._ws.close()
+            self._ws = None
+        logger.debug("CDP connection closed")
+    async def _recv_loop(self) -> None:
+        """Background task: read messages and dispatch to pending futures or
+        listeners."""
+        assert self._ws is not None
+        try:
+            async for raw in self._ws:
+                msg: dict[str, Any] = json.loads(raw)
+                if "id" in msg:
+                    future = self._pending.pop(msg["id"], None)
+                    if future and not future.done():
+                        future.set_result(msg)
+                elif "method" in msg:
+                    method: str = msg["method"]
+                    params: dict[str, Any] = msg.get("params", {})
+                    for cb in self._listeners.get(method, []):
+                        result = cb(params)
+                        if asyncio.iscoroutine(result):
+                            asyncio.create_task(result)
+        except Exception as exc:  # noqa: BLE001
+            logger.debug("CDP recv loop ended: %s", exc)
+            for future in self._pending.values():
+                if not future.done():
+                    future.set_exception(CDPSessionError(f"Connection lost: {exc}"))
+            self._pending.clear()