PyPI - resurf - Versions diffs - 0.1.0__tar.gz - Mend

resurf 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

resurf-0.1.0/.gitignore +76 -0
resurf-0.1.0/PKG-INFO +34 -0
resurf-0.1.0/README.md +3 -0
resurf-0.1.0/pyproject.toml +47 -0
resurf-0.1.0/resurf/__init__.py +24 -0
resurf-0.1.0/resurf/adapters/__init__.py +4 -0
resurf-0.1.0/resurf/adapters/base.py +51 -0
resurf-0.1.0/resurf/adapters/browser_use.py +208 -0
resurf-0.1.0/resurf/adapters/stagehand.py +137 -0
resurf-0.1.0/resurf/adapters/vision_baseline.py +171 -0
resurf-0.1.0/resurf/cli.py +320 -0
resurf-0.1.0/resurf/env.py +175 -0
resurf-0.1.0/resurf/runner.py +88 -0
resurf-0.1.0/resurf/runtime.py +33 -0
resurf-0.1.0/resurf/schemas/task.schema.json +117 -0
resurf-0.1.0/resurf/task.py +236 -0
resurf-0.1.0/resurf/templating.py +83 -0
resurf-0.1.0/resurf/trajectory.py +92 -0
resurf-0.1.0/tests/test_task.py +76 -0

resurf-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,76 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+*.egg
+.eggs/
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+# Python setuptools build dirs — anchored so they don't sweep up
+# legitimate nested "lib/" dirs (e.g. frontend/src/lib).
+/lib/
+/lib64/
+parts/
+sdist/
+var/
+wheels/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+.tox/
+.hypothesis/
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+# Node
+node_modules/
+.pnpm-store/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+# Build outputs
+dist/
+build/
+*.tsbuildinfo
+.parcel-cache/
+.next/
+.vite/
+.turbo/
+# Editors
+.vscode/
+.idea/
+*.swp
+*.swo
+.DS_Store
+# Project-specific
+trajectories/
+*.sqlite
+*.sqlite-shm
+*.sqlite-wal
+sites/shop_v1/seed/snapshots/*.sqlite
+sites/shop_v1/backend/data/
+# Env
+.env
+.env.local
+.env.*.local
+# Playwright
+playwright-report/
+test-results/

resurf-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,34 @@
+Metadata-Version: 2.4
+Name: resurf
+Version: 0.1.0
+Summary: A deterministic, reproducible test environment for AI browser agents
+Author: The resurf contributors
+License: Apache-2.0
+Keywords: agents,ai,browser,evaluation,playwright,testing
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Testing
+Requires-Python: >=3.11
+Requires-Dist: click>=8.1
+Requires-Dist: httpx>=0.27
+Requires-Dist: jinja2>=3.1
+Requires-Dist: jsonschema>=4.21
+Requires-Dist: playwright>=1.43
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: resurf-models==0.1.0
+Requires-Dist: rich>=13.7
+Provides-Extra: browser-use
+Requires-Dist: browser-use<0.13,>=0.12; extra == 'browser-use'
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Provides-Extra: stagehand
+Provides-Extra: vision
+Requires-Dist: openai>=1.30; extra == 'vision'
+Requires-Dist: pillow>=10.0; extra == 'vision'
+Description-Content-Type: text/markdown
+# resurf (Python SDK)
+The Python SDK for resurf. See the [repo root README](../../README.md) for the project overview and quickstart.

resurf-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# resurf (Python SDK)
+The Python SDK for resurf. See the [repo root README](../../README.md) for the project overview and quickstart.

resurf-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+# PyPI distribution name. The Python *import* name stays `resurf` (see
+# `resurf/__init__.py`) — only the published package is `resurf` because
+# the bare `resurf` name was already taken on PyPI.
+name = "resurf"
+version = "0.1.0"
+description = "A deterministic, reproducible test environment for AI browser agents"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "Apache-2.0" }
+authors = [{ name = "The resurf contributors" }]
+keywords = ["browser", "agents", "evaluation", "testing", "playwright", "ai"]
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+]
+dependencies = [
+    # Pinned to the same version we ship in lockstep — see RELEASING.md.
+    "resurf-models==0.1.0",
+    "httpx>=0.27",
+    "playwright>=1.43",
+    "jsonschema>=4.21",
+    "jinja2>=3.1",
+    "pyyaml>=6.0",
+    "click>=8.1",
+    "rich>=13.7",
+]
+[project.optional-dependencies]
+dev = ["pytest>=8.0", "pytest-asyncio>=0.23"]
+"browser-use" = ["browser-use>=0.12,<0.13"]
+stagehand = []  # Stagehand is invoked via Node subprocess; only requires Node 20+ at runtime
+vision = ["openai>=1.30", "pillow>=10.0"]
+[project.scripts]
+resurf = "resurf.cli:main"
+[tool.hatch.build.targets.wheel]
+packages = ["resurf"]
+include = ["resurf/schemas/*.json"]

resurf-0.1.0/resurf/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+"""resurf SDK."""
+from .adapters.base import Action, Adapter, AdapterResult
+from .env import Environment
+from .runner import Runner, RunResult
+from .task import EvalResult, Task, TaskGenerator
+from .trajectory import Step, Trajectory
+__version__ = "0.1.0"
+__all__ = [
+    "Action",
+    "Adapter",
+    "AdapterResult",
+    "Environment",
+    "EvalResult",
+    "RunResult",
+    "Runner",
+    "Step",
+    "Task",
+    "TaskGenerator",
+    "Trajectory",
+]

resurf-0.1.0/resurf/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+from .base import Action, Adapter, AdapterResult
+__all__ = ["Action", "Adapter", "AdapterResult"]

resurf-0.1.0/resurf/adapters/base.py ADDED Viewed

@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Adapter ABC: the interface every framework adapter implements."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from playwright.async_api import BrowserContext
+    from ..env import Environment
+    from ..task import Task
+    from ..trajectory import Trajectory
+@dataclass
+class Action:
+    type: str  # nav | click | type | scroll | screenshot | extract | other
+    detail: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class AdapterResult:
+    actions_taken: int = 0
+    tokens_in: int = 0
+    tokens_out: int = 0
+    notes: dict[str, Any] = field(default_factory=dict)
+class Adapter(ABC):
+    """Base class for framework adapters.
+    Adapters drive the Playwright BrowserContext and emit step records into
+    the provided Trajectory. Returning an AdapterResult with token counts and
+    metadata lets the Runner compute consistent metrics across frameworks.
+    """
+    name: str = "base"
+    @abstractmethod
+    async def run(
+        self,
+        *,
+        task: Task,
+        env: Environment,
+        context: BrowserContext,
+        trajectory: Trajectory,
+    ) -> AdapterResult:
+        ...

resurf-0.1.0/resurf/adapters/browser_use.py ADDED Viewed

@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Browser Use adapter (v0.12+).
+Hands the agent the task goal as a natural-language prompt, lets browser-use
+own its browser, and converts its history into our Trajectory shape.
+Install with: pip install 'resurf[browser-use]'
+Compatibility notes
+-------------------
+- Targets browser-use >= 0.12. The 0.12 line is a hard break from 0.1: it
+  bundles its own LLM clients (no more langchain-openai), replaced
+  ``BrowserConfig`` with ``BrowserProfile``, and ``Browser`` is now an alias
+  for ``BrowserSession``.
+- browser-use launches its own browser via CDP. We do NOT hand it the
+  Environment's Playwright context (the two are separate). For tasks that
+  require pre-auth, we surface the seeded credentials in the task prompt
+  and let the agent sign in itself. A future revision can pass a
+  ``cdp_url`` to share a launched Chromium between resurf and browser-use.
+- Token accounting: browser-use exposes per-step usage on
+  ``AgentHistoryList.history``; we sum what's available best-effort.
+"""
+from __future__ import annotations
+import contextlib
+import os
+import time
+from dataclasses import dataclass
+from .base import Adapter, AdapterResult
+@dataclass
+class BrowserUseAdapter(Adapter):
+    name: str = "browser-use"
+    model: str = "gpt-4o"
+    max_steps: int | None = None  # falls back to task.budget.max_steps
+    extra_instructions: str = ""
+    headless: bool = True
+    async def run(self, *, task, env, context, trajectory) -> AdapterResult:
+        try:
+            from browser_use import (
+                Agent,
+                BrowserProfile,
+                BrowserSession,
+                ChatOpenAI,
+            )
+        except ImportError as exc:  # pragma: no cover
+            raise RuntimeError(
+                "browser-use is not installed. "
+                "Install with `pip install 'resurf[browser-use]'` "
+                "(requires browser-use >= 0.12)."
+            ) from exc
+        if not os.environ.get("OPENAI_API_KEY"):
+            raise RuntimeError(
+                "OPENAI_API_KEY is not set. The browser-use adapter uses OpenAI by default."
+            )
+        # Pre-navigation: skip the LLM's first "go_to_url" step by warming
+        # the session at base_url ourselves. Saves ~3-8s per task. Disable
+        # via REVAR_BROWSER_USE_NO_PRENAV=1 if you want to *see* the
+        # navigation animate (e.g., during a demo).
+        prenav = os.environ.get("REVAR_BROWSER_USE_NO_PRENAV", "").lower() not in ("1", "true", "yes")
+        prompt_lines = [
+            f"You are interacting with a synthetic e-commerce site at {env.base_url}.",
+        ]
+        if prenav:
+            prompt_lines.append(
+                f"You are already on the homepage at {env.base_url}/."
+            )
+        else:
+            prompt_lines.append(f"Begin by navigating to {env.base_url}/.")
+        prompt_lines.append(f"Goal: {task.goal.strip()}")
+        if task.user_credentials:
+            prompt_lines.append(
+                f"If you need to sign in, use email '{task.user_credentials['email']}' "
+                f"and password '{task.user_credentials['password']}'."
+            )
+        if self.extra_instructions:
+            prompt_lines.append(self.extra_instructions)
+        prompt = "\n\n".join(prompt_lines)
+        viewport = _viewport_for(task.viewport)
+        browser_session = BrowserSession(
+            browser_profile=BrowserProfile(
+                headless=self.headless,
+                viewport=viewport,
+            ),
+        )
+        if prenav:
+            try:
+                await browser_session.start()
+                await browser_session.navigate_to(env.base_url + "/")
+            except Exception as exc:
+                # Pre-nav is best-effort; fall back to letting the agent navigate.
+                if os.environ.get("REVAR_DEBUG"):
+                    print(f"[resurf.browser_use] prenav failed, agent will navigate: {exc!r}", flush=True)
+        agent = Agent(
+            task=prompt,
+            llm=ChatOpenAI(model=self.model, temperature=0),
+            browser_session=browser_session,
+        )
+        max_steps = self.max_steps or task.budget.max_steps
+        history = await agent.run(max_steps=max_steps)
+        from resurf.trajectory import Step
+        # Run-level token totals: in v0.12 usage lives on AgentHistoryList,
+        # NOT on per-AgentHistory items (they only have StepMetadata for timing).
+        # Belt and suspenders: try history.usage first, then fall back to
+        # querying the agent's token_cost_service directly (which is what
+        # the agent itself uses to populate history.usage).
+        run_usage = getattr(history, "usage", None)
+        tokens_in = int(getattr(run_usage, "total_prompt_tokens", 0) or 0)
+        tokens_out = int(getattr(run_usage, "total_completion_tokens", 0) or 0)
+        if tokens_in == 0 and tokens_out == 0:
+            try:
+                tcs = getattr(agent, "token_cost_service", None)
+                if tcs is not None:
+                    summary = await tcs.get_usage_summary()
+                    tokens_in = int(getattr(summary, "total_prompt_tokens", 0) or 0)
+                    tokens_out = int(getattr(summary, "total_completion_tokens", 0) or 0)
+                    if os.environ.get("REVAR_DEBUG"):
+                        entries = len(getattr(tcs, "usage_history", []) or [])
+                        print(
+                            f"[resurf.browser_use] token_cost_service fallback: "
+                            f"entries={entries} prompt={tokens_in} completion={tokens_out}",
+                            flush=True,
+                        )
+            except Exception as exc:
+                if os.environ.get("REVAR_DEBUG"):
+                    print(f"[resurf.browser_use] token fallback failed: {exc!r}", flush=True)
+        if os.environ.get("REVAR_DEBUG"):
+            print(
+                f"[resurf.browser_use] history.usage={run_usage!r} "
+                f"final tokens_in={tokens_in} tokens_out={tokens_out}",
+                flush=True,
+            )
+        for i, item in enumerate(getattr(history, "history", []) or []):
+            url = None
+            with contextlib.suppress(Exception):
+                url = item.state.url  # type: ignore[attr-defined]
+            # ``model_output.action`` is a list[ActionModel]; pick the first
+            # so we have *something* to record. Each ActionModel is a
+            # discriminated union of the agent's tool calls (click_element_by_index,
+            # input_text, navigate, scroll, done, ...).
+            action_obj = None
+            try:
+                actions = item.model_output.action if item.model_output else []  # type: ignore[attr-defined]
+                if actions:
+                    action_obj = actions[0]
+            except Exception:
+                pass
+            action_type = "browser_use_action"
+            if action_obj is not None:
+                # ActionModel only ever has one field set (the chosen action's name)
+                try:
+                    dumped = action_obj.model_dump(exclude_none=True)
+                    action_type = next(iter(dumped.keys()), "browser_use_action")
+                except Exception:
+                    action_type = type(action_obj).__name__
+            trajectory.append(
+                Step(
+                    index=i,
+                    timestamp=time.time(),
+                    action_type=action_type,
+                    action={"raw": str(action_obj)[:500]} if action_obj else {},
+                    url=url,
+                )
+            )
+        try:
+            await browser_session.kill()  # 0.12 API; falls back to close() below
+        except AttributeError:  # pragma: no cover
+            with contextlib.suppress(Exception):
+                await browser_session.close()
+        except Exception:
+            pass
+        return AdapterResult(
+            actions_taken=len(getattr(history, "history", []) or []),
+            tokens_in=tokens_in,
+            tokens_out=tokens_out,
+            notes={"model": self.model},
+        )
+def _viewport_for(name: str | None) -> dict[str, int]:
+    """Map our task viewport names to browser-use ViewportSize dicts."""
+    presets = {
+        "desktop": {"width": 1280, "height": 800},
+        "mobile_iphone15": {"width": 390, "height": 844},
+        "mobile_pixel7": {"width": 412, "height": 915},
+    }
+    return presets.get(name or "desktop", presets["desktop"])

resurf-0.1.0/resurf/adapters/stagehand.py ADDED Viewed

@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Stagehand adapter (via Node subprocess).
+Stagehand is a Node.js framework. We invoke it via a small Node bridge script
+using JSON-RPC-over-stdio. The bridge script lives at
+`adapters/stagehand/bridge.mjs` in the repo and is shipped with resurf;
+users need Node 20+ on PATH but no Python-side `stagehand` install.
+We do NOT take over the Playwright context here — Stagehand owns its own
+browser (it integrates tightly with Playwright internally). For tasks that
+require pre-auth, we forward the seeded session cookie via Stagehand's CDP
+init; for v0 those tasks are out of scope for this adapter and we recommend
+browser-use instead.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import os
+import shutil
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from .base import Adapter, AdapterResult
+@dataclass
+class StagehandAdapter(Adapter):
+    name: str = "stagehand"
+    bridge_path: str | None = None
+    node_bin: str = "node"
+    model: str = "gpt-4o"
+    headless: bool = True
+    async def run(self, *, task, env, context, trajectory) -> AdapterResult:
+        if shutil.which(self.node_bin) is None:
+            raise RuntimeError(
+                f"Node binary `{self.node_bin}` not on PATH. "
+                "Install Node 20+ to use the Stagehand adapter."
+            )
+        bridge = Path(self.bridge_path) if self.bridge_path else _default_bridge_path()
+        if not bridge.exists():
+            raise RuntimeError(
+                f"Stagehand bridge not found at {bridge}. "
+                "It should ship in adapters/stagehand/bridge.mjs."
+            )
+        if not os.environ.get("OPENAI_API_KEY"):
+            raise RuntimeError("OPENAI_API_KEY is not set; required by Stagehand.")
+        request = {
+            "goal": task.goal.strip(),
+            "base_url": env.base_url,
+            "model": self.model,
+            "max_steps": task.budget.max_steps,
+            "viewport": task.viewport,
+            "headless": self.headless,
+        }
+        if task.user_credentials:
+            request["credentials"] = task.user_credentials
+        proc = await asyncio.create_subprocess_exec(
+            self.node_bin,
+            str(bridge),
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await proc.communicate(input=(json.dumps(request) + "\n").encode())
+        # The bridge writes its error JSON to stdout (so we can recover a
+        # structured ``error`` field) and exits non-zero. Try to parse stdout
+        # FIRST so we surface a useful message; fall back to stderr only if
+        # stdout is empty/unparseable.
+        from resurf.trajectory import Step
+        payload: dict | None = None
+        parse_err: Exception | None = None
+        if stdout:
+            try:
+                payload = json.loads(stdout.decode().splitlines()[-1])
+            except Exception as exc:
+                parse_err = exc
+        if proc.returncode != 0:
+            bridge_error = (payload or {}).get("error") if payload else None
+            stderr_text = stderr.decode(errors="replace").strip()
+            hint = ""
+            if proc.returncode == 2 and not bridge_error:
+                hint = (
+                    " (exit 2 typically means Stagehand isn't installed — "
+                    "run `cd adapters/stagehand && npm install @browserbasehq/stagehand`)"
+                )
+            raise RuntimeError(
+                "Stagehand bridge exited with code "
+                f"{proc.returncode}.{hint}\n"
+                f"  bridge_error: {bridge_error!r}\n"
+                f"  stderr: {stderr_text!r}"
+            )
+        if payload is None:
+            raise RuntimeError(
+                f"Could not parse stagehand bridge output: {parse_err}\n"
+                f"stdout={stdout!r}\nstderr={stderr!r}"
+            )
+        for i, item in enumerate(payload.get("steps") or []):
+            trajectory.append(
+                Step(
+                    index=i,
+                    timestamp=time.time(),
+                    action_type=item.get("type", "stagehand_action"),
+                    action=item.get("action", {}),
+                    url=item.get("url"),
+                    tokens_in=item.get("tokens_in", 0),
+                    tokens_out=item.get("tokens_out", 0),
+                )
+            )
+        return AdapterResult(
+            actions_taken=len(payload.get("steps") or []),
+            tokens_in=int(payload.get("tokens_in", 0)),
+            tokens_out=int(payload.get("tokens_out", 0)),
+            notes={"model": self.model},
+        )
+def _default_bridge_path() -> Path:
+    """Walk upward from this file looking for adapters/stagehand/bridge.mjs."""
+    here = Path(__file__).resolve()
+    for parent in [here.parent, *here.parents]:
+        candidate = parent / "adapters" / "stagehand" / "bridge.mjs"
+        if candidate.exists():
+            return candidate
+    return Path("adapters/stagehand/bridge.mjs")