resurf 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ .eggs/
9
+ .Python
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ # Python setuptools build dirs — anchored so they don't sweep up
16
+ # legitimate nested "lib/" dirs (e.g. frontend/src/lib).
17
+ /lib/
18
+ /lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ .pytest_cache/
24
+ .mypy_cache/
25
+ .ruff_cache/
26
+ .coverage
27
+ htmlcov/
28
+ .tox/
29
+ .hypothesis/
30
+
31
+ # Virtual environments
32
+ .venv/
33
+ venv/
34
+ env/
35
+ ENV/
36
+
37
+ # Node
38
+ node_modules/
39
+ .pnpm-store/
40
+ npm-debug.log*
41
+ yarn-debug.log*
42
+ yarn-error.log*
43
+ pnpm-debug.log*
44
+
45
+ # Build outputs
46
+ dist/
47
+ build/
48
+ *.tsbuildinfo
49
+ .parcel-cache/
50
+ .next/
51
+ .vite/
52
+ .turbo/
53
+
54
+ # Editors
55
+ .vscode/
56
+ .idea/
57
+ *.swp
58
+ *.swo
59
+ .DS_Store
60
+
61
+ # Project-specific
62
+ trajectories/
63
+ *.sqlite
64
+ *.sqlite-shm
65
+ *.sqlite-wal
66
+ sites/shop_v1/seed/snapshots/*.sqlite
67
+ sites/shop_v1/backend/data/
68
+
69
+ # Env
70
+ .env
71
+ .env.local
72
+ .env.*.local
73
+
74
+ # Playwright
75
+ playwright-report/
76
+ test-results/
resurf-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: resurf
3
+ Version: 0.1.0
4
+ Summary: A deterministic, reproducible test environment for AI browser agents
5
+ Author: The resurf contributors
6
+ License: Apache-2.0
7
+ Keywords: agents,ai,browser,evaluation,playwright,testing
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Topic :: Software Development :: Testing
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: click>=8.1
14
+ Requires-Dist: httpx>=0.27
15
+ Requires-Dist: jinja2>=3.1
16
+ Requires-Dist: jsonschema>=4.21
17
+ Requires-Dist: playwright>=1.43
18
+ Requires-Dist: pyyaml>=6.0
19
+ Requires-Dist: resurf-models==0.1.0
20
+ Requires-Dist: rich>=13.7
21
+ Provides-Extra: browser-use
22
+ Requires-Dist: browser-use<0.13,>=0.12; extra == 'browser-use'
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0; extra == 'dev'
26
+ Provides-Extra: stagehand
27
+ Provides-Extra: vision
28
+ Requires-Dist: openai>=1.30; extra == 'vision'
29
+ Requires-Dist: pillow>=10.0; extra == 'vision'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # resurf (Python SDK)
33
+
34
+ The Python SDK for resurf. See the [repo root README](../../README.md) for the project overview and quickstart.
resurf-0.1.0/README.md ADDED
@@ -0,0 +1,3 @@
1
+ # resurf (Python SDK)
2
+
3
+ The Python SDK for resurf. See the [repo root README](../../README.md) for the project overview and quickstart.
@@ -0,0 +1,47 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ [build-system]
3
+ requires = ["hatchling"]
4
+ build-backend = "hatchling.build"
5
+
6
+ [project]
7
+ # PyPI distribution name. The Python *import* name stays `resurf` (see
8
+ # `resurf/__init__.py`) — only the published package is `resurf` because
9
+ # the bare `resurf` name was already taken on PyPI.
10
+ name = "resurf"
11
+ version = "0.1.0"
12
+ description = "A deterministic, reproducible test environment for AI browser agents"
13
+ readme = "README.md"
14
+ requires-python = ">=3.11"
15
+ license = { text = "Apache-2.0" }
16
+ authors = [{ name = "The resurf contributors" }]
17
+ keywords = ["browser", "agents", "evaluation", "testing", "playwright", "ai"]
18
+ classifiers = [
19
+ "License :: OSI Approved :: Apache Software License",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Software Development :: Testing",
23
+ ]
24
+ dependencies = [
25
+ # Pinned to the same version we ship in lockstep — see RELEASING.md.
26
+ "resurf-models==0.1.0",
27
+ "httpx>=0.27",
28
+ "playwright>=1.43",
29
+ "jsonschema>=4.21",
30
+ "jinja2>=3.1",
31
+ "pyyaml>=6.0",
32
+ "click>=8.1",
33
+ "rich>=13.7",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ dev = ["pytest>=8.0", "pytest-asyncio>=0.23"]
38
+ "browser-use" = ["browser-use>=0.12,<0.13"]
39
+ stagehand = [] # Stagehand is invoked via Node subprocess; only requires Node 20+ at runtime
40
+ vision = ["openai>=1.30", "pillow>=10.0"]
41
+
42
+ [project.scripts]
43
+ resurf = "resurf.cli:main"
44
+
45
+ [tool.hatch.build.targets.wheel]
46
+ packages = ["resurf"]
47
+ include = ["resurf/schemas/*.json"]
@@ -0,0 +1,24 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """resurf SDK."""
3
+
4
+ from .adapters.base import Action, Adapter, AdapterResult
5
+ from .env import Environment
6
+ from .runner import Runner, RunResult
7
+ from .task import EvalResult, Task, TaskGenerator
8
+ from .trajectory import Step, Trajectory
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ __all__ = [
13
+ "Action",
14
+ "Adapter",
15
+ "AdapterResult",
16
+ "Environment",
17
+ "EvalResult",
18
+ "RunResult",
19
+ "Runner",
20
+ "Step",
21
+ "Task",
22
+ "TaskGenerator",
23
+ "Trajectory",
24
+ ]
@@ -0,0 +1,4 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ from .base import Action, Adapter, AdapterResult
3
+
4
+ __all__ = ["Action", "Adapter", "AdapterResult"]
@@ -0,0 +1,51 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Adapter ABC: the interface every framework adapter implements."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass, field
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ if TYPE_CHECKING:
11
+ from playwright.async_api import BrowserContext
12
+
13
+ from ..env import Environment
14
+ from ..task import Task
15
+ from ..trajectory import Trajectory
16
+
17
+
18
+ @dataclass
19
+ class Action:
20
+ type: str # nav | click | type | scroll | screenshot | extract | other
21
+ detail: dict[str, Any] = field(default_factory=dict)
22
+
23
+
24
+ @dataclass
25
+ class AdapterResult:
26
+ actions_taken: int = 0
27
+ tokens_in: int = 0
28
+ tokens_out: int = 0
29
+ notes: dict[str, Any] = field(default_factory=dict)
30
+
31
+
32
+ class Adapter(ABC):
33
+ """Base class for framework adapters.
34
+
35
+ Adapters drive the Playwright BrowserContext and emit step records into
36
+ the provided Trajectory. Returning an AdapterResult with token counts and
37
+ metadata lets the Runner compute consistent metrics across frameworks.
38
+ """
39
+
40
+ name: str = "base"
41
+
42
+ @abstractmethod
43
+ async def run(
44
+ self,
45
+ *,
46
+ task: Task,
47
+ env: Environment,
48
+ context: BrowserContext,
49
+ trajectory: Trajectory,
50
+ ) -> AdapterResult:
51
+ ...
@@ -0,0 +1,208 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Browser Use adapter (v0.12+).
3
+
4
+ Hands the agent the task goal as a natural-language prompt, lets browser-use
5
+ own its browser, and converts its history into our Trajectory shape.
6
+
7
+ Install with: pip install 'resurf[browser-use]'
8
+
9
+ Compatibility notes
10
+ -------------------
11
+ - Targets browser-use >= 0.12. The 0.12 line is a hard break from 0.1: it
12
+ bundles its own LLM clients (no more langchain-openai), replaced
13
+ ``BrowserConfig`` with ``BrowserProfile``, and ``Browser`` is now an alias
14
+ for ``BrowserSession``.
15
+ - browser-use launches its own browser via CDP. We do NOT hand it the
16
+ Environment's Playwright context (the two are separate). For tasks that
17
+ require pre-auth, we surface the seeded credentials in the task prompt
18
+ and let the agent sign in itself. A future revision can pass a
19
+ ``cdp_url`` to share a launched Chromium between resurf and browser-use.
20
+ - Token accounting: browser-use exposes per-step usage on
21
+ ``AgentHistoryList.history``; we sum what's available best-effort.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import contextlib
27
+ import os
28
+ import time
29
+ from dataclasses import dataclass
30
+
31
+ from .base import Adapter, AdapterResult
32
+
33
+
34
+ @dataclass
35
+ class BrowserUseAdapter(Adapter):
36
+ name: str = "browser-use"
37
+ model: str = "gpt-4o"
38
+ max_steps: int | None = None # falls back to task.budget.max_steps
39
+ extra_instructions: str = ""
40
+ headless: bool = True
41
+
42
+ async def run(self, *, task, env, context, trajectory) -> AdapterResult:
43
+ try:
44
+ from browser_use import (
45
+ Agent,
46
+ BrowserProfile,
47
+ BrowserSession,
48
+ ChatOpenAI,
49
+ )
50
+ except ImportError as exc: # pragma: no cover
51
+ raise RuntimeError(
52
+ "browser-use is not installed. "
53
+ "Install with `pip install 'resurf[browser-use]'` "
54
+ "(requires browser-use >= 0.12)."
55
+ ) from exc
56
+
57
+ if not os.environ.get("OPENAI_API_KEY"):
58
+ raise RuntimeError(
59
+ "OPENAI_API_KEY is not set. The browser-use adapter uses OpenAI by default."
60
+ )
61
+
62
+ # Pre-navigation: skip the LLM's first "go_to_url" step by warming
63
+ # the session at base_url ourselves. Saves ~3-8s per task. Disable
64
+ # via REVAR_BROWSER_USE_NO_PRENAV=1 if you want to *see* the
65
+ # navigation animate (e.g., during a demo).
66
+ prenav = os.environ.get("REVAR_BROWSER_USE_NO_PRENAV", "").lower() not in ("1", "true", "yes")
67
+
68
+ prompt_lines = [
69
+ f"You are interacting with a synthetic e-commerce site at {env.base_url}.",
70
+ ]
71
+ if prenav:
72
+ prompt_lines.append(
73
+ f"You are already on the homepage at {env.base_url}/."
74
+ )
75
+ else:
76
+ prompt_lines.append(f"Begin by navigating to {env.base_url}/.")
77
+ prompt_lines.append(f"Goal: {task.goal.strip()}")
78
+ if task.user_credentials:
79
+ prompt_lines.append(
80
+ f"If you need to sign in, use email '{task.user_credentials['email']}' "
81
+ f"and password '{task.user_credentials['password']}'."
82
+ )
83
+ if self.extra_instructions:
84
+ prompt_lines.append(self.extra_instructions)
85
+ prompt = "\n\n".join(prompt_lines)
86
+
87
+ viewport = _viewport_for(task.viewport)
88
+ browser_session = BrowserSession(
89
+ browser_profile=BrowserProfile(
90
+ headless=self.headless,
91
+ viewport=viewport,
92
+ ),
93
+ )
94
+
95
+ if prenav:
96
+ try:
97
+ await browser_session.start()
98
+ await browser_session.navigate_to(env.base_url + "/")
99
+ except Exception as exc:
100
+ # Pre-nav is best-effort; fall back to letting the agent navigate.
101
+ if os.environ.get("REVAR_DEBUG"):
102
+ print(f"[resurf.browser_use] prenav failed, agent will navigate: {exc!r}", flush=True)
103
+
104
+ agent = Agent(
105
+ task=prompt,
106
+ llm=ChatOpenAI(model=self.model, temperature=0),
107
+ browser_session=browser_session,
108
+ )
109
+
110
+ max_steps = self.max_steps or task.budget.max_steps
111
+ history = await agent.run(max_steps=max_steps)
112
+
113
+ from resurf.trajectory import Step
114
+
115
+ # Run-level token totals: in v0.12 usage lives on AgentHistoryList,
116
+ # NOT on per-AgentHistory items (they only have StepMetadata for timing).
117
+ # Belt and suspenders: try history.usage first, then fall back to
118
+ # querying the agent's token_cost_service directly (which is what
119
+ # the agent itself uses to populate history.usage).
120
+ run_usage = getattr(history, "usage", None)
121
+ tokens_in = int(getattr(run_usage, "total_prompt_tokens", 0) or 0)
122
+ tokens_out = int(getattr(run_usage, "total_completion_tokens", 0) or 0)
123
+
124
+ if tokens_in == 0 and tokens_out == 0:
125
+ try:
126
+ tcs = getattr(agent, "token_cost_service", None)
127
+ if tcs is not None:
128
+ summary = await tcs.get_usage_summary()
129
+ tokens_in = int(getattr(summary, "total_prompt_tokens", 0) or 0)
130
+ tokens_out = int(getattr(summary, "total_completion_tokens", 0) or 0)
131
+ if os.environ.get("REVAR_DEBUG"):
132
+ entries = len(getattr(tcs, "usage_history", []) or [])
133
+ print(
134
+ f"[resurf.browser_use] token_cost_service fallback: "
135
+ f"entries={entries} prompt={tokens_in} completion={tokens_out}",
136
+ flush=True,
137
+ )
138
+ except Exception as exc:
139
+ if os.environ.get("REVAR_DEBUG"):
140
+ print(f"[resurf.browser_use] token fallback failed: {exc!r}", flush=True)
141
+
142
+ if os.environ.get("REVAR_DEBUG"):
143
+ print(
144
+ f"[resurf.browser_use] history.usage={run_usage!r} "
145
+ f"final tokens_in={tokens_in} tokens_out={tokens_out}",
146
+ flush=True,
147
+ )
148
+
149
+ for i, item in enumerate(getattr(history, "history", []) or []):
150
+ url = None
151
+ with contextlib.suppress(Exception):
152
+ url = item.state.url # type: ignore[attr-defined]
153
+
154
+ # ``model_output.action`` is a list[ActionModel]; pick the first
155
+ # so we have *something* to record. Each ActionModel is a
156
+ # discriminated union of the agent's tool calls (click_element_by_index,
157
+ # input_text, navigate, scroll, done, ...).
158
+ action_obj = None
159
+ try:
160
+ actions = item.model_output.action if item.model_output else [] # type: ignore[attr-defined]
161
+ if actions:
162
+ action_obj = actions[0]
163
+ except Exception:
164
+ pass
165
+
166
+ action_type = "browser_use_action"
167
+ if action_obj is not None:
168
+ # ActionModel only ever has one field set (the chosen action's name)
169
+ try:
170
+ dumped = action_obj.model_dump(exclude_none=True)
171
+ action_type = next(iter(dumped.keys()), "browser_use_action")
172
+ except Exception:
173
+ action_type = type(action_obj).__name__
174
+
175
+ trajectory.append(
176
+ Step(
177
+ index=i,
178
+ timestamp=time.time(),
179
+ action_type=action_type,
180
+ action={"raw": str(action_obj)[:500]} if action_obj else {},
181
+ url=url,
182
+ )
183
+ )
184
+
185
+ try:
186
+ await browser_session.kill() # 0.12 API; falls back to close() below
187
+ except AttributeError: # pragma: no cover
188
+ with contextlib.suppress(Exception):
189
+ await browser_session.close()
190
+ except Exception:
191
+ pass
192
+
193
+ return AdapterResult(
194
+ actions_taken=len(getattr(history, "history", []) or []),
195
+ tokens_in=tokens_in,
196
+ tokens_out=tokens_out,
197
+ notes={"model": self.model},
198
+ )
199
+
200
+
201
+ def _viewport_for(name: str | None) -> dict[str, int]:
202
+ """Map our task viewport names to browser-use ViewportSize dicts."""
203
+ presets = {
204
+ "desktop": {"width": 1280, "height": 800},
205
+ "mobile_iphone15": {"width": 390, "height": 844},
206
+ "mobile_pixel7": {"width": 412, "height": 915},
207
+ }
208
+ return presets.get(name or "desktop", presets["desktop"])
@@ -0,0 +1,137 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Stagehand adapter (via Node subprocess).
3
+
4
+ Stagehand is a Node.js framework. We invoke it via a small Node bridge script
5
+ using JSON-RPC-over-stdio. The bridge script lives at
6
+ `adapters/stagehand/bridge.mjs` in the repo and is shipped with resurf;
7
+ users need Node 20+ on PATH but no Python-side `stagehand` install.
8
+
9
+ We do NOT take over the Playwright context here — Stagehand owns its own
10
+ browser (it integrates tightly with Playwright internally). For tasks that
11
+ require pre-auth, we forward the seeded session cookie via Stagehand's CDP
12
+ init; for v0 those tasks are out of scope for this adapter and we recommend
13
+ browser-use instead.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import json
20
+ import os
21
+ import shutil
22
+ import time
23
+ from dataclasses import dataclass
24
+ from pathlib import Path
25
+
26
+ from .base import Adapter, AdapterResult
27
+
28
+
29
+ @dataclass
30
+ class StagehandAdapter(Adapter):
31
+ name: str = "stagehand"
32
+ bridge_path: str | None = None
33
+ node_bin: str = "node"
34
+ model: str = "gpt-4o"
35
+ headless: bool = True
36
+
37
+ async def run(self, *, task, env, context, trajectory) -> AdapterResult:
38
+ if shutil.which(self.node_bin) is None:
39
+ raise RuntimeError(
40
+ f"Node binary `{self.node_bin}` not on PATH. "
41
+ "Install Node 20+ to use the Stagehand adapter."
42
+ )
43
+
44
+ bridge = Path(self.bridge_path) if self.bridge_path else _default_bridge_path()
45
+ if not bridge.exists():
46
+ raise RuntimeError(
47
+ f"Stagehand bridge not found at {bridge}. "
48
+ "It should ship in adapters/stagehand/bridge.mjs."
49
+ )
50
+ if not os.environ.get("OPENAI_API_KEY"):
51
+ raise RuntimeError("OPENAI_API_KEY is not set; required by Stagehand.")
52
+
53
+ request = {
54
+ "goal": task.goal.strip(),
55
+ "base_url": env.base_url,
56
+ "model": self.model,
57
+ "max_steps": task.budget.max_steps,
58
+ "viewport": task.viewport,
59
+ "headless": self.headless,
60
+ }
61
+ if task.user_credentials:
62
+ request["credentials"] = task.user_credentials
63
+
64
+ proc = await asyncio.create_subprocess_exec(
65
+ self.node_bin,
66
+ str(bridge),
67
+ stdin=asyncio.subprocess.PIPE,
68
+ stdout=asyncio.subprocess.PIPE,
69
+ stderr=asyncio.subprocess.PIPE,
70
+ )
71
+ stdout, stderr = await proc.communicate(input=(json.dumps(request) + "\n").encode())
72
+
73
+ # The bridge writes its error JSON to stdout (so we can recover a
74
+ # structured ``error`` field) and exits non-zero. Try to parse stdout
75
+ # FIRST so we surface a useful message; fall back to stderr only if
76
+ # stdout is empty/unparseable.
77
+ from resurf.trajectory import Step
78
+
79
+ payload: dict | None = None
80
+ parse_err: Exception | None = None
81
+ if stdout:
82
+ try:
83
+ payload = json.loads(stdout.decode().splitlines()[-1])
84
+ except Exception as exc:
85
+ parse_err = exc
86
+
87
+ if proc.returncode != 0:
88
+ bridge_error = (payload or {}).get("error") if payload else None
89
+ stderr_text = stderr.decode(errors="replace").strip()
90
+ hint = ""
91
+ if proc.returncode == 2 and not bridge_error:
92
+ hint = (
93
+ " (exit 2 typically means Stagehand isn't installed — "
94
+ "run `cd adapters/stagehand && npm install @browserbasehq/stagehand`)"
95
+ )
96
+ raise RuntimeError(
97
+ "Stagehand bridge exited with code "
98
+ f"{proc.returncode}.{hint}\n"
99
+ f" bridge_error: {bridge_error!r}\n"
100
+ f" stderr: {stderr_text!r}"
101
+ )
102
+
103
+ if payload is None:
104
+ raise RuntimeError(
105
+ f"Could not parse stagehand bridge output: {parse_err}\n"
106
+ f"stdout={stdout!r}\nstderr={stderr!r}"
107
+ )
108
+
109
+ for i, item in enumerate(payload.get("steps") or []):
110
+ trajectory.append(
111
+ Step(
112
+ index=i,
113
+ timestamp=time.time(),
114
+ action_type=item.get("type", "stagehand_action"),
115
+ action=item.get("action", {}),
116
+ url=item.get("url"),
117
+ tokens_in=item.get("tokens_in", 0),
118
+ tokens_out=item.get("tokens_out", 0),
119
+ )
120
+ )
121
+
122
+ return AdapterResult(
123
+ actions_taken=len(payload.get("steps") or []),
124
+ tokens_in=int(payload.get("tokens_in", 0)),
125
+ tokens_out=int(payload.get("tokens_out", 0)),
126
+ notes={"model": self.model},
127
+ )
128
+
129
+
130
+ def _default_bridge_path() -> Path:
131
+ """Walk upward from this file looking for adapters/stagehand/bridge.mjs."""
132
+ here = Path(__file__).resolve()
133
+ for parent in [here.parent, *here.parents]:
134
+ candidate = parent / "adapters" / "stagehand" / "bridge.mjs"
135
+ if candidate.exists():
136
+ return candidate
137
+ return Path("adapters/stagehand/bridge.mjs")