miniwob-cube 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.3
2
+ Name: miniwob-cube
3
+ Version: 1.0.0
4
+ Summary: MiniWob++ benchmark for cube
5
+ Requires-Dist: cube-standard
6
+ Requires-Dist: miniwob>=1.0
7
+ Requires-Dist: cube-browser-tool
8
+ Requires-Python: >=3.12
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "miniwob-cube"
3
+ version = "1.0.0"
4
+ description = "MiniWob++ benchmark for cube"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "cube-standard",
8
+ "miniwob>=1.0",
9
+ "cube-browser-tool"
10
+ ]
11
+
12
+ [project.entry-points."cube.benchmarks"]
13
+ miniwob-cube = "miniwob_cube.benchmark:MiniWobBenchmark"
14
+
15
+ [build-system]
16
+ requires = ["uv_build>=0.8,<0.9"]
17
+ build-backend = "uv_build"
18
+
19
+ [tool.uv-build]
20
+ include = ["src/miniwob_cube/task_metadata.json"]
21
+
22
+ [tool.ruff]
23
+ fix = true
24
+ line-length = 120
25
+ indent-width = 4
@@ -0,0 +1,12 @@
1
+ from miniwob_cube.benchmark import MiniWobBenchmark
2
+ from miniwob_cube.debug import get_debug_benchmark, make_debug_agent
3
+ from miniwob_cube.task import MiniWobTask, MiniWobTaskConfig, MiniWobTaskMetadata
4
+
5
+ __all__ = [
6
+ "MiniWobBenchmark",
7
+ "MiniWobTask",
8
+ "MiniWobTaskMetadata",
9
+ "MiniWobTaskConfig",
10
+ "get_debug_benchmark",
11
+ "make_debug_agent",
12
+ ]
@@ -0,0 +1,113 @@
1
+ import logging
2
+ import subprocess
3
+ import sys
4
+ import tempfile
5
+ import time
6
+ import urllib.request
7
+ from importlib.resources import files
8
+ from pathlib import Path
9
+ from typing import ClassVar, Generator
10
+
11
+ from cube.benchmark import Benchmark, BenchmarkMetadata
12
+ from cube.task import TaskConfig
13
+
14
+ from miniwob_cube.task import MiniWobTaskConfig, MiniWobTaskMetadata
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class MiniWobBenchmark(Benchmark):
20
+ benchmark_metadata: ClassVar[BenchmarkMetadata] = BenchmarkMetadata(
21
+ name="miniwob-cube",
22
+ version="1.0.0",
23
+ description="MiniWob++ browser automation benchmark tasks",
24
+ num_tasks=125,
25
+ tags=["browser", "web", "ui"],
26
+ )
27
+ task_metadata: ClassVar[dict[str, MiniWobTaskMetadata]] # type: ignore - will be populated automatically at import time in Benchmark.__init_subclass__
28
+ task_config_class: ClassVar[type[TaskConfig]] = MiniWobTaskConfig
29
+
30
+ html_path: str = files("miniwob").joinpath("html").as_posix() # type: ignore
31
+ port: int = 8000
32
+ remove_human_display: bool = True
33
+ episode_max_time: int = 1000000
34
+ server_start_timeout: float = 10.0
35
+ server_start_poll_interval: float = 0.1
36
+
37
+ # Runtime state (not serialized)
38
+ _server_process: subprocess.Popen | None = None
39
+ _stdout_file: object | None = None
40
+ _stderr_file: object | None = None
41
+
42
+ model_config = {"arbitrary_types_allowed": True}
43
+
44
+ @property
45
+ def base_url(self) -> str:
46
+ return f"http://localhost:{self.port}/miniwob"
47
+
48
+ def _setup(self) -> None:
49
+ tmp_dir = Path(tempfile.gettempdir())
50
+ self._stdout_file = open(tmp_dir / "miniwob_server_stdout.log", "w")
51
+ self._stderr_file = open(tmp_dir / "miniwob_server_stderr.log", "w")
52
+ logger.info(f"Starting MiniWob server at port {self.port} serving from {self.html_path}...")
53
+ self._server_process = subprocess.Popen(
54
+ [sys.executable, "-m", "http.server", str(self.port)],
55
+ cwd=self.html_path,
56
+ stdout=self._stdout_file,
57
+ stderr=self._stderr_file,
58
+ )
59
+ startup_deadline = time.monotonic() + self.server_start_timeout
60
+ last_response_error: Exception | None = None
61
+
62
+ while time.monotonic() < startup_deadline:
63
+ if self._server_process.poll() is not None:
64
+ self._stderr_file.flush()
65
+ stderr_path = Path(tempfile.gettempdir()) / "miniwob_server_stderr.log"
66
+ stderr_content = stderr_path.read_text() if stderr_path.exists() else "No stderr available"
67
+ returncode = self._server_process.returncode
68
+ self.close()
69
+ raise RuntimeError(f"MiniWob server failed to start (exit code {returncode}): {stderr_content}")
70
+
71
+ try:
72
+ urllib.request.urlopen(self.base_url, timeout=1)
73
+ logger.info(f"MiniWob server responding at {self.base_url}")
74
+ break
75
+ except Exception as e:
76
+ last_response_error = e
77
+ time.sleep(self.server_start_poll_interval)
78
+ else:
79
+ self.close()
80
+ raise RuntimeError(
81
+ f"MiniWob server failed to respond at {self.base_url} within {self.server_start_timeout:.1f}s"
82
+ ) from last_response_error
83
+
84
+ self._runtime_context = {"base_url": self.base_url}
85
+
86
+ def get_task_configs(self) -> Generator[MiniWobTaskConfig, None, None]:
87
+ for tm in self.task_metadata.values():
88
+ yield MiniWobTaskConfig(
89
+ task_id=tm.id,
90
+ tool_config=self.default_tool_config,
91
+ base_url=self.base_url,
92
+ remove_human_display=self.remove_human_display,
93
+ episode_max_time=self.episode_max_time,
94
+ )
95
+
96
+ def close(self) -> None:
97
+ if self._server_process is not None:
98
+ logger.info("Shutting down MiniWob server...")
99
+ self._server_process.terminate()
100
+ try:
101
+ self._server_process.wait(timeout=5)
102
+ except subprocess.TimeoutExpired:
103
+ logger.warning("Server did not terminate gracefully, killing...")
104
+ self._server_process.kill()
105
+ self._server_process = None
106
+
107
+ if self._stdout_file is not None:
108
+ self._stdout_file.close()
109
+ self._stdout_file = None
110
+
111
+ if self._stderr_file is not None:
112
+ self._stderr_file.close()
113
+ self._stderr_file = None
@@ -0,0 +1,106 @@
1
+ """Smoke-test script for miniwob-cube — validates infrastructure and task solving.
2
+
3
+ Verifies that the MiniWob HTTP server starts, the browser connects, the task
4
+ page loads and JS initialises, and the hardcoded agents achieve reward=1.0 for
5
+ the debug tasks.
6
+
7
+ Public API (cube.testing protocol)
8
+ -----------------------------------
9
+ get_debug_benchmark() -> MiniWobBenchmark
10
+ make_debug_agent(task_id: str) -> ClickButtonAgent | ClickCheckboxesAgent
11
+
12
+ Usage:
13
+ uv run python -m miniwob_cube.debug
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import re
20
+ import sys
21
+
22
+ from cube.core import Action, ActionSchema, Observation, TextContent
23
+ from cube.testing import run_debug_suite
24
+
25
+ from cube_browser_tool import PlaywrightConfig
26
+
27
+ from miniwob_cube.benchmark import Benchmark, MiniWobBenchmark
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # A small set of representative tasks that cover the JS setup / observation path.
33
+ _DEBUG_TASK_IDS = ["click-button", "click-checkboxes"]
34
+
35
+
36
+ class ClickButtonAgent:
37
+ def __init__(self) -> None:
38
+ self._done = False
39
+
40
+ def _parse_button_text(self, obs: Observation) -> str:
41
+ for content in obs.contents:
42
+ if isinstance(content, TextContent):
43
+ match = re.search(r'Click on the "(.+?)" button', content.data, re.IGNORECASE)
44
+ assert match
45
+ return match.group(1)
46
+
47
+ def __call__(self, obs: Observation, action_set: list[ActionSchema]) -> Action:
48
+ if not self._done:
49
+ self._done = True
50
+ text = self._parse_button_text(obs)
51
+ return Action(name="browser_click", arguments={"selector": f"button:has-text('{text}')"})
52
+ return Action(name="final_step", arguments={})
53
+
54
+
55
+ class ClickCheckboxesAgent:
56
+ def __init__(self) -> None:
57
+ self._step = 0
58
+ self._targets: list[str] = []
59
+
60
+ def _parse_targets(self, obs: Observation) -> list[str]:
61
+ for content in obs.contents:
62
+ if isinstance(content, TextContent):
63
+ match = re.search(r"Select (.+?) and click Submit", content.data, re.IGNORECASE)
64
+ assert match
65
+ words_str = match.group(1)
66
+ if words_str.lower() == "nothing":
67
+ return []
68
+ return [w.strip() for w in words_str.split(",")]
69
+
70
+ def __call__(self, obs: Observation, action_set: list[ActionSchema]) -> Action:
71
+ if self._step == 0:
72
+ self._targets = self._parse_targets(obs)
73
+ idx = self._step
74
+ self._step += 1
75
+ if idx < len(self._targets):
76
+ word = self._targets[idx]
77
+ return Action(
78
+ name="browser_click", arguments={"selector": f"label:has-text('{word}') input[type='checkbox']"}
79
+ )
80
+ if idx == len(self._targets):
81
+ return Action(name="browser_click", arguments={"selector": "button#subbtn"})
82
+ return Action(name="final_step", arguments={})
83
+
84
+
85
+ def make_debug_agent(task_id: str) -> ClickButtonAgent | ClickCheckboxesAgent:
86
+ if task_id == "click-button":
87
+ return ClickButtonAgent()
88
+ if task_id == "click-checkboxes":
89
+ return ClickCheckboxesAgent()
90
+ raise ValueError(f"No hardcoded agent for task: {task_id}")
91
+
92
+
93
+ def get_debug_benchmark() -> Benchmark:
94
+ return MiniWobBenchmark(
95
+ default_tool_config=PlaywrightConfig(headless=True, use_html=True, use_axtree=False, use_screenshot=False),
96
+ ).subset_from_list(_DEBUG_TASK_IDS, benchmark_name_suffix="debug")
97
+
98
+
99
+ if __name__ == "__main__":
100
+ import miniwob_cube.debug as _this_module
101
+
102
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(name)s %(message)s")
103
+
104
+ results = run_debug_suite("miniwob-cube", _this_module)
105
+ failed = [r for r in results if r["error"] or not r["done"] or r["reward"] < 1.0]
106
+ sys.exit(1 if failed else 0)
@@ -0,0 +1,202 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ from cube.benchmark import RuntimeContext
5
+ from cube.container import ContainerBackend
6
+ from cube.core import ActionSchema, Content, Observation
7
+ from cube.task import Task, TaskConfig, TaskMetadata
8
+ from cube.tools.browser import BrowserTool
9
+ from PIL import Image
10
+
11
+
12
+ class MiniWobTaskMetadata(TaskMetadata):
13
+ """TaskMetadata subclass for MiniWob++ tasks.
14
+ Adds cube-specific public fields that are safe to ship in task_metadata.json.
15
+ """
16
+
17
+ nondeterministic: bool = False
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _SUPPORTED_ACTION_NAMES = frozenset(
23
+ {
24
+ "browser_press_key",
25
+ "browser_type",
26
+ "browser_click",
27
+ "browser_drag",
28
+ "browser_hover",
29
+ "browser_select_option",
30
+ "browser_mouse_click_xy",
31
+ }
32
+ )
33
+
34
+
35
+ class MiniWobTask(Task):
36
+ validate_per_step: bool = True
37
+ base_url: str = "http://localhost:8000/miniwob"
38
+ remove_human_display: bool = True
39
+ episode_max_time: int = 1000000
40
+
41
+ @property
42
+ def tool(self) -> BrowserTool: # type: ignore[override]
43
+ return self._tool # type: ignore[return-value]
44
+
45
+ @property
46
+ def url(self) -> str:
47
+ return f"{self.base_url}/{self.metadata.id}.html"
48
+
49
+ def reset(self) -> tuple[Observation, dict[str, Any]]:
50
+ self.tool.reset()
51
+ self.tool.goto(self.url)
52
+ setup_result = self.tool.evaluate_js(_build_setup_js(self.remove_human_display, self.episode_max_time))
53
+ goal, info = _parse_setup_result(setup_result)
54
+ obs = Observation.from_text(goal) + self.obs_postprocess(self.tool.page_obs())
55
+ return obs, {**info, "task_id": self.id, "task_url": self.url, "goal": goal}
56
+
57
+ def evaluate(self, obs: Observation | None = None) -> tuple[float, dict[str, Any]]:
58
+ result = self.tool.evaluate_js("""() => {
59
+ return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY];}""")
60
+ return _parse_validation_result(result)
61
+
62
+ def finished(self, obs: Observation | None = None) -> bool:
63
+ return self.tool.evaluate_js("() => {return WOB_DONE_GLOBAL;}")
64
+
65
+ def filter_actions(self, actions: list[ActionSchema]) -> list[ActionSchema]:
66
+ filtered = [a for a in actions if a.name in _SUPPORTED_ACTION_NAMES]
67
+ logger.info(f"Chosen {len(filtered)} out of {len(actions)} actions for MiniWob task.")
68
+ return filtered
69
+
70
+ def obs_postprocess(self, obs: Observation) -> Observation:
71
+ contents = []
72
+ for content in obs.contents:
73
+ if content.name == "screenshot" and isinstance(content.data, Image.Image):
74
+ # crop to 332x214 because this is the viewport size for MiniWob
75
+ contents.append(Content.from_data(content.data.crop((0, 0, 332, 214)), name=content.name))
76
+ else:
77
+ contents.append(content)
78
+ obs.contents = contents
79
+ return obs
80
+
81
+
82
+ class MiniWobTaskConfig(TaskConfig):
83
+ base_url: str = "http://localhost:8000/miniwob"
84
+ remove_human_display: bool = True
85
+ episode_max_time: int = 1000000
86
+
87
+ def make(
88
+ self,
89
+ runtime_context: RuntimeContext | None = None,
90
+ container_backend: ContainerBackend | None = None,
91
+ ) -> MiniWobTask:
92
+ from miniwob_cube.benchmark import MiniWobBenchmark
93
+ # import here to avoid circular import (benchmark imports task)
94
+
95
+ _ = runtime_context, container_backend
96
+ task_metadata: TaskMetadata = MiniWobBenchmark.task_metadata[self.task_id]
97
+ assert self.tool_config is not None, "tool_config must be set"
98
+ return MiniWobTask(
99
+ metadata=task_metadata,
100
+ tool_config=self.tool_config,
101
+ base_url=self.base_url,
102
+ remove_human_display=self.remove_human_display,
103
+ episode_max_time=self.episode_max_time,
104
+ )
105
+
106
+
107
+ def _build_setup_js(remove_human_display: bool, episode_max_time: int) -> str:
108
+ if remove_human_display:
109
+ js = r"""
110
+ let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover'];
111
+ let __display_divs = {};
112
+ let __query_div_hidden_copy = null;
113
+
114
+ removeDisplay = function() {
115
+ core.clearTimer();
116
+ document.body.removeEventListener('click', core.canvasDrawClick);
117
+
118
+ __query_div_hidden_copy = document.getElementById('query').cloneNode(true);
119
+ document.getElementById('query').innerHTML = '';
120
+
121
+ for (i in __display_ids) {
122
+ elem_id = __display_ids[i];
123
+ elem = document.getElementById(elem_id);
124
+ // remove elem from the document
125
+ elem.remove();
126
+ // but keep it stored somewhere to bring back later
127
+ __display_divs[elem_id] = elem;
128
+ }
129
+ };
130
+
131
+ bringBackDisplay = function() {
132
+ document.getElementById('query').innerHTML = __query_div_hidden_copy.innerHTML;
133
+ for (var elem_id in __display_divs){
134
+ document.body.appendChild(__display_divs[elem_id]);
135
+ }
136
+ core.createDisplay();
137
+ };
138
+
139
+ core.endEpisode_legacy = core.endEpisode;
140
+ core.startEpisodeReal_legacy = core.startEpisodeReal;
141
+ core.getUtterance_legacy = core.getUtterance;
142
+
143
+ core.getUtterance = function () {
144
+ bringBackDisplay();
145
+ utterance = core.getUtterance_legacy();
146
+ removeDisplay();
147
+ return utterance;
148
+ };
149
+
150
+ core.endEpisode = function(reward, time_proportional, reason){
151
+ bringBackDisplay();
152
+ core.endEpisode_legacy(reward, time_proportional, reason);
153
+ removeDisplay();
154
+ };
155
+
156
+ core.startEpisodeReal = function() {
157
+ bringBackDisplay();
158
+ core.startEpisodeReal_legacy();
159
+ removeDisplay();
160
+ };
161
+
162
+ removeDisplay();
163
+ """
164
+ else:
165
+ js = ""
166
+ js += f"""
167
+ Math.seedrandom(42);
168
+ core.EPISODE_MAX_TIME = {episode_max_time};
169
+ core.startEpisodeReal();
170
+ while (!WOB_TASK_READY) {{
171
+ await new Promise(resolve => setTimeout(resolve, 100));
172
+ }}
173
+ return core.getUtterance();
174
+ """
175
+ return f"async () => {{{js}}}"
176
+
177
+
178
+ def _parse_setup_result(setup_result: str | dict) -> tuple[str, dict]:
179
+ if isinstance(setup_result, dict):
180
+ return setup_result["utterance"], {}
181
+ elif isinstance(setup_result, str):
182
+ return setup_result, {}
183
+ else:
184
+ raise ValueError(f"Unexpected setup_result type: {type(setup_result)}")
185
+
186
+
187
+ def _parse_validation_result(validation_result: str | dict | list) -> tuple[float, dict]:
188
+ if isinstance(validation_result, list):
189
+ chunks = validation_result
190
+ done = chunks[3]
191
+ elif isinstance(validation_result, dict):
192
+ raise ValueError("Validation result as dict is not supported")
193
+ else:
194
+ chunks = [c.strip() for c in validation_result.split(",")]
195
+ done = chunks[3].strip().lower() == "true"
196
+ raw_reward = float(chunks[1])
197
+ reward = float(raw_reward > 0)
198
+ return reward, {
199
+ "raw_reward": raw_reward,
200
+ "reward_reason": chunks[2],
201
+ "done": done,
202
+ }