PyPI - miniwob-cube - Versions diffs - 1.0.0__tar.gz - Mend

miniwob-cube 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

miniwob_cube-1.0.0/PKG-INFO +8 -0
miniwob_cube-1.0.0/pyproject.toml +25 -0
miniwob_cube-1.0.0/src/miniwob_cube/__init__.py +12 -0
miniwob_cube-1.0.0/src/miniwob_cube/benchmark.py +113 -0
miniwob_cube-1.0.0/src/miniwob_cube/debug.py +106 -0
miniwob_cube-1.0.0/src/miniwob_cube/task.py +202 -0
miniwob_cube-1.0.0/src/miniwob_cube/task_metadata.json +1252 -0

miniwob_cube-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,8 @@
+Metadata-Version: 2.3
+Name: miniwob-cube
+Version: 1.0.0
+Summary: MiniWob++ benchmark for cube
+Requires-Dist: cube-standard
+Requires-Dist: miniwob>=1.0
+Requires-Dist: cube-browser-tool
+Requires-Python: >=3.12

miniwob_cube-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[project]
+name = "miniwob-cube"
+version = "1.0.0"
+description = "MiniWob++ benchmark for cube"
+requires-python = ">=3.12"
+dependencies = [
+    "cube-standard",
+    "miniwob>=1.0",
+    "cube-browser-tool"
+]
+[project.entry-points."cube.benchmarks"]
+miniwob-cube = "miniwob_cube.benchmark:MiniWobBenchmark"
+[build-system]
+requires = ["uv_build>=0.8,<0.9"]
+build-backend = "uv_build"
+[tool.uv-build]
+include = ["src/miniwob_cube/task_metadata.json"]
+[tool.ruff]
+fix = true
+line-length = 120
+indent-width = 4

miniwob_cube-1.0.0/src/miniwob_cube/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from miniwob_cube.benchmark import MiniWobBenchmark
+from miniwob_cube.debug import get_debug_benchmark, make_debug_agent
+from miniwob_cube.task import MiniWobTask, MiniWobTaskConfig, MiniWobTaskMetadata
+__all__ = [
+    "MiniWobBenchmark",
+    "MiniWobTask",
+    "MiniWobTaskMetadata",
+    "MiniWobTaskConfig",
+    "get_debug_benchmark",
+    "make_debug_agent",
+]

miniwob_cube-1.0.0/src/miniwob_cube/benchmark.py ADDED Viewed

@@ -0,0 +1,113 @@
+import logging
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.request
+from importlib.resources import files
+from pathlib import Path
+from typing import ClassVar, Generator
+from cube.benchmark import Benchmark, BenchmarkMetadata
+from cube.task import TaskConfig
+from miniwob_cube.task import MiniWobTaskConfig, MiniWobTaskMetadata
+logger = logging.getLogger(__name__)
+class MiniWobBenchmark(Benchmark):
+    benchmark_metadata: ClassVar[BenchmarkMetadata] = BenchmarkMetadata(
+        name="miniwob-cube",
+        version="1.0.0",
+        description="MiniWob++ browser automation benchmark tasks",
+        num_tasks=125,
+        tags=["browser", "web", "ui"],
+    )
+    task_metadata: ClassVar[dict[str, MiniWobTaskMetadata]]  # type: ignore - will be populated automatically at import time in Benchmark.__init_subclass__
+    task_config_class: ClassVar[type[TaskConfig]] = MiniWobTaskConfig
+    html_path: str = files("miniwob").joinpath("html").as_posix()  # type: ignore
+    port: int = 8000
+    remove_human_display: bool = True
+    episode_max_time: int = 1000000
+    server_start_timeout: float = 10.0
+    server_start_poll_interval: float = 0.1
+    # Runtime state (not serialized)
+    _server_process: subprocess.Popen | None = None
+    _stdout_file: object | None = None
+    _stderr_file: object | None = None
+    model_config = {"arbitrary_types_allowed": True}
+    @property
+    def base_url(self) -> str:
+        return f"http://localhost:{self.port}/miniwob"
+    def _setup(self) -> None:
+        tmp_dir = Path(tempfile.gettempdir())
+        self._stdout_file = open(tmp_dir / "miniwob_server_stdout.log", "w")
+        self._stderr_file = open(tmp_dir / "miniwob_server_stderr.log", "w")
+        logger.info(f"Starting MiniWob server at port {self.port} serving from {self.html_path}...")
+        self._server_process = subprocess.Popen(
+            [sys.executable, "-m", "http.server", str(self.port)],
+            cwd=self.html_path,
+            stdout=self._stdout_file,
+            stderr=self._stderr_file,
+        )
+        startup_deadline = time.monotonic() + self.server_start_timeout
+        last_response_error: Exception | None = None
+        while time.monotonic() < startup_deadline:
+            if self._server_process.poll() is not None:
+                self._stderr_file.flush()
+                stderr_path = Path(tempfile.gettempdir()) / "miniwob_server_stderr.log"
+                stderr_content = stderr_path.read_text() if stderr_path.exists() else "No stderr available"
+                returncode = self._server_process.returncode
+                self.close()
+                raise RuntimeError(f"MiniWob server failed to start (exit code {returncode}): {stderr_content}")
+            try:
+                urllib.request.urlopen(self.base_url, timeout=1)
+                logger.info(f"MiniWob server responding at {self.base_url}")
+                break
+            except Exception as e:
+                last_response_error = e
+                time.sleep(self.server_start_poll_interval)
+        else:
+            self.close()
+            raise RuntimeError(
+                f"MiniWob server failed to respond at {self.base_url} within {self.server_start_timeout:.1f}s"
+            ) from last_response_error
+        self._runtime_context = {"base_url": self.base_url}
+    def get_task_configs(self) -> Generator[MiniWobTaskConfig, None, None]:
+        for tm in self.task_metadata.values():
+            yield MiniWobTaskConfig(
+                task_id=tm.id,
+                tool_config=self.default_tool_config,
+                base_url=self.base_url,
+                remove_human_display=self.remove_human_display,
+                episode_max_time=self.episode_max_time,
+            )
+    def close(self) -> None:
+        if self._server_process is not None:
+            logger.info("Shutting down MiniWob server...")
+            self._server_process.terminate()
+            try:
+                self._server_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                logger.warning("Server did not terminate gracefully, killing...")
+                self._server_process.kill()
+            self._server_process = None
+        if self._stdout_file is not None:
+            self._stdout_file.close()
+            self._stdout_file = None
+        if self._stderr_file is not None:
+            self._stderr_file.close()
+            self._stderr_file = None

miniwob_cube-1.0.0/src/miniwob_cube/debug.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Smoke-test script for miniwob-cube — validates infrastructure and task solving.
+Verifies that the MiniWob HTTP server starts, the browser connects, the task
+page loads and JS initialises, and the hardcoded agents achieve reward=1.0 for
+the debug tasks.
+Public API (cube.testing protocol)
+-----------------------------------
+get_debug_benchmark()              -> MiniWobBenchmark
+make_debug_agent(task_id: str)     -> ClickButtonAgent | ClickCheckboxesAgent
+Usage:
+    uv run python -m miniwob_cube.debug
+"""
+from __future__ import annotations
+import logging
+import re
+import sys
+from cube.core import Action, ActionSchema, Observation, TextContent
+from cube.testing import run_debug_suite
+from cube_browser_tool import PlaywrightConfig
+from miniwob_cube.benchmark import Benchmark, MiniWobBenchmark
+logger = logging.getLogger(__name__)
+# A small set of representative tasks that cover the JS setup / observation path.
+_DEBUG_TASK_IDS = ["click-button", "click-checkboxes"]
+class ClickButtonAgent:
+    def __init__(self) -> None:
+        self._done = False
+    def _parse_button_text(self, obs: Observation) -> str:
+        for content in obs.contents:
+            if isinstance(content, TextContent):
+                match = re.search(r'Click on the "(.+?)" button', content.data, re.IGNORECASE)
+                assert match
+                return match.group(1)
+    def __call__(self, obs: Observation, action_set: list[ActionSchema]) -> Action:
+        if not self._done:
+            self._done = True
+            text = self._parse_button_text(obs)
+            return Action(name="browser_click", arguments={"selector": f"button:has-text('{text}')"})
+        return Action(name="final_step", arguments={})
+class ClickCheckboxesAgent:
+    def __init__(self) -> None:
+        self._step = 0
+        self._targets: list[str] = []
+    def _parse_targets(self, obs: Observation) -> list[str]:
+        for content in obs.contents:
+            if isinstance(content, TextContent):
+                match = re.search(r"Select (.+?) and click Submit", content.data, re.IGNORECASE)
+                assert match
+                words_str = match.group(1)
+                if words_str.lower() == "nothing":
+                    return []
+                return [w.strip() for w in words_str.split(",")]
+    def __call__(self, obs: Observation, action_set: list[ActionSchema]) -> Action:
+        if self._step == 0:
+            self._targets = self._parse_targets(obs)
+        idx = self._step
+        self._step += 1
+        if idx < len(self._targets):
+            word = self._targets[idx]
+            return Action(
+                name="browser_click", arguments={"selector": f"label:has-text('{word}') input[type='checkbox']"}
+            )
+        if idx == len(self._targets):
+            return Action(name="browser_click", arguments={"selector": "button#subbtn"})
+        return Action(name="final_step", arguments={})
+def make_debug_agent(task_id: str) -> ClickButtonAgent | ClickCheckboxesAgent:
+    if task_id == "click-button":
+        return ClickButtonAgent()
+    if task_id == "click-checkboxes":
+        return ClickCheckboxesAgent()
+    raise ValueError(f"No hardcoded agent for task: {task_id}")
+def get_debug_benchmark() -> Benchmark:
+    return MiniWobBenchmark(
+        default_tool_config=PlaywrightConfig(headless=True, use_html=True, use_axtree=False, use_screenshot=False),
+    ).subset_from_list(_DEBUG_TASK_IDS, benchmark_name_suffix="debug")
+if __name__ == "__main__":
+    import miniwob_cube.debug as _this_module
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)-8s  %(name)s  %(message)s")
+    results = run_debug_suite("miniwob-cube", _this_module)
+    failed = [r for r in results if r["error"] or not r["done"] or r["reward"] < 1.0]
+    sys.exit(1 if failed else 0)

miniwob_cube-1.0.0/src/miniwob_cube/task.py ADDED Viewed

@@ -0,0 +1,202 @@
+import logging
+from typing import Any
+from cube.benchmark import RuntimeContext
+from cube.container import ContainerBackend
+from cube.core import ActionSchema, Content, Observation
+from cube.task import Task, TaskConfig, TaskMetadata
+from cube.tools.browser import BrowserTool
+from PIL import Image
+class MiniWobTaskMetadata(TaskMetadata):
+    """TaskMetadata subclass for MiniWob++ tasks.
+    Adds cube-specific public fields that are safe to ship in task_metadata.json.
+    """
+    nondeterministic: bool = False
+logger = logging.getLogger(__name__)
+_SUPPORTED_ACTION_NAMES = frozenset(
+    {
+        "browser_press_key",
+        "browser_type",
+        "browser_click",
+        "browser_drag",
+        "browser_hover",
+        "browser_select_option",
+        "browser_mouse_click_xy",
+    }
+)
+class MiniWobTask(Task):
+    validate_per_step: bool = True
+    base_url: str = "http://localhost:8000/miniwob"
+    remove_human_display: bool = True
+    episode_max_time: int = 1000000
+    @property
+    def tool(self) -> BrowserTool:  # type: ignore[override]
+        return self._tool  # type: ignore[return-value]
+    @property
+    def url(self) -> str:
+        return f"{self.base_url}/{self.metadata.id}.html"
+    def reset(self) -> tuple[Observation, dict[str, Any]]:
+        self.tool.reset()
+        self.tool.goto(self.url)
+        setup_result = self.tool.evaluate_js(_build_setup_js(self.remove_human_display, self.episode_max_time))
+        goal, info = _parse_setup_result(setup_result)
+        obs = Observation.from_text(goal) + self.obs_postprocess(self.tool.page_obs())
+        return obs, {**info, "task_id": self.id, "task_url": self.url, "goal": goal}
+    def evaluate(self, obs: Observation | None = None) -> tuple[float, dict[str, Any]]:
+        result = self.tool.evaluate_js("""() => {
+return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY];}""")
+        return _parse_validation_result(result)
+    def finished(self, obs: Observation | None = None) -> bool:
+        return self.tool.evaluate_js("() => {return WOB_DONE_GLOBAL;}")
+    def filter_actions(self, actions: list[ActionSchema]) -> list[ActionSchema]:
+        filtered = [a for a in actions if a.name in _SUPPORTED_ACTION_NAMES]
+        logger.info(f"Chosen {len(filtered)} out of {len(actions)} actions for MiniWob task.")
+        return filtered
+    def obs_postprocess(self, obs: Observation) -> Observation:
+        contents = []
+        for content in obs.contents:
+            if content.name == "screenshot" and isinstance(content.data, Image.Image):
+                # crop to 332x214 because this is the viewport size for MiniWob
+                contents.append(Content.from_data(content.data.crop((0, 0, 332, 214)), name=content.name))
+            else:
+                contents.append(content)
+        obs.contents = contents
+        return obs
+class MiniWobTaskConfig(TaskConfig):
+    base_url: str = "http://localhost:8000/miniwob"
+    remove_human_display: bool = True
+    episode_max_time: int = 1000000
+    def make(
+        self,
+        runtime_context: RuntimeContext | None = None,
+        container_backend: ContainerBackend | None = None,
+    ) -> MiniWobTask:
+        from miniwob_cube.benchmark import MiniWobBenchmark
+        # import here to avoid circular import (benchmark imports task)
+        _ = runtime_context, container_backend
+        task_metadata: TaskMetadata = MiniWobBenchmark.task_metadata[self.task_id]
+        assert self.tool_config is not None, "tool_config must be set"
+        return MiniWobTask(
+            metadata=task_metadata,
+            tool_config=self.tool_config,
+            base_url=self.base_url,
+            remove_human_display=self.remove_human_display,
+            episode_max_time=self.episode_max_time,
+        )
+def _build_setup_js(remove_human_display: bool, episode_max_time: int) -> str:
+    if remove_human_display:
+        js = r"""
+let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover'];
+let __display_divs = {};
+let __query_div_hidden_copy = null;
+removeDisplay = function() {
+  core.clearTimer();
+  document.body.removeEventListener('click', core.canvasDrawClick);
+  __query_div_hidden_copy = document.getElementById('query').cloneNode(true);
+  document.getElementById('query').innerHTML = '';
+  for (i in __display_ids) {
+    elem_id = __display_ids[i];
+    elem = document.getElementById(elem_id);
+    // remove elem from the document
+    elem.remove();
+    // but keep it stored somewhere to bring back later
+    __display_divs[elem_id] = elem;
+  }
+};
+bringBackDisplay = function() {
+  document.getElementById('query').innerHTML = __query_div_hidden_copy.innerHTML;
+  for (var elem_id in __display_divs){
+    document.body.appendChild(__display_divs[elem_id]);
+  }
+  core.createDisplay();
+};
+core.endEpisode_legacy = core.endEpisode;
+core.startEpisodeReal_legacy = core.startEpisodeReal;
+core.getUtterance_legacy = core.getUtterance;
+core.getUtterance = function () {
+  bringBackDisplay();
+  utterance = core.getUtterance_legacy();
+  removeDisplay();
+  return utterance;
+};
+core.endEpisode = function(reward, time_proportional, reason){
+  bringBackDisplay();
+  core.endEpisode_legacy(reward, time_proportional, reason);
+  removeDisplay();
+};
+core.startEpisodeReal = function() {
+  bringBackDisplay();
+  core.startEpisodeReal_legacy();
+  removeDisplay();
+};
+removeDisplay();
+"""
+    else:
+        js = ""
+    js += f"""
+Math.seedrandom(42);
+core.EPISODE_MAX_TIME = {episode_max_time};
+core.startEpisodeReal();
+while (!WOB_TASK_READY) {{
+  await new Promise(resolve => setTimeout(resolve, 100));
+}}
+return core.getUtterance();
+    """
+    return f"async () => {{{js}}}"
+def _parse_setup_result(setup_result: str | dict) -> tuple[str, dict]:
+    if isinstance(setup_result, dict):
+        return setup_result["utterance"], {}
+    elif isinstance(setup_result, str):
+        return setup_result, {}
+    else:
+        raise ValueError(f"Unexpected setup_result type: {type(setup_result)}")
+def _parse_validation_result(validation_result: str | dict | list) -> tuple[float, dict]:
+    if isinstance(validation_result, list):
+        chunks = validation_result
+        done = chunks[3]
+    elif isinstance(validation_result, dict):
+        raise ValueError("Validation result as dict is not supported")
+    else:
+        chunks = [c.strip() for c in validation_result.split(",")]
+        done = chunks[3].strip().lower() == "true"
+    raw_reward = float(chunks[1])
+    reward = float(raw_reward > 0)
+    return reward, {
+        "raw_reward": raw_reward,
+        "reward_reason": chunks[2],
+        "done": done,
+    }