workarena-cube 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.3
2
+ Name: workarena-cube
3
+ Version: 1.0.0
4
+ Summary: WorkArena ServiceNow benchmark for cube
5
+ Requires-Dist: cube-standard>=0.1.0rc5
6
+ Requires-Dist: browsergym-workarena
7
+ Requires-Dist: termcolor
8
+ Requires-Dist: cube-browser-tool>=0.2.0
9
+ Requires-Dist: cube-browser-playwright>=0.2.0
10
+ Requires-Dist: cube-chat-tool>=0.1.0
11
+ Requires-Dist: playwright==1.44
12
+ Requires-Python: >=3.12, <3.13
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "workarena-cube"
3
+ version = "1.0.0"
4
+ description = "WorkArena ServiceNow benchmark for cube"
5
+ requires-python = ">=3.12,<3.13"
6
+ dependencies = [
7
+ "cube-standard>=0.1.0rc5",
8
+ "browsergym-workarena",
9
+ "termcolor",
10
+ "cube-browser-tool>=0.2.0",
11
+ "cube-browser-playwright>=0.2.0",
12
+ "cube-chat-tool>=0.1.0",
13
+ "playwright==1.44",
14
+ ]
15
+
16
+ [project.entry-points."cube.benchmarks"]
17
+ workarena-cube = "workarena_cube.benchmark:WorkArenaBenchmark"
18
+
19
+ [build-system]
20
+ requires = ["uv_build>=0.8,<0.9"]
21
+ build-backend = "uv_build"
22
+
23
+ [tool.uv-build]
24
+ include = ["src/workarena_cube/task_metadata.json"]
25
+
26
+ [tool.ruff]
27
+ fix = true
28
+ line-length = 120
29
+ indent-width = 4
@@ -0,0 +1,27 @@
1
+ from workarena_cube.benchmark import WorkArenaBenchmark
2
+ from workarena_cube.debug import CheatAgent, make_debug_agent, get_debug_benchmark
3
+ from workarena_cube.task import WorkArenaTask, WorkArenaTaskConfig, WorkArenaTaskMetadata
4
+ from workarena_cube.tools import (
5
+ WorkArenaBrowserTool,
6
+ WorkArenaCheatTool,
7
+ WorkArenaInfeasibleTool,
8
+ WorkarenaBrowserToolConfig,
9
+ WorkArenaInfeasibleToolConfig,
10
+ WorkArenaCheatToolConfig,
11
+ )
12
+
13
+ __all__ = [
14
+ "WorkArenaBenchmark",
15
+ "WorkArenaTask",
16
+ "WorkArenaTaskConfig",
17
+ "WorkArenaTaskMetadata",
18
+ "CheatAgent",
19
+ "make_debug_agent",
20
+ "get_debug_benchmark",
21
+ "WorkArenaBrowserTool",
22
+ "WorkArenaCheatTool",
23
+ "WorkArenaInfeasibleTool",
24
+ "WorkarenaBrowserToolConfig",
25
+ "WorkArenaInfeasibleToolConfig",
26
+ "WorkArenaCheatToolConfig",
27
+ ]
@@ -0,0 +1,125 @@
1
+ """WorkArena benchmark implementation for the CUBE framework."""
2
+
3
+ import logging
4
+ from typing import ClassVar
5
+
6
+ from browsergym.workarena import get_all_tasks_agents
7
+ from cube.benchmark import Benchmark, BenchmarkMetadata
8
+ from cube.seed import AbstractSeedGenerator
9
+ from cube.task import TaskConfig, TaskMetadata
10
+ from pydantic import PrivateAttr, model_validator
11
+
12
+ from workarena_cube.task import WorkArenaTaskConfig, WorkArenaTaskMetadata
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class WorkArenaSeedGenerator(AbstractSeedGenerator):
18
+ """Generates seeds for WorkArena tasks by delegating to get_all_tasks_agents().
19
+
20
+ Covers all three levels (l1, l2, l3) so it works naturally with any subset
21
+ produced by named_subset() or subset_from_glob().
22
+
23
+ Seeds are derived from WorkArena's own RNG (seeded by meta_seed) to maintain
24
+ compatibility with the original benchmark's evaluation protocol.
25
+
26
+ Lazily loads on first call and caches {task_id: [seeds]} for the lifetime
27
+ of this generator.
28
+ """
29
+
30
+ meta_seed: int = 42
31
+ n_seeds_l1: int = 10
32
+ is_agent_curriculum: bool = True
33
+
34
+ _cache: dict[str, list[int]] | None = PrivateAttr(default=None)
35
+
36
+ def _ensure_loaded(self) -> None:
37
+ if self._cache is not None:
38
+ return
39
+ cache: dict[str, list[int]] = {}
40
+ for level in ("l1", "l2", "l3"):
41
+ for task_class, seed in get_all_tasks_agents(
42
+ filter=level,
43
+ meta_seed=self.meta_seed,
44
+ n_seed_l1=self.n_seeds_l1,
45
+ is_agent_curriculum=self.is_agent_curriculum,
46
+ ):
47
+ task_id = task_class.get_task_id()
48
+ cache.setdefault(task_id, []).append(seed)
49
+ self._cache = cache
50
+
51
+ def __call__(self, task_metadata: TaskMetadata) -> list[int]:
52
+ self._ensure_loaded()
53
+ assert self._cache
54
+ return self._cache.get(task_metadata.id, [])
55
+
56
+
57
+ class WorkArenaBenchmark(Benchmark):
58
+ """CUBE Benchmark for WorkArena ServiceNow tasks.
59
+
60
+ By default loads all task types from all levels (l1, l2, l3).
61
+ Use named_subset() or subset_from_glob() in user-land to filter:
62
+
63
+ bench.named_subset("l1") # L1 only
64
+ bench.named_subset("l2").subset_from_glob("in_human_curriculum", "True") # L2 human curriculum
65
+
66
+ Required environment variables:
67
+ SNOW_INSTANCE_URL, SNOW_INSTANCE_UNAME, SNOW_INSTANCE_PWD
68
+ or HUGGING_FACE_HUB_TOKEN for the hosted instance pool.
69
+
70
+ task_metadata.json is a shipped package resource containing lightweight public fields
71
+ (level, in_human_curriculum, task_class_path). No heavy execution data exists — all
72
+ task logic is available from the browsergym-workarena library at runtime.
73
+
74
+ To regenerate task_metadata.json (developer use only), run:
75
+ scripts/generate_task_metadata.py
76
+ """
77
+
78
+ benchmark_metadata: ClassVar[BenchmarkMetadata] = BenchmarkMetadata(
79
+ name="workarena-cube",
80
+ version="1.0.0",
81
+ description=(
82
+ "WorkArena ServiceNow benchmark tasks across three levels. "
83
+ "By default all task types from all levels are loaded. "
84
+ "Use named_subset('l1'/'l2'/'l3') to filter by level. "
85
+ "For human curriculum: bench.named_subset('l2').subset_from_glob('in_human_curriculum', 'True')."
86
+ ),
87
+ tags=["browser", "web", "servicenow"],
88
+ named_subsets={
89
+ "l1": ("level", "l1"),
90
+ "l2": ("level", "l2"),
91
+ "l3": ("level", "l3"),
92
+ },
93
+ num_tasks=333,
94
+ )
95
+ task_metadata: ClassVar[dict[str, WorkArenaTaskMetadata]] # type: ignore - populated automatically at import time in Benchmark.__init_subclass__
96
+ task_config_class: ClassVar[type[TaskConfig]] = WorkArenaTaskConfig
97
+
98
+ meta_seed: int = 42
99
+ n_seeds_l1: int = 10
100
+ is_agent_curriculum: bool = True
101
+
102
+ @model_validator(mode="after")
103
+ def _init_seed_generator(self) -> "WorkArenaBenchmark":
104
+ """Initialize seed_generator at construction time from benchmark fields."""
105
+ if self.seed_generator is None:
106
+ object.__setattr__(
107
+ self,
108
+ "seed_generator",
109
+ WorkArenaSeedGenerator(
110
+ meta_seed=self.meta_seed,
111
+ n_seeds_l1=self.n_seeds_l1,
112
+ is_agent_curriculum=self.is_agent_curriculum,
113
+ ),
114
+ )
115
+ return self
116
+
117
+ # ── lifecycle ──────────────────────────────────────────────────
118
+
119
+ def _setup(self) -> None:
120
+ """No shared infrastructure needed — WorkArena tasks connect to a remote ServiceNow instance."""
121
+ logger.info(f"WorkArena benchmark ready with {len(self.task_metadata)} tasks")
122
+
123
+ def close(self) -> None:
124
+ """No-op: WorkArena has no server process to shut down."""
125
+ logger.info("WorkArena benchmark closed.")
@@ -0,0 +1,69 @@
1
+ """Smoke-test script for workarena-cube — validates infrastructure without an LLM.
2
+
3
+ Verifies that WorkArena task configs can be enumerated, tasks can be instantiated,
4
+ and the tool + WorkArena episode lifecycle run without errors.
5
+
6
+ Requires ServiceNow credentials (SNOW_INSTANCE_URL, SNOW_INSTANCE_UNAME,
7
+ SNOW_INSTANCE_PWD) or HUGGING_FACE_HUB_TOKEN for the hosted instance pool.
8
+
9
+ Public API (cube.testing protocol)
10
+ -----------------------------------
11
+ get_debug_benchmark() -> WorkArenaBenchmark
12
+ make_debug_agent(task_id: str) -> CheatAgent
13
+
14
+ Usage:
15
+ uv run cube test workarena-cube
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ import sys
22
+ from cube.core import Action, ActionSchema, Observation
23
+ from cube.testing import run_debug_suite
24
+
25
+ from workarena_cube.benchmark import WorkArenaBenchmark
26
+ from workarena_cube.tools import WorkArenaCheatToolConfig
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ _DEBUG_N_TASKS = 2
31
+
32
+
33
+ class CheatAgent:
34
+ """Agent that calls WorkArena's cheat action to solve the task, then stops."""
35
+
36
+ def __init__(self, task_id: str) -> None:
37
+ self._task_id = task_id
38
+ self._cheated: bool = False
39
+
40
+ def __call__(self, obs: Observation, action_set: list[ActionSchema]) -> Action:
41
+ if not self._cheated:
42
+ self._cheated = True
43
+ return Action(name="workarena_cheat", arguments={})
44
+ return Action(name="final_step", arguments={})
45
+
46
+
47
+ def make_debug_agent(task_id: str) -> CheatAgent:
48
+ return CheatAgent(task_id)
49
+
50
+
51
+ def get_debug_benchmark() -> WorkArenaBenchmark:
52
+ bench = WorkArenaBenchmark(
53
+ n_seeds_l1=1,
54
+ default_tool_config=WorkArenaCheatToolConfig(),
55
+ )
56
+ l1_bench = bench.named_subset("l1")
57
+ task_ids = list(l1_bench.task_metadata.keys())[:_DEBUG_N_TASKS]
58
+ return l1_bench.subset_from_list(task_ids) # type: ignore
59
+
60
+
61
+ if __name__ == "__main__":
62
+ import workarena_cube.debug as _this_module
63
+
64
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(name)s %(message)s")
65
+
66
+ results = run_debug_suite("workarena-cube", _this_module)
67
+ failed = [r for r in results if r["error"]]
68
+
69
+ sys.exit(1 if failed else 0)
@@ -0,0 +1,235 @@
1
+ """WorkArena task implementation for the CUBE framework."""
2
+
3
+ import importlib
4
+ import logging
5
+ import time
6
+ from typing import Any, List, Literal, override
7
+
8
+ from browsergym.workarena.tasks.base import AbstractServiceNowTask
9
+ from cube.benchmark import RuntimeContext
10
+ from cube.container import ContainerBackend
11
+ from cube.core import Action, ActionSchema, EnvironmentOutput, Observation
12
+ from cube.task import Task, TaskConfig, TaskMetadata
13
+ from cube.tool import Toolbox
14
+ from cube.tools.browser import BrowserTool
15
+ from cube_browser_playwright import Viewport
16
+ from cube_chat_tool import ChatTool
17
+ from workarena_cube.tools import WorkArenaCheatTool, WorkArenaInfeasibleTool, WorkArenaBrowserTool
18
+ from pydantic import PrivateAttr
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class WorkArenaTaskMetadata(TaskMetadata):
25
+ """TaskMetadata subclass for WorkArena ServiceNow tasks.
26
+
27
+ Public fields shipped in task_metadata.json (available at import time).
28
+ WorkArena has no heavy execution data — all task logic is available from
29
+ the browsergym-workarena library at runtime via task_class_path.
30
+ """
31
+
32
+ level: Literal["l1", "l2", "l3"]
33
+ """Task level: l1 = atomic, l2 = compositional, l3 = extended compositional."""
34
+
35
+ in_human_curriculum: bool
36
+ """Whether this task type is part of the human evaluation curriculum."""
37
+
38
+ task_class_path: str
39
+ """Dotted path to the WorkArena task class, e.g. 'browsergym.workarena.tasks.dashboard.MultiChartValueRetrievalTask'."""
40
+
41
+
42
+ class WorkArenaTask(Task):
43
+ """CUBE Task wrapper for WorkArena ServiceNow tasks."""
44
+
45
+ metadata: WorkArenaTaskMetadata # type: ignore[assignment]
46
+ seed: int
47
+ wait_first_page_time: float = 10.0
48
+ validate_per_step: bool = True
49
+
50
+ _workarena_task: AbstractServiceNowTask | None = PrivateAttr(default=None)
51
+ _validate_cache: tuple[Any, ...] | None = PrivateAttr(default=None)
52
+
53
+ @property
54
+ def _browser_tool(self) -> WorkArenaBrowserTool:
55
+ """Resolve the browser tool whether it's direct or inside a Toolbox."""
56
+ if isinstance(self.tool, Toolbox):
57
+ tool = self.tool.find_tool(BrowserTool)
58
+ if tool is None:
59
+ raise RuntimeError("No BrowserTool found in Toolbox")
60
+ else:
61
+ tool = self.tool
62
+ if not isinstance(tool, WorkArenaBrowserTool):
63
+ raise RuntimeError(
64
+ f"The browser tool must satisfy the WorkArenaBrowserTool protocol (e.g., BrowsergymTool or SyncPlaywrightTool), got {type(tool).__name__}"
65
+ )
66
+ return tool
67
+
68
+ @property
69
+ def _chat_tool(self) -> ChatTool | None:
70
+ """Return the ChatTool if present in a Toolbox, else None."""
71
+ if isinstance(self.tool, Toolbox):
72
+ return self.tool.find_tool(ChatTool) # type: ignore
73
+ return None
74
+
75
+ @property
76
+ def _infeasible_tool(self) -> WorkArenaInfeasibleTool | None:
77
+ """Return the WorkArenaInfeasibleTool if present in a Toolbox, else None."""
78
+ if isinstance(self.tool, Toolbox):
79
+ tool = self.tool.find_tool(WorkArenaInfeasibleTool)
80
+ return tool if isinstance(tool, WorkArenaInfeasibleTool) else None
81
+ return None
82
+
83
+ def reset(self) -> tuple[Observation, dict[str, Any]]:
84
+ """Instantiate and set up the WorkArena task, returning the initial observation."""
85
+ task_class = _load_task_class(self.metadata.task_class_path)
86
+ self._workarena_task = task_class(seed=self.seed)
87
+ if self._workarena_task is None:
88
+ raise RuntimeError("Failed to initialize WorkArena task.")
89
+ _apply_task_runtime_preferences(self._browser_tool, self._workarena_task)
90
+ self.tool.reset()
91
+ self._validate_cache = None
92
+ if isinstance(self._browser_tool, WorkArenaCheatTool):
93
+ self._browser_tool._workarena_task = self._workarena_task
94
+ page = self._browser_tool.page
95
+ goal, task_info = self._workarena_task.setup(page)
96
+
97
+ logger.info(f"WorkArena page URL after setup: {page.url}")
98
+ logger.info(f"WorkArena page title: {page.title()}")
99
+ logger.info(f"WorkArena task class: {self._workarena_task.__class__.__name__}")
100
+
101
+ self._browser_tool.noop()
102
+ time.sleep(self.wait_first_page_time)
103
+ logger.info(f"WorkArena task goal: {goal}")
104
+
105
+ page_obs = self._browser_tool.page_obs()
106
+ if self._chat_tool is not None:
107
+ self._chat_tool.add_message("user", goal)
108
+ obs = Observation.from_text(self._chat_tool.chat_obs()) + page_obs
109
+ else:
110
+ obs = Observation.from_text(goal) + page_obs
111
+ info = {
112
+ "task_id": self.id,
113
+ "task_class": task_class.__name__,
114
+ "seed": self.seed,
115
+ "goal": goal,
116
+ **task_info,
117
+ }
118
+ return obs, info
119
+
120
+ @property
121
+ def _chat_messages(self) -> list[dict]:
122
+ """
123
+ Return combined chat and infeasible messages.
124
+
125
+ Normal path (ChatTool): a copy of session history — safe for parallel episodes,
126
+ always current because send_message() writes before evaluate() runs.
127
+
128
+ Cheat path (WorkArenaCheatTool, no ChatTool): the live _chat_messages_ref list.
129
+ cheat() appends directly to whatever list it receives, so cheat() and validate()
130
+ must share the same list instance.
131
+ """
132
+ messages: list[dict] = []
133
+ if self._chat_tool is None and isinstance(self._browser_tool, WorkArenaCheatTool):
134
+ messages.extend(self._browser_tool._chat_messages_ref)
135
+ elif (chat := self._chat_tool) is not None:
136
+ messages.extend(chat.messages)
137
+ if (infeasible := self._infeasible_tool) is not None:
138
+ messages.extend(infeasible.messages)
139
+ return messages
140
+
141
+ def _validate(self) -> tuple[float, bool, str, dict]:
142
+ """Call WorkArena's validate() with per-step caching.
143
+
144
+ Both evaluate() and finished() call this on every step. The cache avoids
145
+ duplicate ServiceNow REST calls within the same step. It is cleared after
146
+ the first consumer reads it, so the next step gets a fresh call.
147
+ """
148
+ if self._workarena_task is None:
149
+ raise RuntimeError("WorkArena task is not initialized. Call reset() first.")
150
+ if self._validate_cache is None:
151
+ page = self._browser_tool.page
152
+ self._validate_cache = self._workarena_task.validate(page, self._chat_messages) # type: ignore : Workarena validators expect list[dict] despite the protocol specifying list[str].
153
+ return self._validate_cache # type: ignore[return-value]
154
+
155
+ @override
156
+ def step(self, action: Action | List[Action]) -> EnvironmentOutput:
157
+ self._validate_cache = None
158
+ return super().step(action)
159
+
160
+ def evaluate(self, obs: Observation | None = None) -> tuple[float, dict[str, Any]]:
161
+ """Score the current task state via WorkArena's validate()."""
162
+ reward, done, _user_message, task_info = self._validate()
163
+ return reward, {"done": done, **task_info}
164
+
165
+ def finished(self, obs: Observation | None = None) -> bool:
166
+ """Check if the task is done via WorkArena's validate()."""
167
+ if self._workarena_task is None:
168
+ return False
169
+ _reward, done, _user_message, _task_info = self._validate()
170
+ return done
171
+
172
+ def filter_actions(self, actions: list[ActionSchema]) -> list[ActionSchema]:
173
+ """Filter actions based on available tools."""
174
+ if self._chat_tool is None:
175
+ actions = [a for a in actions if a.name != "send_message"]
176
+ if self._infeasible_tool is None:
177
+ actions = [a for a in actions if a.name != "report_infeasible"]
178
+ return actions
179
+
180
+ def close(self) -> None:
181
+ """Teardown the WorkArena task and close the tool."""
182
+ if self._workarena_task is not None:
183
+ try:
184
+ self._workarena_task.teardown()
185
+ except Exception as e:
186
+ logger.warning(f"Error during WorkArena task teardown: {e}")
187
+ finally:
188
+ self._workarena_task = None
189
+ super().close()
190
+
191
+
192
+ class WorkArenaTaskConfig(TaskConfig):
193
+ """Serializable configuration for a single WorkArena task."""
194
+
195
+ def make(
196
+ self,
197
+ runtime_context: RuntimeContext | None = None,
198
+ container_backend: ContainerBackend | None = None,
199
+ ) -> WorkArenaTask:
200
+ # Import here to avoid circular import (benchmark imports task)
201
+ from workarena_cube.benchmark import WorkArenaBenchmark
202
+
203
+ _ = runtime_context, container_backend
204
+ meta = WorkArenaBenchmark.task_metadata[self.task_id]
205
+ assert self.tool_config, f"WorkArenaTaskConfig requires a tool_config, got {self.tool_config}"
206
+ return WorkArenaTask(
207
+ metadata=meta,
208
+ tool_config=self.tool_config,
209
+ seed=self.seed if self.seed is not None else 42,
210
+ )
211
+
212
+
213
+ def _load_task_class(class_path: str) -> type:
214
+ """Reconstruct a task class from its dotted module-qualified name."""
215
+ module_name, class_name = class_path.rsplit(".", 1)
216
+ module = importlib.import_module(module_name)
217
+ return getattr(module, class_name)
218
+
219
+
220
+ def _apply_task_runtime_preferences(tool: WorkArenaBrowserTool, workarena_task: AbstractServiceNowTask) -> None:
221
+ """Apply WorkArena task runtime defaults to the tool config when not explicitly set."""
222
+ browser_config = tool.config.browser
223
+ explicitly_set = browser_config.model_fields_set
224
+ updates: dict[str, Any] = {}
225
+ for field in ("slow_mo", "timeout", "locale", "timezone_id"):
226
+ if field not in explicitly_set and getattr(workarena_task, field, None) is not None:
227
+ updates[field] = getattr(workarena_task, field)
228
+ if "viewport" not in explicitly_set:
229
+ raw_vp = getattr(workarena_task, "viewport", None)
230
+ if isinstance(raw_vp, dict):
231
+ updates["viewport"] = Viewport(**raw_vp)
232
+ elif isinstance(raw_vp, Viewport):
233
+ updates["viewport"] = raw_vp
234
+ if updates:
235
+ tool.config.browser = browser_config.model_copy(update=updates)