openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,619 +0,0 @@
|
|
|
1
|
-
"""Windows Agent Arena Live adapter.
|
|
2
|
-
|
|
3
|
-
This module provides a live HTTP-based adapter for WAA that connects to the
|
|
4
|
-
WAA Flask server running inside a Windows VM. Unlike WAAAdapter which imports
|
|
5
|
-
WAA's DesktopEnv locally, this adapter talks to the server remotely.
|
|
6
|
-
|
|
7
|
-
Architecture:
|
|
8
|
-
The adapter uses WAA's element-based execution model:
|
|
9
|
-
1. Fetch accessibility tree from /accessibility endpoint
|
|
10
|
-
2. Extract element bboxes and POST to /update_computer as rects dict
|
|
11
|
-
3. Agent outputs actions with target_node_id (element-based grounding)
|
|
12
|
-
4. Execute via /execute_windows using computer.mouse.move_id(id) commands
|
|
13
|
-
|
|
14
|
-
This keeps grounding authority on WAA side - we send element IDs,
|
|
15
|
-
not pixel coordinates. WAA's Computer class handles the grounding.
|
|
16
|
-
|
|
17
|
-
Example:
|
|
18
|
-
from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
|
|
19
|
-
|
|
20
|
-
adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
|
|
21
|
-
agent = DemoConditionedAgent(base_agent, retriever)
|
|
22
|
-
results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
from __future__ import annotations
|
|
26
|
-
|
|
27
|
-
import base64
|
|
28
|
-
import io
|
|
29
|
-
import logging
|
|
30
|
-
import time
|
|
31
|
-
from dataclasses import dataclass, field
|
|
32
|
-
from typing import Any
|
|
33
|
-
|
|
34
|
-
import requests
|
|
35
|
-
|
|
36
|
-
from openadapt_ml.benchmarks.base import (
|
|
37
|
-
BenchmarkAction,
|
|
38
|
-
BenchmarkAdapter,
|
|
39
|
-
BenchmarkObservation,
|
|
40
|
-
BenchmarkResult,
|
|
41
|
-
BenchmarkTask,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
logger = logging.getLogger(__name__)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class WAALiveConfig:
|
|
49
|
-
"""Configuration for WAALiveAdapter.
|
|
50
|
-
|
|
51
|
-
Attributes:
|
|
52
|
-
server_url: URL of WAA Flask server (e.g., "http://172.171.112.41:5000").
|
|
53
|
-
a11y_backend: Accessibility backend ("uia" or "win32").
|
|
54
|
-
screen_width: Screen width in pixels.
|
|
55
|
-
screen_height: Screen height in pixels.
|
|
56
|
-
max_steps: Default maximum steps per task.
|
|
57
|
-
action_delay: Delay after actions in seconds (for UI to settle).
|
|
58
|
-
timeout: Request timeout in seconds.
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
server_url: str = "http://localhost:5000"
|
|
62
|
-
a11y_backend: str = "uia"
|
|
63
|
-
screen_width: int = 1920
|
|
64
|
-
screen_height: int = 1200
|
|
65
|
-
max_steps: int = 15
|
|
66
|
-
action_delay: float = 0.5
|
|
67
|
-
timeout: float = 90.0
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class WAALiveAdapter(BenchmarkAdapter):
|
|
71
|
-
"""Live WAA adapter that connects to WAA Flask server over HTTP.
|
|
72
|
-
|
|
73
|
-
Unlike WAAAdapter which imports WAA's DesktopEnv locally, this adapter
|
|
74
|
-
talks to the WAA server remotely via HTTP. This enables:
|
|
75
|
-
- Running DemoConditionedAgent from local machine
|
|
76
|
-
- Using our own VLM (Claude/GPT) instead of WAA's built-in navi agent
|
|
77
|
-
- Injecting demos into prompts before each action
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
config: WAALiveConfig with server URL and settings.
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
def __init__(self, config: WAALiveConfig | None = None):
|
|
84
|
-
self.config = config or WAALiveConfig()
|
|
85
|
-
self._current_task: BenchmarkTask | None = None
|
|
86
|
-
self._step_count = 0
|
|
87
|
-
self._current_a11y: dict | None = None
|
|
88
|
-
self._current_rects: dict[str, list[int]] = {} # element_id -> [l, t, r, b]
|
|
89
|
-
self._current_screenshot: bytes | None = None
|
|
90
|
-
self._actions: list[BenchmarkAction] = []
|
|
91
|
-
|
|
92
|
-
@property
|
|
93
|
-
def name(self) -> str:
|
|
94
|
-
"""Benchmark name."""
|
|
95
|
-
return "waa-live"
|
|
96
|
-
|
|
97
|
-
@property
|
|
98
|
-
def benchmark_type(self) -> str:
|
|
99
|
-
"""Benchmark type (interactive)."""
|
|
100
|
-
return "interactive"
|
|
101
|
-
|
|
102
|
-
@property
|
|
103
|
-
def supports_parallel(self) -> bool:
|
|
104
|
-
"""Whether parallel execution is supported."""
|
|
105
|
-
return False # Single VM for now
|
|
106
|
-
|
|
107
|
-
def check_connection(self) -> bool:
|
|
108
|
-
"""Check if WAA server is reachable.
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
True if server responds to /probe endpoint.
|
|
112
|
-
"""
|
|
113
|
-
try:
|
|
114
|
-
resp = requests.get(
|
|
115
|
-
f"{self.config.server_url}/probe",
|
|
116
|
-
timeout=5.0
|
|
117
|
-
)
|
|
118
|
-
return resp.status_code == 200
|
|
119
|
-
except requests.RequestException:
|
|
120
|
-
return False
|
|
121
|
-
|
|
122
|
-
def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
|
|
123
|
-
"""List available WAA tasks.
|
|
124
|
-
|
|
125
|
-
For live adapter, tasks are typically loaded on-demand.
|
|
126
|
-
Returns empty list - use load_task() directly.
|
|
127
|
-
"""
|
|
128
|
-
return []
|
|
129
|
-
|
|
130
|
-
def load_task(self, task_id: str) -> BenchmarkTask:
|
|
131
|
-
"""Load a specific task by ID.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
task_id: Task identifier.
|
|
135
|
-
|
|
136
|
-
Returns:
|
|
137
|
-
BenchmarkTask object.
|
|
138
|
-
"""
|
|
139
|
-
# For now, create a minimal task - actual task configs should be
|
|
140
|
-
# loaded from WAA repo if needed
|
|
141
|
-
return BenchmarkTask(
|
|
142
|
-
task_id=task_id,
|
|
143
|
-
instruction=f"Task {task_id}",
|
|
144
|
-
domain=task_id.split("_")[0] if "_" in task_id else "unknown",
|
|
145
|
-
time_limit_steps=self.config.max_steps,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
|
|
149
|
-
"""Reset environment to task's initial state.
|
|
150
|
-
|
|
151
|
-
Args:
|
|
152
|
-
task: Task to initialize.
|
|
153
|
-
|
|
154
|
-
Returns:
|
|
155
|
-
Initial observation (screenshot + accessibility tree).
|
|
156
|
-
|
|
157
|
-
Raises:
|
|
158
|
-
RuntimeError: If server is not reachable.
|
|
159
|
-
"""
|
|
160
|
-
if not self.check_connection():
|
|
161
|
-
raise RuntimeError(
|
|
162
|
-
f"Cannot connect to WAA server at {self.config.server_url}. "
|
|
163
|
-
f"Ensure Windows VM is running and server is started."
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
self._current_task = task
|
|
167
|
-
self._step_count = 0
|
|
168
|
-
self._actions = []
|
|
169
|
-
|
|
170
|
-
# Try to close all windows for clean state
|
|
171
|
-
try:
|
|
172
|
-
requests.post(
|
|
173
|
-
f"{self.config.server_url}/setup/close_all",
|
|
174
|
-
timeout=30.0
|
|
175
|
-
)
|
|
176
|
-
logger.info("Closed all windows for clean state")
|
|
177
|
-
except requests.RequestException as e:
|
|
178
|
-
logger.warning(f"Failed to close windows: {e}")
|
|
179
|
-
|
|
180
|
-
# If task has setup commands in raw_config, execute them
|
|
181
|
-
if task.raw_config:
|
|
182
|
-
self._run_task_setup(task.raw_config)
|
|
183
|
-
|
|
184
|
-
# Small delay for UI to settle
|
|
185
|
-
time.sleep(1.0)
|
|
186
|
-
|
|
187
|
-
return self._get_observation()
|
|
188
|
-
|
|
189
|
-
def step(
|
|
190
|
-
self, action: BenchmarkAction
|
|
191
|
-
) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
|
|
192
|
-
"""Execute action and return new observation.
|
|
193
|
-
|
|
194
|
-
Uses element-based grounding via WAA's Computer class. Click actions
|
|
195
|
-
are translated to computer.mouse.move_id(id) commands that WAA executes
|
|
196
|
-
using the rects we POSTed to /update_computer.
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
action: Action to execute.
|
|
200
|
-
|
|
201
|
-
Returns:
|
|
202
|
-
Tuple of (observation, done, info).
|
|
203
|
-
"""
|
|
204
|
-
self._step_count += 1
|
|
205
|
-
self._actions.append(action)
|
|
206
|
-
|
|
207
|
-
# Translate action to element-based command for WAA's Computer
|
|
208
|
-
command = self._translate_action(action)
|
|
209
|
-
|
|
210
|
-
# Execute command via /execute_windows (has access to computer object)
|
|
211
|
-
if command:
|
|
212
|
-
try:
|
|
213
|
-
resp = requests.post(
|
|
214
|
-
f"{self.config.server_url}/execute_windows",
|
|
215
|
-
json={"command": command},
|
|
216
|
-
timeout=self.config.timeout
|
|
217
|
-
)
|
|
218
|
-
if resp.status_code != 200:
|
|
219
|
-
logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
|
|
220
|
-
else:
|
|
221
|
-
result = resp.json()
|
|
222
|
-
if result.get("stderr"):
|
|
223
|
-
logger.warning(f"Command stderr: {result['stderr']}")
|
|
224
|
-
logger.debug(f"Executed: {command}")
|
|
225
|
-
except requests.RequestException as e:
|
|
226
|
-
logger.error(f"Execute request failed: {e}")
|
|
227
|
-
|
|
228
|
-
# Wait for UI to settle
|
|
229
|
-
time.sleep(self.config.action_delay)
|
|
230
|
-
|
|
231
|
-
# Check if done
|
|
232
|
-
done = (
|
|
233
|
-
action.type == "done" or
|
|
234
|
-
self._step_count >= self.config.max_steps
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
obs = self._get_observation()
|
|
238
|
-
info = {
|
|
239
|
-
"step": self._step_count,
|
|
240
|
-
"command": command,
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
return obs, done, info
|
|
244
|
-
|
|
245
|
-
def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
|
|
246
|
-
"""Evaluate current state against task success criteria.
|
|
247
|
-
|
|
248
|
-
For live adapter, full evaluation requires running WAA's evaluators.
|
|
249
|
-
Currently returns a placeholder result.
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
task: Task to evaluate.
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
BenchmarkResult with success/score.
|
|
256
|
-
"""
|
|
257
|
-
# TODO: Implement proper evaluation by calling WAA evaluators
|
|
258
|
-
# For now, check if agent took any actions
|
|
259
|
-
has_actions = len(self._actions) > 0
|
|
260
|
-
called_done = any(a.type == "done" for a in self._actions)
|
|
261
|
-
|
|
262
|
-
return BenchmarkResult(
|
|
263
|
-
task_id=task.task_id,
|
|
264
|
-
success=False, # Can't determine without evaluator
|
|
265
|
-
score=0.5 if has_actions and called_done else 0.0,
|
|
266
|
-
num_steps=self._step_count,
|
|
267
|
-
reason="Evaluation requires WAA evaluators (not yet implemented)",
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
def close(self) -> None:
|
|
271
|
-
"""Clean up resources."""
|
|
272
|
-
self._current_task = None
|
|
273
|
-
self._current_a11y = None
|
|
274
|
-
self._actions = []
|
|
275
|
-
|
|
276
|
-
def _get_observation(self) -> BenchmarkObservation:
|
|
277
|
-
"""Fetch current observation from WAA server.
|
|
278
|
-
|
|
279
|
-
Also extracts element rects from a11y tree and updates WAA's Computer
|
|
280
|
-
so element-based grounding works for subsequent actions.
|
|
281
|
-
|
|
282
|
-
Returns:
|
|
283
|
-
BenchmarkObservation with screenshot and accessibility tree.
|
|
284
|
-
"""
|
|
285
|
-
screenshot = None
|
|
286
|
-
a11y_tree = None
|
|
287
|
-
|
|
288
|
-
# Get screenshot
|
|
289
|
-
try:
|
|
290
|
-
resp = requests.get(
|
|
291
|
-
f"{self.config.server_url}/screenshot",
|
|
292
|
-
timeout=30.0
|
|
293
|
-
)
|
|
294
|
-
if resp.status_code == 200:
|
|
295
|
-
screenshot = resp.content
|
|
296
|
-
self._current_screenshot = screenshot
|
|
297
|
-
logger.debug(f"Got screenshot: {len(screenshot)} bytes")
|
|
298
|
-
else:
|
|
299
|
-
logger.warning(f"Screenshot request failed: {resp.status_code}")
|
|
300
|
-
except requests.RequestException as e:
|
|
301
|
-
logger.error(f"Screenshot request error: {e}")
|
|
302
|
-
|
|
303
|
-
# Get accessibility tree
|
|
304
|
-
try:
|
|
305
|
-
resp = requests.get(
|
|
306
|
-
f"{self.config.server_url}/accessibility",
|
|
307
|
-
params={"backend": self.config.a11y_backend},
|
|
308
|
-
timeout=30.0
|
|
309
|
-
)
|
|
310
|
-
if resp.status_code == 200:
|
|
311
|
-
result = resp.json()
|
|
312
|
-
a11y_tree = result.get("AT", {})
|
|
313
|
-
self._current_a11y = a11y_tree
|
|
314
|
-
# Extract rects for element-based grounding
|
|
315
|
-
self._current_rects = self._extract_rects_from_a11y(a11y_tree)
|
|
316
|
-
logger.debug("Got accessibility tree with %d elements", len(self._current_rects))
|
|
317
|
-
else:
|
|
318
|
-
logger.warning(f"A11y request failed: {resp.status_code}")
|
|
319
|
-
except requests.RequestException as e:
|
|
320
|
-
logger.error(f"A11y request error: {e}")
|
|
321
|
-
|
|
322
|
-
# Update WAA's Computer with current rects for element grounding
|
|
323
|
-
if self._current_rects:
|
|
324
|
-
self._update_waa_computer()
|
|
325
|
-
|
|
326
|
-
return BenchmarkObservation(
|
|
327
|
-
screenshot=screenshot,
|
|
328
|
-
viewport=(self.config.screen_width, self.config.screen_height),
|
|
329
|
-
accessibility_tree=a11y_tree,
|
|
330
|
-
window_title=self._extract_window_title(a11y_tree),
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
def _extract_window_title(self, a11y_tree: dict | str | None) -> str | None:
|
|
334
|
-
"""Extract window title from accessibility tree."""
|
|
335
|
-
if not a11y_tree:
|
|
336
|
-
return None
|
|
337
|
-
# Handle XML string - can't extract title easily
|
|
338
|
-
if isinstance(a11y_tree, str):
|
|
339
|
-
return None
|
|
340
|
-
# Try common field names
|
|
341
|
-
for key in ["Name", "name", "title", "Title"]:
|
|
342
|
-
if key in a11y_tree:
|
|
343
|
-
return a11y_tree[key]
|
|
344
|
-
return None
|
|
345
|
-
|
|
346
|
-
def _extract_rects_from_a11y(self, a11y_tree: dict | None) -> dict[str, list[int]]:
|
|
347
|
-
"""Extract element ID -> bounding box mapping from accessibility tree.
|
|
348
|
-
|
|
349
|
-
This produces the `rects` dict that WAA's Computer class expects.
|
|
350
|
-
The rects are then POSTed to /update_computer so WAA can handle grounding.
|
|
351
|
-
|
|
352
|
-
Args:
|
|
353
|
-
a11y_tree: Accessibility tree from /accessibility endpoint.
|
|
354
|
-
|
|
355
|
-
Returns:
|
|
356
|
-
Dict mapping element IDs to [left, top, right, bottom] bounding boxes.
|
|
357
|
-
"""
|
|
358
|
-
rects: dict[str, list[int]] = {}
|
|
359
|
-
|
|
360
|
-
def visit(node: dict) -> None:
|
|
361
|
-
# Get element ID
|
|
362
|
-
elem_id = None
|
|
363
|
-
for id_field in ["id", "Id", "ID", "AutomationId"]:
|
|
364
|
-
if id_field in node and node[id_field]:
|
|
365
|
-
elem_id = str(node[id_field])
|
|
366
|
-
break
|
|
367
|
-
|
|
368
|
-
# Get bounding box
|
|
369
|
-
bbox = None
|
|
370
|
-
for bbox_field in ["bbox", "BoundingRectangle", "Rect", "rect"]:
|
|
371
|
-
if bbox_field in node:
|
|
372
|
-
bbox = node[bbox_field]
|
|
373
|
-
break
|
|
374
|
-
|
|
375
|
-
# Store if we have both ID and bbox
|
|
376
|
-
if elem_id is not None and bbox is not None:
|
|
377
|
-
# Normalize bbox to [left, top, right, bottom]
|
|
378
|
-
if isinstance(bbox, list) and len(bbox) == 4:
|
|
379
|
-
# Could be [l, t, r, b] or [l, t, w, h] - assume [l, t, r, b]
|
|
380
|
-
rects[elem_id] = [int(x) for x in bbox]
|
|
381
|
-
elif isinstance(bbox, dict):
|
|
382
|
-
x = bbox.get("x", 0)
|
|
383
|
-
y = bbox.get("y", 0)
|
|
384
|
-
w = bbox.get("width", 0)
|
|
385
|
-
h = bbox.get("height", 0)
|
|
386
|
-
rects[elem_id] = [x, y, x + w, y + h]
|
|
387
|
-
elif isinstance(bbox, str):
|
|
388
|
-
parts = [int(p) for p in bbox.split(",")]
|
|
389
|
-
if len(parts) == 4:
|
|
390
|
-
rects[elem_id] = parts
|
|
391
|
-
|
|
392
|
-
# Visit children
|
|
393
|
-
for child_field in ["children", "Children"]:
|
|
394
|
-
children = node.get(child_field, [])
|
|
395
|
-
if isinstance(children, list):
|
|
396
|
-
for child in children:
|
|
397
|
-
if isinstance(child, dict):
|
|
398
|
-
visit(child)
|
|
399
|
-
|
|
400
|
-
if a11y_tree:
|
|
401
|
-
# Handle case where a11y_tree is XML string (WAA returns XML)
|
|
402
|
-
if isinstance(a11y_tree, str):
|
|
403
|
-
# TODO: Parse XML to dict if needed for element grounding
|
|
404
|
-
logger.debug("A11y tree is XML string, skipping rect extraction")
|
|
405
|
-
return rects
|
|
406
|
-
visit(a11y_tree)
|
|
407
|
-
|
|
408
|
-
logger.debug(f"Extracted {len(rects)} element rects from a11y tree")
|
|
409
|
-
return rects
|
|
410
|
-
|
|
411
|
-
def _update_waa_computer(self) -> None:
|
|
412
|
-
"""POST current rects and screenshot to WAA's /update_computer endpoint.
|
|
413
|
-
|
|
414
|
-
This syncs WAA's Computer object with our current element state,
|
|
415
|
-
allowing computer.mouse.move_id(id) to work correctly.
|
|
416
|
-
"""
|
|
417
|
-
if not self._current_rects:
|
|
418
|
-
logger.warning("No rects to update - skipping /update_computer")
|
|
419
|
-
return
|
|
420
|
-
|
|
421
|
-
# Encode screenshot as base64
|
|
422
|
-
screenshot_b64 = ""
|
|
423
|
-
if self._current_screenshot:
|
|
424
|
-
screenshot_b64 = base64.b64encode(self._current_screenshot).decode("utf-8")
|
|
425
|
-
|
|
426
|
-
# Window rect (full screen for now)
|
|
427
|
-
window_rect = [0, 0, self.config.screen_width, self.config.screen_height]
|
|
428
|
-
|
|
429
|
-
payload = {
|
|
430
|
-
"rects": self._current_rects,
|
|
431
|
-
"window_rect": window_rect,
|
|
432
|
-
"screenshot": screenshot_b64,
|
|
433
|
-
"scale": [1.0, 1.0],
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
try:
|
|
437
|
-
resp = requests.post(
|
|
438
|
-
f"{self.config.server_url}/update_computer",
|
|
439
|
-
json=payload,
|
|
440
|
-
timeout=30.0
|
|
441
|
-
)
|
|
442
|
-
if resp.status_code == 200:
|
|
443
|
-
logger.debug("Updated WAA computer with %d rects", len(self._current_rects))
|
|
444
|
-
else:
|
|
445
|
-
logger.warning(f"update_computer failed: {resp.status_code} - {resp.text}")
|
|
446
|
-
except requests.RequestException as e:
|
|
447
|
-
logger.error(f"update_computer request error: {e}")
|
|
448
|
-
|
|
449
|
-
def _run_task_setup(self, raw_config: dict) -> None:
|
|
450
|
-
"""Run task setup commands from raw_config.
|
|
451
|
-
|
|
452
|
-
Args:
|
|
453
|
-
raw_config: Task configuration with setup commands.
|
|
454
|
-
"""
|
|
455
|
-
# Handle different setup command formats
|
|
456
|
-
setup = raw_config.get("setup", raw_config.get("init", {}))
|
|
457
|
-
|
|
458
|
-
if isinstance(setup, dict):
|
|
459
|
-
# Launch application if specified
|
|
460
|
-
if "app" in setup or "application" in setup:
|
|
461
|
-
app = setup.get("app") or setup.get("application")
|
|
462
|
-
try:
|
|
463
|
-
requests.post(
|
|
464
|
-
f"{self.config.server_url}/setup/launch",
|
|
465
|
-
json={"app": app},
|
|
466
|
-
timeout=30.0
|
|
467
|
-
)
|
|
468
|
-
logger.info(f"Launched app: {app}")
|
|
469
|
-
except requests.RequestException as e:
|
|
470
|
-
logger.warning(f"Failed to launch app: {e}")
|
|
471
|
-
|
|
472
|
-
# Run shell commands if specified
|
|
473
|
-
if "commands" in setup:
|
|
474
|
-
for cmd in setup["commands"]:
|
|
475
|
-
try:
|
|
476
|
-
requests.post(
|
|
477
|
-
f"{self.config.server_url}/execute_windows",
|
|
478
|
-
json={"command": cmd, "shell": "powershell"},
|
|
479
|
-
timeout=60.0
|
|
480
|
-
)
|
|
481
|
-
logger.info(f"Ran setup command: {cmd[:50]}...")
|
|
482
|
-
except requests.RequestException as e:
|
|
483
|
-
logger.warning(f"Setup command failed: {e}")
|
|
484
|
-
|
|
485
|
-
def _translate_action(self, action: BenchmarkAction) -> str | None:
|
|
486
|
-
"""Translate BenchmarkAction to element-based command for WAA's Computer.
|
|
487
|
-
|
|
488
|
-
Uses WAA's Computer class via /execute_windows endpoint. Click actions
|
|
489
|
-
use computer.mouse.move_id(id) for element-based grounding - the actual
|
|
490
|
-
coordinates are resolved by WAA's Computer class using the rects we
|
|
491
|
-
POSTed to /update_computer.
|
|
492
|
-
|
|
493
|
-
Args:
|
|
494
|
-
action: The action to translate.
|
|
495
|
-
|
|
496
|
-
Returns:
|
|
497
|
-
Python command string to execute via /execute_windows endpoint,
|
|
498
|
-
or None for actions that don't need execution.
|
|
499
|
-
"""
|
|
500
|
-
if action.type == "done":
|
|
501
|
-
return None
|
|
502
|
-
|
|
503
|
-
if action.type == "wait":
|
|
504
|
-
return "import time; time.sleep(1)"
|
|
505
|
-
|
|
506
|
-
if action.type == "click":
|
|
507
|
-
return self._translate_click_action(action, "single_click")
|
|
508
|
-
|
|
509
|
-
if action.type == "double_click":
|
|
510
|
-
return self._translate_click_action(action, "double_click")
|
|
511
|
-
|
|
512
|
-
if action.type == "right_click":
|
|
513
|
-
return self._translate_click_action(action, "right_click")
|
|
514
|
-
|
|
515
|
-
if action.type == "type":
|
|
516
|
-
text = action.text or ""
|
|
517
|
-
# Escape special characters
|
|
518
|
-
text = text.replace("\\", "\\\\").replace("'", "\\'")
|
|
519
|
-
# Use pyautogui for typing (no grounding needed)
|
|
520
|
-
return f"import pyautogui; pyautogui.write('{text}', interval=0.02)"
|
|
521
|
-
|
|
522
|
-
if action.type == "key":
|
|
523
|
-
return self._translate_key_action(action)
|
|
524
|
-
|
|
525
|
-
if action.type == "scroll":
|
|
526
|
-
direction = action.scroll_direction or "down"
|
|
527
|
-
return f"computer.mouse.scroll('{direction}')"
|
|
528
|
-
|
|
529
|
-
if action.type == "drag":
|
|
530
|
-
# Drag requires start and end - use element IDs or coordinates
|
|
531
|
-
if action.target_node_id is not None:
|
|
532
|
-
elem_id = str(action.target_node_id)
|
|
533
|
-
if elem_id in self._current_rects:
|
|
534
|
-
# Start at element, drag to end coords
|
|
535
|
-
end_x = action.end_x or 0
|
|
536
|
-
end_y = action.end_y or 0
|
|
537
|
-
if isinstance(end_x, float) and 0 <= end_x <= 1:
|
|
538
|
-
end_x = int(end_x * self.config.screen_width)
|
|
539
|
-
if isinstance(end_y, float) and 0 <= end_y <= 1:
|
|
540
|
-
end_y = int(end_y * self.config.screen_height)
|
|
541
|
-
return (
|
|
542
|
-
f"computer.mouse.move_id('{elem_id}'); "
|
|
543
|
-
f"computer.mouse.drag({int(end_x)}, {int(end_y)})"
|
|
544
|
-
)
|
|
545
|
-
logger.warning("Drag requires target_node_id with valid element")
|
|
546
|
-
return None
|
|
547
|
-
|
|
548
|
-
logger.warning(f"Unknown action type: {action.type}")
|
|
549
|
-
return None
|
|
550
|
-
|
|
551
|
-
def _translate_click_action(self, action: BenchmarkAction, click_method: str) -> str:
|
|
552
|
-
"""Translate click-type action to element-based command.
|
|
553
|
-
|
|
554
|
-
Args:
|
|
555
|
-
action: The click action.
|
|
556
|
-
click_method: "single_click", "double_click", or "right_click".
|
|
557
|
-
|
|
558
|
-
Returns:
|
|
559
|
-
Python command string using computer.mouse.move_id() for grounding.
|
|
560
|
-
"""
|
|
561
|
-
# Prefer element ID for grounding (SoM mode)
|
|
562
|
-
if action.target_node_id is not None:
|
|
563
|
-
elem_id = str(action.target_node_id)
|
|
564
|
-
if elem_id in self._current_rects:
|
|
565
|
-
return f"computer.mouse.move_id('{elem_id}'); computer.mouse.{click_method}()"
|
|
566
|
-
else:
|
|
567
|
-
logger.warning(f"Element ID '{elem_id}' not found in rects, falling back to coordinates")
|
|
568
|
-
|
|
569
|
-
# Fallback: use coordinates if provided (less precise)
|
|
570
|
-
x = action.x if action.x is not None else 0
|
|
571
|
-
y = action.y if action.y is not None else 0
|
|
572
|
-
|
|
573
|
-
# Normalize coordinates
|
|
574
|
-
if isinstance(x, float) and 0 <= x <= 1:
|
|
575
|
-
x = x # Keep normalized - move_abs handles it
|
|
576
|
-
if isinstance(y, float) and 0 <= y <= 1:
|
|
577
|
-
y = y # Keep normalized
|
|
578
|
-
|
|
579
|
-
return f"computer.mouse.move_abs({x}, {y}); computer.mouse.{click_method}()"
|
|
580
|
-
|
|
581
|
-
def _translate_key_action(self, action: BenchmarkAction) -> str:
|
|
582
|
-
"""Translate key press action using pyautogui (no grounding needed)."""
|
|
583
|
-
key = action.key or ""
|
|
584
|
-
|
|
585
|
-
# Map common key names to pyautogui names
|
|
586
|
-
key_map = {
|
|
587
|
-
"Enter": "enter",
|
|
588
|
-
"Return": "enter",
|
|
589
|
-
"Tab": "tab",
|
|
590
|
-
"Escape": "escape",
|
|
591
|
-
"Esc": "escape",
|
|
592
|
-
"Backspace": "backspace",
|
|
593
|
-
"Delete": "delete",
|
|
594
|
-
"Del": "delete",
|
|
595
|
-
"Space": "space",
|
|
596
|
-
"Up": "up",
|
|
597
|
-
"Down": "down",
|
|
598
|
-
"Left": "left",
|
|
599
|
-
"Right": "right",
|
|
600
|
-
"Home": "home",
|
|
601
|
-
"End": "end",
|
|
602
|
-
"PageUp": "pageup",
|
|
603
|
-
"PageDown": "pagedown",
|
|
604
|
-
"F1": "f1", "F2": "f2", "F3": "f3", "F4": "f4",
|
|
605
|
-
"F5": "f5", "F6": "f6", "F7": "f7", "F8": "f8",
|
|
606
|
-
"F9": "f9", "F10": "f10", "F11": "f11", "F12": "f12",
|
|
607
|
-
}
|
|
608
|
-
key = key_map.get(key, key.lower())
|
|
609
|
-
|
|
610
|
-
# Handle modifiers with hotkey
|
|
611
|
-
if action.modifiers:
|
|
612
|
-
mods = [m.lower() for m in action.modifiers]
|
|
613
|
-
mod_map = {"control": "ctrl", "command": "win", "meta": "win"}
|
|
614
|
-
mods = [mod_map.get(m, m) for m in mods]
|
|
615
|
-
all_keys = mods + [key]
|
|
616
|
-
keys_str = ", ".join(f"'{k}'" for k in all_keys)
|
|
617
|
-
return f"import pyautogui; pyautogui.hotkey({keys_str})"
|
|
618
|
-
|
|
619
|
-
return f"import pyautogui; pyautogui.press('{key}')"
|