openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,619 @@
1
+ """Windows Agent Arena Live adapter.
2
+
3
+ This module provides a live HTTP-based adapter for WAA that connects to the
4
+ WAA Flask server running inside a Windows VM. Unlike WAAAdapter which imports
5
+ WAA's DesktopEnv locally, this adapter talks to the server remotely.
6
+
7
+ Architecture:
8
+ The adapter uses WAA's element-based execution model:
9
+ 1. Fetch accessibility tree from /accessibility endpoint
10
+ 2. Extract element bboxes and POST to /update_computer as rects dict
11
+ 3. Agent outputs actions with target_node_id (element-based grounding)
12
+ 4. Execute via /execute_windows using computer.mouse.move_id(id) commands
13
+
14
+ This keeps grounding authority on WAA side - we send element IDs,
15
+ not pixel coordinates. WAA's Computer class handles the grounding.
16
+
17
+ Example:
18
+ from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
19
+
20
+ adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
21
+ agent = DemoConditionedAgent(base_agent, retriever)
22
+ results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import base64
28
+ import io
29
+ import logging
30
+ import time
31
+ from dataclasses import dataclass, field
32
+ from typing import Any
33
+
34
+ import requests
35
+
36
+ from openadapt_ml.benchmarks.base import (
37
+ BenchmarkAction,
38
+ BenchmarkAdapter,
39
+ BenchmarkObservation,
40
+ BenchmarkResult,
41
+ BenchmarkTask,
42
+ )
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ @dataclass
48
+ class WAALiveConfig:
49
+ """Configuration for WAALiveAdapter.
50
+
51
+ Attributes:
52
+ server_url: URL of WAA Flask server (e.g., "http://172.171.112.41:5000").
53
+ a11y_backend: Accessibility backend ("uia" or "win32").
54
+ screen_width: Screen width in pixels.
55
+ screen_height: Screen height in pixels.
56
+ max_steps: Default maximum steps per task.
57
+ action_delay: Delay after actions in seconds (for UI to settle).
58
+ timeout: Request timeout in seconds.
59
+ """
60
+
61
+ server_url: str = "http://localhost:5000"
62
+ a11y_backend: str = "uia"
63
+ screen_width: int = 1920
64
+ screen_height: int = 1200
65
+ max_steps: int = 15
66
+ action_delay: float = 0.5
67
+ timeout: float = 90.0
68
+
69
+
70
+ class WAALiveAdapter(BenchmarkAdapter):
71
+ """Live WAA adapter that connects to WAA Flask server over HTTP.
72
+
73
+ Unlike WAAAdapter which imports WAA's DesktopEnv locally, this adapter
74
+ talks to the WAA server remotely via HTTP. This enables:
75
+ - Running DemoConditionedAgent from local machine
76
+ - Using our own VLM (Claude/GPT) instead of WAA's built-in navi agent
77
+ - Injecting demos into prompts before each action
78
+
79
+ Args:
80
+ config: WAALiveConfig with server URL and settings.
81
+ """
82
+
83
+ def __init__(self, config: WAALiveConfig | None = None):
84
+ self.config = config or WAALiveConfig()
85
+ self._current_task: BenchmarkTask | None = None
86
+ self._step_count = 0
87
+ self._current_a11y: dict | None = None
88
+ self._current_rects: dict[str, list[int]] = {} # element_id -> [l, t, r, b]
89
+ self._current_screenshot: bytes | None = None
90
+ self._actions: list[BenchmarkAction] = []
91
+
92
+ @property
93
+ def name(self) -> str:
94
+ """Benchmark name."""
95
+ return "waa-live"
96
+
97
+ @property
98
+ def benchmark_type(self) -> str:
99
+ """Benchmark type (interactive)."""
100
+ return "interactive"
101
+
102
+ @property
103
+ def supports_parallel(self) -> bool:
104
+ """Whether parallel execution is supported."""
105
+ return False # Single VM for now
106
+
107
+ def check_connection(self) -> bool:
108
+ """Check if WAA server is reachable.
109
+
110
+ Returns:
111
+ True if server responds to /probe endpoint.
112
+ """
113
+ try:
114
+ resp = requests.get(
115
+ f"{self.config.server_url}/probe",
116
+ timeout=5.0
117
+ )
118
+ return resp.status_code == 200
119
+ except requests.RequestException:
120
+ return False
121
+
122
+ def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
123
+ """List available WAA tasks.
124
+
125
+ For live adapter, tasks are typically loaded on-demand.
126
+ Returns empty list - use load_task() directly.
127
+ """
128
+ return []
129
+
130
+ def load_task(self, task_id: str) -> BenchmarkTask:
131
+ """Load a specific task by ID.
132
+
133
+ Args:
134
+ task_id: Task identifier.
135
+
136
+ Returns:
137
+ BenchmarkTask object.
138
+ """
139
+ # For now, create a minimal task - actual task configs should be
140
+ # loaded from WAA repo if needed
141
+ return BenchmarkTask(
142
+ task_id=task_id,
143
+ instruction=f"Task {task_id}",
144
+ domain=task_id.split("_")[0] if "_" in task_id else "unknown",
145
+ time_limit_steps=self.config.max_steps,
146
+ )
147
+
148
+ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
149
+ """Reset environment to task's initial state.
150
+
151
+ Args:
152
+ task: Task to initialize.
153
+
154
+ Returns:
155
+ Initial observation (screenshot + accessibility tree).
156
+
157
+ Raises:
158
+ RuntimeError: If server is not reachable.
159
+ """
160
+ if not self.check_connection():
161
+ raise RuntimeError(
162
+ f"Cannot connect to WAA server at {self.config.server_url}. "
163
+ f"Ensure Windows VM is running and server is started."
164
+ )
165
+
166
+ self._current_task = task
167
+ self._step_count = 0
168
+ self._actions = []
169
+
170
+ # Try to close all windows for clean state
171
+ try:
172
+ requests.post(
173
+ f"{self.config.server_url}/setup/close_all",
174
+ timeout=30.0
175
+ )
176
+ logger.info("Closed all windows for clean state")
177
+ except requests.RequestException as e:
178
+ logger.warning(f"Failed to close windows: {e}")
179
+
180
+ # If task has setup commands in raw_config, execute them
181
+ if task.raw_config:
182
+ self._run_task_setup(task.raw_config)
183
+
184
+ # Small delay for UI to settle
185
+ time.sleep(1.0)
186
+
187
+ return self._get_observation()
188
+
189
+ def step(
190
+ self, action: BenchmarkAction
191
+ ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
192
+ """Execute action and return new observation.
193
+
194
+ Uses element-based grounding via WAA's Computer class. Click actions
195
+ are translated to computer.mouse.move_id(id) commands that WAA executes
196
+ using the rects we POSTed to /update_computer.
197
+
198
+ Args:
199
+ action: Action to execute.
200
+
201
+ Returns:
202
+ Tuple of (observation, done, info).
203
+ """
204
+ self._step_count += 1
205
+ self._actions.append(action)
206
+
207
+ # Translate action to element-based command for WAA's Computer
208
+ command = self._translate_action(action)
209
+
210
+ # Execute command via /execute_windows (has access to computer object)
211
+ if command:
212
+ try:
213
+ resp = requests.post(
214
+ f"{self.config.server_url}/execute_windows",
215
+ json={"command": command},
216
+ timeout=self.config.timeout
217
+ )
218
+ if resp.status_code != 200:
219
+ logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
220
+ else:
221
+ result = resp.json()
222
+ if result.get("stderr"):
223
+ logger.warning(f"Command stderr: {result['stderr']}")
224
+ logger.debug(f"Executed: {command}")
225
+ except requests.RequestException as e:
226
+ logger.error(f"Execute request failed: {e}")
227
+
228
+ # Wait for UI to settle
229
+ time.sleep(self.config.action_delay)
230
+
231
+ # Check if done
232
+ done = (
233
+ action.type == "done" or
234
+ self._step_count >= self.config.max_steps
235
+ )
236
+
237
+ obs = self._get_observation()
238
+ info = {
239
+ "step": self._step_count,
240
+ "command": command,
241
+ }
242
+
243
+ return obs, done, info
244
+
245
+ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
246
+ """Evaluate current state against task success criteria.
247
+
248
+ For live adapter, full evaluation requires running WAA's evaluators.
249
+ Currently returns a placeholder result.
250
+
251
+ Args:
252
+ task: Task to evaluate.
253
+
254
+ Returns:
255
+ BenchmarkResult with success/score.
256
+ """
257
+ # TODO: Implement proper evaluation by calling WAA evaluators
258
+ # For now, check if agent took any actions
259
+ has_actions = len(self._actions) > 0
260
+ called_done = any(a.type == "done" for a in self._actions)
261
+
262
+ return BenchmarkResult(
263
+ task_id=task.task_id,
264
+ success=False, # Can't determine without evaluator
265
+ score=0.5 if has_actions and called_done else 0.0,
266
+ num_steps=self._step_count,
267
+ reason="Evaluation requires WAA evaluators (not yet implemented)",
268
+ )
269
+
270
+ def close(self) -> None:
271
+ """Clean up resources."""
272
+ self._current_task = None
273
+ self._current_a11y = None
274
+ self._actions = []
275
+
276
+ def _get_observation(self) -> BenchmarkObservation:
277
+ """Fetch current observation from WAA server.
278
+
279
+ Also extracts element rects from a11y tree and updates WAA's Computer
280
+ so element-based grounding works for subsequent actions.
281
+
282
+ Returns:
283
+ BenchmarkObservation with screenshot and accessibility tree.
284
+ """
285
+ screenshot = None
286
+ a11y_tree = None
287
+
288
+ # Get screenshot
289
+ try:
290
+ resp = requests.get(
291
+ f"{self.config.server_url}/screenshot",
292
+ timeout=30.0
293
+ )
294
+ if resp.status_code == 200:
295
+ screenshot = resp.content
296
+ self._current_screenshot = screenshot
297
+ logger.debug(f"Got screenshot: {len(screenshot)} bytes")
298
+ else:
299
+ logger.warning(f"Screenshot request failed: {resp.status_code}")
300
+ except requests.RequestException as e:
301
+ logger.error(f"Screenshot request error: {e}")
302
+
303
+ # Get accessibility tree
304
+ try:
305
+ resp = requests.get(
306
+ f"{self.config.server_url}/accessibility",
307
+ params={"backend": self.config.a11y_backend},
308
+ timeout=30.0
309
+ )
310
+ if resp.status_code == 200:
311
+ result = resp.json()
312
+ a11y_tree = result.get("AT", {})
313
+ self._current_a11y = a11y_tree
314
+ # Extract rects for element-based grounding
315
+ self._current_rects = self._extract_rects_from_a11y(a11y_tree)
316
+ logger.debug("Got accessibility tree with %d elements", len(self._current_rects))
317
+ else:
318
+ logger.warning(f"A11y request failed: {resp.status_code}")
319
+ except requests.RequestException as e:
320
+ logger.error(f"A11y request error: {e}")
321
+
322
+ # Update WAA's Computer with current rects for element grounding
323
+ if self._current_rects:
324
+ self._update_waa_computer()
325
+
326
+ return BenchmarkObservation(
327
+ screenshot=screenshot,
328
+ viewport=(self.config.screen_width, self.config.screen_height),
329
+ accessibility_tree=a11y_tree,
330
+ window_title=self._extract_window_title(a11y_tree),
331
+ )
332
+
333
+ def _extract_window_title(self, a11y_tree: dict | str | None) -> str | None:
334
+ """Extract window title from accessibility tree."""
335
+ if not a11y_tree:
336
+ return None
337
+ # Handle XML string - can't extract title easily
338
+ if isinstance(a11y_tree, str):
339
+ return None
340
+ # Try common field names
341
+ for key in ["Name", "name", "title", "Title"]:
342
+ if key in a11y_tree:
343
+ return a11y_tree[key]
344
+ return None
345
+
346
+ def _extract_rects_from_a11y(self, a11y_tree: dict | None) -> dict[str, list[int]]:
347
+ """Extract element ID -> bounding box mapping from accessibility tree.
348
+
349
+ This produces the `rects` dict that WAA's Computer class expects.
350
+ The rects are then POSTed to /update_computer so WAA can handle grounding.
351
+
352
+ Args:
353
+ a11y_tree: Accessibility tree from /accessibility endpoint.
354
+
355
+ Returns:
356
+ Dict mapping element IDs to [left, top, right, bottom] bounding boxes.
357
+ """
358
+ rects: dict[str, list[int]] = {}
359
+
360
+ def visit(node: dict) -> None:
361
+ # Get element ID
362
+ elem_id = None
363
+ for id_field in ["id", "Id", "ID", "AutomationId"]:
364
+ if id_field in node and node[id_field]:
365
+ elem_id = str(node[id_field])
366
+ break
367
+
368
+ # Get bounding box
369
+ bbox = None
370
+ for bbox_field in ["bbox", "BoundingRectangle", "Rect", "rect"]:
371
+ if bbox_field in node:
372
+ bbox = node[bbox_field]
373
+ break
374
+
375
+ # Store if we have both ID and bbox
376
+ if elem_id is not None and bbox is not None:
377
+ # Normalize bbox to [left, top, right, bottom]
378
+ if isinstance(bbox, list) and len(bbox) == 4:
379
+ # Could be [l, t, r, b] or [l, t, w, h] - assume [l, t, r, b]
380
+ rects[elem_id] = [int(x) for x in bbox]
381
+ elif isinstance(bbox, dict):
382
+ x = bbox.get("x", 0)
383
+ y = bbox.get("y", 0)
384
+ w = bbox.get("width", 0)
385
+ h = bbox.get("height", 0)
386
+ rects[elem_id] = [x, y, x + w, y + h]
387
+ elif isinstance(bbox, str):
388
+ parts = [int(p) for p in bbox.split(",")]
389
+ if len(parts) == 4:
390
+ rects[elem_id] = parts
391
+
392
+ # Visit children
393
+ for child_field in ["children", "Children"]:
394
+ children = node.get(child_field, [])
395
+ if isinstance(children, list):
396
+ for child in children:
397
+ if isinstance(child, dict):
398
+ visit(child)
399
+
400
+ if a11y_tree:
401
+ # Handle case where a11y_tree is XML string (WAA returns XML)
402
+ if isinstance(a11y_tree, str):
403
+ # TODO: Parse XML to dict if needed for element grounding
404
+ logger.debug("A11y tree is XML string, skipping rect extraction")
405
+ return rects
406
+ visit(a11y_tree)
407
+
408
+ logger.debug(f"Extracted {len(rects)} element rects from a11y tree")
409
+ return rects
410
+
411
+ def _update_waa_computer(self) -> None:
412
+ """POST current rects and screenshot to WAA's /update_computer endpoint.
413
+
414
+ This syncs WAA's Computer object with our current element state,
415
+ allowing computer.mouse.move_id(id) to work correctly.
416
+ """
417
+ if not self._current_rects:
418
+ logger.warning("No rects to update - skipping /update_computer")
419
+ return
420
+
421
+ # Encode screenshot as base64
422
+ screenshot_b64 = ""
423
+ if self._current_screenshot:
424
+ screenshot_b64 = base64.b64encode(self._current_screenshot).decode("utf-8")
425
+
426
+ # Window rect (full screen for now)
427
+ window_rect = [0, 0, self.config.screen_width, self.config.screen_height]
428
+
429
+ payload = {
430
+ "rects": self._current_rects,
431
+ "window_rect": window_rect,
432
+ "screenshot": screenshot_b64,
433
+ "scale": [1.0, 1.0],
434
+ }
435
+
436
+ try:
437
+ resp = requests.post(
438
+ f"{self.config.server_url}/update_computer",
439
+ json=payload,
440
+ timeout=30.0
441
+ )
442
+ if resp.status_code == 200:
443
+ logger.debug("Updated WAA computer with %d rects", len(self._current_rects))
444
+ else:
445
+ logger.warning(f"update_computer failed: {resp.status_code} - {resp.text}")
446
+ except requests.RequestException as e:
447
+ logger.error(f"update_computer request error: {e}")
448
+
449
+ def _run_task_setup(self, raw_config: dict) -> None:
450
+ """Run task setup commands from raw_config.
451
+
452
+ Args:
453
+ raw_config: Task configuration with setup commands.
454
+ """
455
+ # Handle different setup command formats
456
+ setup = raw_config.get("setup", raw_config.get("init", {}))
457
+
458
+ if isinstance(setup, dict):
459
+ # Launch application if specified
460
+ if "app" in setup or "application" in setup:
461
+ app = setup.get("app") or setup.get("application")
462
+ try:
463
+ requests.post(
464
+ f"{self.config.server_url}/setup/launch",
465
+ json={"app": app},
466
+ timeout=30.0
467
+ )
468
+ logger.info(f"Launched app: {app}")
469
+ except requests.RequestException as e:
470
+ logger.warning(f"Failed to launch app: {e}")
471
+
472
+ # Run shell commands if specified
473
+ if "commands" in setup:
474
+ for cmd in setup["commands"]:
475
+ try:
476
+ requests.post(
477
+ f"{self.config.server_url}/execute_windows",
478
+ json={"command": cmd, "shell": "powershell"},
479
+ timeout=60.0
480
+ )
481
+ logger.info(f"Ran setup command: {cmd[:50]}...")
482
+ except requests.RequestException as e:
483
+ logger.warning(f"Setup command failed: {e}")
484
+
485
+ def _translate_action(self, action: BenchmarkAction) -> str | None:
486
+ """Translate BenchmarkAction to element-based command for WAA's Computer.
487
+
488
+ Uses WAA's Computer class via /execute_windows endpoint. Click actions
489
+ use computer.mouse.move_id(id) for element-based grounding - the actual
490
+ coordinates are resolved by WAA's Computer class using the rects we
491
+ POSTed to /update_computer.
492
+
493
+ Args:
494
+ action: The action to translate.
495
+
496
+ Returns:
497
+ Python command string to execute via /execute_windows endpoint,
498
+ or None for actions that don't need execution.
499
+ """
500
+ if action.type == "done":
501
+ return None
502
+
503
+ if action.type == "wait":
504
+ return "import time; time.sleep(1)"
505
+
506
+ if action.type == "click":
507
+ return self._translate_click_action(action, "single_click")
508
+
509
+ if action.type == "double_click":
510
+ return self._translate_click_action(action, "double_click")
511
+
512
+ if action.type == "right_click":
513
+ return self._translate_click_action(action, "right_click")
514
+
515
+ if action.type == "type":
516
+ text = action.text or ""
517
+ # Escape special characters
518
+ text = text.replace("\\", "\\\\").replace("'", "\\'")
519
+ # Use pyautogui for typing (no grounding needed)
520
+ return f"import pyautogui; pyautogui.write('{text}', interval=0.02)"
521
+
522
+ if action.type == "key":
523
+ return self._translate_key_action(action)
524
+
525
+ if action.type == "scroll":
526
+ direction = action.scroll_direction or "down"
527
+ return f"computer.mouse.scroll('{direction}')"
528
+
529
+ if action.type == "drag":
530
+ # Drag requires start and end - use element IDs or coordinates
531
+ if action.target_node_id is not None:
532
+ elem_id = str(action.target_node_id)
533
+ if elem_id in self._current_rects:
534
+ # Start at element, drag to end coords
535
+ end_x = action.end_x or 0
536
+ end_y = action.end_y or 0
537
+ if isinstance(end_x, float) and 0 <= end_x <= 1:
538
+ end_x = int(end_x * self.config.screen_width)
539
+ if isinstance(end_y, float) and 0 <= end_y <= 1:
540
+ end_y = int(end_y * self.config.screen_height)
541
+ return (
542
+ f"computer.mouse.move_id('{elem_id}'); "
543
+ f"computer.mouse.drag({int(end_x)}, {int(end_y)})"
544
+ )
545
+ logger.warning("Drag requires target_node_id with valid element")
546
+ return None
547
+
548
+ logger.warning(f"Unknown action type: {action.type}")
549
+ return None
550
+
551
+ def _translate_click_action(self, action: BenchmarkAction, click_method: str) -> str:
552
+ """Translate click-type action to element-based command.
553
+
554
+ Args:
555
+ action: The click action.
556
+ click_method: "single_click", "double_click", or "right_click".
557
+
558
+ Returns:
559
+ Python command string using computer.mouse.move_id() for grounding.
560
+ """
561
+ # Prefer element ID for grounding (SoM mode)
562
+ if action.target_node_id is not None:
563
+ elem_id = str(action.target_node_id)
564
+ if elem_id in self._current_rects:
565
+ return f"computer.mouse.move_id('{elem_id}'); computer.mouse.{click_method}()"
566
+ else:
567
+ logger.warning(f"Element ID '{elem_id}' not found in rects, falling back to coordinates")
568
+
569
+ # Fallback: use coordinates if provided (less precise)
570
+ x = action.x if action.x is not None else 0
571
+ y = action.y if action.y is not None else 0
572
+
573
+ # Normalize coordinates
574
+ if isinstance(x, float) and 0 <= x <= 1:
575
+ x = x # Keep normalized - move_abs handles it
576
+ if isinstance(y, float) and 0 <= y <= 1:
577
+ y = y # Keep normalized
578
+
579
+ return f"computer.mouse.move_abs({x}, {y}); computer.mouse.{click_method}()"
580
+
581
+ def _translate_key_action(self, action: BenchmarkAction) -> str:
582
+ """Translate key press action using pyautogui (no grounding needed)."""
583
+ key = action.key or ""
584
+
585
+ # Map common key names to pyautogui names
586
+ key_map = {
587
+ "Enter": "enter",
588
+ "Return": "enter",
589
+ "Tab": "tab",
590
+ "Escape": "escape",
591
+ "Esc": "escape",
592
+ "Backspace": "backspace",
593
+ "Delete": "delete",
594
+ "Del": "delete",
595
+ "Space": "space",
596
+ "Up": "up",
597
+ "Down": "down",
598
+ "Left": "left",
599
+ "Right": "right",
600
+ "Home": "home",
601
+ "End": "end",
602
+ "PageUp": "pageup",
603
+ "PageDown": "pagedown",
604
+ "F1": "f1", "F2": "f2", "F3": "f3", "F4": "f4",
605
+ "F5": "f5", "F6": "f6", "F7": "f7", "F8": "f8",
606
+ "F9": "f9", "F10": "f10", "F11": "f11", "F12": "f12",
607
+ }
608
+ key = key_map.get(key, key.lower())
609
+
610
+ # Handle modifiers with hotkey
611
+ if action.modifiers:
612
+ mods = [m.lower() for m in action.modifiers]
613
+ mod_map = {"control": "ctrl", "command": "win", "meta": "win"}
614
+ mods = [mod_map.get(m, m) for m in mods]
615
+ all_keys = mods + [key]
616
+ keys_str = ", ".join(f"'{k}'" for k in all_keys)
617
+ return f"import pyautogui; pyautogui.hotkey({keys_str})"
618
+
619
+ return f"import pyautogui; pyautogui.press('{key}')"