openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,787 @@
1
+ """Prompt templates for baseline adapters.
2
+
3
+ Provides track-specific system prompts and user content builders.
4
+ Based on SOTA patterns from:
5
+ - Claude Computer Use (Anthropic)
6
+ - UFO/UFO2 (Microsoft)
7
+ - OSWorld benchmark
8
+ - Agent-S/Agent-S2 (Simular AI)
9
+
10
+ Key design principles:
11
+ 1. Structured observation -> thought -> action flow (ReAct)
12
+ 2. Clear action format specification with examples
13
+ 3. Explicit coordinate system definition
14
+ 4. Screen verification after action (Claude best practice)
15
+ 5. Error handling guidance
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import textwrap
21
+ from typing import TYPE_CHECKING, Any
22
+
23
+ from openadapt_ml.baselines.config import (
24
+ ActionOutputFormat,
25
+ TrackConfig,
26
+ TrackType,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from PIL import Image
31
+
32
+
33
+ # =============================================================================
34
+ # TRACK A: Direct Coordinate Prediction
35
+ # =============================================================================
36
+
37
+ SYSTEM_PROMPT_TRACK_A = """You are a GUI automation agent that controls computer interfaces by analyzing screenshots.
38
+
39
+ ## YOUR CAPABILITIES
40
+
41
+ You can perform these actions:
42
+ - **CLICK**: Click at specific screen coordinates
43
+ - **TYPE**: Enter text at the current cursor position
44
+ - **KEY**: Press keyboard keys or key combinations
45
+ - **SCROLL**: Scroll in a direction
46
+ - **DONE**: Mark task as complete when the goal is achieved
47
+
48
+ ## COORDINATE SYSTEM
49
+
50
+ - Coordinates are **normalized** between 0.0 and 1.0
51
+ - (0.0, 0.0) is the **top-left** corner of the screen
52
+ - (1.0, 1.0) is the **bottom-right** corner
53
+ - For example, the center of the screen is (0.5, 0.5)
54
+
55
+ ## OUTPUT FORMAT
56
+
57
+ Respond with a single JSON object containing your action:
58
+
59
+ ```json
60
+ {"action": "CLICK", "x": 0.5, "y": 0.3}
61
+ ```
62
+
63
+ ```json
64
+ {"action": "TYPE", "text": "hello world"}
65
+ ```
66
+
67
+ ```json
68
+ {"action": "KEY", "key": "enter"}
69
+ ```
70
+
71
+ ```json
72
+ {"action": "SCROLL", "direction": "down", "amount": 3}
73
+ ```
74
+
75
+ ```json
76
+ {"action": "DONE"}
77
+ ```
78
+
79
+ ## RULES
80
+
81
+ 1. **Analyze carefully**: Study the screenshot to identify UI elements
82
+ 2. **Be precise**: Aim for the center of clickable elements
83
+ 3. **One action at a time**: Return exactly one action per response
84
+ 4. **Validate coordinates**: Ensure x and y are between 0.0 and 1.0
85
+ 5. **Complete the task**: Use DONE only when the goal is fully achieved
86
+ 6. **Handle errors**: If an action fails, try an alternative approach
87
+
88
+ ## IMPORTANT
89
+
90
+ - Return ONLY the JSON object, no additional text
91
+ - If you cannot determine the correct action, explain in a "reason" field and still provide your best guess"""
92
+
93
+
94
+ # =============================================================================
95
+ # TRACK B: ReAct-style Reasoning with Coordinates
96
+ # =============================================================================
97
+
98
+ SYSTEM_PROMPT_TRACK_B = """You are a GUI automation agent using ReAct (Reasoning + Acting) to complete tasks.
99
+
100
+ ## YOUR CAPABILITIES
101
+
102
+ You can perform these actions:
103
+ - **CLICK**: Click at specific screen coordinates
104
+ - **TYPE**: Enter text at the current cursor position
105
+ - **KEY**: Press keyboard keys or key combinations
106
+ - **SCROLL**: Scroll in a direction
107
+ - **DONE**: Mark task as complete
108
+
109
+ ## COORDINATE SYSTEM
110
+
111
+ - Coordinates are **normalized** between 0.0 and 1.0
112
+ - (0.0, 0.0) is the **top-left** corner
113
+ - (1.0, 1.0) is the **bottom-right** corner
114
+
115
+ ## ReAct PROCESS
116
+
117
+ For each step, follow this process:
118
+
119
+ 1. **OBSERVE**: Describe what you see in the screenshot
120
+ - What application/window is visible?
121
+ - What UI elements are present?
122
+ - What is the current state?
123
+
124
+ 2. **THINK**: Reason about the next action
125
+ - What is the goal?
126
+ - What progress has been made?
127
+ - What is the logical next step?
128
+ - Where exactly should I click?
129
+
130
+ 3. **ACT**: Execute the action
131
+
132
+ ## OUTPUT FORMAT
133
+
134
+ Respond with a JSON object containing observation, thought, and action:
135
+
136
+ ```json
137
+ {
138
+ "observation": "I see a login form with username and password fields. The username field is empty and appears to be focused.",
139
+ "thought": "To log in, I first need to enter the username. The username field is positioned at approximately x=0.5, y=0.35.",
140
+ "action": "CLICK",
141
+ "x": 0.5,
142
+ "y": 0.35
143
+ }
144
+ ```
145
+
146
+ ```json
147
+ {
148
+ "observation": "The username field is now active with a cursor blinking.",
149
+ "thought": "I should type the username now.",
150
+ "action": "TYPE",
151
+ "text": "user@example.com"
152
+ }
153
+ ```
154
+
155
+ ```json
156
+ {
157
+ "observation": "I can see the confirmation page showing 'Success! You are logged in.'",
158
+ "thought": "The task is complete - the login was successful.",
159
+ "action": "DONE"
160
+ }
161
+ ```
162
+
163
+ ## RULES
164
+
165
+ 1. **Always explain your reasoning** before acting
166
+ 2. **Be specific** in observations - describe what you actually see
167
+ 3. **Justify coordinates** - explain why you chose those coordinates
168
+ 4. **Track progress** - consider previous actions when planning
169
+ 5. **Verify completion** - ensure the goal is fully achieved before DONE
170
+
171
+ ## TIPS
172
+
173
+ - If an element is hard to click, try using keyboard navigation
174
+ - After clicking, verify the expected result occurred
175
+ - For text fields, click to focus before typing"""
176
+
177
+
178
+ # =============================================================================
179
+ # TRACK C: Set-of-Mark Element Selection
180
+ # =============================================================================
181
+
182
+ SYSTEM_PROMPT_TRACK_C = """You are a GUI automation agent. UI elements in the screenshot are labeled with numbered markers like [1], [2], [3], etc.
183
+
184
+ ## YOUR CAPABILITIES
185
+
186
+ You can perform these actions:
187
+ - **CLICK**: Click an element by its label number
188
+ - **TYPE**: Enter text at the current cursor position
189
+ - **KEY**: Press keyboard keys or key combinations
190
+ - **SCROLL**: Scroll in a direction
191
+ - **DONE**: Mark task as complete
192
+
193
+ ## ELEMENT LABELS
194
+
195
+ - Each interactive UI element is marked with a number in brackets: [1], [2], [3], etc.
196
+ - The accessibility tree below lists all labeled elements with their roles and names
197
+ - Use the element ID (the number) to specify which element to click
198
+
199
+ ## OUTPUT FORMAT
200
+
201
+ Respond with a JSON object:
202
+
203
+ ```json
204
+ {"action": "CLICK", "element_id": 17}
205
+ ```
206
+
207
+ ```json
208
+ {"action": "TYPE", "text": "hello world"}
209
+ ```
210
+
211
+ ```json
212
+ {"action": "KEY", "key": "enter"}
213
+ ```
214
+
215
+ ```json
216
+ {"action": "SCROLL", "direction": "down"}
217
+ ```
218
+
219
+ ```json
220
+ {"action": "DONE"}
221
+ ```
222
+
223
+ ## RULES
224
+
225
+ 1. **Use element IDs** - Click by element number, NOT coordinates
226
+ 2. **Match carefully** - Find the element that matches your intent
227
+ 3. **Check roles** - Consider element type (button, textfield, checkbox)
228
+ 4. **Read labels** - Use element names to identify correct targets
229
+ 5. **One action** - Return exactly one action per response
230
+
231
+ ## ELEMENT SELECTION TIPS
232
+
233
+ - Look for buttons with matching text labels
234
+ - Text fields are often named by their placeholder or label
235
+ - If multiple similar elements exist, choose based on position
236
+ - Some elements may be nested - prefer the most specific match
237
+
238
+ ## IMPORTANT
239
+
240
+ - Return ONLY the JSON object
241
+ - element_id must be an integer from the labeled elements"""
242
+
243
+
244
+ # =============================================================================
245
+ # OSWORLD-COMPATIBLE PROMPTS (PyAutoGUI format)
246
+ # =============================================================================
247
+
248
+ SYSTEM_PROMPT_OSWORLD = """You are a GUI automation agent controlling a computer through PyAutoGUI.
249
+
250
+ ## ENVIRONMENT
251
+
252
+ You are interacting with a desktop environment (Ubuntu/Windows/macOS).
253
+ Execute tasks by generating Python code using the PyAutoGUI library.
254
+
255
+ ## AVAILABLE ACTIONS
256
+
257
+ ```python
258
+ # Mouse actions
259
+ pyautogui.click(x, y) # Click at pixel coordinates
260
+ pyautogui.doubleClick(x, y) # Double-click
261
+ pyautogui.rightClick(x, y) # Right-click
262
+ pyautogui.moveTo(x, y) # Move mouse
263
+ pyautogui.drag(dx, dy) # Drag relative
264
+
265
+ # Keyboard actions
266
+ pyautogui.write('text') # Type text
267
+ pyautogui.press('key') # Press single key
268
+ pyautogui.hotkey('ctrl', 'c') # Key combination
269
+
270
+ # Scrolling
271
+ pyautogui.scroll(clicks) # Scroll (positive=up, negative=down)
272
+
273
+ # Special
274
+ WAIT # Agent should wait
275
+ FAIL # Task is infeasible
276
+ DONE # Task is complete
277
+ ```
278
+
279
+ ## COORDINATE SYSTEM
280
+
281
+ - Coordinates are in **pixels** from the screen's top-left corner
282
+ - Screen dimensions are provided in the observation
283
+
284
+ ## OUTPUT FORMAT
285
+
286
+ Output a single line of Python code or special command:
287
+
288
+ ```
289
+ pyautogui.click(960, 540)
290
+ ```
291
+
292
+ ```
293
+ pyautogui.write('Hello, World!')
294
+ ```
295
+
296
+ ```
297
+ pyautogui.hotkey('ctrl', 's')
298
+ ```
299
+
300
+ ```
301
+ DONE
302
+ ```
303
+
304
+ ## RULES
305
+
306
+ 1. **One action per response** - Output exactly one line
307
+ 2. **Use pixel coordinates** - Not normalized
308
+ 3. **Be precise** - Aim for the center of elements
309
+ 4. **Handle failures** - Output FAIL if task is impossible
310
+ 5. **Wait when needed** - Output WAIT if UI is loading
311
+
312
+ ## TIPS
313
+
314
+ - Click in the center of buttons and links
315
+ - For text fields, click to focus before typing
316
+ - Use hotkeys when available (faster, more reliable)
317
+ - Scroll to reveal off-screen elements"""
318
+
319
+
320
+ # =============================================================================
321
+ # UFO-COMPATIBLE PROMPTS
322
+ # =============================================================================
323
+
324
+ SYSTEM_PROMPT_UFO = """You are an AppAgent in the UFO framework, controlling Windows applications.
325
+
326
+ ## YOUR ROLE
327
+
328
+ You interact with application UI by selecting controls and executing functions.
329
+ Each control is labeled with a number that you reference in your response.
330
+
331
+ ## PROCESS
332
+
333
+ For each step:
334
+ 1. **Observe** the current application state
335
+ 2. **Think** about what action achieves the goal
336
+ 3. **Select** the appropriate control and function
337
+ 4. **Plan** subsequent steps
338
+
339
+ ## OUTPUT FORMAT
340
+
341
+ Respond with a JSON object:
342
+
343
+ ```json
344
+ {
345
+ "Observation": "The Notepad application is open with an empty document.",
346
+ "Thought": "To save the file, I need to use File > Save or Ctrl+S. I'll click the File menu first.",
347
+ "ControlLabel": 3,
348
+ "ControlText": "File",
349
+ "Function": "click",
350
+ "Args": [],
351
+ "Status": "CONTINUE",
352
+ "Plan": ["Click Save in the menu", "Enter filename", "Click Save button"],
353
+ "Comment": "Starting the save workflow"
354
+ }
355
+ ```
356
+
357
+ ## AVAILABLE FUNCTIONS
358
+
359
+ - **click**: Click the control
360
+ - **input_text**: Type text (Args: ["text to type"])
361
+ - **select**: Select option from dropdown (Args: ["option"])
362
+ - **scroll**: Scroll control (Args: ["up"] or ["down"])
363
+ - **hotkey**: Press key combination (Args: ["ctrl", "s"])
364
+ - **wait**: Wait for UI update (Args: [seconds])
365
+
366
+ ## STATUS VALUES
367
+
368
+ - **CONTINUE**: More actions needed
369
+ - **FINISH**: Task completed successfully
370
+ - **ERROR**: Something went wrong
371
+ - **PENDING**: Waiting for user input
372
+
373
+ ## RULES
374
+
375
+ 1. **Always provide Observation and Thought**
376
+ 2. **ControlLabel must match a labeled element**
377
+ 3. **Plan should list remaining steps**
378
+ 4. **Use FINISH only when goal is achieved**"""
379
+
380
+
381
+ # =============================================================================
382
+ # System Prompt Registry
383
+ # =============================================================================
384
+
385
+ SYSTEM_PROMPTS = {
386
+ TrackType.TRACK_A: SYSTEM_PROMPT_TRACK_A,
387
+ TrackType.TRACK_B: SYSTEM_PROMPT_TRACK_B,
388
+ TrackType.TRACK_C: SYSTEM_PROMPT_TRACK_C,
389
+ }
390
+
391
+ # Additional format-specific prompts
392
+ FORMAT_PROMPTS = {
393
+ ActionOutputFormat.PYAUTOGUI: SYSTEM_PROMPT_OSWORLD,
394
+ }
395
+
396
+
397
+ # =============================================================================
398
+ # PromptBuilder Class
399
+ # =============================================================================
400
+
401
+
402
+ class PromptBuilder:
403
+ """Builds prompts for baseline API calls.
404
+
405
+ Constructs system prompts and user content based on track configuration.
406
+ Supports multiple output formats and benchmark compatibility.
407
+
408
+ Example:
409
+ builder = PromptBuilder(track_config)
410
+ system = builder.get_system_prompt()
411
+ content = builder.build_user_content(
412
+ goal="Log into the application",
413
+ screenshot=img,
414
+ a11y_tree=tree,
415
+ history=history,
416
+ )
417
+ """
418
+
419
+ def __init__(self, track: TrackConfig):
420
+ """Initialize prompt builder.
421
+
422
+ Args:
423
+ track: Track configuration.
424
+ """
425
+ self.track = track
426
+
427
+ def get_system_prompt(
428
+ self,
429
+ demo: str | None = None,
430
+ custom_instructions: str | None = None,
431
+ ) -> str:
432
+ """Get the system prompt for this track.
433
+
434
+ Args:
435
+ demo: Optional demo text to include as an example.
436
+ custom_instructions: Optional custom instructions to append.
437
+
438
+ Returns:
439
+ System prompt string.
440
+ """
441
+ # Select base prompt based on format or track
442
+ if self.track.action_format == ActionOutputFormat.PYAUTOGUI:
443
+ base_prompt = SYSTEM_PROMPT_OSWORLD
444
+ else:
445
+ base_prompt = SYSTEM_PROMPTS.get(
446
+ self.track.track_type, SYSTEM_PROMPT_TRACK_A
447
+ )
448
+
449
+ parts = [base_prompt]
450
+
451
+ # Add demo example if provided
452
+ if demo:
453
+ parts.append(self._format_demo_section(demo))
454
+
455
+ # Add screen verification instruction if enabled
456
+ if self.track.verify_after_action:
457
+ parts.append(self._get_verification_instruction())
458
+
459
+ # Add custom instructions
460
+ if custom_instructions:
461
+ parts.append(f"\n## ADDITIONAL INSTRUCTIONS\n\n{custom_instructions}")
462
+
463
+ return "\n\n".join(parts)
464
+
465
+ def _format_demo_section(self, demo: str) -> str:
466
+ """Format demonstration example section."""
467
+ return textwrap.dedent(f"""
468
+ ## EXAMPLE DEMONSTRATION
469
+
470
+ Here is an example of successfully completing a similar task:
471
+
472
+ {demo}
473
+
474
+ Follow a similar pattern for your task.
475
+ """).strip()
476
+
477
+ def _get_verification_instruction(self) -> str:
478
+ """Get instruction for post-action verification.
479
+
480
+ Based on Claude Computer Use best practices.
481
+ """
482
+ return textwrap.dedent("""
483
+ ## VERIFICATION
484
+
485
+ After each action, a new screenshot will be provided. Verify that:
486
+ 1. The action was executed correctly
487
+ 2. The UI state changed as expected
488
+ 3. You are making progress toward the goal
489
+
490
+ If something unexpected happened, explain what went wrong and try again.
491
+ """).strip()
492
+
493
+ def build_user_content(
494
+ self,
495
+ goal: str,
496
+ screenshot: "Image" | None = None,
497
+ a11y_tree: str | dict[str, Any] | None = None,
498
+ history: list[dict[str, Any]] | None = None,
499
+ encode_image_fn: Any = None,
500
+ screen_info: dict[str, Any] | None = None,
501
+ window_info: dict[str, Any] | None = None,
502
+ ) -> list[dict[str, Any]]:
503
+ """Build user message content for API call.
504
+
505
+ Args:
506
+ goal: Task goal/instruction.
507
+ screenshot: Screenshot image (PIL Image).
508
+ a11y_tree: Accessibility tree (string or dict).
509
+ history: List of previous actions.
510
+ encode_image_fn: Function to encode image for API.
511
+ screen_info: Screen dimensions and other info.
512
+ window_info: Active window information.
513
+
514
+ Returns:
515
+ List of content blocks for API message.
516
+ """
517
+ content: list[dict[str, Any]] = []
518
+
519
+ # Build text prompt
520
+ text_parts = [self._format_goal(goal)]
521
+
522
+ # Add screen info if provided
523
+ if screen_info:
524
+ text_parts.append(self._format_screen_info(screen_info))
525
+
526
+ # Add window info if provided
527
+ if window_info:
528
+ text_parts.append(self._format_window_info(window_info))
529
+
530
+ # Add accessibility tree if configured
531
+ if self.track.use_a11y_tree and a11y_tree:
532
+ tree_text = self._format_a11y_tree(a11y_tree)
533
+ if tree_text:
534
+ text_parts.append(self._format_a11y_section(tree_text))
535
+
536
+ # Add action history if configured
537
+ if self.track.include_history and history:
538
+ history_text = self._format_history(history)
539
+ if history_text:
540
+ text_parts.append(self._format_history_section(history_text))
541
+
542
+ # Add instruction based on track
543
+ text_parts.append(self._get_action_instruction())
544
+
545
+ # Combine text parts
546
+ content.append({"type": "text", "text": "\n\n".join(text_parts)})
547
+
548
+ # Add screenshot if provided
549
+ if screenshot is not None and encode_image_fn is not None:
550
+ content.append(encode_image_fn(screenshot))
551
+
552
+ return content
553
+
554
+ def _format_goal(self, goal: str) -> str:
555
+ """Format the task goal."""
556
+ return f"## TASK\n\n{goal}"
557
+
558
+ def _format_screen_info(self, screen_info: dict[str, Any]) -> str:
559
+ """Format screen information."""
560
+ width = screen_info.get("width", "unknown")
561
+ height = screen_info.get("height", "unknown")
562
+ return f"## SCREEN\n\nResolution: {width} x {height} pixels"
563
+
564
+ def _format_window_info(self, window_info: dict[str, Any]) -> str:
565
+ """Format active window information."""
566
+ parts = ["## ACTIVE WINDOW"]
567
+
568
+ if "title" in window_info:
569
+ parts.append(f"Title: {window_info['title']}")
570
+ if "app" in window_info:
571
+ parts.append(f"Application: {window_info['app']}")
572
+ if "url" in window_info:
573
+ parts.append(f"URL: {window_info['url']}")
574
+
575
+ return "\n".join(parts)
576
+
577
+ def _format_a11y_section(self, tree_text: str) -> str:
578
+ """Format accessibility tree section with header."""
579
+ header = "## UI ELEMENTS" if self.track.use_som else "## ACCESSIBILITY TREE"
580
+ return f"{header}\n\n{tree_text}"
581
+
582
+ def _format_history_section(self, history_text: str) -> str:
583
+ """Format history section with header."""
584
+ return f"## PREVIOUS ACTIONS\n\n{history_text}"
585
+
586
+ def _get_action_instruction(self) -> str:
587
+ """Get instruction for action output based on track."""
588
+ if self.track.track_type == TrackType.TRACK_B:
589
+ return "## YOUR TURN\n\nAnalyze the screenshot, explain your reasoning, and provide the next action."
590
+ elif self.track.track_type == TrackType.TRACK_C:
591
+ return "## YOUR TURN\n\nAnalyze the screenshot and select the appropriate element to interact with."
592
+ else:
593
+ return "## YOUR TURN\n\nAnalyze the screenshot and provide the next action."
594
+
595
+ def _format_a11y_tree(self, tree: str | dict[str, Any]) -> str:
596
+ """Format accessibility tree for prompt.
597
+
598
+ Args:
599
+ tree: Accessibility tree as string or dict.
600
+
601
+ Returns:
602
+ Formatted string (possibly truncated).
603
+ """
604
+ if isinstance(tree, str):
605
+ text = tree
606
+ elif isinstance(tree, dict):
607
+ text = self._dict_to_tree_string(tree)
608
+ else:
609
+ return ""
610
+
611
+ # Truncate if needed
612
+ max_lines = self.track.max_a11y_elements
613
+ lines = text.split("\n")
614
+ if len(lines) > max_lines:
615
+ original_count = len(lines)
616
+ lines = lines[:max_lines]
617
+ lines.append(f"... (showing {max_lines} of {original_count} elements)")
618
+
619
+ return "\n".join(lines)
620
+
621
+ def _dict_to_tree_string(
622
+ self,
623
+ tree: dict[str, Any],
624
+ indent: int = 0,
625
+ max_depth: int = 5,
626
+ ) -> str:
627
+ """Convert dict tree to formatted string.
628
+
629
+ Args:
630
+ tree: Dictionary representing accessibility tree.
631
+ indent: Current indentation level.
632
+ max_depth: Maximum recursion depth.
633
+
634
+ Returns:
635
+ Formatted tree string.
636
+ """
637
+ if indent > max_depth:
638
+ return ""
639
+
640
+ lines = []
641
+ prefix = " " * indent
642
+
643
+ role = tree.get("role", "unknown")
644
+ name = tree.get("name", "")
645
+ node_id = tree.get("id", tree.get("node_id", ""))
646
+
647
+ # Format node based on track
648
+ if self.track.use_som and node_id:
649
+ # SoM format: [id] role "name"
650
+ line = f"{prefix}[{node_id}] {role}"
651
+ elif node_id:
652
+ # Non-SoM with ID
653
+ line = f"{prefix}({node_id}) {role}"
654
+ else:
655
+ line = f"{prefix}{role}"
656
+
657
+ if name:
658
+ # Truncate long names
659
+ if len(name) > 50:
660
+ name = name[:47] + "..."
661
+ line += f': "{name}"'
662
+
663
+ # Add bounding box if available (useful for debugging)
664
+ bbox = tree.get("bbox", tree.get("bounds"))
665
+ if bbox and isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
666
+ # Show center point for SoM
667
+ if self.track.use_som:
668
+ cx = (bbox[0] + bbox[2]) / 2
669
+ cy = (bbox[1] + bbox[3]) / 2
670
+ line += f" @ ({cx:.2f}, {cy:.2f})"
671
+
672
+ lines.append(line)
673
+
674
+ # Process children
675
+ children = tree.get("children", [])
676
+ for child in children:
677
+ if isinstance(child, dict):
678
+ child_text = self._dict_to_tree_string(child, indent + 1, max_depth)
679
+ if child_text:
680
+ lines.append(child_text)
681
+
682
+ return "\n".join(lines)
683
+
684
+ def _format_history(self, history: list[dict[str, Any]]) -> str:
685
+ """Format action history for prompt.
686
+
687
+ Args:
688
+ history: List of action dictionaries.
689
+
690
+ Returns:
691
+ Formatted history string.
692
+ """
693
+ if not history:
694
+ return ""
695
+
696
+ lines = []
697
+ max_steps = self.track.max_history_steps
698
+ recent = history[-max_steps:] if len(history) > max_steps else history
699
+
700
+ for i, action in enumerate(recent, 1):
701
+ action_type = action.get("type", action.get("action", "unknown")).upper()
702
+ line = self._format_single_action(i, action_type, action)
703
+ lines.append(line)
704
+
705
+ return "\n".join(lines)
706
+
707
+ def _format_single_action(
708
+ self, step: int, action_type: str, action: dict[str, Any]
709
+ ) -> str:
710
+ """Format a single action for history display."""
711
+ if action_type == "CLICK":
712
+ if "element_id" in action:
713
+ return f"{step}. CLICK([{action['element_id']}])"
714
+ elif "x" in action and "y" in action:
715
+ return f"{step}. CLICK({action['x']:.3f}, {action['y']:.3f})"
716
+ else:
717
+ return f"{step}. CLICK()"
718
+ elif action_type == "TYPE":
719
+ text = action.get("text", "")
720
+ # Truncate long text
721
+ if len(text) > 30:
722
+ text = text[:27] + "..."
723
+ return f'{step}. TYPE("{text}")'
724
+ elif action_type == "KEY":
725
+ key = action.get("key", "")
726
+ return f"{step}. KEY({key})"
727
+ elif action_type == "SCROLL":
728
+ direction = action.get("direction", "down")
729
+ amount = action.get("amount", 1)
730
+ return f"{step}. SCROLL({direction}, {amount})"
731
+ elif action_type == "DONE":
732
+ return f"{step}. DONE()"
733
+ elif action_type == "WAIT":
734
+ return f"{step}. WAIT()"
735
+ else:
736
+ return f"{step}. {action_type}()"
737
+
738
+ def build_verification_prompt(
739
+ self,
740
+ goal: str,
741
+ previous_action: dict[str, Any],
742
+ screenshot: "Image" | None = None,
743
+ encode_image_fn: Any = None,
744
+ ) -> list[dict[str, Any]]:
745
+ """Build a verification prompt after an action.
746
+
747
+ Used to verify action results and decide next steps.
748
+ Based on Claude Computer Use best practices.
749
+
750
+ Args:
751
+ goal: Original task goal.
752
+ previous_action: The action that was just executed.
753
+ screenshot: Screenshot after action execution.
754
+ encode_image_fn: Function to encode image.
755
+
756
+ Returns:
757
+ List of content blocks.
758
+ """
759
+ content: list[dict[str, Any]] = []
760
+
761
+ action_str = self._format_single_action(
762
+ 0, previous_action.get("type", ""), previous_action
763
+ )
764
+ action_str = action_str[3:] # Remove "0. " prefix
765
+
766
+ text = textwrap.dedent(f"""
767
+ ## VERIFICATION CHECK
768
+
769
+ **Goal**: {goal}
770
+
771
+ **Previous Action**: {action_str}
772
+
773
+ Analyze the screenshot and verify:
774
+ 1. Did the action execute correctly?
775
+ 2. Is the UI state as expected?
776
+ 3. Are we making progress toward the goal?
777
+
778
+ If the goal is achieved, respond with {{"action": "DONE"}}.
779
+ Otherwise, provide the next action.
780
+ """).strip()
781
+
782
+ content.append({"type": "text", "text": text})
783
+
784
+ if screenshot is not None and encode_image_fn is not None:
785
+ content.append(encode_image_fn(screenshot))
786
+
787
+ return content