openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
"""Prompt templates for baseline adapters.
|
|
2
|
+
|
|
3
|
+
Provides track-specific system prompts and user content builders.
|
|
4
|
+
Based on SOTA patterns from:
|
|
5
|
+
- Claude Computer Use (Anthropic)
|
|
6
|
+
- UFO/UFO2 (Microsoft)
|
|
7
|
+
- OSWorld benchmark
|
|
8
|
+
- Agent-S/Agent-S2 (Simular AI)
|
|
9
|
+
|
|
10
|
+
Key design principles:
|
|
11
|
+
1. Structured observation -> thought -> action flow (ReAct)
|
|
12
|
+
2. Clear action format specification with examples
|
|
13
|
+
3. Explicit coordinate system definition
|
|
14
|
+
4. Screen verification after action (Claude best practice)
|
|
15
|
+
5. Error handling guidance
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import textwrap
|
|
21
|
+
from typing import TYPE_CHECKING, Any
|
|
22
|
+
|
|
23
|
+
from openadapt_ml.baselines.config import (
|
|
24
|
+
ActionOutputFormat,
|
|
25
|
+
TrackConfig,
|
|
26
|
+
TrackType,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from PIL import Image
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
# TRACK A: Direct Coordinate Prediction
|
|
35
|
+
# =============================================================================
|
|
36
|
+
|
|
37
|
+
SYSTEM_PROMPT_TRACK_A = """You are a GUI automation agent that controls computer interfaces by analyzing screenshots.
|
|
38
|
+
|
|
39
|
+
## YOUR CAPABILITIES
|
|
40
|
+
|
|
41
|
+
You can perform these actions:
|
|
42
|
+
- **CLICK**: Click at specific screen coordinates
|
|
43
|
+
- **TYPE**: Enter text at the current cursor position
|
|
44
|
+
- **KEY**: Press keyboard keys or key combinations
|
|
45
|
+
- **SCROLL**: Scroll in a direction
|
|
46
|
+
- **DONE**: Mark task as complete when the goal is achieved
|
|
47
|
+
|
|
48
|
+
## COORDINATE SYSTEM
|
|
49
|
+
|
|
50
|
+
- Coordinates are **normalized** between 0.0 and 1.0
|
|
51
|
+
- (0.0, 0.0) is the **top-left** corner of the screen
|
|
52
|
+
- (1.0, 1.0) is the **bottom-right** corner
|
|
53
|
+
- For example, the center of the screen is (0.5, 0.5)
|
|
54
|
+
|
|
55
|
+
## OUTPUT FORMAT
|
|
56
|
+
|
|
57
|
+
Respond with a single JSON object containing your action:
|
|
58
|
+
|
|
59
|
+
```json
|
|
60
|
+
{"action": "CLICK", "x": 0.5, "y": 0.3}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{"action": "TYPE", "text": "hello world"}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```json
|
|
68
|
+
{"action": "KEY", "key": "enter"}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{"action": "SCROLL", "direction": "down", "amount": 3}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{"action": "DONE"}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## RULES
|
|
80
|
+
|
|
81
|
+
1. **Analyze carefully**: Study the screenshot to identify UI elements
|
|
82
|
+
2. **Be precise**: Aim for the center of clickable elements
|
|
83
|
+
3. **One action at a time**: Return exactly one action per response
|
|
84
|
+
4. **Validate coordinates**: Ensure x and y are between 0.0 and 1.0
|
|
85
|
+
5. **Complete the task**: Use DONE only when the goal is fully achieved
|
|
86
|
+
6. **Handle errors**: If an action fails, try an alternative approach
|
|
87
|
+
|
|
88
|
+
## IMPORTANT
|
|
89
|
+
|
|
90
|
+
- Return ONLY the JSON object, no additional text
|
|
91
|
+
- If you cannot determine the correct action, explain in a "reason" field and still provide your best guess"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# =============================================================================
|
|
95
|
+
# TRACK B: ReAct-style Reasoning with Coordinates
|
|
96
|
+
# =============================================================================
|
|
97
|
+
|
|
98
|
+
SYSTEM_PROMPT_TRACK_B = """You are a GUI automation agent using ReAct (Reasoning + Acting) to complete tasks.
|
|
99
|
+
|
|
100
|
+
## YOUR CAPABILITIES
|
|
101
|
+
|
|
102
|
+
You can perform these actions:
|
|
103
|
+
- **CLICK**: Click at specific screen coordinates
|
|
104
|
+
- **TYPE**: Enter text at the current cursor position
|
|
105
|
+
- **KEY**: Press keyboard keys or key combinations
|
|
106
|
+
- **SCROLL**: Scroll in a direction
|
|
107
|
+
- **DONE**: Mark task as complete
|
|
108
|
+
|
|
109
|
+
## COORDINATE SYSTEM
|
|
110
|
+
|
|
111
|
+
- Coordinates are **normalized** between 0.0 and 1.0
|
|
112
|
+
- (0.0, 0.0) is the **top-left** corner
|
|
113
|
+
- (1.0, 1.0) is the **bottom-right** corner
|
|
114
|
+
|
|
115
|
+
## ReAct PROCESS
|
|
116
|
+
|
|
117
|
+
For each step, follow this process:
|
|
118
|
+
|
|
119
|
+
1. **OBSERVE**: Describe what you see in the screenshot
|
|
120
|
+
- What application/window is visible?
|
|
121
|
+
- What UI elements are present?
|
|
122
|
+
- What is the current state?
|
|
123
|
+
|
|
124
|
+
2. **THINK**: Reason about the next action
|
|
125
|
+
- What is the goal?
|
|
126
|
+
- What progress has been made?
|
|
127
|
+
- What is the logical next step?
|
|
128
|
+
- Where exactly should I click?
|
|
129
|
+
|
|
130
|
+
3. **ACT**: Execute the action
|
|
131
|
+
|
|
132
|
+
## OUTPUT FORMAT
|
|
133
|
+
|
|
134
|
+
Respond with a JSON object containing observation, thought, and action:
|
|
135
|
+
|
|
136
|
+
```json
|
|
137
|
+
{
|
|
138
|
+
"observation": "I see a login form with username and password fields. The username field is empty and appears to be focused.",
|
|
139
|
+
"thought": "To log in, I first need to enter the username. The username field is positioned at approximately x=0.5, y=0.35.",
|
|
140
|
+
"action": "CLICK",
|
|
141
|
+
"x": 0.5,
|
|
142
|
+
"y": 0.35
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
```json
|
|
147
|
+
{
|
|
148
|
+
"observation": "The username field is now active with a cursor blinking.",
|
|
149
|
+
"thought": "I should type the username now.",
|
|
150
|
+
"action": "TYPE",
|
|
151
|
+
"text": "user@example.com"
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
```json
|
|
156
|
+
{
|
|
157
|
+
"observation": "I can see the confirmation page showing 'Success! You are logged in.'",
|
|
158
|
+
"thought": "The task is complete - the login was successful.",
|
|
159
|
+
"action": "DONE"
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## RULES
|
|
164
|
+
|
|
165
|
+
1. **Always explain your reasoning** before acting
|
|
166
|
+
2. **Be specific** in observations - describe what you actually see
|
|
167
|
+
3. **Justify coordinates** - explain why you chose those coordinates
|
|
168
|
+
4. **Track progress** - consider previous actions when planning
|
|
169
|
+
5. **Verify completion** - ensure the goal is fully achieved before DONE
|
|
170
|
+
|
|
171
|
+
## TIPS
|
|
172
|
+
|
|
173
|
+
- If an element is hard to click, try using keyboard navigation
|
|
174
|
+
- After clicking, verify the expected result occurred
|
|
175
|
+
- For text fields, click to focus before typing"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# =============================================================================
|
|
179
|
+
# TRACK C: Set-of-Mark Element Selection
|
|
180
|
+
# =============================================================================
|
|
181
|
+
|
|
182
|
+
SYSTEM_PROMPT_TRACK_C = """You are a GUI automation agent. UI elements in the screenshot are labeled with numbered markers like [1], [2], [3], etc.
|
|
183
|
+
|
|
184
|
+
## YOUR CAPABILITIES
|
|
185
|
+
|
|
186
|
+
You can perform these actions:
|
|
187
|
+
- **CLICK**: Click an element by its label number
|
|
188
|
+
- **TYPE**: Enter text at the current cursor position
|
|
189
|
+
- **KEY**: Press keyboard keys or key combinations
|
|
190
|
+
- **SCROLL**: Scroll in a direction
|
|
191
|
+
- **DONE**: Mark task as complete
|
|
192
|
+
|
|
193
|
+
## ELEMENT LABELS
|
|
194
|
+
|
|
195
|
+
- Each interactive UI element is marked with a number in brackets: [1], [2], [3], etc.
|
|
196
|
+
- The accessibility tree below lists all labeled elements with their roles and names
|
|
197
|
+
- Use the element ID (the number) to specify which element to click
|
|
198
|
+
|
|
199
|
+
## OUTPUT FORMAT
|
|
200
|
+
|
|
201
|
+
Respond with a JSON object:
|
|
202
|
+
|
|
203
|
+
```json
|
|
204
|
+
{"action": "CLICK", "element_id": 17}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
```json
|
|
208
|
+
{"action": "TYPE", "text": "hello world"}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
```json
|
|
212
|
+
{"action": "KEY", "key": "enter"}
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
```json
|
|
216
|
+
{"action": "SCROLL", "direction": "down"}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
```json
|
|
220
|
+
{"action": "DONE"}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## RULES
|
|
224
|
+
|
|
225
|
+
1. **Use element IDs** - Click by element number, NOT coordinates
|
|
226
|
+
2. **Match carefully** - Find the element that matches your intent
|
|
227
|
+
3. **Check roles** - Consider element type (button, textfield, checkbox)
|
|
228
|
+
4. **Read labels** - Use element names to identify correct targets
|
|
229
|
+
5. **One action** - Return exactly one action per response
|
|
230
|
+
|
|
231
|
+
## ELEMENT SELECTION TIPS
|
|
232
|
+
|
|
233
|
+
- Look for buttons with matching text labels
|
|
234
|
+
- Text fields are often named by their placeholder or label
|
|
235
|
+
- If multiple similar elements exist, choose based on position
|
|
236
|
+
- Some elements may be nested - prefer the most specific match
|
|
237
|
+
|
|
238
|
+
## IMPORTANT
|
|
239
|
+
|
|
240
|
+
- Return ONLY the JSON object
|
|
241
|
+
- element_id must be an integer from the labeled elements"""
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# =============================================================================
|
|
245
|
+
# OSWORLD-COMPATIBLE PROMPTS (PyAutoGUI format)
|
|
246
|
+
# =============================================================================
|
|
247
|
+
|
|
248
|
+
SYSTEM_PROMPT_OSWORLD = """You are a GUI automation agent controlling a computer through PyAutoGUI.
|
|
249
|
+
|
|
250
|
+
## ENVIRONMENT
|
|
251
|
+
|
|
252
|
+
You are interacting with a desktop environment (Ubuntu/Windows/macOS).
|
|
253
|
+
Execute tasks by generating Python code using the PyAutoGUI library.
|
|
254
|
+
|
|
255
|
+
## AVAILABLE ACTIONS
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
# Mouse actions
|
|
259
|
+
pyautogui.click(x, y) # Click at pixel coordinates
|
|
260
|
+
pyautogui.doubleClick(x, y) # Double-click
|
|
261
|
+
pyautogui.rightClick(x, y) # Right-click
|
|
262
|
+
pyautogui.moveTo(x, y) # Move mouse
|
|
263
|
+
pyautogui.drag(dx, dy) # Drag relative
|
|
264
|
+
|
|
265
|
+
# Keyboard actions
|
|
266
|
+
pyautogui.write('text') # Type text
|
|
267
|
+
pyautogui.press('key') # Press single key
|
|
268
|
+
pyautogui.hotkey('ctrl', 'c') # Key combination
|
|
269
|
+
|
|
270
|
+
# Scrolling
|
|
271
|
+
pyautogui.scroll(clicks) # Scroll (positive=up, negative=down)
|
|
272
|
+
|
|
273
|
+
# Special
|
|
274
|
+
WAIT # Agent should wait
|
|
275
|
+
FAIL # Task is infeasible
|
|
276
|
+
DONE # Task is complete
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## COORDINATE SYSTEM
|
|
280
|
+
|
|
281
|
+
- Coordinates are in **pixels** from the screen's top-left corner
|
|
282
|
+
- Screen dimensions are provided in the observation
|
|
283
|
+
|
|
284
|
+
## OUTPUT FORMAT
|
|
285
|
+
|
|
286
|
+
Output a single line of Python code or special command:
|
|
287
|
+
|
|
288
|
+
```
|
|
289
|
+
pyautogui.click(960, 540)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
```
|
|
293
|
+
pyautogui.write('Hello, World!')
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
```
|
|
297
|
+
pyautogui.hotkey('ctrl', 's')
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
```
|
|
301
|
+
DONE
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## RULES
|
|
305
|
+
|
|
306
|
+
1. **One action per response** - Output exactly one line
|
|
307
|
+
2. **Use pixel coordinates** - Not normalized
|
|
308
|
+
3. **Be precise** - Aim for the center of elements
|
|
309
|
+
4. **Handle failures** - Output FAIL if task is impossible
|
|
310
|
+
5. **Wait when needed** - Output WAIT if UI is loading
|
|
311
|
+
|
|
312
|
+
## TIPS
|
|
313
|
+
|
|
314
|
+
- Click in the center of buttons and links
|
|
315
|
+
- For text fields, click to focus before typing
|
|
316
|
+
- Use hotkeys when available (faster, more reliable)
|
|
317
|
+
- Scroll to reveal off-screen elements"""
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# =============================================================================
|
|
321
|
+
# UFO-COMPATIBLE PROMPTS
|
|
322
|
+
# =============================================================================
|
|
323
|
+
|
|
324
|
+
SYSTEM_PROMPT_UFO = """You are an AppAgent in the UFO framework, controlling Windows applications.
|
|
325
|
+
|
|
326
|
+
## YOUR ROLE
|
|
327
|
+
|
|
328
|
+
You interact with application UI by selecting controls and executing functions.
|
|
329
|
+
Each control is labeled with a number that you reference in your response.
|
|
330
|
+
|
|
331
|
+
## PROCESS
|
|
332
|
+
|
|
333
|
+
For each step:
|
|
334
|
+
1. **Observe** the current application state
|
|
335
|
+
2. **Think** about what action achieves the goal
|
|
336
|
+
3. **Select** the appropriate control and function
|
|
337
|
+
4. **Plan** subsequent steps
|
|
338
|
+
|
|
339
|
+
## OUTPUT FORMAT
|
|
340
|
+
|
|
341
|
+
Respond with a JSON object:
|
|
342
|
+
|
|
343
|
+
```json
|
|
344
|
+
{
|
|
345
|
+
"Observation": "The Notepad application is open with an empty document.",
|
|
346
|
+
"Thought": "To save the file, I need to use File > Save or Ctrl+S. I'll click the File menu first.",
|
|
347
|
+
"ControlLabel": 3,
|
|
348
|
+
"ControlText": "File",
|
|
349
|
+
"Function": "click",
|
|
350
|
+
"Args": [],
|
|
351
|
+
"Status": "CONTINUE",
|
|
352
|
+
"Plan": ["Click Save in the menu", "Enter filename", "Click Save button"],
|
|
353
|
+
"Comment": "Starting the save workflow"
|
|
354
|
+
}
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## AVAILABLE FUNCTIONS
|
|
358
|
+
|
|
359
|
+
- **click**: Click the control
|
|
360
|
+
- **input_text**: Type text (Args: ["text to type"])
|
|
361
|
+
- **select**: Select option from dropdown (Args: ["option"])
|
|
362
|
+
- **scroll**: Scroll control (Args: ["up"] or ["down"])
|
|
363
|
+
- **hotkey**: Press key combination (Args: ["ctrl", "s"])
|
|
364
|
+
- **wait**: Wait for UI update (Args: [seconds])
|
|
365
|
+
|
|
366
|
+
## STATUS VALUES
|
|
367
|
+
|
|
368
|
+
- **CONTINUE**: More actions needed
|
|
369
|
+
- **FINISH**: Task completed successfully
|
|
370
|
+
- **ERROR**: Something went wrong
|
|
371
|
+
- **PENDING**: Waiting for user input
|
|
372
|
+
|
|
373
|
+
## RULES
|
|
374
|
+
|
|
375
|
+
1. **Always provide Observation and Thought**
|
|
376
|
+
2. **ControlLabel must match a labeled element**
|
|
377
|
+
3. **Plan should list remaining steps**
|
|
378
|
+
4. **Use FINISH only when goal is achieved**"""
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# =============================================================================
|
|
382
|
+
# System Prompt Registry
|
|
383
|
+
# =============================================================================
|
|
384
|
+
|
|
385
|
+
SYSTEM_PROMPTS = {
|
|
386
|
+
TrackType.TRACK_A: SYSTEM_PROMPT_TRACK_A,
|
|
387
|
+
TrackType.TRACK_B: SYSTEM_PROMPT_TRACK_B,
|
|
388
|
+
TrackType.TRACK_C: SYSTEM_PROMPT_TRACK_C,
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
# Additional format-specific prompts
|
|
392
|
+
FORMAT_PROMPTS = {
|
|
393
|
+
ActionOutputFormat.PYAUTOGUI: SYSTEM_PROMPT_OSWORLD,
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# =============================================================================
|
|
398
|
+
# PromptBuilder Class
|
|
399
|
+
# =============================================================================
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class PromptBuilder:
|
|
403
|
+
"""Builds prompts for baseline API calls.
|
|
404
|
+
|
|
405
|
+
Constructs system prompts and user content based on track configuration.
|
|
406
|
+
Supports multiple output formats and benchmark compatibility.
|
|
407
|
+
|
|
408
|
+
Example:
|
|
409
|
+
builder = PromptBuilder(track_config)
|
|
410
|
+
system = builder.get_system_prompt()
|
|
411
|
+
content = builder.build_user_content(
|
|
412
|
+
goal="Log into the application",
|
|
413
|
+
screenshot=img,
|
|
414
|
+
a11y_tree=tree,
|
|
415
|
+
history=history,
|
|
416
|
+
)
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
def __init__(self, track: TrackConfig):
|
|
420
|
+
"""Initialize prompt builder.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
track: Track configuration.
|
|
424
|
+
"""
|
|
425
|
+
self.track = track
|
|
426
|
+
|
|
427
|
+
def get_system_prompt(
|
|
428
|
+
self,
|
|
429
|
+
demo: str | None = None,
|
|
430
|
+
custom_instructions: str | None = None,
|
|
431
|
+
) -> str:
|
|
432
|
+
"""Get the system prompt for this track.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
demo: Optional demo text to include as an example.
|
|
436
|
+
custom_instructions: Optional custom instructions to append.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
System prompt string.
|
|
440
|
+
"""
|
|
441
|
+
# Select base prompt based on format or track
|
|
442
|
+
if self.track.action_format == ActionOutputFormat.PYAUTOGUI:
|
|
443
|
+
base_prompt = SYSTEM_PROMPT_OSWORLD
|
|
444
|
+
else:
|
|
445
|
+
base_prompt = SYSTEM_PROMPTS.get(
|
|
446
|
+
self.track.track_type, SYSTEM_PROMPT_TRACK_A
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
parts = [base_prompt]
|
|
450
|
+
|
|
451
|
+
# Add demo example if provided
|
|
452
|
+
if demo:
|
|
453
|
+
parts.append(self._format_demo_section(demo))
|
|
454
|
+
|
|
455
|
+
# Add screen verification instruction if enabled
|
|
456
|
+
if self.track.verify_after_action:
|
|
457
|
+
parts.append(self._get_verification_instruction())
|
|
458
|
+
|
|
459
|
+
# Add custom instructions
|
|
460
|
+
if custom_instructions:
|
|
461
|
+
parts.append(f"\n## ADDITIONAL INSTRUCTIONS\n\n{custom_instructions}")
|
|
462
|
+
|
|
463
|
+
return "\n\n".join(parts)
|
|
464
|
+
|
|
465
|
+
def _format_demo_section(self, demo: str) -> str:
|
|
466
|
+
"""Format demonstration example section."""
|
|
467
|
+
return textwrap.dedent(f"""
|
|
468
|
+
## EXAMPLE DEMONSTRATION
|
|
469
|
+
|
|
470
|
+
Here is an example of successfully completing a similar task:
|
|
471
|
+
|
|
472
|
+
{demo}
|
|
473
|
+
|
|
474
|
+
Follow a similar pattern for your task.
|
|
475
|
+
""").strip()
|
|
476
|
+
|
|
477
|
+
def _get_verification_instruction(self) -> str:
|
|
478
|
+
"""Get instruction for post-action verification.
|
|
479
|
+
|
|
480
|
+
Based on Claude Computer Use best practices.
|
|
481
|
+
"""
|
|
482
|
+
return textwrap.dedent("""
|
|
483
|
+
## VERIFICATION
|
|
484
|
+
|
|
485
|
+
After each action, a new screenshot will be provided. Verify that:
|
|
486
|
+
1. The action was executed correctly
|
|
487
|
+
2. The UI state changed as expected
|
|
488
|
+
3. You are making progress toward the goal
|
|
489
|
+
|
|
490
|
+
If something unexpected happened, explain what went wrong and try again.
|
|
491
|
+
""").strip()
|
|
492
|
+
|
|
493
|
+
def build_user_content(
|
|
494
|
+
self,
|
|
495
|
+
goal: str,
|
|
496
|
+
screenshot: "Image" | None = None,
|
|
497
|
+
a11y_tree: str | dict[str, Any] | None = None,
|
|
498
|
+
history: list[dict[str, Any]] | None = None,
|
|
499
|
+
encode_image_fn: Any = None,
|
|
500
|
+
screen_info: dict[str, Any] | None = None,
|
|
501
|
+
window_info: dict[str, Any] | None = None,
|
|
502
|
+
) -> list[dict[str, Any]]:
|
|
503
|
+
"""Build user message content for API call.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
goal: Task goal/instruction.
|
|
507
|
+
screenshot: Screenshot image (PIL Image).
|
|
508
|
+
a11y_tree: Accessibility tree (string or dict).
|
|
509
|
+
history: List of previous actions.
|
|
510
|
+
encode_image_fn: Function to encode image for API.
|
|
511
|
+
screen_info: Screen dimensions and other info.
|
|
512
|
+
window_info: Active window information.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List of content blocks for API message.
|
|
516
|
+
"""
|
|
517
|
+
content: list[dict[str, Any]] = []
|
|
518
|
+
|
|
519
|
+
# Build text prompt
|
|
520
|
+
text_parts = [self._format_goal(goal)]
|
|
521
|
+
|
|
522
|
+
# Add screen info if provided
|
|
523
|
+
if screen_info:
|
|
524
|
+
text_parts.append(self._format_screen_info(screen_info))
|
|
525
|
+
|
|
526
|
+
# Add window info if provided
|
|
527
|
+
if window_info:
|
|
528
|
+
text_parts.append(self._format_window_info(window_info))
|
|
529
|
+
|
|
530
|
+
# Add accessibility tree if configured
|
|
531
|
+
if self.track.use_a11y_tree and a11y_tree:
|
|
532
|
+
tree_text = self._format_a11y_tree(a11y_tree)
|
|
533
|
+
if tree_text:
|
|
534
|
+
text_parts.append(self._format_a11y_section(tree_text))
|
|
535
|
+
|
|
536
|
+
# Add action history if configured
|
|
537
|
+
if self.track.include_history and history:
|
|
538
|
+
history_text = self._format_history(history)
|
|
539
|
+
if history_text:
|
|
540
|
+
text_parts.append(self._format_history_section(history_text))
|
|
541
|
+
|
|
542
|
+
# Add instruction based on track
|
|
543
|
+
text_parts.append(self._get_action_instruction())
|
|
544
|
+
|
|
545
|
+
# Combine text parts
|
|
546
|
+
content.append({"type": "text", "text": "\n\n".join(text_parts)})
|
|
547
|
+
|
|
548
|
+
# Add screenshot if provided
|
|
549
|
+
if screenshot is not None and encode_image_fn is not None:
|
|
550
|
+
content.append(encode_image_fn(screenshot))
|
|
551
|
+
|
|
552
|
+
return content
|
|
553
|
+
|
|
554
|
+
def _format_goal(self, goal: str) -> str:
|
|
555
|
+
"""Format the task goal."""
|
|
556
|
+
return f"## TASK\n\n{goal}"
|
|
557
|
+
|
|
558
|
+
def _format_screen_info(self, screen_info: dict[str, Any]) -> str:
|
|
559
|
+
"""Format screen information."""
|
|
560
|
+
width = screen_info.get("width", "unknown")
|
|
561
|
+
height = screen_info.get("height", "unknown")
|
|
562
|
+
return f"## SCREEN\n\nResolution: {width} x {height} pixels"
|
|
563
|
+
|
|
564
|
+
def _format_window_info(self, window_info: dict[str, Any]) -> str:
|
|
565
|
+
"""Format active window information."""
|
|
566
|
+
parts = ["## ACTIVE WINDOW"]
|
|
567
|
+
|
|
568
|
+
if "title" in window_info:
|
|
569
|
+
parts.append(f"Title: {window_info['title']}")
|
|
570
|
+
if "app" in window_info:
|
|
571
|
+
parts.append(f"Application: {window_info['app']}")
|
|
572
|
+
if "url" in window_info:
|
|
573
|
+
parts.append(f"URL: {window_info['url']}")
|
|
574
|
+
|
|
575
|
+
return "\n".join(parts)
|
|
576
|
+
|
|
577
|
+
def _format_a11y_section(self, tree_text: str) -> str:
|
|
578
|
+
"""Format accessibility tree section with header."""
|
|
579
|
+
header = "## UI ELEMENTS" if self.track.use_som else "## ACCESSIBILITY TREE"
|
|
580
|
+
return f"{header}\n\n{tree_text}"
|
|
581
|
+
|
|
582
|
+
def _format_history_section(self, history_text: str) -> str:
|
|
583
|
+
"""Format history section with header."""
|
|
584
|
+
return f"## PREVIOUS ACTIONS\n\n{history_text}"
|
|
585
|
+
|
|
586
|
+
def _get_action_instruction(self) -> str:
|
|
587
|
+
"""Get instruction for action output based on track."""
|
|
588
|
+
if self.track.track_type == TrackType.TRACK_B:
|
|
589
|
+
return "## YOUR TURN\n\nAnalyze the screenshot, explain your reasoning, and provide the next action."
|
|
590
|
+
elif self.track.track_type == TrackType.TRACK_C:
|
|
591
|
+
return "## YOUR TURN\n\nAnalyze the screenshot and select the appropriate element to interact with."
|
|
592
|
+
else:
|
|
593
|
+
return "## YOUR TURN\n\nAnalyze the screenshot and provide the next action."
|
|
594
|
+
|
|
595
|
+
def _format_a11y_tree(self, tree: str | dict[str, Any]) -> str:
|
|
596
|
+
"""Format accessibility tree for prompt.
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
tree: Accessibility tree as string or dict.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
Formatted string (possibly truncated).
|
|
603
|
+
"""
|
|
604
|
+
if isinstance(tree, str):
|
|
605
|
+
text = tree
|
|
606
|
+
elif isinstance(tree, dict):
|
|
607
|
+
text = self._dict_to_tree_string(tree)
|
|
608
|
+
else:
|
|
609
|
+
return ""
|
|
610
|
+
|
|
611
|
+
# Truncate if needed
|
|
612
|
+
max_lines = self.track.max_a11y_elements
|
|
613
|
+
lines = text.split("\n")
|
|
614
|
+
if len(lines) > max_lines:
|
|
615
|
+
original_count = len(lines)
|
|
616
|
+
lines = lines[:max_lines]
|
|
617
|
+
lines.append(f"... (showing {max_lines} of {original_count} elements)")
|
|
618
|
+
|
|
619
|
+
return "\n".join(lines)
|
|
620
|
+
|
|
621
|
+
def _dict_to_tree_string(
|
|
622
|
+
self,
|
|
623
|
+
tree: dict[str, Any],
|
|
624
|
+
indent: int = 0,
|
|
625
|
+
max_depth: int = 5,
|
|
626
|
+
) -> str:
|
|
627
|
+
"""Convert dict tree to formatted string.
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
tree: Dictionary representing accessibility tree.
|
|
631
|
+
indent: Current indentation level.
|
|
632
|
+
max_depth: Maximum recursion depth.
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
Formatted tree string.
|
|
636
|
+
"""
|
|
637
|
+
if indent > max_depth:
|
|
638
|
+
return ""
|
|
639
|
+
|
|
640
|
+
lines = []
|
|
641
|
+
prefix = " " * indent
|
|
642
|
+
|
|
643
|
+
role = tree.get("role", "unknown")
|
|
644
|
+
name = tree.get("name", "")
|
|
645
|
+
node_id = tree.get("id", tree.get("node_id", ""))
|
|
646
|
+
|
|
647
|
+
# Format node based on track
|
|
648
|
+
if self.track.use_som and node_id:
|
|
649
|
+
# SoM format: [id] role "name"
|
|
650
|
+
line = f"{prefix}[{node_id}] {role}"
|
|
651
|
+
elif node_id:
|
|
652
|
+
# Non-SoM with ID
|
|
653
|
+
line = f"{prefix}({node_id}) {role}"
|
|
654
|
+
else:
|
|
655
|
+
line = f"{prefix}{role}"
|
|
656
|
+
|
|
657
|
+
if name:
|
|
658
|
+
# Truncate long names
|
|
659
|
+
if len(name) > 50:
|
|
660
|
+
name = name[:47] + "..."
|
|
661
|
+
line += f': "{name}"'
|
|
662
|
+
|
|
663
|
+
# Add bounding box if available (useful for debugging)
|
|
664
|
+
bbox = tree.get("bbox", tree.get("bounds"))
|
|
665
|
+
if bbox and isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
|
|
666
|
+
# Show center point for SoM
|
|
667
|
+
if self.track.use_som:
|
|
668
|
+
cx = (bbox[0] + bbox[2]) / 2
|
|
669
|
+
cy = (bbox[1] + bbox[3]) / 2
|
|
670
|
+
line += f" @ ({cx:.2f}, {cy:.2f})"
|
|
671
|
+
|
|
672
|
+
lines.append(line)
|
|
673
|
+
|
|
674
|
+
# Process children
|
|
675
|
+
children = tree.get("children", [])
|
|
676
|
+
for child in children:
|
|
677
|
+
if isinstance(child, dict):
|
|
678
|
+
child_text = self._dict_to_tree_string(child, indent + 1, max_depth)
|
|
679
|
+
if child_text:
|
|
680
|
+
lines.append(child_text)
|
|
681
|
+
|
|
682
|
+
return "\n".join(lines)
|
|
683
|
+
|
|
684
|
+
def _format_history(self, history: list[dict[str, Any]]) -> str:
|
|
685
|
+
"""Format action history for prompt.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
history: List of action dictionaries.
|
|
689
|
+
|
|
690
|
+
Returns:
|
|
691
|
+
Formatted history string.
|
|
692
|
+
"""
|
|
693
|
+
if not history:
|
|
694
|
+
return ""
|
|
695
|
+
|
|
696
|
+
lines = []
|
|
697
|
+
max_steps = self.track.max_history_steps
|
|
698
|
+
recent = history[-max_steps:] if len(history) > max_steps else history
|
|
699
|
+
|
|
700
|
+
for i, action in enumerate(recent, 1):
|
|
701
|
+
action_type = action.get("type", action.get("action", "unknown")).upper()
|
|
702
|
+
line = self._format_single_action(i, action_type, action)
|
|
703
|
+
lines.append(line)
|
|
704
|
+
|
|
705
|
+
return "\n".join(lines)
|
|
706
|
+
|
|
707
|
+
def _format_single_action(
|
|
708
|
+
self, step: int, action_type: str, action: dict[str, Any]
|
|
709
|
+
) -> str:
|
|
710
|
+
"""Format a single action for history display."""
|
|
711
|
+
if action_type == "CLICK":
|
|
712
|
+
if "element_id" in action:
|
|
713
|
+
return f"{step}. CLICK([{action['element_id']}])"
|
|
714
|
+
elif "x" in action and "y" in action:
|
|
715
|
+
return f"{step}. CLICK({action['x']:.3f}, {action['y']:.3f})"
|
|
716
|
+
else:
|
|
717
|
+
return f"{step}. CLICK()"
|
|
718
|
+
elif action_type == "TYPE":
|
|
719
|
+
text = action.get("text", "")
|
|
720
|
+
# Truncate long text
|
|
721
|
+
if len(text) > 30:
|
|
722
|
+
text = text[:27] + "..."
|
|
723
|
+
return f'{step}. TYPE("{text}")'
|
|
724
|
+
elif action_type == "KEY":
|
|
725
|
+
key = action.get("key", "")
|
|
726
|
+
return f"{step}. KEY({key})"
|
|
727
|
+
elif action_type == "SCROLL":
|
|
728
|
+
direction = action.get("direction", "down")
|
|
729
|
+
amount = action.get("amount", 1)
|
|
730
|
+
return f"{step}. SCROLL({direction}, {amount})"
|
|
731
|
+
elif action_type == "DONE":
|
|
732
|
+
return f"{step}. DONE()"
|
|
733
|
+
elif action_type == "WAIT":
|
|
734
|
+
return f"{step}. WAIT()"
|
|
735
|
+
else:
|
|
736
|
+
return f"{step}. {action_type}()"
|
|
737
|
+
|
|
738
|
+
def build_verification_prompt(
|
|
739
|
+
self,
|
|
740
|
+
goal: str,
|
|
741
|
+
previous_action: dict[str, Any],
|
|
742
|
+
screenshot: "Image" | None = None,
|
|
743
|
+
encode_image_fn: Any = None,
|
|
744
|
+
) -> list[dict[str, Any]]:
|
|
745
|
+
"""Build a verification prompt after an action.
|
|
746
|
+
|
|
747
|
+
Used to verify action results and decide next steps.
|
|
748
|
+
Based on Claude Computer Use best practices.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
goal: Original task goal.
|
|
752
|
+
previous_action: The action that was just executed.
|
|
753
|
+
screenshot: Screenshot after action execution.
|
|
754
|
+
encode_image_fn: Function to encode image.
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
List of content blocks.
|
|
758
|
+
"""
|
|
759
|
+
content: list[dict[str, Any]] = []
|
|
760
|
+
|
|
761
|
+
action_str = self._format_single_action(
|
|
762
|
+
0, previous_action.get("type", ""), previous_action
|
|
763
|
+
)
|
|
764
|
+
action_str = action_str[3:] # Remove "0. " prefix
|
|
765
|
+
|
|
766
|
+
text = textwrap.dedent(f"""
|
|
767
|
+
## VERIFICATION CHECK
|
|
768
|
+
|
|
769
|
+
**Goal**: {goal}
|
|
770
|
+
|
|
771
|
+
**Previous Action**: {action_str}
|
|
772
|
+
|
|
773
|
+
Analyze the screenshot and verify:
|
|
774
|
+
1. Did the action execute correctly?
|
|
775
|
+
2. Is the UI state as expected?
|
|
776
|
+
3. Are we making progress toward the goal?
|
|
777
|
+
|
|
778
|
+
If the goal is achieved, respond with {{"action": "DONE"}}.
|
|
779
|
+
Otherwise, provide the next action.
|
|
780
|
+
""").strip()
|
|
781
|
+
|
|
782
|
+
content.append({"type": "text", "text": text})
|
|
783
|
+
|
|
784
|
+
if screenshot is not None and encode_image_fn is not None:
|
|
785
|
+
content.append(encode_image_fn(screenshot))
|
|
786
|
+
|
|
787
|
+
return content
|