openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
openadapt_ml/config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic_settings import BaseSettings
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Settings(BaseSettings):
|
|
7
|
+
"""Application settings loaded from environment variables or .env file.
|
|
8
|
+
|
|
9
|
+
Priority order for configuration values:
|
|
10
|
+
1. Environment variables
|
|
11
|
+
2. .env file
|
|
12
|
+
3. Default values (None for API keys)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# VLM API Keys
|
|
16
|
+
anthropic_api_key: str | None = None
|
|
17
|
+
openai_api_key: str | None = None
|
|
18
|
+
google_api_key: str | None = None
|
|
19
|
+
|
|
20
|
+
# Azure credentials (for WAA benchmark on Azure)
|
|
21
|
+
# These are used by DefaultAzureCredential for Service Principal auth
|
|
22
|
+
azure_client_id: str | None = None
|
|
23
|
+
azure_client_secret: str | None = None
|
|
24
|
+
azure_tenant_id: str | None = None
|
|
25
|
+
|
|
26
|
+
# Azure ML workspace config
|
|
27
|
+
azure_subscription_id: str | None = None
|
|
28
|
+
azure_ml_resource_group: str | None = None
|
|
29
|
+
azure_ml_workspace_name: str | None = None
|
|
30
|
+
|
|
31
|
+
# Azure VM settings (optional overrides)
|
|
32
|
+
# D2_v3 = 2 vCPUs, 8GB RAM (fits free trial with existing usage)
|
|
33
|
+
# D4_v3 = 4 vCPUs, 16GB RAM (needs 4 free vCPUs)
|
|
34
|
+
# D8_v3 = 8 vCPUs, 32GB RAM (requires quota increase)
|
|
35
|
+
azure_vm_size: str = "Standard_D2_v3"
|
|
36
|
+
# Docker image for WAA agent container
|
|
37
|
+
# Default is Docker Hub; setup_azure.py will set this to ACR image
|
|
38
|
+
azure_docker_image: str = "docker.io/windowsarena/winarena:latest"
|
|
39
|
+
|
|
40
|
+
# Azure Storage for async inference queue (Phase 2)
|
|
41
|
+
azure_storage_connection_string: str | None = None
|
|
42
|
+
azure_inference_queue_name: str = "inference-jobs"
|
|
43
|
+
azure_checkpoints_container: str = "checkpoints"
|
|
44
|
+
azure_comparisons_container: str = "comparisons"
|
|
45
|
+
|
|
46
|
+
# Lambda Labs (cloud GPU for training)
|
|
47
|
+
lambda_api_key: str | None = None
|
|
48
|
+
|
|
49
|
+
model_config = {
|
|
50
|
+
"env_file": ".env",
|
|
51
|
+
"env_file_encoding": "utf-8",
|
|
52
|
+
"extra": "ignore", # ignore extra env vars
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
settings = Settings()
|
|
File without changes
|
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
from torch.utils.data import Dataset
|
|
8
|
+
|
|
9
|
+
from openadapt_ml.schemas.sessions import Action, Episode, Step
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Coordinate-based DSL system prompt (original)
|
|
13
|
+
SYSTEM_PROMPT = (
|
|
14
|
+
"You are a GUI automation agent. Given a screenshot and a user goal, "
|
|
15
|
+
"predict the single next action.\n\n"
|
|
16
|
+
"COORDINATE SYSTEM:\n"
|
|
17
|
+
"- x=0.0 is the LEFT edge, x=1.0 is the RIGHT edge\n"
|
|
18
|
+
"- y=0.0 is the TOP edge, y=1.0 is the BOTTOM edge\n"
|
|
19
|
+
"- To click the CENTER of an element, estimate its center position as a fraction of screen width/height\n"
|
|
20
|
+
"- Example: An element in the middle of the screen would be approximately x=0.5, y=0.5\n\n"
|
|
21
|
+
"ALLOWED ACTIONS (use exactly this format):\n"
|
|
22
|
+
"- CLICK(x=0.XX, y=0.XX) → click at normalized coordinates\n"
|
|
23
|
+
"- TYPE(text=\"...\") → type text into the currently focused field\n"
|
|
24
|
+
"- WAIT() → wait for UI to update\n"
|
|
25
|
+
"- DONE() → task is complete\n\n"
|
|
26
|
+
"RESPONSE FORMAT (required):\n"
|
|
27
|
+
"Thought: [Brief reasoning: what element to interact with and why]\n"
|
|
28
|
+
"Action: [Exactly one action, e.g., CLICK(x=0.35, y=0.42)]\n\n"
|
|
29
|
+
"IMPORTANT: Output coordinates with 2 decimal places. Estimate the center of target elements."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Set-of-Marks (SoM) DSL system prompt - uses element indices instead of coordinates
|
|
33
|
+
SYSTEM_PROMPT_SOM = (
|
|
34
|
+
"You are a GUI automation agent. Given a screenshot and a user goal, "
|
|
35
|
+
"predict the single next action.\n\n"
|
|
36
|
+
"INTERACTIVE ELEMENTS:\n"
|
|
37
|
+
"The screenshot shows numbered labels [1], [2], [3], etc. on interactive UI elements.\n"
|
|
38
|
+
"These labels indicate clickable elements like buttons, text fields, links, etc.\n\n"
|
|
39
|
+
"ELEMENT LABELS ON THIS LOGIN SCREEN:\n"
|
|
40
|
+
"[1] = Username text field\n"
|
|
41
|
+
"[2] = Password text field\n"
|
|
42
|
+
"[3] = Login button\n\n"
|
|
43
|
+
"ALLOWED ACTIONS (use exactly this format):\n"
|
|
44
|
+
"- CLICK([N]) → click element with number N to focus/activate it\n"
|
|
45
|
+
"- TYPE([N], \"text\") → type text into element N (e.g., TYPE([2], \"hello\"))\n"
|
|
46
|
+
"- WAIT() → wait for UI to update\n"
|
|
47
|
+
"- DONE() → task is complete\n\n"
|
|
48
|
+
"ACTION SEQUENCE FOR LOGIN:\n"
|
|
49
|
+
"1. CLICK([1]) to focus username field\n"
|
|
50
|
+
"2. TYPE([1], \"username\") to enter username\n"
|
|
51
|
+
"3. CLICK([2]) to focus password field\n"
|
|
52
|
+
"4. TYPE([2], \"password\") to enter password\n"
|
|
53
|
+
"5. CLICK([3]) to submit login\n"
|
|
54
|
+
"6. DONE() when login is complete\n\n"
|
|
55
|
+
"RESPONSE FORMAT (required):\n"
|
|
56
|
+
"Thought: [Brief reasoning: which numbered element to interact with and why]\n"
|
|
57
|
+
"Action: [Exactly one action from the sequence above]\n\n"
|
|
58
|
+
"IMPORTANT: Follow the action sequence step by step. Each step must be done separately."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# SoM prompt for registration scenario
|
|
62
|
+
SYSTEM_PROMPT_SOM_REGISTRATION = (
|
|
63
|
+
"You are a GUI automation agent. Given a screenshot and a user goal, "
|
|
64
|
+
"predict the single next action.\n\n"
|
|
65
|
+
"INTERACTIVE ELEMENTS:\n"
|
|
66
|
+
"The screenshot shows numbered labels [1], [2], [3], etc. on interactive UI elements.\n"
|
|
67
|
+
"These labels indicate clickable elements like buttons, text fields, links, etc.\n\n"
|
|
68
|
+
"ELEMENT LABELS ON THIS REGISTRATION SCREEN:\n"
|
|
69
|
+
"[1] = First Name text field\n"
|
|
70
|
+
"[2] = Last Name text field\n"
|
|
71
|
+
"[3] = Email text field\n"
|
|
72
|
+
"[4] = Password text field\n"
|
|
73
|
+
"[5] = Confirm Password text field\n"
|
|
74
|
+
"[6] = Register button\n\n"
|
|
75
|
+
"ALLOWED ACTIONS (use exactly this format):\n"
|
|
76
|
+
"- CLICK([N]) → click element with number N to focus/activate it\n"
|
|
77
|
+
"- TYPE([N], \"text\") → type text into element N (e.g., TYPE([2], \"hello\"))\n"
|
|
78
|
+
"- WAIT() → wait for UI to update\n"
|
|
79
|
+
"- DONE() → task is complete\n\n"
|
|
80
|
+
"ACTION SEQUENCE FOR REGISTRATION:\n"
|
|
81
|
+
"1. CLICK([1]) to focus first name field\n"
|
|
82
|
+
"2. TYPE([1], \"name\") to enter first name\n"
|
|
83
|
+
"3. CLICK([2]) to focus last name field\n"
|
|
84
|
+
"4. TYPE([2], \"name\") to enter last name\n"
|
|
85
|
+
"5. CLICK([3]) to focus email field\n"
|
|
86
|
+
"6. TYPE([3], \"email\") to enter email\n"
|
|
87
|
+
"7. CLICK([4]) to focus password field\n"
|
|
88
|
+
"8. TYPE([4], \"pass\") to enter password\n"
|
|
89
|
+
"9. CLICK([5]) to focus confirm password field\n"
|
|
90
|
+
"10. TYPE([5], \"pass\") to enter confirmation\n"
|
|
91
|
+
"11. CLICK([6]) to submit registration\n"
|
|
92
|
+
"12. DONE() when registration is complete\n\n"
|
|
93
|
+
"RESPONSE FORMAT (required):\n"
|
|
94
|
+
"Thought: [Brief reasoning: which numbered element to interact with and why]\n"
|
|
95
|
+
"Action: [Exactly one action from the sequence above]\n\n"
|
|
96
|
+
"IMPORTANT: Follow the action sequence step by step. Each step must be done separately."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def format_action(action: Action, use_som: bool = False) -> str:
|
|
101
|
+
"""Serialize an Action into a simple textual command.
|
|
102
|
+
|
|
103
|
+
For v1 we support a small subset:
|
|
104
|
+
- click: CLICK(x=0.42, y=0.73) or CLICK([1]) in SoM mode
|
|
105
|
+
- type: TYPE(text="hello") or TYPE([1], "hello") in SoM mode
|
|
106
|
+
- wait: WAIT()
|
|
107
|
+
- done: DONE()
|
|
108
|
+
Other types fall back to a generic representation.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
action: The action to format.
|
|
112
|
+
use_som: If True, use Set-of-Marks (SoM) index-based format instead of
|
|
113
|
+
coordinate-based format. Requires element_index to be set.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
t = action.type
|
|
117
|
+
if use_som:
|
|
118
|
+
# SoM mode: use element indices instead of coordinates
|
|
119
|
+
if t == "click" and action.element_index is not None:
|
|
120
|
+
return f"CLICK([{action.element_index}])"
|
|
121
|
+
if t == "type" and action.text is not None:
|
|
122
|
+
escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
|
|
123
|
+
if action.element_index is not None:
|
|
124
|
+
return f"TYPE([{action.element_index}], \"{escaped}\")"
|
|
125
|
+
else:
|
|
126
|
+
# Fallback: TYPE without element reference (for focused field)
|
|
127
|
+
return f"TYPE(\"{escaped}\")"
|
|
128
|
+
if t == "wait":
|
|
129
|
+
return "WAIT()"
|
|
130
|
+
if t == "done":
|
|
131
|
+
return "DONE()"
|
|
132
|
+
# Fallback
|
|
133
|
+
return f"ACTION(type={t})"
|
|
134
|
+
else:
|
|
135
|
+
# Coordinate mode (original)
|
|
136
|
+
if t == "click" and action.x is not None and action.y is not None:
|
|
137
|
+
return f"CLICK(x={action.x:.2f}, y={action.y:.2f})"
|
|
138
|
+
if t == "type" and action.text is not None:
|
|
139
|
+
escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
|
|
140
|
+
return f"TYPE(text=\"{escaped}\")"
|
|
141
|
+
if t == "wait":
|
|
142
|
+
return "WAIT()"
|
|
143
|
+
if t == "done":
|
|
144
|
+
return "DONE()"
|
|
145
|
+
# Fallback
|
|
146
|
+
return f"ACTION(type={t})"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def parse_action_som(text: str) -> Action:
|
|
150
|
+
"""Parse a SoM-style action string into an Action object.
|
|
151
|
+
|
|
152
|
+
Supported formats:
|
|
153
|
+
- CLICK([N]) → click element N
|
|
154
|
+
- TYPE([N], "text") → type text into element N
|
|
155
|
+
- TYPE("text") → type text into focused field
|
|
156
|
+
- WAIT() → wait
|
|
157
|
+
- DONE() → done
|
|
158
|
+
|
|
159
|
+
Returns Action with element_index set for click/type actions.
|
|
160
|
+
"""
|
|
161
|
+
import re
|
|
162
|
+
|
|
163
|
+
text = text.strip()
|
|
164
|
+
|
|
165
|
+
# CLICK([N])
|
|
166
|
+
match = re.match(r"CLICK\(\[(\d+)\]\)", text)
|
|
167
|
+
if match:
|
|
168
|
+
idx = int(match.group(1))
|
|
169
|
+
return Action(type="click", element_index=idx)
|
|
170
|
+
|
|
171
|
+
# TYPE([N], "text") or TYPE([N], 'text')
|
|
172
|
+
match = re.match(r'TYPE\(\[(\d+)\],\s*["\'](.*)["\']\)', text, re.DOTALL)
|
|
173
|
+
if match:
|
|
174
|
+
idx = int(match.group(1))
|
|
175
|
+
content = match.group(2).replace("\\\"", "\"").replace("\\\\", "\\")
|
|
176
|
+
return Action(type="type", text=content, element_index=idx)
|
|
177
|
+
|
|
178
|
+
# TYPE("text") - no element index
|
|
179
|
+
match = re.match(r'TYPE\(["\'](.*)["\']\)', text, re.DOTALL)
|
|
180
|
+
if match:
|
|
181
|
+
content = match.group(1).replace("\\\"", "\"").replace("\\\\", "\\")
|
|
182
|
+
return Action(type="type", text=content)
|
|
183
|
+
|
|
184
|
+
# WAIT()
|
|
185
|
+
if text.upper().startswith("WAIT"):
|
|
186
|
+
return Action(type="wait")
|
|
187
|
+
|
|
188
|
+
# DONE()
|
|
189
|
+
if text.upper().startswith("DONE"):
|
|
190
|
+
return Action(type="done")
|
|
191
|
+
|
|
192
|
+
# Failed to parse
|
|
193
|
+
return Action(type="failed", raw={"text": text})
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _generate_generic_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
|
|
197
|
+
"""Generate a thought for real captures (non-synthetic scenarios).
|
|
198
|
+
|
|
199
|
+
This creates action-appropriate thoughts that teach the model to output
|
|
200
|
+
the correct DSL format while connecting actions to the goal.
|
|
201
|
+
"""
|
|
202
|
+
action = step.action
|
|
203
|
+
t = action.type
|
|
204
|
+
|
|
205
|
+
# Progress context
|
|
206
|
+
progress = f"Step {step_index + 1} of {total_steps}."
|
|
207
|
+
|
|
208
|
+
if t == "click":
|
|
209
|
+
if action.x is not None and action.y is not None:
|
|
210
|
+
# Describe the click location relative to screen regions
|
|
211
|
+
x, y = action.x, action.y
|
|
212
|
+
h_pos = "left" if x < 0.33 else ("center" if x < 0.66 else "right")
|
|
213
|
+
v_pos = "top" if y < 0.33 else ("middle" if y < 0.66 else "bottom")
|
|
214
|
+
return (
|
|
215
|
+
f"{progress} To progress toward '{goal}', I need to click on an element "
|
|
216
|
+
f"in the {v_pos}-{h_pos} area of the screen."
|
|
217
|
+
)
|
|
218
|
+
return f"{progress} I need to click on the relevant UI element to continue toward '{goal}'."
|
|
219
|
+
|
|
220
|
+
if t == "double_click":
|
|
221
|
+
return f"{progress} I need to double-click to select or activate this element for '{goal}'."
|
|
222
|
+
|
|
223
|
+
if t == "type":
|
|
224
|
+
if action.text:
|
|
225
|
+
# Don't reveal the actual text, just indicate typing is needed
|
|
226
|
+
return f"{progress} I need to type text into the focused input field to continue toward '{goal}'."
|
|
227
|
+
return f"{progress} I need to enter text in the current field."
|
|
228
|
+
|
|
229
|
+
if t == "scroll":
|
|
230
|
+
return f"{progress} I need to scroll to reveal more content or reach the target element for '{goal}'."
|
|
231
|
+
|
|
232
|
+
if t == "drag":
|
|
233
|
+
return f"{progress} I need to drag an element to complete this part of '{goal}'."
|
|
234
|
+
|
|
235
|
+
if t == "key_press":
|
|
236
|
+
return f"{progress} I need to press a key to continue the workflow."
|
|
237
|
+
|
|
238
|
+
if t == "wait":
|
|
239
|
+
return f"{progress} I should wait for the UI to update before the next action."
|
|
240
|
+
|
|
241
|
+
if t == "done":
|
|
242
|
+
return f"The goal '{goal}' has been achieved. The workflow is complete."
|
|
243
|
+
|
|
244
|
+
# Fallback
|
|
245
|
+
return f"{progress} Taking the next action to progress toward '{goal}'."
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _generate_thought_for_step(
|
|
249
|
+
step_index: int,
|
|
250
|
+
step: Step,
|
|
251
|
+
goal: str,
|
|
252
|
+
scenario: str = "login",
|
|
253
|
+
total_steps: int = 6,
|
|
254
|
+
) -> str:
|
|
255
|
+
"""Generate a simple but semantically meaningful Thought for a step.
|
|
256
|
+
|
|
257
|
+
This handles both login (6 steps) and registration (12 steps) workflows,
|
|
258
|
+
as well as generic real-world captures.
|
|
259
|
+
The goal text is included where helpful so the model can learn to connect
|
|
260
|
+
actions back to the stated objective.
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
action = step.action
|
|
264
|
+
t = action.type
|
|
265
|
+
|
|
266
|
+
if scenario == "registration":
|
|
267
|
+
return _generate_registration_thought(step_index, step, goal, total_steps)
|
|
268
|
+
elif scenario == "login" and total_steps <= 7:
|
|
269
|
+
# Only use login-specific thoughts for actual login scenarios (6-7 steps)
|
|
270
|
+
return _generate_login_thought(step_index, step, goal, total_steps)
|
|
271
|
+
else:
|
|
272
|
+
# Use generic thoughts for real captures and other scenarios
|
|
273
|
+
return _generate_generic_thought(step_index, step, goal, total_steps)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
|
|
277
|
+
"""Generate thought for login scenario (6 steps)."""
|
|
278
|
+
action = step.action
|
|
279
|
+
t = action.type
|
|
280
|
+
|
|
281
|
+
# Step 0: click username field
|
|
282
|
+
if step_index == 0 and t == "click":
|
|
283
|
+
return (
|
|
284
|
+
"I see a login screen with empty username and password fields and a Login button. "
|
|
285
|
+
f"To start logging in, I need to click on the username field to focus it ({goal})."
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Step 1: type username
|
|
289
|
+
if step_index == 1 and t == "type":
|
|
290
|
+
return (
|
|
291
|
+
"The username field is focused. To move toward the login goal, I should type the "
|
|
292
|
+
"username into this field."
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Step 2: click password field
|
|
296
|
+
if step_index == 2 and t == "click":
|
|
297
|
+
return (
|
|
298
|
+
"The username has been entered. Next, I need to focus the password field so that I can "
|
|
299
|
+
"enter the password for this login. I will click on the password input box."
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Step 3: type password
|
|
303
|
+
if step_index == 3 and t == "type":
|
|
304
|
+
return (
|
|
305
|
+
"The password field is focused. To continue the login process, I should type the "
|
|
306
|
+
"password (which will appear as masked characters on the screen)."
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Step 4: click Login button
|
|
310
|
+
if step_index == 4 and t == "click":
|
|
311
|
+
return (
|
|
312
|
+
"Both the username and password have been entered. To submit the form and attempt the "
|
|
313
|
+
"login, I should click the Login button."
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Step 5: DONE on logged-in screen
|
|
317
|
+
if step_index == 5 and t == "done":
|
|
318
|
+
return (
|
|
319
|
+
"I now see a logged-in confirmation screen indicating the goal has been satisfied. "
|
|
320
|
+
"The task is complete, so I should emit DONE()."
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Fallback for any unexpected cases
|
|
324
|
+
return (
|
|
325
|
+
"Based on the current screen and the login goal, I will take the next action that moves "
|
|
326
|
+
"the workflow forward."
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _generate_registration_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
|
|
331
|
+
"""Generate thought for registration scenario (12 steps)."""
|
|
332
|
+
action = step.action
|
|
333
|
+
t = action.type
|
|
334
|
+
|
|
335
|
+
# Registration step mapping (pairs of click + type for 5 fields, then submit + done)
|
|
336
|
+
thoughts = {
|
|
337
|
+
(0, "click"): (
|
|
338
|
+
"I see a registration form with empty fields for name, email, and password. "
|
|
339
|
+
f"To start registration, I need to click on the First Name field ({goal})."
|
|
340
|
+
),
|
|
341
|
+
(1, "type"): (
|
|
342
|
+
"The First Name field is focused. I should type the first name."
|
|
343
|
+
),
|
|
344
|
+
(2, "click"): (
|
|
345
|
+
"First name entered. Now I need to focus the Last Name field to enter it."
|
|
346
|
+
),
|
|
347
|
+
(3, "type"): (
|
|
348
|
+
"The Last Name field is focused. I should type the last name."
|
|
349
|
+
),
|
|
350
|
+
(4, "click"): (
|
|
351
|
+
"Last name entered. Now I need to focus the Email field to enter the email address."
|
|
352
|
+
),
|
|
353
|
+
(5, "type"): (
|
|
354
|
+
"The Email field is focused. I should type the email address."
|
|
355
|
+
),
|
|
356
|
+
(6, "click"): (
|
|
357
|
+
"Email entered. Now I need to focus the Password field to create a password."
|
|
358
|
+
),
|
|
359
|
+
(7, "type"): (
|
|
360
|
+
"The Password field is focused. I should type the password."
|
|
361
|
+
),
|
|
362
|
+
(8, "click"): (
|
|
363
|
+
"Password entered. Now I need to focus the Confirm Password field to verify the password."
|
|
364
|
+
),
|
|
365
|
+
(9, "type"): (
|
|
366
|
+
"The Confirm Password field is focused. I should type the same password again."
|
|
367
|
+
),
|
|
368
|
+
(10, "click"): (
|
|
369
|
+
"All form fields are filled. I should click the Register button to submit the form."
|
|
370
|
+
),
|
|
371
|
+
(11, "done"): (
|
|
372
|
+
"Registration is complete - I see a success screen. The task is finished."
|
|
373
|
+
),
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
key = (step_index, t)
|
|
377
|
+
if key in thoughts:
|
|
378
|
+
return thoughts[key]
|
|
379
|
+
|
|
380
|
+
# Fallback
|
|
381
|
+
return (
|
|
382
|
+
"Based on the current screen and the registration goal, I will take the next action "
|
|
383
|
+
"that moves the workflow forward."
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _detect_scenario(episode: Episode) -> str:
|
|
388
|
+
"""Detect scenario from episode workflow_id."""
|
|
389
|
+
workflow_id = episode.workflow_id or ""
|
|
390
|
+
if "registration" in workflow_id.lower():
|
|
391
|
+
return "registration"
|
|
392
|
+
return "login"
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def build_next_action_sft_samples(
|
|
396
|
+
episodes: List[Episode],
|
|
397
|
+
use_som: bool = False,
|
|
398
|
+
) -> List[Dict[str, Any]]:
|
|
399
|
+
"""Convert Episodes into goal-conditioned next-action SFT samples.
|
|
400
|
+
|
|
401
|
+
One sample per step (including terminal DONE), with structure:
|
|
402
|
+
{
|
|
403
|
+
"images": [image_path],
|
|
404
|
+
"messages": [
|
|
405
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
406
|
+
{"role": "user", "content": user_content},
|
|
407
|
+
{"role": "assistant", "content": action_text},
|
|
408
|
+
],
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
episodes: List of episodes to convert.
|
|
413
|
+
use_som: If True, use Set-of-Marks (SoM) DSL with element indices
|
|
414
|
+
instead of coordinate-based DSL.
|
|
415
|
+
"""
|
|
416
|
+
|
|
417
|
+
samples: List[Dict[str, Any]] = []
|
|
418
|
+
|
|
419
|
+
for episode in episodes:
|
|
420
|
+
goal = episode.goal
|
|
421
|
+
total_steps = len(episode.steps)
|
|
422
|
+
scenario = _detect_scenario(episode)
|
|
423
|
+
|
|
424
|
+
# Select appropriate system prompt based on mode and scenario
|
|
425
|
+
if use_som:
|
|
426
|
+
if scenario == "registration":
|
|
427
|
+
system_prompt = SYSTEM_PROMPT_SOM_REGISTRATION
|
|
428
|
+
else:
|
|
429
|
+
system_prompt = SYSTEM_PROMPT_SOM
|
|
430
|
+
else:
|
|
431
|
+
system_prompt = SYSTEM_PROMPT
|
|
432
|
+
|
|
433
|
+
for step_index, step in enumerate(episode.steps):
|
|
434
|
+
image_path = step.observation.image_path
|
|
435
|
+
if not image_path:
|
|
436
|
+
# Skip steps without an associated image
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
# Build action history from previous steps
|
|
440
|
+
action_history = []
|
|
441
|
+
for prev_idx in range(step_index):
|
|
442
|
+
prev_step = episode.steps[prev_idx]
|
|
443
|
+
prev_action_text = format_action(prev_step.action, use_som=use_som)
|
|
444
|
+
action_history.append(prev_action_text)
|
|
445
|
+
|
|
446
|
+
# Build history section for both modes - use actual step count
|
|
447
|
+
if action_history:
|
|
448
|
+
history_text = "ACTIONS COMPLETED SO FAR:\n"
|
|
449
|
+
for i, action_text in enumerate(action_history, 1):
|
|
450
|
+
history_text += f" {i}. {action_text}\n"
|
|
451
|
+
history_text += f"\nThis is step {step_index + 1} of {total_steps}. "
|
|
452
|
+
else:
|
|
453
|
+
history_text = f"This is step 1 of {total_steps} (no actions completed yet). "
|
|
454
|
+
|
|
455
|
+
if use_som:
|
|
456
|
+
user_content = (
|
|
457
|
+
f"Goal: {goal}\n\n"
|
|
458
|
+
f"{history_text}"
|
|
459
|
+
"Look at the screenshot and determine the NEXT action.\n\n"
|
|
460
|
+
"Thought: [which numbered element to interact with and why]\n"
|
|
461
|
+
"Action: [CLICK([N]) or TYPE([N], \"text\") or WAIT() or DONE()]"
|
|
462
|
+
)
|
|
463
|
+
else:
|
|
464
|
+
user_content = (
|
|
465
|
+
f"Goal: {goal}\n\n"
|
|
466
|
+
f"{history_text}"
|
|
467
|
+
"Look at the screenshot and determine the NEXT action.\n\n"
|
|
468
|
+
"Thought: [what element to interact with and why]\n"
|
|
469
|
+
"Action: [CLICK(x=..., y=...) or TYPE(text=\"...\") or WAIT() or DONE()]"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Provide a deterministic, semantically meaningful Thought while supervising
|
|
473
|
+
# the exact DSL Action.
|
|
474
|
+
action_text = format_action(step.action, use_som=use_som)
|
|
475
|
+
thought_text = _generate_thought_for_step(step_index, step, goal, scenario, total_steps)
|
|
476
|
+
assistant_content = f"Thought: {thought_text}\nAction: {action_text}"
|
|
477
|
+
|
|
478
|
+
sample = {
|
|
479
|
+
"images": [image_path],
|
|
480
|
+
"messages": [
|
|
481
|
+
{"role": "system", "content": system_prompt},
|
|
482
|
+
{"role": "user", "content": user_content},
|
|
483
|
+
{"role": "assistant", "content": assistant_content},
|
|
484
|
+
],
|
|
485
|
+
}
|
|
486
|
+
samples.append(sample)
|
|
487
|
+
|
|
488
|
+
return samples
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
@dataclass
|
|
492
|
+
class NextActionSample:
|
|
493
|
+
images: List[str]
|
|
494
|
+
messages: List[Dict[str, str]]
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
class NextActionDataset(Dataset):
|
|
498
|
+
"""Thin PyTorch Dataset wrapper around pre-built SFT samples."""
|
|
499
|
+
|
|
500
|
+
def __init__(self, samples: List[Dict[str, Any]]):
|
|
501
|
+
self._samples = samples
|
|
502
|
+
|
|
503
|
+
def __len__(self) -> int: # type: ignore[override]
|
|
504
|
+
return len(self._samples)
|
|
505
|
+
|
|
506
|
+
def __getitem__(self, idx: int) -> Dict[str, Any]: # type: ignore[override]
|
|
507
|
+
return self._samples[idx]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Evaluation modules for openadapt-ml.
|
|
2
|
+
|
|
3
|
+
This package provides evaluation metrics and utilities for measuring
|
|
4
|
+
model performance on GUI automation tasks.
|
|
5
|
+
|
|
6
|
+
Modules:
|
|
7
|
+
- grounding: Grounding-specific metrics (IoU, hit rate, latency)
|
|
8
|
+
- trajectory_matching: Trajectory comparison metrics (existing)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from openadapt_ml.evals.grounding import (
|
|
12
|
+
GroundingMetrics,
|
|
13
|
+
GroundingResult,
|
|
14
|
+
evaluate_grounder,
|
|
15
|
+
evaluate_grounder_on_episode,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"GroundingMetrics",
|
|
20
|
+
"GroundingResult",
|
|
21
|
+
"evaluate_grounder",
|
|
22
|
+
"evaluate_grounder_on_episode",
|
|
23
|
+
]
|