openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
openadapt_ml/config.py ADDED
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic_settings import BaseSettings
4
+
5
+
6
+ class Settings(BaseSettings):
7
+ """Application settings loaded from environment variables or .env file.
8
+
9
+ Priority order for configuration values:
10
+ 1. Environment variables
11
+ 2. .env file
12
+ 3. Default values (None for API keys)
13
+ """
14
+
15
+ # VLM API Keys
16
+ anthropic_api_key: str | None = None
17
+ openai_api_key: str | None = None
18
+ google_api_key: str | None = None
19
+
20
+ # Azure credentials (for WAA benchmark on Azure)
21
+ # These are used by DefaultAzureCredential for Service Principal auth
22
+ azure_client_id: str | None = None
23
+ azure_client_secret: str | None = None
24
+ azure_tenant_id: str | None = None
25
+
26
+ # Azure ML workspace config
27
+ azure_subscription_id: str | None = None
28
+ azure_ml_resource_group: str | None = None
29
+ azure_ml_workspace_name: str | None = None
30
+
31
+ # Azure VM settings (optional overrides)
32
+ # D2_v3 = 2 vCPUs, 8GB RAM (fits free trial with existing usage)
33
+ # D4_v3 = 4 vCPUs, 16GB RAM (needs 4 free vCPUs)
34
+ # D8_v3 = 8 vCPUs, 32GB RAM (requires quota increase)
35
+ azure_vm_size: str = "Standard_D2_v3"
36
+ # Docker image for WAA agent container
37
+ # Default is Docker Hub; setup_azure.py will set this to ACR image
38
+ azure_docker_image: str = "docker.io/windowsarena/winarena:latest"
39
+
40
+ # Azure Storage for async inference queue (Phase 2)
41
+ azure_storage_connection_string: str | None = None
42
+ azure_inference_queue_name: str = "inference-jobs"
43
+ azure_checkpoints_container: str = "checkpoints"
44
+ azure_comparisons_container: str = "comparisons"
45
+
46
+ # Lambda Labs (cloud GPU for training)
47
+ lambda_api_key: str | None = None
48
+
49
+ model_config = {
50
+ "env_file": ".env",
51
+ "env_file_encoding": "utf-8",
52
+ "extra": "ignore", # ignore extra env vars
53
+ }
54
+
55
+
56
+ settings = Settings()
File without changes
@@ -0,0 +1,507 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List
5
+
6
+ import torch
7
+ from torch.utils.data import Dataset
8
+
9
+ from openadapt_ml.schemas.sessions import Action, Episode, Step
10
+
11
+
12
+ # Coordinate-based DSL system prompt (original)
13
+ SYSTEM_PROMPT = (
14
+ "You are a GUI automation agent. Given a screenshot and a user goal, "
15
+ "predict the single next action.\n\n"
16
+ "COORDINATE SYSTEM:\n"
17
+ "- x=0.0 is the LEFT edge, x=1.0 is the RIGHT edge\n"
18
+ "- y=0.0 is the TOP edge, y=1.0 is the BOTTOM edge\n"
19
+ "- To click the CENTER of an element, estimate its center position as a fraction of screen width/height\n"
20
+ "- Example: An element in the middle of the screen would be approximately x=0.5, y=0.5\n\n"
21
+ "ALLOWED ACTIONS (use exactly this format):\n"
22
+ "- CLICK(x=0.XX, y=0.XX) → click at normalized coordinates\n"
23
+ "- TYPE(text=\"...\") → type text into the currently focused field\n"
24
+ "- WAIT() → wait for UI to update\n"
25
+ "- DONE() → task is complete\n\n"
26
+ "RESPONSE FORMAT (required):\n"
27
+ "Thought: [Brief reasoning: what element to interact with and why]\n"
28
+ "Action: [Exactly one action, e.g., CLICK(x=0.35, y=0.42)]\n\n"
29
+ "IMPORTANT: Output coordinates with 2 decimal places. Estimate the center of target elements."
30
+ )
31
+
32
+ # Set-of-Marks (SoM) DSL system prompt - uses element indices instead of coordinates
33
+ SYSTEM_PROMPT_SOM = (
34
+ "You are a GUI automation agent. Given a screenshot and a user goal, "
35
+ "predict the single next action.\n\n"
36
+ "INTERACTIVE ELEMENTS:\n"
37
+ "The screenshot shows numbered labels [1], [2], [3], etc. on interactive UI elements.\n"
38
+ "These labels indicate clickable elements like buttons, text fields, links, etc.\n\n"
39
+ "ELEMENT LABELS ON THIS LOGIN SCREEN:\n"
40
+ "[1] = Username text field\n"
41
+ "[2] = Password text field\n"
42
+ "[3] = Login button\n\n"
43
+ "ALLOWED ACTIONS (use exactly this format):\n"
44
+ "- CLICK([N]) → click element with number N to focus/activate it\n"
45
+ "- TYPE([N], \"text\") → type text into element N (e.g., TYPE([2], \"hello\"))\n"
46
+ "- WAIT() → wait for UI to update\n"
47
+ "- DONE() → task is complete\n\n"
48
+ "ACTION SEQUENCE FOR LOGIN:\n"
49
+ "1. CLICK([1]) to focus username field\n"
50
+ "2. TYPE([1], \"username\") to enter username\n"
51
+ "3. CLICK([2]) to focus password field\n"
52
+ "4. TYPE([2], \"password\") to enter password\n"
53
+ "5. CLICK([3]) to submit login\n"
54
+ "6. DONE() when login is complete\n\n"
55
+ "RESPONSE FORMAT (required):\n"
56
+ "Thought: [Brief reasoning: which numbered element to interact with and why]\n"
57
+ "Action: [Exactly one action from the sequence above]\n\n"
58
+ "IMPORTANT: Follow the action sequence step by step. Each step must be done separately."
59
+ )
60
+
61
+ # SoM prompt for registration scenario
62
+ SYSTEM_PROMPT_SOM_REGISTRATION = (
63
+ "You are a GUI automation agent. Given a screenshot and a user goal, "
64
+ "predict the single next action.\n\n"
65
+ "INTERACTIVE ELEMENTS:\n"
66
+ "The screenshot shows numbered labels [1], [2], [3], etc. on interactive UI elements.\n"
67
+ "These labels indicate clickable elements like buttons, text fields, links, etc.\n\n"
68
+ "ELEMENT LABELS ON THIS REGISTRATION SCREEN:\n"
69
+ "[1] = First Name text field\n"
70
+ "[2] = Last Name text field\n"
71
+ "[3] = Email text field\n"
72
+ "[4] = Password text field\n"
73
+ "[5] = Confirm Password text field\n"
74
+ "[6] = Register button\n\n"
75
+ "ALLOWED ACTIONS (use exactly this format):\n"
76
+ "- CLICK([N]) → click element with number N to focus/activate it\n"
77
+ "- TYPE([N], \"text\") → type text into element N (e.g., TYPE([2], \"hello\"))\n"
78
+ "- WAIT() → wait for UI to update\n"
79
+ "- DONE() → task is complete\n\n"
80
+ "ACTION SEQUENCE FOR REGISTRATION:\n"
81
+ "1. CLICK([1]) to focus first name field\n"
82
+ "2. TYPE([1], \"name\") to enter first name\n"
83
+ "3. CLICK([2]) to focus last name field\n"
84
+ "4. TYPE([2], \"name\") to enter last name\n"
85
+ "5. CLICK([3]) to focus email field\n"
86
+ "6. TYPE([3], \"email\") to enter email\n"
87
+ "7. CLICK([4]) to focus password field\n"
88
+ "8. TYPE([4], \"pass\") to enter password\n"
89
+ "9. CLICK([5]) to focus confirm password field\n"
90
+ "10. TYPE([5], \"pass\") to enter confirmation\n"
91
+ "11. CLICK([6]) to submit registration\n"
92
+ "12. DONE() when registration is complete\n\n"
93
+ "RESPONSE FORMAT (required):\n"
94
+ "Thought: [Brief reasoning: which numbered element to interact with and why]\n"
95
+ "Action: [Exactly one action from the sequence above]\n\n"
96
+ "IMPORTANT: Follow the action sequence step by step. Each step must be done separately."
97
+ )
98
+
99
+
100
+ def format_action(action: Action, use_som: bool = False) -> str:
101
+ """Serialize an Action into a simple textual command.
102
+
103
+ For v1 we support a small subset:
104
+ - click: CLICK(x=0.42, y=0.73) or CLICK([1]) in SoM mode
105
+ - type: TYPE(text="hello") or TYPE([1], "hello") in SoM mode
106
+ - wait: WAIT()
107
+ - done: DONE()
108
+ Other types fall back to a generic representation.
109
+
110
+ Args:
111
+ action: The action to format.
112
+ use_som: If True, use Set-of-Marks (SoM) index-based format instead of
113
+ coordinate-based format. Requires element_index to be set.
114
+ """
115
+
116
+ t = action.type
117
+ if use_som:
118
+ # SoM mode: use element indices instead of coordinates
119
+ if t == "click" and action.element_index is not None:
120
+ return f"CLICK([{action.element_index}])"
121
+ if t == "type" and action.text is not None:
122
+ escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
123
+ if action.element_index is not None:
124
+ return f"TYPE([{action.element_index}], \"{escaped}\")"
125
+ else:
126
+ # Fallback: TYPE without element reference (for focused field)
127
+ return f"TYPE(\"{escaped}\")"
128
+ if t == "wait":
129
+ return "WAIT()"
130
+ if t == "done":
131
+ return "DONE()"
132
+ # Fallback
133
+ return f"ACTION(type={t})"
134
+ else:
135
+ # Coordinate mode (original)
136
+ if t == "click" and action.x is not None and action.y is not None:
137
+ return f"CLICK(x={action.x:.2f}, y={action.y:.2f})"
138
+ if t == "type" and action.text is not None:
139
+ escaped = action.text.replace("\\", "\\\\").replace("\"", "\\\"")
140
+ return f"TYPE(text=\"{escaped}\")"
141
+ if t == "wait":
142
+ return "WAIT()"
143
+ if t == "done":
144
+ return "DONE()"
145
+ # Fallback
146
+ return f"ACTION(type={t})"
147
+
148
+
149
+ def parse_action_som(text: str) -> Action:
150
+ """Parse a SoM-style action string into an Action object.
151
+
152
+ Supported formats:
153
+ - CLICK([N]) → click element N
154
+ - TYPE([N], "text") → type text into element N
155
+ - TYPE("text") → type text into focused field
156
+ - WAIT() → wait
157
+ - DONE() → done
158
+
159
+ Returns Action with element_index set for click/type actions.
160
+ """
161
+ import re
162
+
163
+ text = text.strip()
164
+
165
+ # CLICK([N])
166
+ match = re.match(r"CLICK\(\[(\d+)\]\)", text)
167
+ if match:
168
+ idx = int(match.group(1))
169
+ return Action(type="click", element_index=idx)
170
+
171
+ # TYPE([N], "text") or TYPE([N], 'text')
172
+ match = re.match(r'TYPE\(\[(\d+)\],\s*["\'](.*)["\']\)', text, re.DOTALL)
173
+ if match:
174
+ idx = int(match.group(1))
175
+ content = match.group(2).replace("\\\"", "\"").replace("\\\\", "\\")
176
+ return Action(type="type", text=content, element_index=idx)
177
+
178
+ # TYPE("text") - no element index
179
+ match = re.match(r'TYPE\(["\'](.*)["\']\)', text, re.DOTALL)
180
+ if match:
181
+ content = match.group(1).replace("\\\"", "\"").replace("\\\\", "\\")
182
+ return Action(type="type", text=content)
183
+
184
+ # WAIT()
185
+ if text.upper().startswith("WAIT"):
186
+ return Action(type="wait")
187
+
188
+ # DONE()
189
+ if text.upper().startswith("DONE"):
190
+ return Action(type="done")
191
+
192
+ # Failed to parse
193
+ return Action(type="failed", raw={"text": text})
194
+
195
+
196
+ def _generate_generic_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
197
+ """Generate a thought for real captures (non-synthetic scenarios).
198
+
199
+ This creates action-appropriate thoughts that teach the model to output
200
+ the correct DSL format while connecting actions to the goal.
201
+ """
202
+ action = step.action
203
+ t = action.type
204
+
205
+ # Progress context
206
+ progress = f"Step {step_index + 1} of {total_steps}."
207
+
208
+ if t == "click":
209
+ if action.x is not None and action.y is not None:
210
+ # Describe the click location relative to screen regions
211
+ x, y = action.x, action.y
212
+ h_pos = "left" if x < 0.33 else ("center" if x < 0.66 else "right")
213
+ v_pos = "top" if y < 0.33 else ("middle" if y < 0.66 else "bottom")
214
+ return (
215
+ f"{progress} To progress toward '{goal}', I need to click on an element "
216
+ f"in the {v_pos}-{h_pos} area of the screen."
217
+ )
218
+ return f"{progress} I need to click on the relevant UI element to continue toward '{goal}'."
219
+
220
+ if t == "double_click":
221
+ return f"{progress} I need to double-click to select or activate this element for '{goal}'."
222
+
223
+ if t == "type":
224
+ if action.text:
225
+ # Don't reveal the actual text, just indicate typing is needed
226
+ return f"{progress} I need to type text into the focused input field to continue toward '{goal}'."
227
+ return f"{progress} I need to enter text in the current field."
228
+
229
+ if t == "scroll":
230
+ return f"{progress} I need to scroll to reveal more content or reach the target element for '{goal}'."
231
+
232
+ if t == "drag":
233
+ return f"{progress} I need to drag an element to complete this part of '{goal}'."
234
+
235
+ if t == "key_press":
236
+ return f"{progress} I need to press a key to continue the workflow."
237
+
238
+ if t == "wait":
239
+ return f"{progress} I should wait for the UI to update before the next action."
240
+
241
+ if t == "done":
242
+ return f"The goal '{goal}' has been achieved. The workflow is complete."
243
+
244
+ # Fallback
245
+ return f"{progress} Taking the next action to progress toward '{goal}'."
246
+
247
+
248
+ def _generate_thought_for_step(
249
+ step_index: int,
250
+ step: Step,
251
+ goal: str,
252
+ scenario: str = "login",
253
+ total_steps: int = 6,
254
+ ) -> str:
255
+ """Generate a simple but semantically meaningful Thought for a step.
256
+
257
+ This handles both login (6 steps) and registration (12 steps) workflows,
258
+ as well as generic real-world captures.
259
+ The goal text is included where helpful so the model can learn to connect
260
+ actions back to the stated objective.
261
+ """
262
+
263
+ action = step.action
264
+ t = action.type
265
+
266
+ if scenario == "registration":
267
+ return _generate_registration_thought(step_index, step, goal, total_steps)
268
+ elif scenario == "login" and total_steps <= 7:
269
+ # Only use login-specific thoughts for actual login scenarios (6-7 steps)
270
+ return _generate_login_thought(step_index, step, goal, total_steps)
271
+ else:
272
+ # Use generic thoughts for real captures and other scenarios
273
+ return _generate_generic_thought(step_index, step, goal, total_steps)
274
+
275
+
276
+ def _generate_login_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
277
+ """Generate thought for login scenario (6 steps)."""
278
+ action = step.action
279
+ t = action.type
280
+
281
+ # Step 0: click username field
282
+ if step_index == 0 and t == "click":
283
+ return (
284
+ "I see a login screen with empty username and password fields and a Login button. "
285
+ f"To start logging in, I need to click on the username field to focus it ({goal})."
286
+ )
287
+
288
+ # Step 1: type username
289
+ if step_index == 1 and t == "type":
290
+ return (
291
+ "The username field is focused. To move toward the login goal, I should type the "
292
+ "username into this field."
293
+ )
294
+
295
+ # Step 2: click password field
296
+ if step_index == 2 and t == "click":
297
+ return (
298
+ "The username has been entered. Next, I need to focus the password field so that I can "
299
+ "enter the password for this login. I will click on the password input box."
300
+ )
301
+
302
+ # Step 3: type password
303
+ if step_index == 3 and t == "type":
304
+ return (
305
+ "The password field is focused. To continue the login process, I should type the "
306
+ "password (which will appear as masked characters on the screen)."
307
+ )
308
+
309
+ # Step 4: click Login button
310
+ if step_index == 4 and t == "click":
311
+ return (
312
+ "Both the username and password have been entered. To submit the form and attempt the "
313
+ "login, I should click the Login button."
314
+ )
315
+
316
+ # Step 5: DONE on logged-in screen
317
+ if step_index == 5 and t == "done":
318
+ return (
319
+ "I now see a logged-in confirmation screen indicating the goal has been satisfied. "
320
+ "The task is complete, so I should emit DONE()."
321
+ )
322
+
323
+ # Fallback for any unexpected cases
324
+ return (
325
+ "Based on the current screen and the login goal, I will take the next action that moves "
326
+ "the workflow forward."
327
+ )
328
+
329
+
330
+ def _generate_registration_thought(step_index: int, step: Step, goal: str, total_steps: int) -> str:
331
+ """Generate thought for registration scenario (12 steps)."""
332
+ action = step.action
333
+ t = action.type
334
+
335
+ # Registration step mapping (pairs of click + type for 5 fields, then submit + done)
336
+ thoughts = {
337
+ (0, "click"): (
338
+ "I see a registration form with empty fields for name, email, and password. "
339
+ f"To start registration, I need to click on the First Name field ({goal})."
340
+ ),
341
+ (1, "type"): (
342
+ "The First Name field is focused. I should type the first name."
343
+ ),
344
+ (2, "click"): (
345
+ "First name entered. Now I need to focus the Last Name field to enter it."
346
+ ),
347
+ (3, "type"): (
348
+ "The Last Name field is focused. I should type the last name."
349
+ ),
350
+ (4, "click"): (
351
+ "Last name entered. Now I need to focus the Email field to enter the email address."
352
+ ),
353
+ (5, "type"): (
354
+ "The Email field is focused. I should type the email address."
355
+ ),
356
+ (6, "click"): (
357
+ "Email entered. Now I need to focus the Password field to create a password."
358
+ ),
359
+ (7, "type"): (
360
+ "The Password field is focused. I should type the password."
361
+ ),
362
+ (8, "click"): (
363
+ "Password entered. Now I need to focus the Confirm Password field to verify the password."
364
+ ),
365
+ (9, "type"): (
366
+ "The Confirm Password field is focused. I should type the same password again."
367
+ ),
368
+ (10, "click"): (
369
+ "All form fields are filled. I should click the Register button to submit the form."
370
+ ),
371
+ (11, "done"): (
372
+ "Registration is complete - I see a success screen. The task is finished."
373
+ ),
374
+ }
375
+
376
+ key = (step_index, t)
377
+ if key in thoughts:
378
+ return thoughts[key]
379
+
380
+ # Fallback
381
+ return (
382
+ "Based on the current screen and the registration goal, I will take the next action "
383
+ "that moves the workflow forward."
384
+ )
385
+
386
+
387
+ def _detect_scenario(episode: Episode) -> str:
388
+ """Detect scenario from episode workflow_id."""
389
+ workflow_id = episode.workflow_id or ""
390
+ if "registration" in workflow_id.lower():
391
+ return "registration"
392
+ return "login"
393
+
394
+
395
+ def build_next_action_sft_samples(
396
+ episodes: List[Episode],
397
+ use_som: bool = False,
398
+ ) -> List[Dict[str, Any]]:
399
+ """Convert Episodes into goal-conditioned next-action SFT samples.
400
+
401
+ One sample per step (including terminal DONE), with structure:
402
+ {
403
+ "images": [image_path],
404
+ "messages": [
405
+ {"role": "system", "content": SYSTEM_PROMPT},
406
+ {"role": "user", "content": user_content},
407
+ {"role": "assistant", "content": action_text},
408
+ ],
409
+ }
410
+
411
+ Args:
412
+ episodes: List of episodes to convert.
413
+ use_som: If True, use Set-of-Marks (SoM) DSL with element indices
414
+ instead of coordinate-based DSL.
415
+ """
416
+
417
+ samples: List[Dict[str, Any]] = []
418
+
419
+ for episode in episodes:
420
+ goal = episode.goal
421
+ total_steps = len(episode.steps)
422
+ scenario = _detect_scenario(episode)
423
+
424
+ # Select appropriate system prompt based on mode and scenario
425
+ if use_som:
426
+ if scenario == "registration":
427
+ system_prompt = SYSTEM_PROMPT_SOM_REGISTRATION
428
+ else:
429
+ system_prompt = SYSTEM_PROMPT_SOM
430
+ else:
431
+ system_prompt = SYSTEM_PROMPT
432
+
433
+ for step_index, step in enumerate(episode.steps):
434
+ image_path = step.observation.image_path
435
+ if not image_path:
436
+ # Skip steps without an associated image
437
+ continue
438
+
439
+ # Build action history from previous steps
440
+ action_history = []
441
+ for prev_idx in range(step_index):
442
+ prev_step = episode.steps[prev_idx]
443
+ prev_action_text = format_action(prev_step.action, use_som=use_som)
444
+ action_history.append(prev_action_text)
445
+
446
+ # Build history section for both modes - use actual step count
447
+ if action_history:
448
+ history_text = "ACTIONS COMPLETED SO FAR:\n"
449
+ for i, action_text in enumerate(action_history, 1):
450
+ history_text += f" {i}. {action_text}\n"
451
+ history_text += f"\nThis is step {step_index + 1} of {total_steps}. "
452
+ else:
453
+ history_text = f"This is step 1 of {total_steps} (no actions completed yet). "
454
+
455
+ if use_som:
456
+ user_content = (
457
+ f"Goal: {goal}\n\n"
458
+ f"{history_text}"
459
+ "Look at the screenshot and determine the NEXT action.\n\n"
460
+ "Thought: [which numbered element to interact with and why]\n"
461
+ "Action: [CLICK([N]) or TYPE([N], \"text\") or WAIT() or DONE()]"
462
+ )
463
+ else:
464
+ user_content = (
465
+ f"Goal: {goal}\n\n"
466
+ f"{history_text}"
467
+ "Look at the screenshot and determine the NEXT action.\n\n"
468
+ "Thought: [what element to interact with and why]\n"
469
+ "Action: [CLICK(x=..., y=...) or TYPE(text=\"...\") or WAIT() or DONE()]"
470
+ )
471
+
472
+ # Provide a deterministic, semantically meaningful Thought while supervising
473
+ # the exact DSL Action.
474
+ action_text = format_action(step.action, use_som=use_som)
475
+ thought_text = _generate_thought_for_step(step_index, step, goal, scenario, total_steps)
476
+ assistant_content = f"Thought: {thought_text}\nAction: {action_text}"
477
+
478
+ sample = {
479
+ "images": [image_path],
480
+ "messages": [
481
+ {"role": "system", "content": system_prompt},
482
+ {"role": "user", "content": user_content},
483
+ {"role": "assistant", "content": assistant_content},
484
+ ],
485
+ }
486
+ samples.append(sample)
487
+
488
+ return samples
489
+
490
+
491
+ @dataclass
492
+ class NextActionSample:
493
+ images: List[str]
494
+ messages: List[Dict[str, str]]
495
+
496
+
497
+ class NextActionDataset(Dataset):
498
+ """Thin PyTorch Dataset wrapper around pre-built SFT samples."""
499
+
500
+ def __init__(self, samples: List[Dict[str, Any]]):
501
+ self._samples = samples
502
+
503
+ def __len__(self) -> int: # type: ignore[override]
504
+ return len(self._samples)
505
+
506
+ def __getitem__(self, idx: int) -> Dict[str, Any]: # type: ignore[override]
507
+ return self._samples[idx]
@@ -0,0 +1,23 @@
1
+ """Evaluation modules for openadapt-ml.
2
+
3
+ This package provides evaluation metrics and utilities for measuring
4
+ model performance on GUI automation tasks.
5
+
6
+ Modules:
7
+ - grounding: Grounding-specific metrics (IoU, hit rate, latency)
8
+ - trajectory_matching: Trajectory comparison metrics (existing)
9
+ """
10
+
11
+ from openadapt_ml.evals.grounding import (
12
+ GroundingMetrics,
13
+ GroundingResult,
14
+ evaluate_grounder,
15
+ evaluate_grounder_on_episode,
16
+ )
17
+
18
+ __all__ = [
19
+ "GroundingMetrics",
20
+ "GroundingResult",
21
+ "evaluate_grounder",
22
+ "evaluate_grounder_on_episode",
23
+ ]