openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,7 @@ from dataclasses import dataclass, field
5
5
  from typing import Any, Callable, Dict, List, Optional
6
6
 
7
7
  from openadapt_ml.runtime.policy import AgentPolicy
8
- from openadapt_ml.schemas.sessions import Action, Episode
8
+ from openadapt_ml.schema import Action, Episode, ActionType
9
9
 
10
10
 
11
11
  @dataclass
@@ -15,10 +15,15 @@ class MilestoneSpec:
15
15
  A milestone is achieved when, at a specific step, the predicted action
16
16
  matches certain criteria (type match + optional coord threshold).
17
17
  """
18
+
18
19
  name: str
19
20
  step_index: int # Which step in the episode (0-indexed)
20
- expected_type: str # Expected ground truth action type ("click", "type", "done", etc.)
21
- coord_threshold: Optional[float] = None # If set, coord error must be < this for clicks
21
+ expected_type: (
22
+ str # Expected ground truth action type ("click", "type", "done", etc.)
23
+ )
24
+ coord_threshold: Optional[float] = (
25
+ None # If set, coord error must be < this for clicks
26
+ )
22
27
 
23
28
 
24
29
  # Predefined milestone specs per scenario
@@ -28,7 +33,9 @@ class MilestoneSpec:
28
33
  LOGIN_MILESTONES = [
29
34
  MilestoneSpec("typed_username", step_index=1, expected_type="type"),
30
35
  MilestoneSpec("typed_password", step_index=3, expected_type="type"),
31
- MilestoneSpec("clicked_login", step_index=4, expected_type="click", coord_threshold=0.10),
36
+ MilestoneSpec(
37
+ "clicked_login", step_index=4, expected_type="click", coord_threshold=0.10
38
+ ),
32
39
  MilestoneSpec("emitted_done", step_index=5, expected_type="done"),
33
40
  ]
34
41
 
@@ -81,33 +88,60 @@ class AggregateMetrics:
81
88
  action_type_accuracy: float
82
89
  mean_coord_error: Optional[float]
83
90
  coord_error_count: int
84
- episode_success_rate: Optional[float] # Strict: all steps must match (renamed from success_pred)
91
+ episode_success_rate: Optional[
92
+ float
93
+ ] # Strict: all steps must match (renamed from success_pred)
85
94
  click_hit_rate: Optional[float] # Point-based: within 5% of center
86
- mean_episode_progress: Optional[float] # Partial credit: avg(step_matches/step_total)
95
+ mean_episode_progress: Optional[
96
+ float
97
+ ] # Partial credit: avg(step_matches/step_total)
87
98
  # New partial-credit metrics
88
- mean_episode_step_score: Optional[float] # Strict partial: avg(full_step_correct/step_total)
99
+ mean_episode_step_score: Optional[
100
+ float
101
+ ] # Strict partial: avg(full_step_correct/step_total)
89
102
  weak_episode_success_rate: Optional[float] # Semantic milestones all achieved
90
103
  state_success_rate: Optional[float] = None # From model's State: {"success": true}
91
- bbox_hit_rate: Optional[float] = None # Bbox-based: click anywhere in element bounds
104
+ bbox_hit_rate: Optional[float] = (
105
+ None # Bbox-based: click anywhere in element bounds
106
+ )
92
107
  element_accuracy: Optional[float] = None # SoM element index accuracy
93
108
 
94
109
 
110
+ def _get_action_type_str(action: Action) -> str:
111
+ """Get action type as string, handling both enum and string types."""
112
+ return action.type.value if isinstance(action.type, ActionType) else action.type
113
+
114
+
115
+ def _get_normalized_coords(action: Action) -> tuple[Optional[float], Optional[float]]:
116
+ """Extract normalized coordinates from action."""
117
+ if action.normalized_coordinates:
118
+ return action.normalized_coordinates
119
+ return None, None
120
+
121
+
122
+ def _get_bbox(action: Action) -> Optional[tuple[float, float, float, float]]:
123
+ """Extract bounding box from action, checking element.bounds or raw."""
124
+ if action.element and action.element.bounds:
125
+ b = action.element.bounds
126
+ return (b.x, b.y, b.x + b.width, b.y + b.height)
127
+ elif action.raw and "bbox" in action.raw:
128
+ return action.raw["bbox"]
129
+ return None
130
+
131
+
95
132
  def compute_coordinate_error(pred_action: Action, gt_action: Action) -> Optional[float]:
96
133
  """Compute normalized L2 distance between predicted and ground-truth coords.
97
134
 
98
135
  Returns None if either action is missing coordinates.
99
136
  """
137
+ pred_x, pred_y = _get_normalized_coords(pred_action)
138
+ gt_x, gt_y = _get_normalized_coords(gt_action)
100
139
 
101
- if (
102
- pred_action.x is None
103
- or pred_action.y is None
104
- or gt_action.x is None
105
- or gt_action.y is None
106
- ):
140
+ if pred_x is None or pred_y is None or gt_x is None or gt_y is None:
107
141
  return None
108
142
 
109
- dx = pred_action.x - gt_action.x
110
- dy = pred_action.y - gt_action.y
143
+ dx = pred_x - gt_x
144
+ dy = pred_y - gt_y
111
145
  return math.sqrt(dx * dx + dy * dy)
112
146
 
113
147
 
@@ -119,14 +153,16 @@ def is_click_in_bbox(pred_action: Action, gt_action: Action) -> Optional[bool]:
119
153
  - False if prediction is outside bbox
120
154
  - None if no bbox is available (fall back to coord distance)
121
155
  """
122
- if gt_action.bbox is None:
156
+ gt_bbox = _get_bbox(gt_action)
157
+ if gt_bbox is None:
123
158
  return None
124
159
 
125
- if pred_action.x is None or pred_action.y is None:
160
+ pred_x, pred_y = _get_normalized_coords(pred_action)
161
+ if pred_x is None or pred_y is None:
126
162
  return False
127
163
 
128
- x_min, y_min, x_max, y_max = gt_action.bbox
129
- return (x_min <= pred_action.x <= x_max) and (y_min <= pred_action.y <= y_max)
164
+ x_min, y_min, x_max, y_max = gt_bbox
165
+ return (x_min <= pred_x <= x_max) and (y_min <= pred_y <= y_max)
130
166
 
131
167
 
132
168
  def evaluate_episode(
@@ -177,7 +213,7 @@ def evaluate_episode(
177
213
 
178
214
  for step_idx, step in enumerate(episode.steps):
179
215
  # Skip steps without an image; the dataset builder does the same.
180
- if not step.observation.image_path:
216
+ if not step.observation.screenshot_path:
181
217
  continue
182
218
 
183
219
  if sample_idx >= len(samples):
@@ -186,16 +222,22 @@ def evaluate_episode(
186
222
  sample = samples[sample_idx]
187
223
  sample_idx += 1
188
224
 
189
- pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(sample)
225
+ pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(
226
+ sample
227
+ )
190
228
  gt_action = step.action
191
229
 
230
+ # Get action types as strings for comparison
231
+ pred_type_str = _get_action_type_str(pred_action)
232
+ gt_type_str = _get_action_type_str(gt_action)
233
+
192
234
  # Track state-based success from final step
193
235
  if pred_state and isinstance(pred_state, dict):
194
236
  success_val = pred_state.get("success")
195
237
  if isinstance(success_val, bool):
196
238
  last_state_success = success_val
197
239
 
198
- type_match = pred_action.type == gt_action.type
240
+ type_match = pred_type_str == gt_type_str
199
241
  if type_match:
200
242
  step_matches += 1
201
243
  else:
@@ -203,17 +245,30 @@ def evaluate_episode(
203
245
 
204
246
  coord_error: Optional[float] = None
205
247
  click_hit = False
206
- bbox_hit = False
207
248
  element_hit = False
208
249
 
250
+ # Helper to get element index - check element.element_id or raw field
251
+ def _get_element_index(action: Action) -> Optional[int]:
252
+ if action.element and action.element.element_id:
253
+ try:
254
+ return int(action.element.element_id)
255
+ except (ValueError, TypeError):
256
+ pass
257
+ if action.raw and "element_index" in action.raw:
258
+ return action.raw["element_index"]
259
+ return None
260
+
261
+ gt_element_index = _get_element_index(gt_action)
262
+ pred_element_index = _get_element_index(pred_action)
263
+
209
264
  # SoM mode: evaluate by element index for click/drag/type actions
210
- if use_som and gt_action.type in {"click", "drag", "type"}:
211
- if gt_action.element_index is not None:
265
+ if use_som and gt_type_str in {"click", "drag", "type"}:
266
+ if gt_element_index is not None:
212
267
  element_total += 1
213
- if pred_action.element_index == gt_action.element_index:
268
+ if pred_element_index == gt_element_index:
214
269
  element_hits += 1
215
270
  element_hit = True
216
- elif gt_action.type in {"click", "drag"}:
271
+ elif gt_type_str in {"click", "drag"}:
217
272
  # Coordinate mode: evaluate by coordinate distance
218
273
  coord_error = compute_coordinate_error(pred_action, gt_action)
219
274
  if coord_error is not None:
@@ -229,15 +284,14 @@ def evaluate_episode(
229
284
  bbox_total += 1
230
285
  if in_bbox:
231
286
  bbox_hits += 1
232
- bbox_hit = True
233
287
 
234
288
  # Full step correctness: type matches AND element/coord match for relevant actions
235
289
  if type_match:
236
- if use_som and gt_action.type in {"click", "drag", "type"}:
290
+ if use_som and gt_type_str in {"click", "drag", "type"}:
237
291
  # SoM mode: require element index match
238
292
  if element_hit:
239
293
  full_step_correct += 1
240
- elif gt_action.type in {"click", "drag"}:
294
+ elif gt_type_str in {"click", "drag"}:
241
295
  # Coordinate mode: require click hit
242
296
  if click_hit:
243
297
  full_step_correct += 1
@@ -247,20 +301,30 @@ def evaluate_episode(
247
301
 
248
302
  # Track semantic milestones using the milestone spec
249
303
  for milestone in milestones:
250
- if step_idx == milestone.step_index and gt_action.type == milestone.expected_type:
251
- if pred_action.type == milestone.expected_type:
304
+ if (
305
+ step_idx == milestone.step_index
306
+ and gt_type_str == milestone.expected_type
307
+ ):
308
+ if pred_type_str == milestone.expected_type:
252
309
  # Check coord threshold if specified (for click actions)
253
310
  if milestone.coord_threshold is not None:
254
- if coord_error is not None and coord_error < milestone.coord_threshold:
311
+ if (
312
+ coord_error is not None
313
+ and coord_error < milestone.coord_threshold
314
+ ):
255
315
  milestones_achieved[milestone.name] = True
256
316
  else:
257
317
  # No coord threshold - type match is sufficient
258
318
  milestones_achieved[milestone.name] = True
259
319
 
260
320
  # Ensure DONE is correct at the DONE step.
261
- if gt_action.type == "done" and pred_action.type != "done":
321
+ if gt_type_str == "done" and pred_type_str != "done":
262
322
  success_pred = False
263
323
 
324
+ # Get normalized coordinates for logging
325
+ pred_x, pred_y = _get_normalized_coords(pred_action)
326
+ gt_x, gt_y = _get_normalized_coords(gt_action)
327
+
264
328
  # Optional logging of this step.
265
329
  if log_fn is not None and (log_limit is None or logged_count < log_limit):
266
330
  messages = sample.get("messages", [])
@@ -273,30 +337,30 @@ def evaluate_episode(
273
337
  user_prompt = m.get("content")
274
338
 
275
339
  record: Dict[str, Any] = {
276
- "episode_id": episode.id,
340
+ "episode_id": episode.episode_id,
277
341
  "step_index": step_idx,
278
- "goal": episode.goal,
342
+ "goal": episode.instruction,
279
343
  "system_prompt": system_prompt,
280
344
  "user_prompt": user_prompt,
281
345
  "model_output_raw": raw_text,
282
346
  "pred_action": {
283
- "type": pred_action.type,
284
- "x": pred_action.x,
285
- "y": pred_action.y,
347
+ "type": pred_type_str,
348
+ "x": pred_x,
349
+ "y": pred_y,
286
350
  "text": pred_action.text,
287
- "element_index": pred_action.element_index,
351
+ "element_index": pred_element_index,
288
352
  },
289
353
  "ground_truth_action": {
290
- "type": gt_action.type,
291
- "x": gt_action.x,
292
- "y": gt_action.y,
354
+ "type": gt_type_str,
355
+ "x": gt_x,
356
+ "y": gt_y,
293
357
  "text": gt_action.text,
294
- "element_index": gt_action.element_index,
358
+ "element_index": gt_element_index,
295
359
  },
296
- "correct_type": pred_action.type == gt_action.type,
360
+ "correct_type": pred_type_str == gt_type_str,
297
361
  "coord_error_norm": coord_error,
298
- "element_match": pred_action.element_index == gt_action.element_index
299
- if gt_action.element_index is not None
362
+ "element_match": pred_element_index == gt_element_index
363
+ if gt_element_index is not None
300
364
  else None,
301
365
  }
302
366
 
@@ -306,7 +370,7 @@ def evaluate_episode(
306
370
  step_total += 1
307
371
 
308
372
  metrics = EpisodeMetrics(
309
- episode_id=episode.id,
373
+ episode_id=episode.episode_id,
310
374
  step_matches=step_matches,
311
375
  step_total=step_total,
312
376
  coord_errors=coord_errors,
@@ -380,18 +444,16 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
380
444
 
381
445
  # Partial credit: average episode progress (step_matches / step_total per episode)
382
446
  if eval_episodes:
383
- episode_progress_scores = [
384
- m.step_matches / m.step_total for m in eval_episodes
385
- ]
386
- mean_episode_progress = sum(episode_progress_scores) / len(episode_progress_scores)
447
+ episode_progress_scores = [m.step_matches / m.step_total for m in eval_episodes]
448
+ mean_episode_progress = sum(episode_progress_scores) / len(
449
+ episode_progress_scores
450
+ )
387
451
  else:
388
452
  mean_episode_progress = None
389
453
 
390
454
  # Strict partial: avg(full_step_correct / step_total) - requires type match + click hit
391
455
  if eval_episodes:
392
- step_scores = [
393
- m.full_step_correct / m.step_total for m in eval_episodes
394
- ]
456
+ step_scores = [m.full_step_correct / m.step_total for m in eval_episodes]
395
457
  mean_episode_step_score = sum(step_scores) / len(step_scores)
396
458
  else:
397
459
  mean_episode_step_score = None
@@ -399,7 +461,8 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
399
461
  # Weak episode success: all milestones achieved
400
462
  if eval_episodes:
401
463
  weak_success_count = sum(
402
- 1 for m in eval_episodes
464
+ 1
465
+ for m in eval_episodes
403
466
  if m.milestones_achieved and all(m.milestones_achieved.values())
404
467
  )
405
468
  weak_episode_success_rate = weak_success_count / len(eval_episodes)
@@ -0,0 +1,19 @@
1
+ """Demo-conditioned prompt experiment.
2
+
3
+ Tests whether including a human demonstration in the prompt
4
+ improves VLM agent performance on similar tasks.
5
+ """
6
+
7
+ from openadapt_ml.experiments.demo_prompt.format_demo import (
8
+ format_episode_as_demo,
9
+ format_action,
10
+ )
11
+ from openadapt_ml.experiments.demo_prompt.run_experiment import (
12
+ DemoPromptExperiment,
13
+ )
14
+
15
+ __all__ = [
16
+ "format_episode_as_demo",
17
+ "format_action",
18
+ "DemoPromptExperiment",
19
+ ]
@@ -0,0 +1,236 @@
1
+ """Demo formatting utilities for few-shot prompting."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from openadapt_ml.schema import Action, Episode, Step
10
+
11
+
12
+ def format_action(action: "Action") -> str:
13
+ """Format an Action as a string for the prompt.
14
+
15
+ Args:
16
+ action: Action to format.
17
+
18
+ Returns:
19
+ String representation like "CLICK(0.5, 0.3)" or "TYPE('hello')".
20
+ """
21
+ # Get action type value (handle both enum and string)
22
+ action_type = action.type.value if hasattr(action.type, "value") else action.type
23
+
24
+ if action_type == "click":
25
+ if action.normalized_coordinates is not None:
26
+ x, y = action.normalized_coordinates
27
+ return f"CLICK({x:.3f}, {y:.3f})"
28
+ return "CLICK()"
29
+
30
+ elif action_type == "double_click":
31
+ if action.normalized_coordinates is not None:
32
+ x, y = action.normalized_coordinates
33
+ return f"DOUBLE_CLICK({x:.3f}, {y:.3f})"
34
+ return "DOUBLE_CLICK()"
35
+
36
+ elif action_type == "type":
37
+ text = action.text or ""
38
+ # Escape quotes and truncate if very long
39
+ text = text.replace('"', '\\"')
40
+ if len(text) > 50:
41
+ text = text[:47] + "..."
42
+ return f'TYPE("{text}")'
43
+
44
+ elif action_type == "key":
45
+ key = action.key or "unknown"
46
+ if action.modifiers:
47
+ mods = "+".join(action.modifiers)
48
+ return f"KEY({mods}+{key})"
49
+ return f"KEY({key})"
50
+
51
+ elif action_type == "scroll":
52
+ direction = action.scroll_direction or "down"
53
+ return f"SCROLL({direction})"
54
+
55
+ elif action_type == "drag":
56
+ if (
57
+ action.normalized_coordinates is not None
58
+ and action.normalized_end is not None
59
+ ):
60
+ x, y = action.normalized_coordinates
61
+ end_x, end_y = action.normalized_end
62
+ return f"DRAG({x:.3f}, {y:.3f}, {end_x:.3f}, {end_y:.3f})"
63
+ return "DRAG()"
64
+
65
+ else:
66
+ return f"{action_type.upper()}()"
67
+
68
+
69
+ def format_step(step: "Step", step_num: int) -> str:
70
+ """Format a single step for the demo.
71
+
72
+ Args:
73
+ step: Step to format.
74
+ step_num: Step number (1-indexed).
75
+
76
+ Returns:
77
+ Formatted step string.
78
+ """
79
+ lines = [f"Step {step_num}:"]
80
+
81
+ # Add window context if available
82
+ if step.observation and step.observation.window_title:
83
+ lines.append(f" Window: {step.observation.window_title}")
84
+
85
+ # Add action
86
+ if step.action:
87
+ action_str = format_action(step.action)
88
+ lines.append(f" Action: {action_str}")
89
+
90
+ return "\n".join(lines)
91
+
92
+
93
+ def format_episode_as_demo(
94
+ episode: "Episode",
95
+ max_steps: int = 10,
96
+ include_screenshots: bool = False,
97
+ ) -> str:
98
+ """Convert an Episode to a few-shot demo format.
99
+
100
+ Args:
101
+ episode: Episode containing the demonstration.
102
+ max_steps: Maximum number of steps to include.
103
+ include_screenshots: Whether to include screenshot paths (for multi-image).
104
+
105
+ Returns:
106
+ Formatted demo string for prompt injection.
107
+ """
108
+ lines = [
109
+ "DEMONSTRATION:",
110
+ f"Task: {episode.instruction}",
111
+ "",
112
+ ]
113
+
114
+ for i, step in enumerate(episode.steps[:max_steps], 1):
115
+ lines.append(format_step(step, i))
116
+
117
+ # Optionally include screenshot reference
118
+ if (
119
+ include_screenshots
120
+ and step.observation
121
+ and step.observation.screenshot_path
122
+ ):
123
+ lines.append(f" [Screenshot: {step.observation.screenshot_path}]")
124
+
125
+ lines.append("")
126
+
127
+ lines.append("---")
128
+ return "\n".join(lines)
129
+
130
+
131
+ def format_episode_verbose(
132
+ episode: "Episode",
133
+ max_steps: int = 10,
134
+ ) -> str:
135
+ """Format episode with more context per step.
136
+
137
+ Includes:
138
+ - Screen summary
139
+ - User intent (inferred)
140
+ - Action taken
141
+ - Observed result
142
+
143
+ Args:
144
+ episode: Episode to format.
145
+ max_steps: Maximum steps to include.
146
+
147
+ Returns:
148
+ Verbose demo string.
149
+ """
150
+ lines = [
151
+ "DEMONSTRATION:",
152
+ f"Goal: {episode.instruction}",
153
+ "",
154
+ "The following shows the step-by-step procedure:",
155
+ "",
156
+ ]
157
+
158
+ for i, step in enumerate(episode.steps[:max_steps], 1):
159
+ lines.append(f"Step {i}:")
160
+
161
+ # Screen summary
162
+ if step.observation:
163
+ if step.observation.window_title:
164
+ lines.append(f" [Screen: {step.observation.window_title}]")
165
+
166
+ # Action taken
167
+ if step.action:
168
+ action_str = format_action(step.action)
169
+ lines.append(f" [Action: {action_str}]")
170
+
171
+ # Observed result (inferred from next step's observation)
172
+ if i < len(episode.steps):
173
+ next_step = episode.steps[i]
174
+ if next_step.observation and next_step.observation.window_title:
175
+ if (
176
+ not step.observation
177
+ or next_step.observation.window_title
178
+ != step.observation.window_title
179
+ ):
180
+ lines.append(
181
+ f" [Result: Window changed to {next_step.observation.window_title}]"
182
+ )
183
+
184
+ lines.append("")
185
+
186
+ lines.append("---")
187
+ return "\n".join(lines)
188
+
189
+
190
+ def get_demo_screenshot_paths(
191
+ episode: "Episode",
192
+ max_steps: int = 10,
193
+ ) -> list[str]:
194
+ """Get screenshot paths from episode for multi-image prompting.
195
+
196
+ Args:
197
+ episode: Episode to extract screenshots from.
198
+ max_steps: Maximum steps to include.
199
+
200
+ Returns:
201
+ List of screenshot paths.
202
+ """
203
+ paths = []
204
+ for step in episode.steps[:max_steps]:
205
+ if step.observation and step.observation.screenshot_path:
206
+ path = step.observation.screenshot_path
207
+ if Path(path).exists():
208
+ paths.append(path)
209
+ return paths
210
+
211
+
212
+ def generate_length_matched_control(demo: str) -> str:
213
+ """Generate a control prompt with the same token count but no trajectory info.
214
+
215
+ Used to control for prompt length effects.
216
+
217
+ Args:
218
+ demo: The demo string to match length of.
219
+
220
+ Returns:
221
+ Control string of similar length with irrelevant content.
222
+ """
223
+ # Use generic placeholder text
224
+ placeholder = (
225
+ "This is placeholder text that serves as a control condition. "
226
+ "It contains no relevant information about the task or demonstration. "
227
+ "The purpose is to match the token count of the demonstration prompt. "
228
+ )
229
+
230
+ # Repeat to match approximate length
231
+ target_len = len(demo)
232
+ control = ""
233
+ while len(control) < target_len:
234
+ control += placeholder
235
+
236
+ return control[:target_len]
@@ -0,0 +1,83 @@
1
+ {
2
+ "timestamp": "20251231_002125",
3
+ "provider": "anthropic",
4
+ "screenshot": "/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift/screenshots/capture_31807990_step_0.png",
5
+ "demo": "DEMONSTRATION:\nGoal: Turn off Night Shift in macOS System Settings\n\nThe following shows the step-by-step procedure:\n\nStep 1:\n [Screen: Desktop with Terminal window visible]\n [Action: CLICK(0.01, 0.01) - Click Apple menu icon in top-left]\n [Result: Apple menu dropdown opened]\n\nStep 2:\n [Screen: Apple menu visible with options]\n [Action: CLICK on \"System Settings...\" menu item]\n [Result: System Settings application opened]\n\nStep 3:\n [Screen: System Settings window with sidebar]\n [Action: CLICK on \"Displays\" in the sidebar]\n [Result: Displays panel shown in main area]\n\nStep 4:\n [Screen: Displays panel showing display settings]\n [Action: CLICK on \"Night Shift...\" button]\n [Result: Night Shift popup/sheet appeared]\n\nStep 5:\n [Screen: Night Shift popup with Schedule dropdown]\n [Action: CLICK on Schedule dropdown, select \"Off\"]\n [Result: Night Shift schedule set to Off, Night Shift disabled]\n\n---",
6
+ "test_cases": [
7
+ {
8
+ "test_case": {
9
+ "name": "near_toggle",
10
+ "task": "Turn ON Night Shift in macOS System Settings",
11
+ "similarity": "near",
12
+ "notes": "Same procedure, just toggle to opposite state"
13
+ },
14
+ "results": {
15
+ "zero_shot": {
16
+ "action": "CLICK(20, 8)",
17
+ "response": "I need to turn ON Night Shift in macOS. Based on the terminal window showing an attempt to control Night Shift, I should access System Settings to properly enable it. Let me click on the Apple menu.\n\nACTION: CLICK(20, 8)",
18
+ "error": null
19
+ },
20
+ "with_demo": {
21
+ "action": "CLICK(20, 8)",
22
+ "response": "I need to turn ON Night Shift in macOS System Settings. Let me start by clicking on the Apple menu icon to access System Settings.\n\nACTION: CLICK(20, 8)",
23
+ "error": null
24
+ },
25
+ "control": {
26
+ "action": "CLICK(1243, 8)",
27
+ "response": "I need to analyze the screenshot to understand how to turn on Night Shift in macOS System Settings.\n\nThe screenshot shows a Terminal window with some command output. To access Night Shift settings in macOS, I need to open System Settings. I can see various system icons in the menu bar at the top of the screen.\n\nACTION: CLICK(1243, 8)",
28
+ "error": null
29
+ }
30
+ }
31
+ },
32
+ {
33
+ "test_case": {
34
+ "name": "medium_same_panel",
35
+ "task": "Adjust Night Shift color temperature to warmer setting",
36
+ "similarity": "medium",
37
+ "notes": "Same navigation path, different final action"
38
+ },
39
+ "results": {
40
+ "zero_shot": {
41
+ "action": "CLICK(1218, 8)",
42
+ "response": "The task requires adjusting the Night Shift color temperature to a warmer setting. Looking at the screenshot, I can see a macOS Terminal window. The system menu bar at the top right contains various control icons. I need to find the Night Shift or display settings controls.\n\nACTION: CLICK(1218, 8)",
43
+ "error": null
44
+ },
45
+ "with_demo": {
46
+ "action": "CLICK(19, 8)",
47
+ "response": "I need to adjust the Night Shift color temperature to make it warmer. First, I should access the System Settings to find the Night Shift controls.\n\nACTION: CLICK(19, 8)",
48
+ "error": null
49
+ },
50
+ "control": {
51
+ "action": "CLICK(1114, 8)",
52
+ "response": "I need to analyze the screenshot and the goal to determine the next action.\n\nThe screenshot shows a Terminal window on macOS. The goal is to \"Adjust Night Shift color temperature to warmer setting.\"\n\nLooking at the top menu bar, I can see various system icons. To adjust Night Shift settings, I would need to access System Preferences or use the Control Center. I can see what appears to be system control icons in the top-right corner of the screen.\n\nThe most direct way to adjust Night Shift would be to click on the Control Center icon or the display settings in the menu bar.\n\nACTION: CLICK(1114, 8)",
53
+ "error": null
54
+ }
55
+ }
56
+ },
57
+ {
58
+ "test_case": {
59
+ "name": "far_different_setting",
60
+ "task": "Turn on True Tone display in macOS System Settings",
61
+ "similarity": "far",
62
+ "notes": "Same app but different panel (True Tone is in Displays)"
63
+ },
64
+ "results": {
65
+ "zero_shot": {
66
+ "action": "CLICK(1217, 8)",
67
+ "response": "I need to access System Settings to turn on True Tone display. Let me analyze the screen to find how to access it.\n\nACTION: CLICK(1217, 8)",
68
+ "error": null
69
+ },
70
+ "with_demo": {
71
+ "action": "CLICK(20, 8)",
72
+ "response": "I need to turn on True Tone display in macOS System Settings. Let me start by clicking on the Apple menu in the top-left corner.\n\nACTION: CLICK(20, 8)",
73
+ "error": null
74
+ },
75
+ "control": {
76
+ "action": "CLICK(20, 8)",
77
+ "response": "I need to open System Settings to enable True Tone display. I'll click on the Apple menu in the top-left corner to access System Settings.\n\nACTION: CLICK(20, 8)",
78
+ "error": null
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }