openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -107
- openadapt_ml/benchmarks/agent.py +297 -374
- openadapt_ml/benchmarks/azure.py +62 -24
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1874 -751
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +1236 -0
- openadapt_ml/benchmarks/vm_monitor.py +1111 -0
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +3194 -89
- openadapt_ml/cloud/ssh_tunnel.py +595 -0
- openadapt_ml/datasets/next_action.py +125 -96
- openadapt_ml/evals/grounding.py +32 -9
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +120 -57
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +732 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +277 -0
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +11 -10
- openadapt_ml/ingest/capture.py +97 -86
- openadapt_ml/ingest/loader.py +120 -69
- openadapt_ml/ingest/synthetic.py +344 -193
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +843 -0
- openadapt_ml/retrieval/embeddings.py +630 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +162 -0
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +27 -14
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +113 -0
- openadapt_ml/schema/converters.py +588 -0
- openadapt_ml/schema/episode.py +470 -0
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +102 -61
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +19 -14
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +16 -17
- openadapt_ml/scripts/train.py +98 -75
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +3255 -19
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +255 -441
- openadapt_ml/training/trl_trainer.py +403 -0
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/runner.py +0 -381
- openadapt_ml/benchmarks/waa.py +0 -704
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,7 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from typing import Any, Callable, Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
from openadapt_ml.runtime.policy import AgentPolicy
|
|
8
|
-
from openadapt_ml.
|
|
8
|
+
from openadapt_ml.schema import Action, Episode, ActionType
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@dataclass
|
|
@@ -15,10 +15,15 @@ class MilestoneSpec:
|
|
|
15
15
|
A milestone is achieved when, at a specific step, the predicted action
|
|
16
16
|
matches certain criteria (type match + optional coord threshold).
|
|
17
17
|
"""
|
|
18
|
+
|
|
18
19
|
name: str
|
|
19
20
|
step_index: int # Which step in the episode (0-indexed)
|
|
20
|
-
expected_type:
|
|
21
|
-
|
|
21
|
+
expected_type: (
|
|
22
|
+
str # Expected ground truth action type ("click", "type", "done", etc.)
|
|
23
|
+
)
|
|
24
|
+
coord_threshold: Optional[float] = (
|
|
25
|
+
None # If set, coord error must be < this for clicks
|
|
26
|
+
)
|
|
22
27
|
|
|
23
28
|
|
|
24
29
|
# Predefined milestone specs per scenario
|
|
@@ -28,7 +33,9 @@ class MilestoneSpec:
|
|
|
28
33
|
LOGIN_MILESTONES = [
|
|
29
34
|
MilestoneSpec("typed_username", step_index=1, expected_type="type"),
|
|
30
35
|
MilestoneSpec("typed_password", step_index=3, expected_type="type"),
|
|
31
|
-
MilestoneSpec(
|
|
36
|
+
MilestoneSpec(
|
|
37
|
+
"clicked_login", step_index=4, expected_type="click", coord_threshold=0.10
|
|
38
|
+
),
|
|
32
39
|
MilestoneSpec("emitted_done", step_index=5, expected_type="done"),
|
|
33
40
|
]
|
|
34
41
|
|
|
@@ -81,33 +88,60 @@ class AggregateMetrics:
|
|
|
81
88
|
action_type_accuracy: float
|
|
82
89
|
mean_coord_error: Optional[float]
|
|
83
90
|
coord_error_count: int
|
|
84
|
-
episode_success_rate: Optional[
|
|
91
|
+
episode_success_rate: Optional[
|
|
92
|
+
float
|
|
93
|
+
] # Strict: all steps must match (renamed from success_pred)
|
|
85
94
|
click_hit_rate: Optional[float] # Point-based: within 5% of center
|
|
86
|
-
mean_episode_progress: Optional[
|
|
95
|
+
mean_episode_progress: Optional[
|
|
96
|
+
float
|
|
97
|
+
] # Partial credit: avg(step_matches/step_total)
|
|
87
98
|
# New partial-credit metrics
|
|
88
|
-
mean_episode_step_score: Optional[
|
|
99
|
+
mean_episode_step_score: Optional[
|
|
100
|
+
float
|
|
101
|
+
] # Strict partial: avg(full_step_correct/step_total)
|
|
89
102
|
weak_episode_success_rate: Optional[float] # Semantic milestones all achieved
|
|
90
103
|
state_success_rate: Optional[float] = None # From model's State: {"success": true}
|
|
91
|
-
bbox_hit_rate: Optional[float] =
|
|
104
|
+
bbox_hit_rate: Optional[float] = (
|
|
105
|
+
None # Bbox-based: click anywhere in element bounds
|
|
106
|
+
)
|
|
92
107
|
element_accuracy: Optional[float] = None # SoM element index accuracy
|
|
93
108
|
|
|
94
109
|
|
|
110
|
+
def _get_action_type_str(action: Action) -> str:
|
|
111
|
+
"""Get action type as string, handling both enum and string types."""
|
|
112
|
+
return action.type.value if isinstance(action.type, ActionType) else action.type
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_normalized_coords(action: Action) -> tuple[Optional[float], Optional[float]]:
|
|
116
|
+
"""Extract normalized coordinates from action."""
|
|
117
|
+
if action.normalized_coordinates:
|
|
118
|
+
return action.normalized_coordinates
|
|
119
|
+
return None, None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _get_bbox(action: Action) -> Optional[tuple[float, float, float, float]]:
|
|
123
|
+
"""Extract bounding box from action, checking element.bounds or raw."""
|
|
124
|
+
if action.element and action.element.bounds:
|
|
125
|
+
b = action.element.bounds
|
|
126
|
+
return (b.x, b.y, b.x + b.width, b.y + b.height)
|
|
127
|
+
elif action.raw and "bbox" in action.raw:
|
|
128
|
+
return action.raw["bbox"]
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
|
|
95
132
|
def compute_coordinate_error(pred_action: Action, gt_action: Action) -> Optional[float]:
|
|
96
133
|
"""Compute normalized L2 distance between predicted and ground-truth coords.
|
|
97
134
|
|
|
98
135
|
Returns None if either action is missing coordinates.
|
|
99
136
|
"""
|
|
137
|
+
pred_x, pred_y = _get_normalized_coords(pred_action)
|
|
138
|
+
gt_x, gt_y = _get_normalized_coords(gt_action)
|
|
100
139
|
|
|
101
|
-
if
|
|
102
|
-
pred_action.x is None
|
|
103
|
-
or pred_action.y is None
|
|
104
|
-
or gt_action.x is None
|
|
105
|
-
or gt_action.y is None
|
|
106
|
-
):
|
|
140
|
+
if pred_x is None or pred_y is None or gt_x is None or gt_y is None:
|
|
107
141
|
return None
|
|
108
142
|
|
|
109
|
-
dx =
|
|
110
|
-
dy =
|
|
143
|
+
dx = pred_x - gt_x
|
|
144
|
+
dy = pred_y - gt_y
|
|
111
145
|
return math.sqrt(dx * dx + dy * dy)
|
|
112
146
|
|
|
113
147
|
|
|
@@ -119,14 +153,16 @@ def is_click_in_bbox(pred_action: Action, gt_action: Action) -> Optional[bool]:
|
|
|
119
153
|
- False if prediction is outside bbox
|
|
120
154
|
- None if no bbox is available (fall back to coord distance)
|
|
121
155
|
"""
|
|
122
|
-
|
|
156
|
+
gt_bbox = _get_bbox(gt_action)
|
|
157
|
+
if gt_bbox is None:
|
|
123
158
|
return None
|
|
124
159
|
|
|
125
|
-
|
|
160
|
+
pred_x, pred_y = _get_normalized_coords(pred_action)
|
|
161
|
+
if pred_x is None or pred_y is None:
|
|
126
162
|
return False
|
|
127
163
|
|
|
128
|
-
x_min, y_min, x_max, y_max =
|
|
129
|
-
return (x_min <=
|
|
164
|
+
x_min, y_min, x_max, y_max = gt_bbox
|
|
165
|
+
return (x_min <= pred_x <= x_max) and (y_min <= pred_y <= y_max)
|
|
130
166
|
|
|
131
167
|
|
|
132
168
|
def evaluate_episode(
|
|
@@ -177,7 +213,7 @@ def evaluate_episode(
|
|
|
177
213
|
|
|
178
214
|
for step_idx, step in enumerate(episode.steps):
|
|
179
215
|
# Skip steps without an image; the dataset builder does the same.
|
|
180
|
-
if not step.observation.
|
|
216
|
+
if not step.observation.screenshot_path:
|
|
181
217
|
continue
|
|
182
218
|
|
|
183
219
|
if sample_idx >= len(samples):
|
|
@@ -186,16 +222,22 @@ def evaluate_episode(
|
|
|
186
222
|
sample = samples[sample_idx]
|
|
187
223
|
sample_idx += 1
|
|
188
224
|
|
|
189
|
-
pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(
|
|
225
|
+
pred_action, _thought, pred_state, raw_text = policy.predict_action_from_sample(
|
|
226
|
+
sample
|
|
227
|
+
)
|
|
190
228
|
gt_action = step.action
|
|
191
229
|
|
|
230
|
+
# Get action types as strings for comparison
|
|
231
|
+
pred_type_str = _get_action_type_str(pred_action)
|
|
232
|
+
gt_type_str = _get_action_type_str(gt_action)
|
|
233
|
+
|
|
192
234
|
# Track state-based success from final step
|
|
193
235
|
if pred_state and isinstance(pred_state, dict):
|
|
194
236
|
success_val = pred_state.get("success")
|
|
195
237
|
if isinstance(success_val, bool):
|
|
196
238
|
last_state_success = success_val
|
|
197
239
|
|
|
198
|
-
type_match =
|
|
240
|
+
type_match = pred_type_str == gt_type_str
|
|
199
241
|
if type_match:
|
|
200
242
|
step_matches += 1
|
|
201
243
|
else:
|
|
@@ -203,17 +245,30 @@ def evaluate_episode(
|
|
|
203
245
|
|
|
204
246
|
coord_error: Optional[float] = None
|
|
205
247
|
click_hit = False
|
|
206
|
-
bbox_hit = False
|
|
207
248
|
element_hit = False
|
|
208
249
|
|
|
250
|
+
# Helper to get element index - check element.element_id or raw field
|
|
251
|
+
def _get_element_index(action: Action) -> Optional[int]:
|
|
252
|
+
if action.element and action.element.element_id:
|
|
253
|
+
try:
|
|
254
|
+
return int(action.element.element_id)
|
|
255
|
+
except (ValueError, TypeError):
|
|
256
|
+
pass
|
|
257
|
+
if action.raw and "element_index" in action.raw:
|
|
258
|
+
return action.raw["element_index"]
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
gt_element_index = _get_element_index(gt_action)
|
|
262
|
+
pred_element_index = _get_element_index(pred_action)
|
|
263
|
+
|
|
209
264
|
# SoM mode: evaluate by element index for click/drag/type actions
|
|
210
|
-
if use_som and
|
|
211
|
-
if
|
|
265
|
+
if use_som and gt_type_str in {"click", "drag", "type"}:
|
|
266
|
+
if gt_element_index is not None:
|
|
212
267
|
element_total += 1
|
|
213
|
-
if
|
|
268
|
+
if pred_element_index == gt_element_index:
|
|
214
269
|
element_hits += 1
|
|
215
270
|
element_hit = True
|
|
216
|
-
elif
|
|
271
|
+
elif gt_type_str in {"click", "drag"}:
|
|
217
272
|
# Coordinate mode: evaluate by coordinate distance
|
|
218
273
|
coord_error = compute_coordinate_error(pred_action, gt_action)
|
|
219
274
|
if coord_error is not None:
|
|
@@ -229,15 +284,14 @@ def evaluate_episode(
|
|
|
229
284
|
bbox_total += 1
|
|
230
285
|
if in_bbox:
|
|
231
286
|
bbox_hits += 1
|
|
232
|
-
bbox_hit = True
|
|
233
287
|
|
|
234
288
|
# Full step correctness: type matches AND element/coord match for relevant actions
|
|
235
289
|
if type_match:
|
|
236
|
-
if use_som and
|
|
290
|
+
if use_som and gt_type_str in {"click", "drag", "type"}:
|
|
237
291
|
# SoM mode: require element index match
|
|
238
292
|
if element_hit:
|
|
239
293
|
full_step_correct += 1
|
|
240
|
-
elif
|
|
294
|
+
elif gt_type_str in {"click", "drag"}:
|
|
241
295
|
# Coordinate mode: require click hit
|
|
242
296
|
if click_hit:
|
|
243
297
|
full_step_correct += 1
|
|
@@ -247,20 +301,30 @@ def evaluate_episode(
|
|
|
247
301
|
|
|
248
302
|
# Track semantic milestones using the milestone spec
|
|
249
303
|
for milestone in milestones:
|
|
250
|
-
if
|
|
251
|
-
|
|
304
|
+
if (
|
|
305
|
+
step_idx == milestone.step_index
|
|
306
|
+
and gt_type_str == milestone.expected_type
|
|
307
|
+
):
|
|
308
|
+
if pred_type_str == milestone.expected_type:
|
|
252
309
|
# Check coord threshold if specified (for click actions)
|
|
253
310
|
if milestone.coord_threshold is not None:
|
|
254
|
-
if
|
|
311
|
+
if (
|
|
312
|
+
coord_error is not None
|
|
313
|
+
and coord_error < milestone.coord_threshold
|
|
314
|
+
):
|
|
255
315
|
milestones_achieved[milestone.name] = True
|
|
256
316
|
else:
|
|
257
317
|
# No coord threshold - type match is sufficient
|
|
258
318
|
milestones_achieved[milestone.name] = True
|
|
259
319
|
|
|
260
320
|
# Ensure DONE is correct at the DONE step.
|
|
261
|
-
if
|
|
321
|
+
if gt_type_str == "done" and pred_type_str != "done":
|
|
262
322
|
success_pred = False
|
|
263
323
|
|
|
324
|
+
# Get normalized coordinates for logging
|
|
325
|
+
pred_x, pred_y = _get_normalized_coords(pred_action)
|
|
326
|
+
gt_x, gt_y = _get_normalized_coords(gt_action)
|
|
327
|
+
|
|
264
328
|
# Optional logging of this step.
|
|
265
329
|
if log_fn is not None and (log_limit is None or logged_count < log_limit):
|
|
266
330
|
messages = sample.get("messages", [])
|
|
@@ -273,30 +337,30 @@ def evaluate_episode(
|
|
|
273
337
|
user_prompt = m.get("content")
|
|
274
338
|
|
|
275
339
|
record: Dict[str, Any] = {
|
|
276
|
-
"episode_id": episode.
|
|
340
|
+
"episode_id": episode.episode_id,
|
|
277
341
|
"step_index": step_idx,
|
|
278
|
-
"goal": episode.
|
|
342
|
+
"goal": episode.instruction,
|
|
279
343
|
"system_prompt": system_prompt,
|
|
280
344
|
"user_prompt": user_prompt,
|
|
281
345
|
"model_output_raw": raw_text,
|
|
282
346
|
"pred_action": {
|
|
283
|
-
"type":
|
|
284
|
-
"x":
|
|
285
|
-
"y":
|
|
347
|
+
"type": pred_type_str,
|
|
348
|
+
"x": pred_x,
|
|
349
|
+
"y": pred_y,
|
|
286
350
|
"text": pred_action.text,
|
|
287
|
-
"element_index":
|
|
351
|
+
"element_index": pred_element_index,
|
|
288
352
|
},
|
|
289
353
|
"ground_truth_action": {
|
|
290
|
-
"type":
|
|
291
|
-
"x":
|
|
292
|
-
"y":
|
|
354
|
+
"type": gt_type_str,
|
|
355
|
+
"x": gt_x,
|
|
356
|
+
"y": gt_y,
|
|
293
357
|
"text": gt_action.text,
|
|
294
|
-
"element_index":
|
|
358
|
+
"element_index": gt_element_index,
|
|
295
359
|
},
|
|
296
|
-
"correct_type":
|
|
360
|
+
"correct_type": pred_type_str == gt_type_str,
|
|
297
361
|
"coord_error_norm": coord_error,
|
|
298
|
-
"element_match":
|
|
299
|
-
if
|
|
362
|
+
"element_match": pred_element_index == gt_element_index
|
|
363
|
+
if gt_element_index is not None
|
|
300
364
|
else None,
|
|
301
365
|
}
|
|
302
366
|
|
|
@@ -306,7 +370,7 @@ def evaluate_episode(
|
|
|
306
370
|
step_total += 1
|
|
307
371
|
|
|
308
372
|
metrics = EpisodeMetrics(
|
|
309
|
-
episode_id=episode.
|
|
373
|
+
episode_id=episode.episode_id,
|
|
310
374
|
step_matches=step_matches,
|
|
311
375
|
step_total=step_total,
|
|
312
376
|
coord_errors=coord_errors,
|
|
@@ -380,18 +444,16 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
|
|
|
380
444
|
|
|
381
445
|
# Partial credit: average episode progress (step_matches / step_total per episode)
|
|
382
446
|
if eval_episodes:
|
|
383
|
-
episode_progress_scores = [
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
447
|
+
episode_progress_scores = [m.step_matches / m.step_total for m in eval_episodes]
|
|
448
|
+
mean_episode_progress = sum(episode_progress_scores) / len(
|
|
449
|
+
episode_progress_scores
|
|
450
|
+
)
|
|
387
451
|
else:
|
|
388
452
|
mean_episode_progress = None
|
|
389
453
|
|
|
390
454
|
# Strict partial: avg(full_step_correct / step_total) - requires type match + click hit
|
|
391
455
|
if eval_episodes:
|
|
392
|
-
step_scores = [
|
|
393
|
-
m.full_step_correct / m.step_total for m in eval_episodes
|
|
394
|
-
]
|
|
456
|
+
step_scores = [m.full_step_correct / m.step_total for m in eval_episodes]
|
|
395
457
|
mean_episode_step_score = sum(step_scores) / len(step_scores)
|
|
396
458
|
else:
|
|
397
459
|
mean_episode_step_score = None
|
|
@@ -399,7 +461,8 @@ def aggregate_metrics(episodes_metrics: List[EpisodeMetrics]) -> AggregateMetric
|
|
|
399
461
|
# Weak episode success: all milestones achieved
|
|
400
462
|
if eval_episodes:
|
|
401
463
|
weak_success_count = sum(
|
|
402
|
-
1
|
|
464
|
+
1
|
|
465
|
+
for m in eval_episodes
|
|
403
466
|
if m.milestones_achieved and all(m.milestones_achieved.values())
|
|
404
467
|
)
|
|
405
468
|
weak_episode_success_rate = weak_success_count / len(eval_episodes)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Demo-conditioned prompt experiment.
|
|
2
|
+
|
|
3
|
+
Tests whether including a human demonstration in the prompt
|
|
4
|
+
improves VLM agent performance on similar tasks.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from openadapt_ml.experiments.demo_prompt.format_demo import (
|
|
8
|
+
format_episode_as_demo,
|
|
9
|
+
format_action,
|
|
10
|
+
)
|
|
11
|
+
from openadapt_ml.experiments.demo_prompt.run_experiment import (
|
|
12
|
+
DemoPromptExperiment,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"format_episode_as_demo",
|
|
17
|
+
"format_action",
|
|
18
|
+
"DemoPromptExperiment",
|
|
19
|
+
]
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Demo formatting utilities for few-shot prompting."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from openadapt_ml.schema import Action, Episode, Step
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_action(action: "Action") -> str:
|
|
13
|
+
"""Format an Action as a string for the prompt.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
action: Action to format.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
String representation like "CLICK(0.5, 0.3)" or "TYPE('hello')".
|
|
20
|
+
"""
|
|
21
|
+
# Get action type value (handle both enum and string)
|
|
22
|
+
action_type = action.type.value if hasattr(action.type, "value") else action.type
|
|
23
|
+
|
|
24
|
+
if action_type == "click":
|
|
25
|
+
if action.normalized_coordinates is not None:
|
|
26
|
+
x, y = action.normalized_coordinates
|
|
27
|
+
return f"CLICK({x:.3f}, {y:.3f})"
|
|
28
|
+
return "CLICK()"
|
|
29
|
+
|
|
30
|
+
elif action_type == "double_click":
|
|
31
|
+
if action.normalized_coordinates is not None:
|
|
32
|
+
x, y = action.normalized_coordinates
|
|
33
|
+
return f"DOUBLE_CLICK({x:.3f}, {y:.3f})"
|
|
34
|
+
return "DOUBLE_CLICK()"
|
|
35
|
+
|
|
36
|
+
elif action_type == "type":
|
|
37
|
+
text = action.text or ""
|
|
38
|
+
# Escape quotes and truncate if very long
|
|
39
|
+
text = text.replace('"', '\\"')
|
|
40
|
+
if len(text) > 50:
|
|
41
|
+
text = text[:47] + "..."
|
|
42
|
+
return f'TYPE("{text}")'
|
|
43
|
+
|
|
44
|
+
elif action_type == "key":
|
|
45
|
+
key = action.key or "unknown"
|
|
46
|
+
if action.modifiers:
|
|
47
|
+
mods = "+".join(action.modifiers)
|
|
48
|
+
return f"KEY({mods}+{key})"
|
|
49
|
+
return f"KEY({key})"
|
|
50
|
+
|
|
51
|
+
elif action_type == "scroll":
|
|
52
|
+
direction = action.scroll_direction or "down"
|
|
53
|
+
return f"SCROLL({direction})"
|
|
54
|
+
|
|
55
|
+
elif action_type == "drag":
|
|
56
|
+
if (
|
|
57
|
+
action.normalized_coordinates is not None
|
|
58
|
+
and action.normalized_end is not None
|
|
59
|
+
):
|
|
60
|
+
x, y = action.normalized_coordinates
|
|
61
|
+
end_x, end_y = action.normalized_end
|
|
62
|
+
return f"DRAG({x:.3f}, {y:.3f}, {end_x:.3f}, {end_y:.3f})"
|
|
63
|
+
return "DRAG()"
|
|
64
|
+
|
|
65
|
+
else:
|
|
66
|
+
return f"{action_type.upper()}()"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def format_step(step: "Step", step_num: int) -> str:
|
|
70
|
+
"""Format a single step for the demo.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
step: Step to format.
|
|
74
|
+
step_num: Step number (1-indexed).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Formatted step string.
|
|
78
|
+
"""
|
|
79
|
+
lines = [f"Step {step_num}:"]
|
|
80
|
+
|
|
81
|
+
# Add window context if available
|
|
82
|
+
if step.observation and step.observation.window_title:
|
|
83
|
+
lines.append(f" Window: {step.observation.window_title}")
|
|
84
|
+
|
|
85
|
+
# Add action
|
|
86
|
+
if step.action:
|
|
87
|
+
action_str = format_action(step.action)
|
|
88
|
+
lines.append(f" Action: {action_str}")
|
|
89
|
+
|
|
90
|
+
return "\n".join(lines)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def format_episode_as_demo(
|
|
94
|
+
episode: "Episode",
|
|
95
|
+
max_steps: int = 10,
|
|
96
|
+
include_screenshots: bool = False,
|
|
97
|
+
) -> str:
|
|
98
|
+
"""Convert an Episode to a few-shot demo format.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
episode: Episode containing the demonstration.
|
|
102
|
+
max_steps: Maximum number of steps to include.
|
|
103
|
+
include_screenshots: Whether to include screenshot paths (for multi-image).
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Formatted demo string for prompt injection.
|
|
107
|
+
"""
|
|
108
|
+
lines = [
|
|
109
|
+
"DEMONSTRATION:",
|
|
110
|
+
f"Task: {episode.instruction}",
|
|
111
|
+
"",
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
for i, step in enumerate(episode.steps[:max_steps], 1):
|
|
115
|
+
lines.append(format_step(step, i))
|
|
116
|
+
|
|
117
|
+
# Optionally include screenshot reference
|
|
118
|
+
if (
|
|
119
|
+
include_screenshots
|
|
120
|
+
and step.observation
|
|
121
|
+
and step.observation.screenshot_path
|
|
122
|
+
):
|
|
123
|
+
lines.append(f" [Screenshot: {step.observation.screenshot_path}]")
|
|
124
|
+
|
|
125
|
+
lines.append("")
|
|
126
|
+
|
|
127
|
+
lines.append("---")
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def format_episode_verbose(
|
|
132
|
+
episode: "Episode",
|
|
133
|
+
max_steps: int = 10,
|
|
134
|
+
) -> str:
|
|
135
|
+
"""Format episode with more context per step.
|
|
136
|
+
|
|
137
|
+
Includes:
|
|
138
|
+
- Screen summary
|
|
139
|
+
- User intent (inferred)
|
|
140
|
+
- Action taken
|
|
141
|
+
- Observed result
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
episode: Episode to format.
|
|
145
|
+
max_steps: Maximum steps to include.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Verbose demo string.
|
|
149
|
+
"""
|
|
150
|
+
lines = [
|
|
151
|
+
"DEMONSTRATION:",
|
|
152
|
+
f"Goal: {episode.instruction}",
|
|
153
|
+
"",
|
|
154
|
+
"The following shows the step-by-step procedure:",
|
|
155
|
+
"",
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
for i, step in enumerate(episode.steps[:max_steps], 1):
|
|
159
|
+
lines.append(f"Step {i}:")
|
|
160
|
+
|
|
161
|
+
# Screen summary
|
|
162
|
+
if step.observation:
|
|
163
|
+
if step.observation.window_title:
|
|
164
|
+
lines.append(f" [Screen: {step.observation.window_title}]")
|
|
165
|
+
|
|
166
|
+
# Action taken
|
|
167
|
+
if step.action:
|
|
168
|
+
action_str = format_action(step.action)
|
|
169
|
+
lines.append(f" [Action: {action_str}]")
|
|
170
|
+
|
|
171
|
+
# Observed result (inferred from next step's observation)
|
|
172
|
+
if i < len(episode.steps):
|
|
173
|
+
next_step = episode.steps[i]
|
|
174
|
+
if next_step.observation and next_step.observation.window_title:
|
|
175
|
+
if (
|
|
176
|
+
not step.observation
|
|
177
|
+
or next_step.observation.window_title
|
|
178
|
+
!= step.observation.window_title
|
|
179
|
+
):
|
|
180
|
+
lines.append(
|
|
181
|
+
f" [Result: Window changed to {next_step.observation.window_title}]"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
lines.append("")
|
|
185
|
+
|
|
186
|
+
lines.append("---")
|
|
187
|
+
return "\n".join(lines)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_demo_screenshot_paths(
|
|
191
|
+
episode: "Episode",
|
|
192
|
+
max_steps: int = 10,
|
|
193
|
+
) -> list[str]:
|
|
194
|
+
"""Get screenshot paths from episode for multi-image prompting.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
episode: Episode to extract screenshots from.
|
|
198
|
+
max_steps: Maximum steps to include.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
List of screenshot paths.
|
|
202
|
+
"""
|
|
203
|
+
paths = []
|
|
204
|
+
for step in episode.steps[:max_steps]:
|
|
205
|
+
if step.observation and step.observation.screenshot_path:
|
|
206
|
+
path = step.observation.screenshot_path
|
|
207
|
+
if Path(path).exists():
|
|
208
|
+
paths.append(path)
|
|
209
|
+
return paths
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def generate_length_matched_control(demo: str) -> str:
|
|
213
|
+
"""Generate a control prompt with the same token count but no trajectory info.
|
|
214
|
+
|
|
215
|
+
Used to control for prompt length effects.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
demo: The demo string to match length of.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Control string of similar length with irrelevant content.
|
|
222
|
+
"""
|
|
223
|
+
# Use generic placeholder text
|
|
224
|
+
placeholder = (
|
|
225
|
+
"This is placeholder text that serves as a control condition. "
|
|
226
|
+
"It contains no relevant information about the task or demonstration. "
|
|
227
|
+
"The purpose is to match the token count of the demonstration prompt. "
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Repeat to match approximate length
|
|
231
|
+
target_len = len(demo)
|
|
232
|
+
control = ""
|
|
233
|
+
while len(control) < target_len:
|
|
234
|
+
control += placeholder
|
|
235
|
+
|
|
236
|
+
return control[:target_len]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "20251231_002125",
|
|
3
|
+
"provider": "anthropic",
|
|
4
|
+
"screenshot": "/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift/screenshots/capture_31807990_step_0.png",
|
|
5
|
+
"demo": "DEMONSTRATION:\nGoal: Turn off Night Shift in macOS System Settings\n\nThe following shows the step-by-step procedure:\n\nStep 1:\n [Screen: Desktop with Terminal window visible]\n [Action: CLICK(0.01, 0.01) - Click Apple menu icon in top-left]\n [Result: Apple menu dropdown opened]\n\nStep 2:\n [Screen: Apple menu visible with options]\n [Action: CLICK on \"System Settings...\" menu item]\n [Result: System Settings application opened]\n\nStep 3:\n [Screen: System Settings window with sidebar]\n [Action: CLICK on \"Displays\" in the sidebar]\n [Result: Displays panel shown in main area]\n\nStep 4:\n [Screen: Displays panel showing display settings]\n [Action: CLICK on \"Night Shift...\" button]\n [Result: Night Shift popup/sheet appeared]\n\nStep 5:\n [Screen: Night Shift popup with Schedule dropdown]\n [Action: CLICK on Schedule dropdown, select \"Off\"]\n [Result: Night Shift schedule set to Off, Night Shift disabled]\n\n---",
|
|
6
|
+
"test_cases": [
|
|
7
|
+
{
|
|
8
|
+
"test_case": {
|
|
9
|
+
"name": "near_toggle",
|
|
10
|
+
"task": "Turn ON Night Shift in macOS System Settings",
|
|
11
|
+
"similarity": "near",
|
|
12
|
+
"notes": "Same procedure, just toggle to opposite state"
|
|
13
|
+
},
|
|
14
|
+
"results": {
|
|
15
|
+
"zero_shot": {
|
|
16
|
+
"action": "CLICK(20, 8)",
|
|
17
|
+
"response": "I need to turn ON Night Shift in macOS. Based on the terminal window showing an attempt to control Night Shift, I should access System Settings to properly enable it. Let me click on the Apple menu.\n\nACTION: CLICK(20, 8)",
|
|
18
|
+
"error": null
|
|
19
|
+
},
|
|
20
|
+
"with_demo": {
|
|
21
|
+
"action": "CLICK(20, 8)",
|
|
22
|
+
"response": "I need to turn ON Night Shift in macOS System Settings. Let me start by clicking on the Apple menu icon to access System Settings.\n\nACTION: CLICK(20, 8)",
|
|
23
|
+
"error": null
|
|
24
|
+
},
|
|
25
|
+
"control": {
|
|
26
|
+
"action": "CLICK(1243, 8)",
|
|
27
|
+
"response": "I need to analyze the screenshot to understand how to turn on Night Shift in macOS System Settings.\n\nThe screenshot shows a Terminal window with some command output. To access Night Shift settings in macOS, I need to open System Settings. I can see various system icons in the menu bar at the top of the screen.\n\nACTION: CLICK(1243, 8)",
|
|
28
|
+
"error": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"test_case": {
|
|
34
|
+
"name": "medium_same_panel",
|
|
35
|
+
"task": "Adjust Night Shift color temperature to warmer setting",
|
|
36
|
+
"similarity": "medium",
|
|
37
|
+
"notes": "Same navigation path, different final action"
|
|
38
|
+
},
|
|
39
|
+
"results": {
|
|
40
|
+
"zero_shot": {
|
|
41
|
+
"action": "CLICK(1218, 8)",
|
|
42
|
+
"response": "The task requires adjusting the Night Shift color temperature to a warmer setting. Looking at the screenshot, I can see a macOS Terminal window. The system menu bar at the top right contains various control icons. I need to find the Night Shift or display settings controls.\n\nACTION: CLICK(1218, 8)",
|
|
43
|
+
"error": null
|
|
44
|
+
},
|
|
45
|
+
"with_demo": {
|
|
46
|
+
"action": "CLICK(19, 8)",
|
|
47
|
+
"response": "I need to adjust the Night Shift color temperature to make it warmer. First, I should access the System Settings to find the Night Shift controls.\n\nACTION: CLICK(19, 8)",
|
|
48
|
+
"error": null
|
|
49
|
+
},
|
|
50
|
+
"control": {
|
|
51
|
+
"action": "CLICK(1114, 8)",
|
|
52
|
+
"response": "I need to analyze the screenshot and the goal to determine the next action.\n\nThe screenshot shows a Terminal window on macOS. The goal is to \"Adjust Night Shift color temperature to warmer setting.\"\n\nLooking at the top menu bar, I can see various system icons. To adjust Night Shift settings, I would need to access System Preferences or use the Control Center. I can see what appears to be system control icons in the top-right corner of the screen.\n\nThe most direct way to adjust Night Shift would be to click on the Control Center icon or the display settings in the menu bar.\n\nACTION: CLICK(1114, 8)",
|
|
53
|
+
"error": null
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"test_case": {
|
|
59
|
+
"name": "far_different_setting",
|
|
60
|
+
"task": "Turn on True Tone display in macOS System Settings",
|
|
61
|
+
"similarity": "far",
|
|
62
|
+
"notes": "Same app but different panel (True Tone is in Displays)"
|
|
63
|
+
},
|
|
64
|
+
"results": {
|
|
65
|
+
"zero_shot": {
|
|
66
|
+
"action": "CLICK(1217, 8)",
|
|
67
|
+
"response": "I need to access System Settings to turn on True Tone display. Let me analyze the screen to find how to access it.\n\nACTION: CLICK(1217, 8)",
|
|
68
|
+
"error": null
|
|
69
|
+
},
|
|
70
|
+
"with_demo": {
|
|
71
|
+
"action": "CLICK(20, 8)",
|
|
72
|
+
"response": "I need to turn on True Tone display in macOS System Settings. Let me start by clicking on the Apple menu in the top-left corner.\n\nACTION: CLICK(20, 8)",
|
|
73
|
+
"error": null
|
|
74
|
+
},
|
|
75
|
+
"control": {
|
|
76
|
+
"action": "CLICK(20, 8)",
|
|
77
|
+
"response": "I need to open System Settings to enable True Tone display. I'll click on the Apple menu in the top-left corner to access System Settings.\n\nACTION: CLICK(20, 8)",
|
|
78
|
+
"error": null
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}
|