openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/__init__.py +8 -0
- openadapt_ml/benchmarks/agent.py +90 -11
- openadapt_ml/benchmarks/azure.py +35 -6
- openadapt_ml/benchmarks/cli.py +4449 -201
- openadapt_ml/benchmarks/live_tracker.py +180 -0
- openadapt_ml/benchmarks/runner.py +41 -4
- openadapt_ml/benchmarks/viewer.py +1219 -0
- openadapt_ml/benchmarks/vm_monitor.py +610 -0
- openadapt_ml/benchmarks/waa.py +61 -4
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/benchmarks/waa_live.py +619 -0
- openadapt_ml/cloud/local.py +1555 -1
- openadapt_ml/cloud/ssh_tunnel.py +553 -0
- openadapt_ml/datasets/next_action.py +87 -68
- openadapt_ml/evals/grounding.py +26 -8
- openadapt_ml/evals/trajectory_matching.py +84 -36
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +717 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +265 -0
- openadapt_ml/ingest/__init__.py +3 -4
- openadapt_ml/ingest/capture.py +89 -81
- openadapt_ml/ingest/loader.py +116 -68
- openadapt_ml/ingest/synthetic.py +221 -159
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +817 -0
- openadapt_ml/retrieval/embeddings.py +629 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +160 -0
- openadapt_ml/runtime/policy.py +10 -10
- openadapt_ml/schema/__init__.py +104 -0
- openadapt_ml/schema/converters.py +541 -0
- openadapt_ml/schema/episode.py +457 -0
- openadapt_ml/scripts/compare.py +26 -16
- openadapt_ml/scripts/eval_policy.py +4 -5
- openadapt_ml/scripts/prepare_synthetic.py +14 -17
- openadapt_ml/scripts/train.py +81 -70
- openadapt_ml/training/benchmark_viewer.py +3225 -0
- openadapt_ml/training/trainer.py +120 -363
- openadapt_ml/training/trl_trainer.py +354 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
- openadapt_ml-0.2.0.dist-info/RECORD +86 -0
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""Demo-conditioned prompt experiment runner.
|
|
2
|
+
|
|
3
|
+
Tests whether including a human demonstration improves VLM performance.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from openadapt_ml.experiments.demo_prompt.format_demo import (
|
|
18
|
+
format_episode_as_demo,
|
|
19
|
+
format_episode_verbose,
|
|
20
|
+
generate_length_matched_control,
|
|
21
|
+
get_demo_screenshot_paths,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# System prompt for GUI automation
|
|
26
|
+
SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
|
|
27
|
+
|
|
28
|
+
Available actions:
|
|
29
|
+
- CLICK(x, y) - Click at normalized coordinates (0.0-1.0)
|
|
30
|
+
- TYPE("text") - Type the given text
|
|
31
|
+
- KEY(key) - Press a key (e.g., Enter, Tab, Escape)
|
|
32
|
+
- KEY(modifier+key) - Press key combination (e.g., Cmd+c, Ctrl+v)
|
|
33
|
+
- SCROLL(direction) - Scroll up or down
|
|
34
|
+
- DONE() - Task is complete
|
|
35
|
+
|
|
36
|
+
Respond with exactly ONE action.
|
|
37
|
+
Think step by step, then output the action on a new line starting with "ACTION:"
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ExperimentResult:
|
|
43
|
+
"""Result of a single experiment run."""
|
|
44
|
+
|
|
45
|
+
task: str
|
|
46
|
+
condition: str # "zero_shot", "with_demo", "control"
|
|
47
|
+
response: str
|
|
48
|
+
action_parsed: str | None
|
|
49
|
+
success: bool | None # None if not evaluated
|
|
50
|
+
error: str | None = None
|
|
51
|
+
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TaskPair:
|
|
56
|
+
"""A demo-test task pair."""
|
|
57
|
+
|
|
58
|
+
demo_task: str
|
|
59
|
+
test_task: str
|
|
60
|
+
similarity: str # "near", "medium", "far"
|
|
61
|
+
demo_capture_path: str | None = None
|
|
62
|
+
test_screenshot_path: str | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DemoPromptExperiment:
|
|
66
|
+
"""Run demo-conditioned prompt experiments."""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
provider: str = "anthropic",
|
|
71
|
+
max_tokens: int = 512,
|
|
72
|
+
verbose: bool = True,
|
|
73
|
+
):
|
|
74
|
+
"""Initialize experiment.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
provider: API provider ("anthropic" or "openai").
|
|
78
|
+
max_tokens: Maximum tokens for response.
|
|
79
|
+
verbose: Whether to print progress.
|
|
80
|
+
"""
|
|
81
|
+
self.provider = provider
|
|
82
|
+
self.max_tokens = max_tokens
|
|
83
|
+
self.verbose = verbose
|
|
84
|
+
self._client = None
|
|
85
|
+
|
|
86
|
+
def _get_client(self) -> Any:
|
|
87
|
+
"""Lazily initialize API client."""
|
|
88
|
+
if self._client is not None:
|
|
89
|
+
return self._client
|
|
90
|
+
|
|
91
|
+
if self.provider == "anthropic":
|
|
92
|
+
from anthropic import Anthropic
|
|
93
|
+
from openadapt_ml.config import settings
|
|
94
|
+
import os
|
|
95
|
+
|
|
96
|
+
key = settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
|
|
97
|
+
if not key:
|
|
98
|
+
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
|
99
|
+
self._client = Anthropic(api_key=key)
|
|
100
|
+
|
|
101
|
+
elif self.provider == "openai":
|
|
102
|
+
from openai import OpenAI
|
|
103
|
+
from openadapt_ml.config import settings
|
|
104
|
+
import os
|
|
105
|
+
|
|
106
|
+
key = settings.openai_api_key or os.getenv("OPENAI_API_KEY")
|
|
107
|
+
if not key:
|
|
108
|
+
raise RuntimeError("OPENAI_API_KEY not set")
|
|
109
|
+
self._client = OpenAI(api_key=key)
|
|
110
|
+
|
|
111
|
+
else:
|
|
112
|
+
raise ValueError(f"Unknown provider: {self.provider}")
|
|
113
|
+
|
|
114
|
+
return self._client
|
|
115
|
+
|
|
116
|
+
def _call_api(
|
|
117
|
+
self,
|
|
118
|
+
user_content: str,
|
|
119
|
+
image_paths: list[str] | None = None,
|
|
120
|
+
) -> str:
|
|
121
|
+
"""Call the API with text and optional images.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
user_content: User message text.
|
|
125
|
+
image_paths: Optional list of image paths to include.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Model response text.
|
|
129
|
+
"""
|
|
130
|
+
client = self._get_client()
|
|
131
|
+
|
|
132
|
+
if self.provider == "anthropic":
|
|
133
|
+
content: list[dict[str, Any]] = []
|
|
134
|
+
|
|
135
|
+
# Add images first
|
|
136
|
+
if image_paths:
|
|
137
|
+
for path in image_paths[:5]: # Limit to 5 images
|
|
138
|
+
if Path(path).exists():
|
|
139
|
+
with open(path, "rb") as f:
|
|
140
|
+
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
141
|
+
content.append({
|
|
142
|
+
"type": "image",
|
|
143
|
+
"source": {
|
|
144
|
+
"type": "base64",
|
|
145
|
+
"media_type": "image/png",
|
|
146
|
+
"data": image_b64,
|
|
147
|
+
},
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
# Add text
|
|
151
|
+
content.append({"type": "text", "text": user_content})
|
|
152
|
+
|
|
153
|
+
response = client.messages.create(
|
|
154
|
+
model="claude-sonnet-4-5-20250929",
|
|
155
|
+
max_tokens=self.max_tokens,
|
|
156
|
+
system=SYSTEM_PROMPT,
|
|
157
|
+
messages=[{"role": "user", "content": content}],
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
parts = getattr(response, "content", [])
|
|
161
|
+
texts = [getattr(p, "text", "") for p in parts if getattr(p, "type", "") == "text"]
|
|
162
|
+
return "\n".join([t for t in texts if t]).strip()
|
|
163
|
+
|
|
164
|
+
elif self.provider == "openai":
|
|
165
|
+
user_content_parts: list[dict[str, Any]] = []
|
|
166
|
+
|
|
167
|
+
# Add images first
|
|
168
|
+
if image_paths:
|
|
169
|
+
for path in image_paths[:5]:
|
|
170
|
+
if Path(path).exists():
|
|
171
|
+
with open(path, "rb") as f:
|
|
172
|
+
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
173
|
+
user_content_parts.append({
|
|
174
|
+
"type": "image_url",
|
|
175
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
# Add text
|
|
179
|
+
user_content_parts.append({"type": "text", "text": user_content})
|
|
180
|
+
|
|
181
|
+
response = client.chat.completions.create(
|
|
182
|
+
model="gpt-4o",
|
|
183
|
+
messages=[
|
|
184
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
185
|
+
{"role": "user", "content": user_content_parts},
|
|
186
|
+
],
|
|
187
|
+
max_tokens=self.max_tokens,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return response.choices[0].message.content or ""
|
|
191
|
+
|
|
192
|
+
raise ValueError(f"Unknown provider: {self.provider}")
|
|
193
|
+
|
|
194
|
+
def _parse_action(self, response: str) -> str | None:
|
|
195
|
+
"""Extract action from response.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
response: Model response text.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Extracted action string or None.
|
|
202
|
+
"""
|
|
203
|
+
import re
|
|
204
|
+
|
|
205
|
+
# Look for ACTION: prefix
|
|
206
|
+
match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
|
|
207
|
+
if match:
|
|
208
|
+
return match.group(1).strip()
|
|
209
|
+
|
|
210
|
+
# Look for action patterns
|
|
211
|
+
patterns = [
|
|
212
|
+
r"(CLICK\s*\([^)]+\))",
|
|
213
|
+
r"(TYPE\s*\([^)]+\))",
|
|
214
|
+
r"(KEY\s*\([^)]+\))",
|
|
215
|
+
r"(SCROLL\s*\([^)]+\))",
|
|
216
|
+
r"(DONE\s*\(\s*\))",
|
|
217
|
+
]
|
|
218
|
+
for pattern in patterns:
|
|
219
|
+
match = re.search(pattern, response, re.IGNORECASE)
|
|
220
|
+
if match:
|
|
221
|
+
return match.group(1).strip()
|
|
222
|
+
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
def run_zero_shot(
|
|
226
|
+
self,
|
|
227
|
+
task: str,
|
|
228
|
+
screenshot_path: str,
|
|
229
|
+
) -> ExperimentResult:
|
|
230
|
+
"""Run zero-shot (no demo) condition.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
task: Task instruction.
|
|
234
|
+
screenshot_path: Path to current screenshot.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
ExperimentResult.
|
|
238
|
+
"""
|
|
239
|
+
if self.verbose:
|
|
240
|
+
print(f" Running zero-shot: {task[:50]}...")
|
|
241
|
+
|
|
242
|
+
prompt = f"Goal: {task}\n\nWhat is the next action?"
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
response = self._call_api(prompt, [screenshot_path])
|
|
246
|
+
action = self._parse_action(response)
|
|
247
|
+
return ExperimentResult(
|
|
248
|
+
task=task,
|
|
249
|
+
condition="zero_shot",
|
|
250
|
+
response=response,
|
|
251
|
+
action_parsed=action,
|
|
252
|
+
success=None, # Manual evaluation needed
|
|
253
|
+
)
|
|
254
|
+
except Exception as e:
|
|
255
|
+
return ExperimentResult(
|
|
256
|
+
task=task,
|
|
257
|
+
condition="zero_shot",
|
|
258
|
+
response="",
|
|
259
|
+
action_parsed=None,
|
|
260
|
+
success=False,
|
|
261
|
+
error=str(e),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def run_with_demo(
|
|
265
|
+
self,
|
|
266
|
+
task: str,
|
|
267
|
+
screenshot_path: str,
|
|
268
|
+
demo_text: str,
|
|
269
|
+
demo_screenshots: list[str] | None = None,
|
|
270
|
+
) -> ExperimentResult:
|
|
271
|
+
"""Run with-demo condition.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
task: Task instruction.
|
|
275
|
+
screenshot_path: Path to current screenshot.
|
|
276
|
+
demo_text: Formatted demo text.
|
|
277
|
+
demo_screenshots: Optional demo screenshot paths.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
ExperimentResult.
|
|
281
|
+
"""
|
|
282
|
+
if self.verbose:
|
|
283
|
+
print(f" Running with-demo: {task[:50]}...")
|
|
284
|
+
|
|
285
|
+
prompt = f"{demo_text}\n\nNOW PERFORM THIS TASK:\nGoal: {task}\n\nWhat is the next action?"
|
|
286
|
+
|
|
287
|
+
# Combine demo screenshots with current screenshot
|
|
288
|
+
all_images = (demo_screenshots or []) + [screenshot_path]
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
response = self._call_api(prompt, all_images)
|
|
292
|
+
action = self._parse_action(response)
|
|
293
|
+
return ExperimentResult(
|
|
294
|
+
task=task,
|
|
295
|
+
condition="with_demo",
|
|
296
|
+
response=response,
|
|
297
|
+
action_parsed=action,
|
|
298
|
+
success=None,
|
|
299
|
+
)
|
|
300
|
+
except Exception as e:
|
|
301
|
+
return ExperimentResult(
|
|
302
|
+
task=task,
|
|
303
|
+
condition="with_demo",
|
|
304
|
+
response="",
|
|
305
|
+
action_parsed=None,
|
|
306
|
+
success=False,
|
|
307
|
+
error=str(e),
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def run_control(
|
|
311
|
+
self,
|
|
312
|
+
task: str,
|
|
313
|
+
screenshot_path: str,
|
|
314
|
+
control_text: str,
|
|
315
|
+
) -> ExperimentResult:
|
|
316
|
+
"""Run length-matched control condition.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
task: Task instruction.
|
|
320
|
+
screenshot_path: Path to current screenshot.
|
|
321
|
+
control_text: Length-matched control text.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
ExperimentResult.
|
|
325
|
+
"""
|
|
326
|
+
if self.verbose:
|
|
327
|
+
print(f" Running control: {task[:50]}...")
|
|
328
|
+
|
|
329
|
+
prompt = f"{control_text}\n\nGoal: {task}\n\nWhat is the next action?"
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
response = self._call_api(prompt, [screenshot_path])
|
|
333
|
+
action = self._parse_action(response)
|
|
334
|
+
return ExperimentResult(
|
|
335
|
+
task=task,
|
|
336
|
+
condition="control",
|
|
337
|
+
response=response,
|
|
338
|
+
action_parsed=action,
|
|
339
|
+
success=None,
|
|
340
|
+
)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
return ExperimentResult(
|
|
343
|
+
task=task,
|
|
344
|
+
condition="control",
|
|
345
|
+
response="",
|
|
346
|
+
action_parsed=None,
|
|
347
|
+
success=False,
|
|
348
|
+
error=str(e),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def run_task_pair(
|
|
352
|
+
self,
|
|
353
|
+
demo_episode: Any, # Episode
|
|
354
|
+
test_task: str,
|
|
355
|
+
test_screenshot: str,
|
|
356
|
+
include_demo_images: bool = False,
|
|
357
|
+
) -> dict[str, ExperimentResult]:
|
|
358
|
+
"""Run all conditions for a task pair.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
demo_episode: Episode containing the demonstration.
|
|
362
|
+
test_task: Test task instruction.
|
|
363
|
+
test_screenshot: Path to test screenshot.
|
|
364
|
+
include_demo_images: Whether to include demo screenshots.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Dict mapping condition name to result.
|
|
368
|
+
"""
|
|
369
|
+
# Format demo
|
|
370
|
+
demo_text = format_episode_verbose(demo_episode, max_steps=10)
|
|
371
|
+
|
|
372
|
+
# Get demo screenshots if requested
|
|
373
|
+
demo_screenshots = None
|
|
374
|
+
if include_demo_images:
|
|
375
|
+
demo_screenshots = get_demo_screenshot_paths(demo_episode, max_steps=5)
|
|
376
|
+
|
|
377
|
+
# Generate control
|
|
378
|
+
control_text = generate_length_matched_control(demo_text)
|
|
379
|
+
|
|
380
|
+
results = {}
|
|
381
|
+
|
|
382
|
+
# Run all conditions
|
|
383
|
+
results["zero_shot"] = self.run_zero_shot(test_task, test_screenshot)
|
|
384
|
+
results["with_demo"] = self.run_with_demo(
|
|
385
|
+
test_task, test_screenshot, demo_text, demo_screenshots
|
|
386
|
+
)
|
|
387
|
+
results["control"] = self.run_control(test_task, test_screenshot, control_text)
|
|
388
|
+
|
|
389
|
+
return results
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def run_experiment(
|
|
393
|
+
demo_capture_path: str,
|
|
394
|
+
test_task: str,
|
|
395
|
+
test_screenshot: str,
|
|
396
|
+
provider: str = "anthropic",
|
|
397
|
+
output_dir: str | None = None,
|
|
398
|
+
include_demo_images: bool = False,
|
|
399
|
+
goal: str | None = None,
|
|
400
|
+
) -> dict[str, Any]:
|
|
401
|
+
"""Run the full experiment.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
demo_capture_path: Path to demo capture directory.
|
|
405
|
+
test_task: Test task instruction.
|
|
406
|
+
test_screenshot: Path to test screenshot.
|
|
407
|
+
provider: API provider.
|
|
408
|
+
output_dir: Optional output directory for results.
|
|
409
|
+
include_demo_images: Whether to include demo screenshots.
|
|
410
|
+
goal: Optional goal for demo episode (overrides capture's).
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Dict with results.
|
|
414
|
+
"""
|
|
415
|
+
from openadapt_ml.ingest.capture import capture_to_episode
|
|
416
|
+
|
|
417
|
+
print(f"Loading demo from: {demo_capture_path}")
|
|
418
|
+
episode = capture_to_episode(demo_capture_path, goal=goal)
|
|
419
|
+
print(f" Loaded {len(episode.steps)} steps, goal: {episode.goal}")
|
|
420
|
+
|
|
421
|
+
print(f"\nTest task: {test_task}")
|
|
422
|
+
print(f"Test screenshot: {test_screenshot}")
|
|
423
|
+
|
|
424
|
+
experiment = DemoPromptExperiment(provider=provider)
|
|
425
|
+
results = experiment.run_task_pair(
|
|
426
|
+
demo_episode=episode,
|
|
427
|
+
test_task=test_task,
|
|
428
|
+
test_screenshot=test_screenshot,
|
|
429
|
+
include_demo_images=include_demo_images,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Print results
|
|
433
|
+
print("\n" + "=" * 60)
|
|
434
|
+
print("RESULTS")
|
|
435
|
+
print("=" * 60)
|
|
436
|
+
|
|
437
|
+
for condition, result in results.items():
|
|
438
|
+
print(f"\n{condition.upper()}:")
|
|
439
|
+
print(f" Action: {result.action_parsed}")
|
|
440
|
+
if result.error:
|
|
441
|
+
print(f" Error: {result.error}")
|
|
442
|
+
print(f" Response preview: {result.response[:200]}...")
|
|
443
|
+
|
|
444
|
+
# Save results if output dir specified
|
|
445
|
+
if output_dir:
|
|
446
|
+
output_path = Path(output_dir)
|
|
447
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
448
|
+
|
|
449
|
+
results_file = output_path / f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
450
|
+
with open(results_file, "w") as f:
|
|
451
|
+
json.dump(
|
|
452
|
+
{
|
|
453
|
+
"demo_capture": demo_capture_path,
|
|
454
|
+
"test_task": test_task,
|
|
455
|
+
"test_screenshot": test_screenshot,
|
|
456
|
+
"provider": provider,
|
|
457
|
+
"results": {
|
|
458
|
+
k: {
|
|
459
|
+
"task": v.task,
|
|
460
|
+
"condition": v.condition,
|
|
461
|
+
"action_parsed": v.action_parsed,
|
|
462
|
+
"response": v.response,
|
|
463
|
+
"error": v.error,
|
|
464
|
+
"timestamp": v.timestamp,
|
|
465
|
+
}
|
|
466
|
+
for k, v in results.items()
|
|
467
|
+
},
|
|
468
|
+
},
|
|
469
|
+
f,
|
|
470
|
+
indent=2,
|
|
471
|
+
)
|
|
472
|
+
print(f"\nResults saved to: {results_file}")
|
|
473
|
+
|
|
474
|
+
return {"results": results, "episode": episode}
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def main():
|
|
478
|
+
"""CLI entry point."""
|
|
479
|
+
parser = argparse.ArgumentParser(
|
|
480
|
+
description="Run demo-conditioned prompt experiment"
|
|
481
|
+
)
|
|
482
|
+
parser.add_argument(
|
|
483
|
+
"--demo-capture",
|
|
484
|
+
required=True,
|
|
485
|
+
help="Path to demo capture directory",
|
|
486
|
+
)
|
|
487
|
+
parser.add_argument(
|
|
488
|
+
"--test-task",
|
|
489
|
+
required=True,
|
|
490
|
+
help="Test task instruction",
|
|
491
|
+
)
|
|
492
|
+
parser.add_argument(
|
|
493
|
+
"--test-screenshot",
|
|
494
|
+
required=True,
|
|
495
|
+
help="Path to test screenshot",
|
|
496
|
+
)
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--provider",
|
|
499
|
+
default="anthropic",
|
|
500
|
+
choices=["anthropic", "openai"],
|
|
501
|
+
help="API provider (default: anthropic)",
|
|
502
|
+
)
|
|
503
|
+
parser.add_argument(
|
|
504
|
+
"--output",
|
|
505
|
+
help="Output directory for results",
|
|
506
|
+
)
|
|
507
|
+
parser.add_argument(
|
|
508
|
+
"--include-demo-images",
|
|
509
|
+
action="store_true",
|
|
510
|
+
help="Include demo screenshots in prompt",
|
|
511
|
+
)
|
|
512
|
+
parser.add_argument(
|
|
513
|
+
"--goal",
|
|
514
|
+
help="Override goal for demo episode",
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
args = parser.parse_args()
|
|
518
|
+
|
|
519
|
+
run_experiment(
|
|
520
|
+
demo_capture_path=args.demo_capture,
|
|
521
|
+
test_task=args.test_task,
|
|
522
|
+
test_screenshot=args.test_screenshot,
|
|
523
|
+
provider=args.provider,
|
|
524
|
+
output_dir=args.output,
|
|
525
|
+
include_demo_images=args.include_demo_images,
|
|
526
|
+
goal=args.goal,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
if __name__ == "__main__":
|
|
531
|
+
main()
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""WAA demo-conditioned experiment module.
|
|
2
|
+
|
|
3
|
+
This module contains demonstrations and task definitions for the
|
|
4
|
+
Windows Agent Arena demo-conditioned prompting experiment.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from openadapt_ml.experiments.waa_demo.demos import DEMOS, get_demo
|
|
8
|
+
from openadapt_ml.experiments.waa_demo.tasks import TASKS, get_task
|
|
9
|
+
|
|
10
|
+
__all__ = ["DEMOS", "TASKS", "get_demo", "get_task"]
|