openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,531 @@
1
+ """Demo-conditioned prompt experiment runner.
2
+
3
+ Tests whether including a human demonstration improves VLM performance.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import base64
10
+ import json
11
+ import sys
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from openadapt_ml.experiments.demo_prompt.format_demo import (
18
+ format_episode_as_demo,
19
+ format_episode_verbose,
20
+ generate_length_matched_control,
21
+ get_demo_screenshot_paths,
22
+ )
23
+
24
+
25
+ # System prompt for GUI automation
26
+ SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
27
+
28
+ Available actions:
29
+ - CLICK(x, y) - Click at normalized coordinates (0.0-1.0)
30
+ - TYPE("text") - Type the given text
31
+ - KEY(key) - Press a key (e.g., Enter, Tab, Escape)
32
+ - KEY(modifier+key) - Press key combination (e.g., Cmd+c, Ctrl+v)
33
+ - SCROLL(direction) - Scroll up or down
34
+ - DONE() - Task is complete
35
+
36
+ Respond with exactly ONE action.
37
+ Think step by step, then output the action on a new line starting with "ACTION:"
38
+ """
39
+
40
+
41
+ @dataclass
42
+ class ExperimentResult:
43
+ """Result of a single experiment run."""
44
+
45
+ task: str
46
+ condition: str # "zero_shot", "with_demo", "control"
47
+ response: str
48
+ action_parsed: str | None
49
+ success: bool | None # None if not evaluated
50
+ error: str | None = None
51
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
52
+
53
+
54
+ @dataclass
55
+ class TaskPair:
56
+ """A demo-test task pair."""
57
+
58
+ demo_task: str
59
+ test_task: str
60
+ similarity: str # "near", "medium", "far"
61
+ demo_capture_path: str | None = None
62
+ test_screenshot_path: str | None = None
63
+
64
+
65
+ class DemoPromptExperiment:
66
+ """Run demo-conditioned prompt experiments."""
67
+
68
+ def __init__(
69
+ self,
70
+ provider: str = "anthropic",
71
+ max_tokens: int = 512,
72
+ verbose: bool = True,
73
+ ):
74
+ """Initialize experiment.
75
+
76
+ Args:
77
+ provider: API provider ("anthropic" or "openai").
78
+ max_tokens: Maximum tokens for response.
79
+ verbose: Whether to print progress.
80
+ """
81
+ self.provider = provider
82
+ self.max_tokens = max_tokens
83
+ self.verbose = verbose
84
+ self._client = None
85
+
86
+ def _get_client(self) -> Any:
87
+ """Lazily initialize API client."""
88
+ if self._client is not None:
89
+ return self._client
90
+
91
+ if self.provider == "anthropic":
92
+ from anthropic import Anthropic
93
+ from openadapt_ml.config import settings
94
+ import os
95
+
96
+ key = settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
97
+ if not key:
98
+ raise RuntimeError("ANTHROPIC_API_KEY not set")
99
+ self._client = Anthropic(api_key=key)
100
+
101
+ elif self.provider == "openai":
102
+ from openai import OpenAI
103
+ from openadapt_ml.config import settings
104
+ import os
105
+
106
+ key = settings.openai_api_key or os.getenv("OPENAI_API_KEY")
107
+ if not key:
108
+ raise RuntimeError("OPENAI_API_KEY not set")
109
+ self._client = OpenAI(api_key=key)
110
+
111
+ else:
112
+ raise ValueError(f"Unknown provider: {self.provider}")
113
+
114
+ return self._client
115
+
116
+ def _call_api(
117
+ self,
118
+ user_content: str,
119
+ image_paths: list[str] | None = None,
120
+ ) -> str:
121
+ """Call the API with text and optional images.
122
+
123
+ Args:
124
+ user_content: User message text.
125
+ image_paths: Optional list of image paths to include.
126
+
127
+ Returns:
128
+ Model response text.
129
+ """
130
+ client = self._get_client()
131
+
132
+ if self.provider == "anthropic":
133
+ content: list[dict[str, Any]] = []
134
+
135
+ # Add images first
136
+ if image_paths:
137
+ for path in image_paths[:5]: # Limit to 5 images
138
+ if Path(path).exists():
139
+ with open(path, "rb") as f:
140
+ image_b64 = base64.b64encode(f.read()).decode("utf-8")
141
+ content.append({
142
+ "type": "image",
143
+ "source": {
144
+ "type": "base64",
145
+ "media_type": "image/png",
146
+ "data": image_b64,
147
+ },
148
+ })
149
+
150
+ # Add text
151
+ content.append({"type": "text", "text": user_content})
152
+
153
+ response = client.messages.create(
154
+ model="claude-sonnet-4-5-20250929",
155
+ max_tokens=self.max_tokens,
156
+ system=SYSTEM_PROMPT,
157
+ messages=[{"role": "user", "content": content}],
158
+ )
159
+
160
+ parts = getattr(response, "content", [])
161
+ texts = [getattr(p, "text", "") for p in parts if getattr(p, "type", "") == "text"]
162
+ return "\n".join([t for t in texts if t]).strip()
163
+
164
+ elif self.provider == "openai":
165
+ user_content_parts: list[dict[str, Any]] = []
166
+
167
+ # Add images first
168
+ if image_paths:
169
+ for path in image_paths[:5]:
170
+ if Path(path).exists():
171
+ with open(path, "rb") as f:
172
+ image_b64 = base64.b64encode(f.read()).decode("utf-8")
173
+ user_content_parts.append({
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
176
+ })
177
+
178
+ # Add text
179
+ user_content_parts.append({"type": "text", "text": user_content})
180
+
181
+ response = client.chat.completions.create(
182
+ model="gpt-4o",
183
+ messages=[
184
+ {"role": "system", "content": SYSTEM_PROMPT},
185
+ {"role": "user", "content": user_content_parts},
186
+ ],
187
+ max_tokens=self.max_tokens,
188
+ )
189
+
190
+ return response.choices[0].message.content or ""
191
+
192
+ raise ValueError(f"Unknown provider: {self.provider}")
193
+
194
+ def _parse_action(self, response: str) -> str | None:
195
+ """Extract action from response.
196
+
197
+ Args:
198
+ response: Model response text.
199
+
200
+ Returns:
201
+ Extracted action string or None.
202
+ """
203
+ import re
204
+
205
+ # Look for ACTION: prefix
206
+ match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
207
+ if match:
208
+ return match.group(1).strip()
209
+
210
+ # Look for action patterns
211
+ patterns = [
212
+ r"(CLICK\s*\([^)]+\))",
213
+ r"(TYPE\s*\([^)]+\))",
214
+ r"(KEY\s*\([^)]+\))",
215
+ r"(SCROLL\s*\([^)]+\))",
216
+ r"(DONE\s*\(\s*\))",
217
+ ]
218
+ for pattern in patterns:
219
+ match = re.search(pattern, response, re.IGNORECASE)
220
+ if match:
221
+ return match.group(1).strip()
222
+
223
+ return None
224
+
225
+ def run_zero_shot(
226
+ self,
227
+ task: str,
228
+ screenshot_path: str,
229
+ ) -> ExperimentResult:
230
+ """Run zero-shot (no demo) condition.
231
+
232
+ Args:
233
+ task: Task instruction.
234
+ screenshot_path: Path to current screenshot.
235
+
236
+ Returns:
237
+ ExperimentResult.
238
+ """
239
+ if self.verbose:
240
+ print(f" Running zero-shot: {task[:50]}...")
241
+
242
+ prompt = f"Goal: {task}\n\nWhat is the next action?"
243
+
244
+ try:
245
+ response = self._call_api(prompt, [screenshot_path])
246
+ action = self._parse_action(response)
247
+ return ExperimentResult(
248
+ task=task,
249
+ condition="zero_shot",
250
+ response=response,
251
+ action_parsed=action,
252
+ success=None, # Manual evaluation needed
253
+ )
254
+ except Exception as e:
255
+ return ExperimentResult(
256
+ task=task,
257
+ condition="zero_shot",
258
+ response="",
259
+ action_parsed=None,
260
+ success=False,
261
+ error=str(e),
262
+ )
263
+
264
+ def run_with_demo(
265
+ self,
266
+ task: str,
267
+ screenshot_path: str,
268
+ demo_text: str,
269
+ demo_screenshots: list[str] | None = None,
270
+ ) -> ExperimentResult:
271
+ """Run with-demo condition.
272
+
273
+ Args:
274
+ task: Task instruction.
275
+ screenshot_path: Path to current screenshot.
276
+ demo_text: Formatted demo text.
277
+ demo_screenshots: Optional demo screenshot paths.
278
+
279
+ Returns:
280
+ ExperimentResult.
281
+ """
282
+ if self.verbose:
283
+ print(f" Running with-demo: {task[:50]}...")
284
+
285
+ prompt = f"{demo_text}\n\nNOW PERFORM THIS TASK:\nGoal: {task}\n\nWhat is the next action?"
286
+
287
+ # Combine demo screenshots with current screenshot
288
+ all_images = (demo_screenshots or []) + [screenshot_path]
289
+
290
+ try:
291
+ response = self._call_api(prompt, all_images)
292
+ action = self._parse_action(response)
293
+ return ExperimentResult(
294
+ task=task,
295
+ condition="with_demo",
296
+ response=response,
297
+ action_parsed=action,
298
+ success=None,
299
+ )
300
+ except Exception as e:
301
+ return ExperimentResult(
302
+ task=task,
303
+ condition="with_demo",
304
+ response="",
305
+ action_parsed=None,
306
+ success=False,
307
+ error=str(e),
308
+ )
309
+
310
+ def run_control(
311
+ self,
312
+ task: str,
313
+ screenshot_path: str,
314
+ control_text: str,
315
+ ) -> ExperimentResult:
316
+ """Run length-matched control condition.
317
+
318
+ Args:
319
+ task: Task instruction.
320
+ screenshot_path: Path to current screenshot.
321
+ control_text: Length-matched control text.
322
+
323
+ Returns:
324
+ ExperimentResult.
325
+ """
326
+ if self.verbose:
327
+ print(f" Running control: {task[:50]}...")
328
+
329
+ prompt = f"{control_text}\n\nGoal: {task}\n\nWhat is the next action?"
330
+
331
+ try:
332
+ response = self._call_api(prompt, [screenshot_path])
333
+ action = self._parse_action(response)
334
+ return ExperimentResult(
335
+ task=task,
336
+ condition="control",
337
+ response=response,
338
+ action_parsed=action,
339
+ success=None,
340
+ )
341
+ except Exception as e:
342
+ return ExperimentResult(
343
+ task=task,
344
+ condition="control",
345
+ response="",
346
+ action_parsed=None,
347
+ success=False,
348
+ error=str(e),
349
+ )
350
+
351
+ def run_task_pair(
352
+ self,
353
+ demo_episode: Any, # Episode
354
+ test_task: str,
355
+ test_screenshot: str,
356
+ include_demo_images: bool = False,
357
+ ) -> dict[str, ExperimentResult]:
358
+ """Run all conditions for a task pair.
359
+
360
+ Args:
361
+ demo_episode: Episode containing the demonstration.
362
+ test_task: Test task instruction.
363
+ test_screenshot: Path to test screenshot.
364
+ include_demo_images: Whether to include demo screenshots.
365
+
366
+ Returns:
367
+ Dict mapping condition name to result.
368
+ """
369
+ # Format demo
370
+ demo_text = format_episode_verbose(demo_episode, max_steps=10)
371
+
372
+ # Get demo screenshots if requested
373
+ demo_screenshots = None
374
+ if include_demo_images:
375
+ demo_screenshots = get_demo_screenshot_paths(demo_episode, max_steps=5)
376
+
377
+ # Generate control
378
+ control_text = generate_length_matched_control(demo_text)
379
+
380
+ results = {}
381
+
382
+ # Run all conditions
383
+ results["zero_shot"] = self.run_zero_shot(test_task, test_screenshot)
384
+ results["with_demo"] = self.run_with_demo(
385
+ test_task, test_screenshot, demo_text, demo_screenshots
386
+ )
387
+ results["control"] = self.run_control(test_task, test_screenshot, control_text)
388
+
389
+ return results
390
+
391
+
392
+ def run_experiment(
393
+ demo_capture_path: str,
394
+ test_task: str,
395
+ test_screenshot: str,
396
+ provider: str = "anthropic",
397
+ output_dir: str | None = None,
398
+ include_demo_images: bool = False,
399
+ goal: str | None = None,
400
+ ) -> dict[str, Any]:
401
+ """Run the full experiment.
402
+
403
+ Args:
404
+ demo_capture_path: Path to demo capture directory.
405
+ test_task: Test task instruction.
406
+ test_screenshot: Path to test screenshot.
407
+ provider: API provider.
408
+ output_dir: Optional output directory for results.
409
+ include_demo_images: Whether to include demo screenshots.
410
+ goal: Optional goal for demo episode (overrides capture's).
411
+
412
+ Returns:
413
+ Dict with results.
414
+ """
415
+ from openadapt_ml.ingest.capture import capture_to_episode
416
+
417
+ print(f"Loading demo from: {demo_capture_path}")
418
+ episode = capture_to_episode(demo_capture_path, goal=goal)
419
+ print(f" Loaded {len(episode.steps)} steps, goal: {episode.goal}")
420
+
421
+ print(f"\nTest task: {test_task}")
422
+ print(f"Test screenshot: {test_screenshot}")
423
+
424
+ experiment = DemoPromptExperiment(provider=provider)
425
+ results = experiment.run_task_pair(
426
+ demo_episode=episode,
427
+ test_task=test_task,
428
+ test_screenshot=test_screenshot,
429
+ include_demo_images=include_demo_images,
430
+ )
431
+
432
+ # Print results
433
+ print("\n" + "=" * 60)
434
+ print("RESULTS")
435
+ print("=" * 60)
436
+
437
+ for condition, result in results.items():
438
+ print(f"\n{condition.upper()}:")
439
+ print(f" Action: {result.action_parsed}")
440
+ if result.error:
441
+ print(f" Error: {result.error}")
442
+ print(f" Response preview: {result.response[:200]}...")
443
+
444
+ # Save results if output dir specified
445
+ if output_dir:
446
+ output_path = Path(output_dir)
447
+ output_path.mkdir(parents=True, exist_ok=True)
448
+
449
+ results_file = output_path / f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
450
+ with open(results_file, "w") as f:
451
+ json.dump(
452
+ {
453
+ "demo_capture": demo_capture_path,
454
+ "test_task": test_task,
455
+ "test_screenshot": test_screenshot,
456
+ "provider": provider,
457
+ "results": {
458
+ k: {
459
+ "task": v.task,
460
+ "condition": v.condition,
461
+ "action_parsed": v.action_parsed,
462
+ "response": v.response,
463
+ "error": v.error,
464
+ "timestamp": v.timestamp,
465
+ }
466
+ for k, v in results.items()
467
+ },
468
+ },
469
+ f,
470
+ indent=2,
471
+ )
472
+ print(f"\nResults saved to: {results_file}")
473
+
474
+ return {"results": results, "episode": episode}
475
+
476
+
477
+ def main():
478
+ """CLI entry point."""
479
+ parser = argparse.ArgumentParser(
480
+ description="Run demo-conditioned prompt experiment"
481
+ )
482
+ parser.add_argument(
483
+ "--demo-capture",
484
+ required=True,
485
+ help="Path to demo capture directory",
486
+ )
487
+ parser.add_argument(
488
+ "--test-task",
489
+ required=True,
490
+ help="Test task instruction",
491
+ )
492
+ parser.add_argument(
493
+ "--test-screenshot",
494
+ required=True,
495
+ help="Path to test screenshot",
496
+ )
497
+ parser.add_argument(
498
+ "--provider",
499
+ default="anthropic",
500
+ choices=["anthropic", "openai"],
501
+ help="API provider (default: anthropic)",
502
+ )
503
+ parser.add_argument(
504
+ "--output",
505
+ help="Output directory for results",
506
+ )
507
+ parser.add_argument(
508
+ "--include-demo-images",
509
+ action="store_true",
510
+ help="Include demo screenshots in prompt",
511
+ )
512
+ parser.add_argument(
513
+ "--goal",
514
+ help="Override goal for demo episode",
515
+ )
516
+
517
+ args = parser.parse_args()
518
+
519
+ run_experiment(
520
+ demo_capture_path=args.demo_capture,
521
+ test_task=args.test_task,
522
+ test_screenshot=args.test_screenshot,
523
+ provider=args.provider,
524
+ output_dir=args.output,
525
+ include_demo_images=args.include_demo_images,
526
+ goal=args.goal,
527
+ )
528
+
529
+
530
+ if __name__ == "__main__":
531
+ main()
@@ -0,0 +1,10 @@
1
+ """WAA demo-conditioned experiment module.
2
+
3
+ This module contains demonstrations and task definitions for the
4
+ Windows Agent Arena demo-conditioned prompting experiment.
5
+ """
6
+
7
+ from openadapt_ml.experiments.waa_demo.demos import DEMOS, get_demo
8
+ from openadapt_ml.experiments.waa_demo.tasks import TASKS, get_task
9
+
10
+ __all__ = ["DEMOS", "TASKS", "get_demo", "get_task"]