openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,541 @@
1
+ """Demo-conditioned prompt experiment runner.
2
+
3
+ Tests whether including a human demonstration improves VLM performance.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import base64
10
+ import json
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from openadapt_ml.experiments.demo_prompt.format_demo import (
17
+ format_episode_verbose,
18
+ generate_length_matched_control,
19
+ get_demo_screenshot_paths,
20
+ )
21
+
22
+
23
+ # System prompt for GUI automation
24
+ SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
25
+
26
+ Available actions:
27
+ - CLICK(x, y) - Click at normalized coordinates (0.0-1.0)
28
+ - TYPE("text") - Type the given text
29
+ - KEY(key) - Press a key (e.g., Enter, Tab, Escape)
30
+ - KEY(modifier+key) - Press key combination (e.g., Cmd+c, Ctrl+v)
31
+ - SCROLL(direction) - Scroll up or down
32
+ - DONE() - Task is complete
33
+
34
+ Respond with exactly ONE action.
35
+ Think step by step, then output the action on a new line starting with "ACTION:"
36
+ """
37
+
38
+
39
+ @dataclass
40
+ class ExperimentResult:
41
+ """Result of a single experiment run."""
42
+
43
+ task: str
44
+ condition: str # "zero_shot", "with_demo", "control"
45
+ response: str
46
+ action_parsed: str | None
47
+ success: bool | None # None if not evaluated
48
+ error: str | None = None
49
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
50
+
51
+
52
+ @dataclass
53
+ class TaskPair:
54
+ """A demo-test task pair."""
55
+
56
+ demo_task: str
57
+ test_task: str
58
+ similarity: str # "near", "medium", "far"
59
+ demo_capture_path: str | None = None
60
+ test_screenshot_path: str | None = None
61
+
62
+
63
+ class DemoPromptExperiment:
64
+ """Run demo-conditioned prompt experiments."""
65
+
66
+ def __init__(
67
+ self,
68
+ provider: str = "anthropic",
69
+ max_tokens: int = 512,
70
+ verbose: bool = True,
71
+ ):
72
+ """Initialize experiment.
73
+
74
+ Args:
75
+ provider: API provider ("anthropic" or "openai").
76
+ max_tokens: Maximum tokens for response.
77
+ verbose: Whether to print progress.
78
+ """
79
+ self.provider = provider
80
+ self.max_tokens = max_tokens
81
+ self.verbose = verbose
82
+ self._client = None
83
+
84
+ def _get_client(self) -> Any:
85
+ """Lazily initialize API client."""
86
+ if self._client is not None:
87
+ return self._client
88
+
89
+ if self.provider == "anthropic":
90
+ from anthropic import Anthropic
91
+ from openadapt_ml.config import settings
92
+ import os
93
+
94
+ key = settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
95
+ if not key:
96
+ raise RuntimeError("ANTHROPIC_API_KEY not set")
97
+ self._client = Anthropic(api_key=key)
98
+
99
+ elif self.provider == "openai":
100
+ from openai import OpenAI
101
+ from openadapt_ml.config import settings
102
+ import os
103
+
104
+ key = settings.openai_api_key or os.getenv("OPENAI_API_KEY")
105
+ if not key:
106
+ raise RuntimeError("OPENAI_API_KEY not set")
107
+ self._client = OpenAI(api_key=key)
108
+
109
+ else:
110
+ raise ValueError(f"Unknown provider: {self.provider}")
111
+
112
+ return self._client
113
+
114
+ def _call_api(
115
+ self,
116
+ user_content: str,
117
+ image_paths: list[str] | None = None,
118
+ ) -> str:
119
+ """Call the API with text and optional images.
120
+
121
+ Args:
122
+ user_content: User message text.
123
+ image_paths: Optional list of image paths to include.
124
+
125
+ Returns:
126
+ Model response text.
127
+ """
128
+ client = self._get_client()
129
+
130
+ if self.provider == "anthropic":
131
+ content: list[dict[str, Any]] = []
132
+
133
+ # Add images first
134
+ if image_paths:
135
+ for path in image_paths[:5]: # Limit to 5 images
136
+ if Path(path).exists():
137
+ with open(path, "rb") as f:
138
+ image_b64 = base64.b64encode(f.read()).decode("utf-8")
139
+ content.append(
140
+ {
141
+ "type": "image",
142
+ "source": {
143
+ "type": "base64",
144
+ "media_type": "image/png",
145
+ "data": image_b64,
146
+ },
147
+ }
148
+ )
149
+
150
+ # Add text
151
+ content.append({"type": "text", "text": user_content})
152
+
153
+ response = client.messages.create(
154
+ model="claude-sonnet-4-5-20250929",
155
+ max_tokens=self.max_tokens,
156
+ system=SYSTEM_PROMPT,
157
+ messages=[{"role": "user", "content": content}],
158
+ )
159
+
160
+ parts = getattr(response, "content", [])
161
+ texts = [
162
+ getattr(p, "text", "")
163
+ for p in parts
164
+ if getattr(p, "type", "") == "text"
165
+ ]
166
+ return "\n".join([t for t in texts if t]).strip()
167
+
168
+ elif self.provider == "openai":
169
+ user_content_parts: list[dict[str, Any]] = []
170
+
171
+ # Add images first
172
+ if image_paths:
173
+ for path in image_paths[:5]:
174
+ if Path(path).exists():
175
+ with open(path, "rb") as f:
176
+ image_b64 = base64.b64encode(f.read()).decode("utf-8")
177
+ user_content_parts.append(
178
+ {
179
+ "type": "image_url",
180
+ "image_url": {
181
+ "url": f"data:image/png;base64,{image_b64}"
182
+ },
183
+ }
184
+ )
185
+
186
+ # Add text
187
+ user_content_parts.append({"type": "text", "text": user_content})
188
+
189
+ response = client.chat.completions.create(
190
+ model="gpt-4o",
191
+ messages=[
192
+ {"role": "system", "content": SYSTEM_PROMPT},
193
+ {"role": "user", "content": user_content_parts},
194
+ ],
195
+ max_tokens=self.max_tokens,
196
+ )
197
+
198
+ return response.choices[0].message.content or ""
199
+
200
+ raise ValueError(f"Unknown provider: {self.provider}")
201
+
202
+ def _parse_action(self, response: str) -> str | None:
203
+ """Extract action from response.
204
+
205
+ Args:
206
+ response: Model response text.
207
+
208
+ Returns:
209
+ Extracted action string or None.
210
+ """
211
+ import re
212
+
213
+ # Look for ACTION: prefix
214
+ match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
215
+ if match:
216
+ return match.group(1).strip()
217
+
218
+ # Look for action patterns
219
+ patterns = [
220
+ r"(CLICK\s*\([^)]+\))",
221
+ r"(TYPE\s*\([^)]+\))",
222
+ r"(KEY\s*\([^)]+\))",
223
+ r"(SCROLL\s*\([^)]+\))",
224
+ r"(DONE\s*\(\s*\))",
225
+ ]
226
+ for pattern in patterns:
227
+ match = re.search(pattern, response, re.IGNORECASE)
228
+ if match:
229
+ return match.group(1).strip()
230
+
231
+ return None
232
+
233
+ def run_zero_shot(
234
+ self,
235
+ task: str,
236
+ screenshot_path: str,
237
+ ) -> ExperimentResult:
238
+ """Run zero-shot (no demo) condition.
239
+
240
+ Args:
241
+ task: Task instruction.
242
+ screenshot_path: Path to current screenshot.
243
+
244
+ Returns:
245
+ ExperimentResult.
246
+ """
247
+ if self.verbose:
248
+ print(f" Running zero-shot: {task[:50]}...")
249
+
250
+ prompt = f"Goal: {task}\n\nWhat is the next action?"
251
+
252
+ try:
253
+ response = self._call_api(prompt, [screenshot_path])
254
+ action = self._parse_action(response)
255
+ return ExperimentResult(
256
+ task=task,
257
+ condition="zero_shot",
258
+ response=response,
259
+ action_parsed=action,
260
+ success=None, # Manual evaluation needed
261
+ )
262
+ except Exception as e:
263
+ return ExperimentResult(
264
+ task=task,
265
+ condition="zero_shot",
266
+ response="",
267
+ action_parsed=None,
268
+ success=False,
269
+ error=str(e),
270
+ )
271
+
272
+ def run_with_demo(
273
+ self,
274
+ task: str,
275
+ screenshot_path: str,
276
+ demo_text: str,
277
+ demo_screenshots: list[str] | None = None,
278
+ ) -> ExperimentResult:
279
+ """Run with-demo condition.
280
+
281
+ Args:
282
+ task: Task instruction.
283
+ screenshot_path: Path to current screenshot.
284
+ demo_text: Formatted demo text.
285
+ demo_screenshots: Optional demo screenshot paths.
286
+
287
+ Returns:
288
+ ExperimentResult.
289
+ """
290
+ if self.verbose:
291
+ print(f" Running with-demo: {task[:50]}...")
292
+
293
+ prompt = f"{demo_text}\n\nNOW PERFORM THIS TASK:\nGoal: {task}\n\nWhat is the next action?"
294
+
295
+ # Combine demo screenshots with current screenshot
296
+ all_images = (demo_screenshots or []) + [screenshot_path]
297
+
298
+ try:
299
+ response = self._call_api(prompt, all_images)
300
+ action = self._parse_action(response)
301
+ return ExperimentResult(
302
+ task=task,
303
+ condition="with_demo",
304
+ response=response,
305
+ action_parsed=action,
306
+ success=None,
307
+ )
308
+ except Exception as e:
309
+ return ExperimentResult(
310
+ task=task,
311
+ condition="with_demo",
312
+ response="",
313
+ action_parsed=None,
314
+ success=False,
315
+ error=str(e),
316
+ )
317
+
318
+ def run_control(
319
+ self,
320
+ task: str,
321
+ screenshot_path: str,
322
+ control_text: str,
323
+ ) -> ExperimentResult:
324
+ """Run length-matched control condition.
325
+
326
+ Args:
327
+ task: Task instruction.
328
+ screenshot_path: Path to current screenshot.
329
+ control_text: Length-matched control text.
330
+
331
+ Returns:
332
+ ExperimentResult.
333
+ """
334
+ if self.verbose:
335
+ print(f" Running control: {task[:50]}...")
336
+
337
+ prompt = f"{control_text}\n\nGoal: {task}\n\nWhat is the next action?"
338
+
339
+ try:
340
+ response = self._call_api(prompt, [screenshot_path])
341
+ action = self._parse_action(response)
342
+ return ExperimentResult(
343
+ task=task,
344
+ condition="control",
345
+ response=response,
346
+ action_parsed=action,
347
+ success=None,
348
+ )
349
+ except Exception as e:
350
+ return ExperimentResult(
351
+ task=task,
352
+ condition="control",
353
+ response="",
354
+ action_parsed=None,
355
+ success=False,
356
+ error=str(e),
357
+ )
358
+
359
+ def run_task_pair(
360
+ self,
361
+ demo_episode: Any, # Episode
362
+ test_task: str,
363
+ test_screenshot: str,
364
+ include_demo_images: bool = False,
365
+ ) -> dict[str, ExperimentResult]:
366
+ """Run all conditions for a task pair.
367
+
368
+ Args:
369
+ demo_episode: Episode containing the demonstration.
370
+ test_task: Test task instruction.
371
+ test_screenshot: Path to test screenshot.
372
+ include_demo_images: Whether to include demo screenshots.
373
+
374
+ Returns:
375
+ Dict mapping condition name to result.
376
+ """
377
+ # Format demo
378
+ demo_text = format_episode_verbose(demo_episode, max_steps=10)
379
+
380
+ # Get demo screenshots if requested
381
+ demo_screenshots = None
382
+ if include_demo_images:
383
+ demo_screenshots = get_demo_screenshot_paths(demo_episode, max_steps=5)
384
+
385
+ # Generate control
386
+ control_text = generate_length_matched_control(demo_text)
387
+
388
+ results = {}
389
+
390
+ # Run all conditions
391
+ results["zero_shot"] = self.run_zero_shot(test_task, test_screenshot)
392
+ results["with_demo"] = self.run_with_demo(
393
+ test_task, test_screenshot, demo_text, demo_screenshots
394
+ )
395
+ results["control"] = self.run_control(test_task, test_screenshot, control_text)
396
+
397
+ return results
398
+
399
+
400
+ def run_experiment(
401
+ demo_capture_path: str,
402
+ test_task: str,
403
+ test_screenshot: str,
404
+ provider: str = "anthropic",
405
+ output_dir: str | None = None,
406
+ include_demo_images: bool = False,
407
+ goal: str | None = None,
408
+ ) -> dict[str, Any]:
409
+ """Run the full experiment.
410
+
411
+ Args:
412
+ demo_capture_path: Path to demo capture directory.
413
+ test_task: Test task instruction.
414
+ test_screenshot: Path to test screenshot.
415
+ provider: API provider.
416
+ output_dir: Optional output directory for results.
417
+ include_demo_images: Whether to include demo screenshots.
418
+ goal: Optional goal for demo episode (overrides capture's).
419
+
420
+ Returns:
421
+ Dict with results.
422
+ """
423
+ from openadapt_ml.ingest.capture import capture_to_episode
424
+
425
+ print(f"Loading demo from: {demo_capture_path}")
426
+ episode = capture_to_episode(demo_capture_path, goal=goal)
427
+ print(f" Loaded {len(episode.steps)} steps, goal: {episode.goal}")
428
+
429
+ print(f"\nTest task: {test_task}")
430
+ print(f"Test screenshot: {test_screenshot}")
431
+
432
+ experiment = DemoPromptExperiment(provider=provider)
433
+ results = experiment.run_task_pair(
434
+ demo_episode=episode,
435
+ test_task=test_task,
436
+ test_screenshot=test_screenshot,
437
+ include_demo_images=include_demo_images,
438
+ )
439
+
440
+ # Print results
441
+ print("\n" + "=" * 60)
442
+ print("RESULTS")
443
+ print("=" * 60)
444
+
445
+ for condition, result in results.items():
446
+ print(f"\n{condition.upper()}:")
447
+ print(f" Action: {result.action_parsed}")
448
+ if result.error:
449
+ print(f" Error: {result.error}")
450
+ print(f" Response preview: {result.response[:200]}...")
451
+
452
+ # Save results if output dir specified
453
+ if output_dir:
454
+ output_path = Path(output_dir)
455
+ output_path.mkdir(parents=True, exist_ok=True)
456
+
457
+ results_file = (
458
+ output_path / f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
459
+ )
460
+ with open(results_file, "w") as f:
461
+ json.dump(
462
+ {
463
+ "demo_capture": demo_capture_path,
464
+ "test_task": test_task,
465
+ "test_screenshot": test_screenshot,
466
+ "provider": provider,
467
+ "results": {
468
+ k: {
469
+ "task": v.task,
470
+ "condition": v.condition,
471
+ "action_parsed": v.action_parsed,
472
+ "response": v.response,
473
+ "error": v.error,
474
+ "timestamp": v.timestamp,
475
+ }
476
+ for k, v in results.items()
477
+ },
478
+ },
479
+ f,
480
+ indent=2,
481
+ )
482
+ print(f"\nResults saved to: {results_file}")
483
+
484
+ return {"results": results, "episode": episode}
485
+
486
+
487
+ def main():
488
+ """CLI entry point."""
489
+ parser = argparse.ArgumentParser(
490
+ description="Run demo-conditioned prompt experiment"
491
+ )
492
+ parser.add_argument(
493
+ "--demo-capture",
494
+ required=True,
495
+ help="Path to demo capture directory",
496
+ )
497
+ parser.add_argument(
498
+ "--test-task",
499
+ required=True,
500
+ help="Test task instruction",
501
+ )
502
+ parser.add_argument(
503
+ "--test-screenshot",
504
+ required=True,
505
+ help="Path to test screenshot",
506
+ )
507
+ parser.add_argument(
508
+ "--provider",
509
+ default="anthropic",
510
+ choices=["anthropic", "openai"],
511
+ help="API provider (default: anthropic)",
512
+ )
513
+ parser.add_argument(
514
+ "--output",
515
+ help="Output directory for results",
516
+ )
517
+ parser.add_argument(
518
+ "--include-demo-images",
519
+ action="store_true",
520
+ help="Include demo screenshots in prompt",
521
+ )
522
+ parser.add_argument(
523
+ "--goal",
524
+ help="Override goal for demo episode",
525
+ )
526
+
527
+ args = parser.parse_args()
528
+
529
+ run_experiment(
530
+ demo_capture_path=args.demo_capture,
531
+ test_task=args.test_task,
532
+ test_screenshot=args.test_screenshot,
533
+ provider=args.provider,
534
+ output_dir=args.output,
535
+ include_demo_images=args.include_demo_images,
536
+ goal=args.goal,
537
+ )
538
+
539
+
540
+ if __name__ == "__main__":
541
+ main()
@@ -0,0 +1,70 @@
1
+ """Representation Shootout Experiment.
2
+
3
+ Compares three approaches for GUI action prediction under distribution drift:
4
+
5
+ - Condition A: Raw Coordinates - Direct coordinate regression
6
+ - Condition B: Coordinates + Visual Cues - Enhanced with markers and zoom
7
+ - Condition C: Marks (Element IDs) - Element classification using SoM
8
+
9
+ Usage:
10
+ # Run full experiment
11
+ python -m openadapt_ml.experiments.representation_shootout.runner run
12
+
13
+ # Run specific condition
14
+ python -m openadapt_ml.experiments.representation_shootout.runner run --condition marks
15
+
16
+ # Evaluate under specific drift
17
+ python -m openadapt_ml.experiments.representation_shootout.runner eval --drift resolution
18
+
19
+ See docs/experiments/representation_shootout_design.md for full documentation.
20
+ """
21
+
22
+ from openadapt_ml.experiments.representation_shootout.config import (
23
+ ConditionConfig,
24
+ ConditionName,
25
+ DriftConfig,
26
+ DriftType,
27
+ ExperimentConfig,
28
+ MetricName,
29
+ )
30
+ from openadapt_ml.experiments.representation_shootout.conditions import (
31
+ ConditionBase,
32
+ CoordsCuesCondition,
33
+ MarksCondition,
34
+ RawCoordsCondition,
35
+ create_condition,
36
+ )
37
+ from openadapt_ml.experiments.representation_shootout.evaluator import (
38
+ DriftEvaluator,
39
+ EvaluationResult,
40
+ compute_metrics,
41
+ make_recommendation,
42
+ )
43
+ from openadapt_ml.experiments.representation_shootout.runner import (
44
+ ExperimentRunner,
45
+ run_experiment,
46
+ )
47
+
48
+ __all__ = [
49
+ # Config
50
+ "ExperimentConfig",
51
+ "ConditionConfig",
52
+ "ConditionName",
53
+ "DriftConfig",
54
+ "DriftType",
55
+ "MetricName",
56
+ # Conditions
57
+ "ConditionBase",
58
+ "RawCoordsCondition",
59
+ "CoordsCuesCondition",
60
+ "MarksCondition",
61
+ "create_condition",
62
+ # Evaluator
63
+ "DriftEvaluator",
64
+ "EvaluationResult",
65
+ "compute_metrics",
66
+ "make_recommendation",
67
+ # Runner
68
+ "ExperimentRunner",
69
+ "run_experiment",
70
+ ]