openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,121 @@
1
+ """Unified baseline adapters for VLM comparison.
2
+
3
+ This module provides tools for comparing different VLM providers
4
+ (Claude, GPT, Gemini) across multiple evaluation tracks:
5
+
6
+ - Track A: Direct coordinate prediction
7
+ - Track B: ReAct-style reasoning with coordinates
8
+ - Track C: Set-of-Mark element selection
9
+
10
+ Based on SOTA patterns from:
11
+ - Claude Computer Use (Anthropic)
12
+ - Microsoft UFO/UFO2
13
+ - OSWorld benchmark
14
+ - Agent-S/Agent-S2 (Simular AI)
15
+
16
+ Usage:
17
+ from openadapt_ml.baselines import UnifiedBaselineAdapter, BaselineConfig, TrackConfig
18
+
19
+ # Quick start with model alias
20
+ adapter = UnifiedBaselineAdapter.from_alias("claude-opus-4.5")
21
+ action = adapter.predict(screenshot, "Click the submit button")
22
+
23
+ # With explicit configuration
24
+ config = BaselineConfig(
25
+ provider="anthropic",
26
+ model="claude-opus-4-5-20251101",
27
+ track=TrackConfig.track_c(),
28
+ )
29
+ adapter = UnifiedBaselineAdapter(config)
30
+
31
+ # OSWorld-compatible configuration
32
+ config = BaselineConfig(
33
+ provider="openai",
34
+ model="gpt-5.2",
35
+ track=TrackConfig.osworld_compatible(),
36
+ )
37
+
38
+ # Parse responses directly
39
+ from openadapt_ml.baselines import UnifiedResponseParser, ElementRegistry
40
+
41
+ parser = UnifiedResponseParser()
42
+ action = parser.parse('{"action": "CLICK", "x": 0.5, "y": 0.3}')
43
+
44
+ # With element ID to coordinate conversion
45
+ registry = ElementRegistry.from_a11y_tree(tree)
46
+ parser = UnifiedResponseParser(element_registry=registry)
47
+ action = parser.parse_and_resolve('{"action": "CLICK", "element_id": 17}')
48
+ """
49
+
50
+ from openadapt_ml.baselines.adapter import UnifiedBaselineAdapter
51
+ from openadapt_ml.baselines.config import (
52
+ # Enums
53
+ ActionOutputFormat,
54
+ CoordinateSystem,
55
+ TrackType,
56
+ # Config dataclasses
57
+ BaselineConfig,
58
+ ModelSpec,
59
+ ReActConfig,
60
+ ScreenConfig,
61
+ SoMConfig,
62
+ TrackConfig,
63
+ # Registry
64
+ MODELS,
65
+ # Helper functions
66
+ get_default_model,
67
+ get_model_spec,
68
+ )
69
+ from openadapt_ml.baselines.parser import (
70
+ ElementRegistry,
71
+ ParsedAction,
72
+ UIElement,
73
+ UnifiedResponseParser,
74
+ )
75
+ from openadapt_ml.baselines.prompts import (
76
+ # System prompts
77
+ FORMAT_PROMPTS,
78
+ SYSTEM_PROMPT_OSWORLD,
79
+ SYSTEM_PROMPT_TRACK_A,
80
+ SYSTEM_PROMPT_TRACK_B,
81
+ SYSTEM_PROMPT_TRACK_C,
82
+ SYSTEM_PROMPT_UFO,
83
+ SYSTEM_PROMPTS,
84
+ # Builder class
85
+ PromptBuilder,
86
+ )
87
+
88
+ __all__ = [
89
+ # Main adapter
90
+ "UnifiedBaselineAdapter",
91
+ # Configuration - Enums
92
+ "ActionOutputFormat",
93
+ "CoordinateSystem",
94
+ "TrackType",
95
+ # Configuration - Dataclasses
96
+ "BaselineConfig",
97
+ "ModelSpec",
98
+ "ReActConfig",
99
+ "ScreenConfig",
100
+ "SoMConfig",
101
+ "TrackConfig",
102
+ # Configuration - Registry
103
+ "MODELS",
104
+ # Configuration - Functions
105
+ "get_default_model",
106
+ "get_model_spec",
107
+ # Parsing
108
+ "ElementRegistry",
109
+ "ParsedAction",
110
+ "UIElement",
111
+ "UnifiedResponseParser",
112
+ # Prompts
113
+ "FORMAT_PROMPTS",
114
+ "PromptBuilder",
115
+ "SYSTEM_PROMPT_OSWORLD",
116
+ "SYSTEM_PROMPT_TRACK_A",
117
+ "SYSTEM_PROMPT_TRACK_B",
118
+ "SYSTEM_PROMPT_TRACK_C",
119
+ "SYSTEM_PROMPT_UFO",
120
+ "SYSTEM_PROMPTS",
121
+ ]
@@ -0,0 +1,185 @@
1
+ """Unified baseline adapter for comparing VLMs across tracks.
2
+
3
+ Main entry point for baseline evaluations.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from openadapt_ml.baselines.config import BaselineConfig, TrackConfig
12
+ from openadapt_ml.baselines.parser import ParsedAction, UnifiedResponseParser
13
+ from openadapt_ml.baselines.prompts import PromptBuilder
14
+ from openadapt_ml.config import settings
15
+ from openadapt_ml.models.providers import get_provider
16
+
17
+ if TYPE_CHECKING:
18
+ from PIL import Image
19
+
20
+
21
+ class UnifiedBaselineAdapter:
22
+ """Adapter for running baseline evaluations across VLM providers.
23
+
24
+ Provides a unified interface for Claude, GPT, and Gemini models
25
+ across multiple evaluation tracks (coordinates, ReAct, SoM).
26
+
27
+ Example:
28
+ adapter = UnifiedBaselineAdapter(BaselineConfig.from_alias("claude-opus-4.5"))
29
+ result = adapter.predict(screenshot, "Click the submit button")
30
+ print(result.x, result.y)
31
+
32
+ # With Track C (SoM)
33
+ adapter = UnifiedBaselineAdapter(BaselineConfig.from_alias(
34
+ "gemini-3-pro",
35
+ track=TrackConfig.track_c(),
36
+ ))
37
+ result = adapter.predict(screenshot, "Click the login button", a11y_tree=tree)
38
+ print(result.element_id)
39
+ """
40
+
41
+ def __init__(self, config: BaselineConfig):
42
+ """Initialize the baseline adapter.
43
+
44
+ Args:
45
+ config: Baseline configuration including model, track, etc.
46
+ """
47
+ self.config = config
48
+ self._provider = get_provider(config.provider)
49
+ self._client = None
50
+ self._prompt_builder = PromptBuilder(config.track)
51
+ self._parser = UnifiedResponseParser()
52
+
53
+ @property
54
+ def client(self) -> Any:
55
+ """Lazy-load API client."""
56
+ if self._client is None:
57
+ api_key = self._resolve_api_key()
58
+ self._client = self._provider.create_client(api_key)
59
+ return self._client
60
+
61
+ def _resolve_api_key(self) -> str:
62
+ """Resolve API key from config, settings, or environment."""
63
+ if self.config.api_key:
64
+ return self.config.api_key
65
+
66
+ # Try settings first
67
+ if self.config.provider == "anthropic":
68
+ key = settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
69
+ elif self.config.provider == "openai":
70
+ key = settings.openai_api_key or os.getenv("OPENAI_API_KEY")
71
+ elif self.config.provider == "google":
72
+ key = settings.google_api_key or os.getenv("GOOGLE_API_KEY")
73
+ else:
74
+ key = None
75
+
76
+ if not key:
77
+ raise RuntimeError(
78
+ f"API key for {self.config.provider} not found. "
79
+ "Set in .env file or environment variable."
80
+ )
81
+ return key
82
+
83
+ def predict(
84
+ self,
85
+ screenshot: "Image",
86
+ goal: str,
87
+ a11y_tree: str | dict[str, Any] | None = None,
88
+ history: list[dict[str, Any]] | None = None,
89
+ ) -> ParsedAction:
90
+ """Predict the next action given current state.
91
+
92
+ Args:
93
+ screenshot: Current screenshot as PIL Image.
94
+ goal: Task goal/instruction.
95
+ a11y_tree: Optional accessibility tree (string or dict).
96
+ history: Optional list of previous actions.
97
+
98
+ Returns:
99
+ ParsedAction with predicted action.
100
+ """
101
+ # Build system prompt
102
+ system_prompt = self._prompt_builder.get_system_prompt(self.config.demo)
103
+
104
+ # Build user content
105
+ content = self._prompt_builder.build_user_content(
106
+ goal=goal,
107
+ screenshot=screenshot,
108
+ a11y_tree=a11y_tree,
109
+ history=history,
110
+ encode_image_fn=self._provider.encode_image,
111
+ )
112
+
113
+ # Call API
114
+ response = self._provider.send_message(
115
+ client=self.client,
116
+ model=self.config.model,
117
+ system=system_prompt,
118
+ content=content,
119
+ max_tokens=self.config.max_tokens,
120
+ temperature=self.config.temperature,
121
+ )
122
+
123
+ if self.config.verbose:
124
+ print(f"[{self.config.provider}] Response: {response[:200]}...")
125
+
126
+ # Parse response
127
+ action = self._parser.parse(response)
128
+
129
+ return action
130
+
131
+ def predict_batch(
132
+ self,
133
+ samples: list[dict[str, Any]],
134
+ ) -> list[ParsedAction]:
135
+ """Predict actions for multiple samples.
136
+
137
+ Note: Currently runs sequentially. Future optimization could
138
+ use async/parallel calls for providers that support it.
139
+
140
+ Args:
141
+ samples: List of dicts with keys: screenshot, goal, a11y_tree, history.
142
+
143
+ Returns:
144
+ List of ParsedActions.
145
+ """
146
+ results = []
147
+ for sample in samples:
148
+ action = self.predict(
149
+ screenshot=sample.get("screenshot"),
150
+ goal=sample.get("goal", ""),
151
+ a11y_tree=sample.get("a11y_tree"),
152
+ history=sample.get("history"),
153
+ )
154
+ results.append(action)
155
+ return results
156
+
157
+ @classmethod
158
+ def from_alias(
159
+ cls,
160
+ model_alias: str,
161
+ track: TrackConfig | None = None,
162
+ **kwargs: Any,
163
+ ) -> "UnifiedBaselineAdapter":
164
+ """Create adapter from model alias.
165
+
166
+ Convenience constructor that resolves model aliases.
167
+
168
+ Args:
169
+ model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2').
170
+ track: Track config (defaults to Track A).
171
+ **kwargs: Additional config options.
172
+
173
+ Returns:
174
+ UnifiedBaselineAdapter instance.
175
+ """
176
+ config = BaselineConfig.from_alias(model_alias, track=track, **kwargs)
177
+ return cls(config)
178
+
179
+ def __repr__(self) -> str:
180
+ return (
181
+ f"UnifiedBaselineAdapter("
182
+ f"provider={self.config.provider}, "
183
+ f"model={self.config.model}, "
184
+ f"track={self.config.track.track_type.value})"
185
+ )
@@ -0,0 +1,314 @@
1
+ """CLI for baseline adapter operations.
2
+
3
+ Provides commands for comparing VLMs across tracks.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ import click
13
+
14
+ from openadapt_ml.baselines.config import MODELS
15
+
16
+
17
+ @click.group()
18
+ def baselines():
19
+ """Baseline adapter commands for VLM comparison."""
20
+ pass
21
+
22
+
23
+ @baselines.command()
24
+ @click.option(
25
+ "--model",
26
+ "-m",
27
+ required=True,
28
+ type=click.Choice(list(MODELS.keys())),
29
+ help="Model alias to use",
30
+ )
31
+ @click.option(
32
+ "--track",
33
+ "-t",
34
+ type=click.Choice(["A", "B", "C"]),
35
+ default="A",
36
+ help="Evaluation track (A=coords, B=ReAct, C=SoM)",
37
+ )
38
+ @click.option(
39
+ "--image",
40
+ "-i",
41
+ type=click.Path(exists=True),
42
+ required=True,
43
+ help="Screenshot image path",
44
+ )
45
+ @click.option(
46
+ "--goal",
47
+ "-g",
48
+ required=True,
49
+ help="Task goal/instruction",
50
+ )
51
+ @click.option(
52
+ "--output",
53
+ "-o",
54
+ type=click.Path(),
55
+ help="Output JSON file path",
56
+ )
57
+ @click.option(
58
+ "--verbose",
59
+ "-v",
60
+ is_flag=True,
61
+ help="Enable verbose output",
62
+ )
63
+ def run(
64
+ model: str,
65
+ track: str,
66
+ image: str,
67
+ goal: str,
68
+ output: str | None,
69
+ verbose: bool,
70
+ ):
71
+ """Run a single baseline prediction.
72
+
73
+ Example:
74
+ uv run python -m openadapt_ml.baselines.cli run \\
75
+ --model claude-opus-4.5 \\
76
+ --track A \\
77
+ --image screenshot.png \\
78
+ --goal "Click the submit button"
79
+ """
80
+ from PIL import Image
81
+
82
+ from openadapt_ml.baselines import UnifiedBaselineAdapter, TrackConfig
83
+
84
+ # Select track config
85
+ track_configs = {
86
+ "A": TrackConfig.track_a(),
87
+ "B": TrackConfig.track_b(),
88
+ "C": TrackConfig.track_c(),
89
+ }
90
+ track_config = track_configs[track]
91
+
92
+ click.echo(f"Model: {model}")
93
+ click.echo(f"Track: {track} ({track_config.track_type.value})")
94
+ click.echo(f"Image: {image}")
95
+ click.echo(f"Goal: {goal}")
96
+ click.echo()
97
+
98
+ # Load image
99
+ screenshot = Image.open(image)
100
+
101
+ # Create adapter
102
+ adapter = UnifiedBaselineAdapter.from_alias(
103
+ model,
104
+ track=track_config,
105
+ verbose=verbose,
106
+ )
107
+
108
+ # Run prediction
109
+ click.echo("Running prediction...")
110
+ action = adapter.predict(screenshot, goal)
111
+
112
+ # Display result
113
+ click.echo()
114
+ click.echo("=" * 50)
115
+ click.echo("RESULT")
116
+ click.echo("=" * 50)
117
+
118
+ if action.is_valid:
119
+ click.echo(f"Action: {action.action_type.upper()}")
120
+ if action.x is not None and action.y is not None:
121
+ click.echo(f"Coordinates: ({action.x:.4f}, {action.y:.4f})")
122
+ if action.element_id is not None:
123
+ click.echo(f"Element ID: {action.element_id}")
124
+ if action.text is not None:
125
+ click.echo(f"Text: {action.text}")
126
+ if action.thought is not None:
127
+ click.echo(f"Thought: {action.thought}")
128
+ else:
129
+ click.echo(f"Parse Error: {action.parse_error}")
130
+ click.echo(
131
+ f"Raw Response: {action.raw_response[:200] if action.raw_response else 'None'}..."
132
+ )
133
+
134
+ # Save output if requested
135
+ if output:
136
+ result = {
137
+ "model": model,
138
+ "track": track,
139
+ "goal": goal,
140
+ "action": action.to_dict(),
141
+ "raw_response": action.raw_response,
142
+ "parse_error": action.parse_error,
143
+ }
144
+ Path(output).write_text(json.dumps(result, indent=2))
145
+ click.echo(f"\nSaved to: {output}")
146
+
147
+
148
+ @baselines.command()
149
+ @click.option(
150
+ "--models",
151
+ "-m",
152
+ required=True,
153
+ help="Comma-separated model aliases",
154
+ )
155
+ @click.option(
156
+ "--track",
157
+ "-t",
158
+ type=click.Choice(["A", "B", "C"]),
159
+ default="A",
160
+ help="Evaluation track",
161
+ )
162
+ @click.option(
163
+ "--image",
164
+ "-i",
165
+ type=click.Path(exists=True),
166
+ required=True,
167
+ help="Screenshot image path",
168
+ )
169
+ @click.option(
170
+ "--goal",
171
+ "-g",
172
+ required=True,
173
+ help="Task goal/instruction",
174
+ )
175
+ @click.option(
176
+ "--output",
177
+ "-o",
178
+ type=click.Path(),
179
+ help="Output JSON file path",
180
+ )
181
+ def compare(
182
+ models: str,
183
+ track: str,
184
+ image: str,
185
+ goal: str,
186
+ output: str | None,
187
+ ):
188
+ """Compare multiple models on the same task.
189
+
190
+ Example:
191
+ uv run python -m openadapt_ml.baselines.cli compare \\
192
+ --models claude-opus-4.5,gpt-5.2,gemini-3-pro \\
193
+ --track C \\
194
+ --image screenshot.png \\
195
+ --goal "Click the login button"
196
+ """
197
+ from PIL import Image
198
+
199
+ from openadapt_ml.baselines import UnifiedBaselineAdapter, TrackConfig
200
+
201
+ model_list = [m.strip() for m in models.split(",")]
202
+
203
+ # Validate models
204
+ for m in model_list:
205
+ if m not in MODELS:
206
+ click.echo(f"Error: Unknown model '{m}'", err=True)
207
+ click.echo(f"Available: {', '.join(MODELS.keys())}", err=True)
208
+ sys.exit(1)
209
+
210
+ # Select track config
211
+ track_configs = {
212
+ "A": TrackConfig.track_a(),
213
+ "B": TrackConfig.track_b(),
214
+ "C": TrackConfig.track_c(),
215
+ }
216
+ track_config = track_configs[track]
217
+
218
+ click.echo(f"Comparing {len(model_list)} models on Track {track}")
219
+ click.echo(f"Image: {image}")
220
+ click.echo(f"Goal: {goal}")
221
+ click.echo()
222
+
223
+ # Load image
224
+ screenshot = Image.open(image)
225
+
226
+ results = []
227
+
228
+ # Run each model
229
+ for model in model_list:
230
+ click.echo(f"Running {model}...")
231
+
232
+ try:
233
+ adapter = UnifiedBaselineAdapter.from_alias(model, track=track_config)
234
+ action = adapter.predict(screenshot, goal)
235
+
236
+ results.append(
237
+ {
238
+ "model": model,
239
+ "success": action.is_valid,
240
+ "action": action.to_dict(),
241
+ "error": action.parse_error,
242
+ }
243
+ )
244
+
245
+ status = "OK" if action.is_valid else "FAILED"
246
+ click.echo(f" {status}: {action.action_type}")
247
+
248
+ except Exception as e:
249
+ results.append(
250
+ {
251
+ "model": model,
252
+ "success": False,
253
+ "action": None,
254
+ "error": str(e),
255
+ }
256
+ )
257
+ click.echo(f" ERROR: {e}")
258
+
259
+ # Summary table
260
+ click.echo()
261
+ click.echo("=" * 60)
262
+ click.echo("COMPARISON SUMMARY")
263
+ click.echo("=" * 60)
264
+ click.echo(f"{'Model':<25} {'Status':<10} {'Action':<25}")
265
+ click.echo("-" * 60)
266
+
267
+ for r in results:
268
+ model = r["model"]
269
+ status = "OK" if r["success"] else "FAILED"
270
+ action = r["action"]
271
+ if action:
272
+ if action.get("x") is not None:
273
+ action_str = f"CLICK({action['x']:.3f}, {action['y']:.3f})"
274
+ elif action.get("element_id") is not None:
275
+ action_str = f"CLICK([{action['element_id']}])"
276
+ else:
277
+ action_str = action.get("type", "unknown").upper()
278
+ else:
279
+ action_str = r.get("error", "Unknown error")[:25]
280
+
281
+ click.echo(f"{model:<25} {status:<10} {action_str:<25}")
282
+
283
+ # Save output if requested
284
+ if output:
285
+ full_results = {
286
+ "models": model_list,
287
+ "track": track,
288
+ "goal": goal,
289
+ "results": results,
290
+ }
291
+ Path(output).write_text(json.dumps(full_results, indent=2))
292
+ click.echo(f"\nSaved to: {output}")
293
+
294
+
295
+ @baselines.command()
296
+ def list_models():
297
+ """List available models and their providers."""
298
+ click.echo("Available models:")
299
+ click.echo()
300
+ click.echo(f"{'Alias':<20} {'Provider':<12} {'Model ID':<35} {'Default'}")
301
+ click.echo("-" * 75)
302
+
303
+ for alias, spec in MODELS.items():
304
+ default = "*" if spec.is_default else ""
305
+ click.echo(f"{alias:<20} {spec.provider:<12} {spec.model_id:<35} {default}")
306
+
307
+
308
+ # Entry point for direct execution
309
+ def main():
310
+ baselines()
311
+
312
+
313
+ if __name__ == "__main__":
314
+ main()