openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Unified baseline adapters for VLM comparison.
|
|
2
|
+
|
|
3
|
+
This module provides tools for comparing different VLM providers
|
|
4
|
+
(Claude, GPT, Gemini) across multiple evaluation tracks:
|
|
5
|
+
|
|
6
|
+
- Track A: Direct coordinate prediction
|
|
7
|
+
- Track B: ReAct-style reasoning with coordinates
|
|
8
|
+
- Track C: Set-of-Mark element selection
|
|
9
|
+
|
|
10
|
+
Based on SOTA patterns from:
|
|
11
|
+
- Claude Computer Use (Anthropic)
|
|
12
|
+
- Microsoft UFO/UFO2
|
|
13
|
+
- OSWorld benchmark
|
|
14
|
+
- Agent-S/Agent-S2 (Simular AI)
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
from openadapt_ml.baselines import UnifiedBaselineAdapter, BaselineConfig, TrackConfig
|
|
18
|
+
|
|
19
|
+
# Quick start with model alias
|
|
20
|
+
adapter = UnifiedBaselineAdapter.from_alias("claude-opus-4.5")
|
|
21
|
+
action = adapter.predict(screenshot, "Click the submit button")
|
|
22
|
+
|
|
23
|
+
# With explicit configuration
|
|
24
|
+
config = BaselineConfig(
|
|
25
|
+
provider="anthropic",
|
|
26
|
+
model="claude-opus-4-5-20251101",
|
|
27
|
+
track=TrackConfig.track_c(),
|
|
28
|
+
)
|
|
29
|
+
adapter = UnifiedBaselineAdapter(config)
|
|
30
|
+
|
|
31
|
+
# OSWorld-compatible configuration
|
|
32
|
+
config = BaselineConfig(
|
|
33
|
+
provider="openai",
|
|
34
|
+
model="gpt-5.2",
|
|
35
|
+
track=TrackConfig.osworld_compatible(),
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Parse responses directly
|
|
39
|
+
from openadapt_ml.baselines import UnifiedResponseParser, ElementRegistry
|
|
40
|
+
|
|
41
|
+
parser = UnifiedResponseParser()
|
|
42
|
+
action = parser.parse('{"action": "CLICK", "x": 0.5, "y": 0.3}')
|
|
43
|
+
|
|
44
|
+
# With element ID to coordinate conversion
|
|
45
|
+
registry = ElementRegistry.from_a11y_tree(tree)
|
|
46
|
+
parser = UnifiedResponseParser(element_registry=registry)
|
|
47
|
+
action = parser.parse_and_resolve('{"action": "CLICK", "element_id": 17}')
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
from openadapt_ml.baselines.adapter import UnifiedBaselineAdapter
|
|
51
|
+
from openadapt_ml.baselines.config import (
|
|
52
|
+
# Enums
|
|
53
|
+
ActionOutputFormat,
|
|
54
|
+
CoordinateSystem,
|
|
55
|
+
TrackType,
|
|
56
|
+
# Config dataclasses
|
|
57
|
+
BaselineConfig,
|
|
58
|
+
ModelSpec,
|
|
59
|
+
ReActConfig,
|
|
60
|
+
ScreenConfig,
|
|
61
|
+
SoMConfig,
|
|
62
|
+
TrackConfig,
|
|
63
|
+
# Registry
|
|
64
|
+
MODELS,
|
|
65
|
+
# Helper functions
|
|
66
|
+
get_default_model,
|
|
67
|
+
get_model_spec,
|
|
68
|
+
)
|
|
69
|
+
from openadapt_ml.baselines.parser import (
|
|
70
|
+
ElementRegistry,
|
|
71
|
+
ParsedAction,
|
|
72
|
+
UIElement,
|
|
73
|
+
UnifiedResponseParser,
|
|
74
|
+
)
|
|
75
|
+
from openadapt_ml.baselines.prompts import (
|
|
76
|
+
# System prompts
|
|
77
|
+
FORMAT_PROMPTS,
|
|
78
|
+
SYSTEM_PROMPT_OSWORLD,
|
|
79
|
+
SYSTEM_PROMPT_TRACK_A,
|
|
80
|
+
SYSTEM_PROMPT_TRACK_B,
|
|
81
|
+
SYSTEM_PROMPT_TRACK_C,
|
|
82
|
+
SYSTEM_PROMPT_UFO,
|
|
83
|
+
SYSTEM_PROMPTS,
|
|
84
|
+
# Builder class
|
|
85
|
+
PromptBuilder,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
__all__ = [
|
|
89
|
+
# Main adapter
|
|
90
|
+
"UnifiedBaselineAdapter",
|
|
91
|
+
# Configuration - Enums
|
|
92
|
+
"ActionOutputFormat",
|
|
93
|
+
"CoordinateSystem",
|
|
94
|
+
"TrackType",
|
|
95
|
+
# Configuration - Dataclasses
|
|
96
|
+
"BaselineConfig",
|
|
97
|
+
"ModelSpec",
|
|
98
|
+
"ReActConfig",
|
|
99
|
+
"ScreenConfig",
|
|
100
|
+
"SoMConfig",
|
|
101
|
+
"TrackConfig",
|
|
102
|
+
# Configuration - Registry
|
|
103
|
+
"MODELS",
|
|
104
|
+
# Configuration - Functions
|
|
105
|
+
"get_default_model",
|
|
106
|
+
"get_model_spec",
|
|
107
|
+
# Parsing
|
|
108
|
+
"ElementRegistry",
|
|
109
|
+
"ParsedAction",
|
|
110
|
+
"UIElement",
|
|
111
|
+
"UnifiedResponseParser",
|
|
112
|
+
# Prompts
|
|
113
|
+
"FORMAT_PROMPTS",
|
|
114
|
+
"PromptBuilder",
|
|
115
|
+
"SYSTEM_PROMPT_OSWORLD",
|
|
116
|
+
"SYSTEM_PROMPT_TRACK_A",
|
|
117
|
+
"SYSTEM_PROMPT_TRACK_B",
|
|
118
|
+
"SYSTEM_PROMPT_TRACK_C",
|
|
119
|
+
"SYSTEM_PROMPT_UFO",
|
|
120
|
+
"SYSTEM_PROMPTS",
|
|
121
|
+
]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Unified baseline adapter for comparing VLMs across tracks.
|
|
2
|
+
|
|
3
|
+
Main entry point for baseline evaluations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from openadapt_ml.baselines.config import BaselineConfig, TrackConfig
|
|
12
|
+
from openadapt_ml.baselines.parser import ParsedAction, UnifiedResponseParser
|
|
13
|
+
from openadapt_ml.baselines.prompts import PromptBuilder
|
|
14
|
+
from openadapt_ml.config import settings
|
|
15
|
+
from openadapt_ml.models.providers import get_provider
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UnifiedBaselineAdapter:
|
|
22
|
+
"""Adapter for running baseline evaluations across VLM providers.
|
|
23
|
+
|
|
24
|
+
Provides a unified interface for Claude, GPT, and Gemini models
|
|
25
|
+
across multiple evaluation tracks (coordinates, ReAct, SoM).
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
adapter = UnifiedBaselineAdapter(BaselineConfig.from_alias("claude-opus-4.5"))
|
|
29
|
+
result = adapter.predict(screenshot, "Click the submit button")
|
|
30
|
+
print(result.x, result.y)
|
|
31
|
+
|
|
32
|
+
# With Track C (SoM)
|
|
33
|
+
adapter = UnifiedBaselineAdapter(BaselineConfig.from_alias(
|
|
34
|
+
"gemini-3-pro",
|
|
35
|
+
track=TrackConfig.track_c(),
|
|
36
|
+
))
|
|
37
|
+
result = adapter.predict(screenshot, "Click the login button", a11y_tree=tree)
|
|
38
|
+
print(result.element_id)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, config: BaselineConfig):
|
|
42
|
+
"""Initialize the baseline adapter.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
config: Baseline configuration including model, track, etc.
|
|
46
|
+
"""
|
|
47
|
+
self.config = config
|
|
48
|
+
self._provider = get_provider(config.provider)
|
|
49
|
+
self._client = None
|
|
50
|
+
self._prompt_builder = PromptBuilder(config.track)
|
|
51
|
+
self._parser = UnifiedResponseParser()
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def client(self) -> Any:
|
|
55
|
+
"""Lazy-load API client."""
|
|
56
|
+
if self._client is None:
|
|
57
|
+
api_key = self._resolve_api_key()
|
|
58
|
+
self._client = self._provider.create_client(api_key)
|
|
59
|
+
return self._client
|
|
60
|
+
|
|
61
|
+
def _resolve_api_key(self) -> str:
|
|
62
|
+
"""Resolve API key from config, settings, or environment."""
|
|
63
|
+
if self.config.api_key:
|
|
64
|
+
return self.config.api_key
|
|
65
|
+
|
|
66
|
+
# Try settings first
|
|
67
|
+
if self.config.provider == "anthropic":
|
|
68
|
+
key = settings.anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
|
|
69
|
+
elif self.config.provider == "openai":
|
|
70
|
+
key = settings.openai_api_key or os.getenv("OPENAI_API_KEY")
|
|
71
|
+
elif self.config.provider == "google":
|
|
72
|
+
key = settings.google_api_key or os.getenv("GOOGLE_API_KEY")
|
|
73
|
+
else:
|
|
74
|
+
key = None
|
|
75
|
+
|
|
76
|
+
if not key:
|
|
77
|
+
raise RuntimeError(
|
|
78
|
+
f"API key for {self.config.provider} not found. "
|
|
79
|
+
"Set in .env file or environment variable."
|
|
80
|
+
)
|
|
81
|
+
return key
|
|
82
|
+
|
|
83
|
+
def predict(
|
|
84
|
+
self,
|
|
85
|
+
screenshot: "Image",
|
|
86
|
+
goal: str,
|
|
87
|
+
a11y_tree: str | dict[str, Any] | None = None,
|
|
88
|
+
history: list[dict[str, Any]] | None = None,
|
|
89
|
+
) -> ParsedAction:
|
|
90
|
+
"""Predict the next action given current state.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
screenshot: Current screenshot as PIL Image.
|
|
94
|
+
goal: Task goal/instruction.
|
|
95
|
+
a11y_tree: Optional accessibility tree (string or dict).
|
|
96
|
+
history: Optional list of previous actions.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
ParsedAction with predicted action.
|
|
100
|
+
"""
|
|
101
|
+
# Build system prompt
|
|
102
|
+
system_prompt = self._prompt_builder.get_system_prompt(self.config.demo)
|
|
103
|
+
|
|
104
|
+
# Build user content
|
|
105
|
+
content = self._prompt_builder.build_user_content(
|
|
106
|
+
goal=goal,
|
|
107
|
+
screenshot=screenshot,
|
|
108
|
+
a11y_tree=a11y_tree,
|
|
109
|
+
history=history,
|
|
110
|
+
encode_image_fn=self._provider.encode_image,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Call API
|
|
114
|
+
response = self._provider.send_message(
|
|
115
|
+
client=self.client,
|
|
116
|
+
model=self.config.model,
|
|
117
|
+
system=system_prompt,
|
|
118
|
+
content=content,
|
|
119
|
+
max_tokens=self.config.max_tokens,
|
|
120
|
+
temperature=self.config.temperature,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if self.config.verbose:
|
|
124
|
+
print(f"[{self.config.provider}] Response: {response[:200]}...")
|
|
125
|
+
|
|
126
|
+
# Parse response
|
|
127
|
+
action = self._parser.parse(response)
|
|
128
|
+
|
|
129
|
+
return action
|
|
130
|
+
|
|
131
|
+
def predict_batch(
|
|
132
|
+
self,
|
|
133
|
+
samples: list[dict[str, Any]],
|
|
134
|
+
) -> list[ParsedAction]:
|
|
135
|
+
"""Predict actions for multiple samples.
|
|
136
|
+
|
|
137
|
+
Note: Currently runs sequentially. Future optimization could
|
|
138
|
+
use async/parallel calls for providers that support it.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
samples: List of dicts with keys: screenshot, goal, a11y_tree, history.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
List of ParsedActions.
|
|
145
|
+
"""
|
|
146
|
+
results = []
|
|
147
|
+
for sample in samples:
|
|
148
|
+
action = self.predict(
|
|
149
|
+
screenshot=sample.get("screenshot"),
|
|
150
|
+
goal=sample.get("goal", ""),
|
|
151
|
+
a11y_tree=sample.get("a11y_tree"),
|
|
152
|
+
history=sample.get("history"),
|
|
153
|
+
)
|
|
154
|
+
results.append(action)
|
|
155
|
+
return results
|
|
156
|
+
|
|
157
|
+
@classmethod
|
|
158
|
+
def from_alias(
|
|
159
|
+
cls,
|
|
160
|
+
model_alias: str,
|
|
161
|
+
track: TrackConfig | None = None,
|
|
162
|
+
**kwargs: Any,
|
|
163
|
+
) -> "UnifiedBaselineAdapter":
|
|
164
|
+
"""Create adapter from model alias.
|
|
165
|
+
|
|
166
|
+
Convenience constructor that resolves model aliases.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2').
|
|
170
|
+
track: Track config (defaults to Track A).
|
|
171
|
+
**kwargs: Additional config options.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
UnifiedBaselineAdapter instance.
|
|
175
|
+
"""
|
|
176
|
+
config = BaselineConfig.from_alias(model_alias, track=track, **kwargs)
|
|
177
|
+
return cls(config)
|
|
178
|
+
|
|
179
|
+
def __repr__(self) -> str:
|
|
180
|
+
return (
|
|
181
|
+
f"UnifiedBaselineAdapter("
|
|
182
|
+
f"provider={self.config.provider}, "
|
|
183
|
+
f"model={self.config.model}, "
|
|
184
|
+
f"track={self.config.track.track_type.value})"
|
|
185
|
+
)
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""CLI for baseline adapter operations.
|
|
2
|
+
|
|
3
|
+
Provides commands for comparing VLMs across tracks.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
|
|
14
|
+
from openadapt_ml.baselines.config import MODELS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.group()
|
|
18
|
+
def baselines():
|
|
19
|
+
"""Baseline adapter commands for VLM comparison."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@baselines.command()
|
|
24
|
+
@click.option(
|
|
25
|
+
"--model",
|
|
26
|
+
"-m",
|
|
27
|
+
required=True,
|
|
28
|
+
type=click.Choice(list(MODELS.keys())),
|
|
29
|
+
help="Model alias to use",
|
|
30
|
+
)
|
|
31
|
+
@click.option(
|
|
32
|
+
"--track",
|
|
33
|
+
"-t",
|
|
34
|
+
type=click.Choice(["A", "B", "C"]),
|
|
35
|
+
default="A",
|
|
36
|
+
help="Evaluation track (A=coords, B=ReAct, C=SoM)",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"--image",
|
|
40
|
+
"-i",
|
|
41
|
+
type=click.Path(exists=True),
|
|
42
|
+
required=True,
|
|
43
|
+
help="Screenshot image path",
|
|
44
|
+
)
|
|
45
|
+
@click.option(
|
|
46
|
+
"--goal",
|
|
47
|
+
"-g",
|
|
48
|
+
required=True,
|
|
49
|
+
help="Task goal/instruction",
|
|
50
|
+
)
|
|
51
|
+
@click.option(
|
|
52
|
+
"--output",
|
|
53
|
+
"-o",
|
|
54
|
+
type=click.Path(),
|
|
55
|
+
help="Output JSON file path",
|
|
56
|
+
)
|
|
57
|
+
@click.option(
|
|
58
|
+
"--verbose",
|
|
59
|
+
"-v",
|
|
60
|
+
is_flag=True,
|
|
61
|
+
help="Enable verbose output",
|
|
62
|
+
)
|
|
63
|
+
def run(
|
|
64
|
+
model: str,
|
|
65
|
+
track: str,
|
|
66
|
+
image: str,
|
|
67
|
+
goal: str,
|
|
68
|
+
output: str | None,
|
|
69
|
+
verbose: bool,
|
|
70
|
+
):
|
|
71
|
+
"""Run a single baseline prediction.
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
uv run python -m openadapt_ml.baselines.cli run \\
|
|
75
|
+
--model claude-opus-4.5 \\
|
|
76
|
+
--track A \\
|
|
77
|
+
--image screenshot.png \\
|
|
78
|
+
--goal "Click the submit button"
|
|
79
|
+
"""
|
|
80
|
+
from PIL import Image
|
|
81
|
+
|
|
82
|
+
from openadapt_ml.baselines import UnifiedBaselineAdapter, TrackConfig
|
|
83
|
+
|
|
84
|
+
# Select track config
|
|
85
|
+
track_configs = {
|
|
86
|
+
"A": TrackConfig.track_a(),
|
|
87
|
+
"B": TrackConfig.track_b(),
|
|
88
|
+
"C": TrackConfig.track_c(),
|
|
89
|
+
}
|
|
90
|
+
track_config = track_configs[track]
|
|
91
|
+
|
|
92
|
+
click.echo(f"Model: {model}")
|
|
93
|
+
click.echo(f"Track: {track} ({track_config.track_type.value})")
|
|
94
|
+
click.echo(f"Image: {image}")
|
|
95
|
+
click.echo(f"Goal: {goal}")
|
|
96
|
+
click.echo()
|
|
97
|
+
|
|
98
|
+
# Load image
|
|
99
|
+
screenshot = Image.open(image)
|
|
100
|
+
|
|
101
|
+
# Create adapter
|
|
102
|
+
adapter = UnifiedBaselineAdapter.from_alias(
|
|
103
|
+
model,
|
|
104
|
+
track=track_config,
|
|
105
|
+
verbose=verbose,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Run prediction
|
|
109
|
+
click.echo("Running prediction...")
|
|
110
|
+
action = adapter.predict(screenshot, goal)
|
|
111
|
+
|
|
112
|
+
# Display result
|
|
113
|
+
click.echo()
|
|
114
|
+
click.echo("=" * 50)
|
|
115
|
+
click.echo("RESULT")
|
|
116
|
+
click.echo("=" * 50)
|
|
117
|
+
|
|
118
|
+
if action.is_valid:
|
|
119
|
+
click.echo(f"Action: {action.action_type.upper()}")
|
|
120
|
+
if action.x is not None and action.y is not None:
|
|
121
|
+
click.echo(f"Coordinates: ({action.x:.4f}, {action.y:.4f})")
|
|
122
|
+
if action.element_id is not None:
|
|
123
|
+
click.echo(f"Element ID: {action.element_id}")
|
|
124
|
+
if action.text is not None:
|
|
125
|
+
click.echo(f"Text: {action.text}")
|
|
126
|
+
if action.thought is not None:
|
|
127
|
+
click.echo(f"Thought: {action.thought}")
|
|
128
|
+
else:
|
|
129
|
+
click.echo(f"Parse Error: {action.parse_error}")
|
|
130
|
+
click.echo(
|
|
131
|
+
f"Raw Response: {action.raw_response[:200] if action.raw_response else 'None'}..."
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Save output if requested
|
|
135
|
+
if output:
|
|
136
|
+
result = {
|
|
137
|
+
"model": model,
|
|
138
|
+
"track": track,
|
|
139
|
+
"goal": goal,
|
|
140
|
+
"action": action.to_dict(),
|
|
141
|
+
"raw_response": action.raw_response,
|
|
142
|
+
"parse_error": action.parse_error,
|
|
143
|
+
}
|
|
144
|
+
Path(output).write_text(json.dumps(result, indent=2))
|
|
145
|
+
click.echo(f"\nSaved to: {output}")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@baselines.command()
|
|
149
|
+
@click.option(
|
|
150
|
+
"--models",
|
|
151
|
+
"-m",
|
|
152
|
+
required=True,
|
|
153
|
+
help="Comma-separated model aliases",
|
|
154
|
+
)
|
|
155
|
+
@click.option(
|
|
156
|
+
"--track",
|
|
157
|
+
"-t",
|
|
158
|
+
type=click.Choice(["A", "B", "C"]),
|
|
159
|
+
default="A",
|
|
160
|
+
help="Evaluation track",
|
|
161
|
+
)
|
|
162
|
+
@click.option(
|
|
163
|
+
"--image",
|
|
164
|
+
"-i",
|
|
165
|
+
type=click.Path(exists=True),
|
|
166
|
+
required=True,
|
|
167
|
+
help="Screenshot image path",
|
|
168
|
+
)
|
|
169
|
+
@click.option(
|
|
170
|
+
"--goal",
|
|
171
|
+
"-g",
|
|
172
|
+
required=True,
|
|
173
|
+
help="Task goal/instruction",
|
|
174
|
+
)
|
|
175
|
+
@click.option(
|
|
176
|
+
"--output",
|
|
177
|
+
"-o",
|
|
178
|
+
type=click.Path(),
|
|
179
|
+
help="Output JSON file path",
|
|
180
|
+
)
|
|
181
|
+
def compare(
|
|
182
|
+
models: str,
|
|
183
|
+
track: str,
|
|
184
|
+
image: str,
|
|
185
|
+
goal: str,
|
|
186
|
+
output: str | None,
|
|
187
|
+
):
|
|
188
|
+
"""Compare multiple models on the same task.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
uv run python -m openadapt_ml.baselines.cli compare \\
|
|
192
|
+
--models claude-opus-4.5,gpt-5.2,gemini-3-pro \\
|
|
193
|
+
--track C \\
|
|
194
|
+
--image screenshot.png \\
|
|
195
|
+
--goal "Click the login button"
|
|
196
|
+
"""
|
|
197
|
+
from PIL import Image
|
|
198
|
+
|
|
199
|
+
from openadapt_ml.baselines import UnifiedBaselineAdapter, TrackConfig
|
|
200
|
+
|
|
201
|
+
model_list = [m.strip() for m in models.split(",")]
|
|
202
|
+
|
|
203
|
+
# Validate models
|
|
204
|
+
for m in model_list:
|
|
205
|
+
if m not in MODELS:
|
|
206
|
+
click.echo(f"Error: Unknown model '{m}'", err=True)
|
|
207
|
+
click.echo(f"Available: {', '.join(MODELS.keys())}", err=True)
|
|
208
|
+
sys.exit(1)
|
|
209
|
+
|
|
210
|
+
# Select track config
|
|
211
|
+
track_configs = {
|
|
212
|
+
"A": TrackConfig.track_a(),
|
|
213
|
+
"B": TrackConfig.track_b(),
|
|
214
|
+
"C": TrackConfig.track_c(),
|
|
215
|
+
}
|
|
216
|
+
track_config = track_configs[track]
|
|
217
|
+
|
|
218
|
+
click.echo(f"Comparing {len(model_list)} models on Track {track}")
|
|
219
|
+
click.echo(f"Image: {image}")
|
|
220
|
+
click.echo(f"Goal: {goal}")
|
|
221
|
+
click.echo()
|
|
222
|
+
|
|
223
|
+
# Load image
|
|
224
|
+
screenshot = Image.open(image)
|
|
225
|
+
|
|
226
|
+
results = []
|
|
227
|
+
|
|
228
|
+
# Run each model
|
|
229
|
+
for model in model_list:
|
|
230
|
+
click.echo(f"Running {model}...")
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
adapter = UnifiedBaselineAdapter.from_alias(model, track=track_config)
|
|
234
|
+
action = adapter.predict(screenshot, goal)
|
|
235
|
+
|
|
236
|
+
results.append(
|
|
237
|
+
{
|
|
238
|
+
"model": model,
|
|
239
|
+
"success": action.is_valid,
|
|
240
|
+
"action": action.to_dict(),
|
|
241
|
+
"error": action.parse_error,
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
status = "OK" if action.is_valid else "FAILED"
|
|
246
|
+
click.echo(f" {status}: {action.action_type}")
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
results.append(
|
|
250
|
+
{
|
|
251
|
+
"model": model,
|
|
252
|
+
"success": False,
|
|
253
|
+
"action": None,
|
|
254
|
+
"error": str(e),
|
|
255
|
+
}
|
|
256
|
+
)
|
|
257
|
+
click.echo(f" ERROR: {e}")
|
|
258
|
+
|
|
259
|
+
# Summary table
|
|
260
|
+
click.echo()
|
|
261
|
+
click.echo("=" * 60)
|
|
262
|
+
click.echo("COMPARISON SUMMARY")
|
|
263
|
+
click.echo("=" * 60)
|
|
264
|
+
click.echo(f"{'Model':<25} {'Status':<10} {'Action':<25}")
|
|
265
|
+
click.echo("-" * 60)
|
|
266
|
+
|
|
267
|
+
for r in results:
|
|
268
|
+
model = r["model"]
|
|
269
|
+
status = "OK" if r["success"] else "FAILED"
|
|
270
|
+
action = r["action"]
|
|
271
|
+
if action:
|
|
272
|
+
if action.get("x") is not None:
|
|
273
|
+
action_str = f"CLICK({action['x']:.3f}, {action['y']:.3f})"
|
|
274
|
+
elif action.get("element_id") is not None:
|
|
275
|
+
action_str = f"CLICK([{action['element_id']}])"
|
|
276
|
+
else:
|
|
277
|
+
action_str = action.get("type", "unknown").upper()
|
|
278
|
+
else:
|
|
279
|
+
action_str = r.get("error", "Unknown error")[:25]
|
|
280
|
+
|
|
281
|
+
click.echo(f"{model:<25} {status:<10} {action_str:<25}")
|
|
282
|
+
|
|
283
|
+
# Save output if requested
|
|
284
|
+
if output:
|
|
285
|
+
full_results = {
|
|
286
|
+
"models": model_list,
|
|
287
|
+
"track": track,
|
|
288
|
+
"goal": goal,
|
|
289
|
+
"results": results,
|
|
290
|
+
}
|
|
291
|
+
Path(output).write_text(json.dumps(full_results, indent=2))
|
|
292
|
+
click.echo(f"\nSaved to: {output}")
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@baselines.command()
|
|
296
|
+
def list_models():
|
|
297
|
+
"""List available models and their providers."""
|
|
298
|
+
click.echo("Available models:")
|
|
299
|
+
click.echo()
|
|
300
|
+
click.echo(f"{'Alias':<20} {'Provider':<12} {'Model ID':<35} {'Default'}")
|
|
301
|
+
click.echo("-" * 75)
|
|
302
|
+
|
|
303
|
+
for alias, spec in MODELS.items():
|
|
304
|
+
default = "*" if spec.is_default else ""
|
|
305
|
+
click.echo(f"{alias:<20} {spec.provider:<12} {spec.model_id:<35} {default}")
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# Entry point for direct execution
|
|
309
|
+
def main():
|
|
310
|
+
baselines()
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
if __name__ == "__main__":
|
|
314
|
+
main()
|