PyPI - openadapt-ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -115
openadapt_ml/benchmarks/agent.py +265 -421
openadapt_ml/benchmarks/azure.py +28 -19
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1722 -4847
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +22 -5
openadapt_ml/benchmarks/vm_monitor.py +530 -29
openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +2038 -487
openadapt_ml/cloud/ssh_tunnel.py +68 -26
openadapt_ml/datasets/next_action.py +40 -30
openadapt_ml/evals/grounding.py +8 -3
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +41 -26
openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/runner.py +29 -14
openadapt_ml/export/parquet.py +36 -24
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +8 -6
openadapt_ml/ingest/capture.py +25 -22
openadapt_ml/ingest/loader.py +7 -4
openadapt_ml/ingest/synthetic.py +189 -100
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/demo_retriever.py +50 -24
openadapt_ml/retrieval/embeddings.py +9 -8
openadapt_ml/retrieval/retriever.py +3 -1
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +18 -5
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +9 -0
openadapt_ml/schema/converters.py +74 -27
openadapt_ml/schema/episode.py +31 -18
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +85 -54
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +15 -9
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +3 -1
openadapt_ml/scripts/train.py +21 -9
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +52 -41
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +143 -86
openadapt_ml/training/trl_trainer.py +70 -21
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
openadapt_ml-0.2.2.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/live_tracker.py +0 -180
openadapt_ml/benchmarks/runner.py +0 -418
openadapt_ml/benchmarks/waa.py +0 -761
openadapt_ml/benchmarks/waa_live.py +0 -619
openadapt_ml-0.2.0.dist-info/RECORD +0 -86
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/agent.py CHANGED Viewed

@@ -1,8 +1,15 @@
-"""Agent interface for benchmark evaluation.
+"""ML-specific agents for benchmark evaluation.
-This module provides the BenchmarkAgent interface that agents must implement
-to be evaluated on benchmarks, plus adapters to wrap existing openadapt-ml
-components.
+This module provides agents that wrap openadapt-ml components (VLM adapters,
+policies, baselines) for benchmark evaluation.
+For standalone agents without ML dependencies, use openadapt_evals:
+    from openadapt_evals import ApiAgent, ScriptedAgent, RandomAgent
+ML-specific agents in this module:
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 Example:
     from openadapt_ml.benchmarks import PolicyAgent
@@ -12,7 +19,7 @@ Example:
     agent = PolicyAgent(policy)
     results = evaluate_agent_on_benchmark(agent, benchmark_adapter)
-    # API-backed agents (GPT-5.1, Claude)
+    # API-backed agents (GPT-5.1, Claude) using openadapt-ml adapters
     from openadapt_ml.benchmarks import APIBenchmarkAgent
     agent = APIBenchmarkAgent(provider="anthropic")  # Uses Claude
@@ -22,13 +29,13 @@ Example:
 from __future__ import annotations
-import json
 import re
-from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
-from openadapt_ml.benchmarks.base import (
+# Import base classes from openadapt-evals (canonical location)
+from openadapt_evals import (
     BenchmarkAction,
+    BenchmarkAgent,
     BenchmarkObservation,
     BenchmarkTask,
 )
@@ -36,43 +43,7 @@ from openadapt_ml.benchmarks.base import (
 if TYPE_CHECKING:
     from openadapt_ml.models.api_adapter import ApiVLMAdapter
     from openadapt_ml.runtime.policy import AgentPolicy
-    from openadapt_ml.schema import Action, ActionType
-class BenchmarkAgent(ABC):
-    """Abstract interface for agents evaluated on benchmarks.
-    Agents must implement the `act` method to receive observations
-    and return actions. The agent can maintain internal state across
-    steps within an episode.
-    """
-    @abstractmethod
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Given observation and task, return next action.
-        Args:
-            observation: Current observation from the environment.
-            task: Task being performed.
-            history: Optional list of previous (observation, action) pairs.
-        Returns:
-            Action to execute.
-        """
-        pass
-    def reset(self) -> None:
-        """Reset agent state between episodes.
-        Called before starting a new task. Override to clear any
-        internal state.
-        """
-        pass
+    from openadapt_ml.schema import Action
 class PolicyAgent(BenchmarkAgent):
@@ -128,61 +99,37 @@ class PolicyAgent(BenchmarkAgent):
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict:
-        """Build SFT-style sample from benchmark observation.
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build SFT-style sample from benchmark observation."""
         content_parts = [f"Goal: {task.instruction}"]
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
             content_parts.append(f"UI Elements:\n{tree_str}")
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
         content_parts.append("What action should be taken next?")
-        # Build sample
         sample = {
             "messages": [
                 {"role": "user", "content": "\n\n".join(content_parts)},
             ],
         }
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
         return sample
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-        Returns:
-            Formatted string representation.
-        """
-        # Simple formatting - can be overridden for platform-specific formatting
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
@@ -203,29 +150,15 @@ class PolicyAgent(BenchmarkAgent):
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-        Args:
-            history: List of (observation, action) pairs.
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-        Args:
-            action: Action to convert.
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_name:
                 return f"CLICK({action.target_name})"
@@ -250,29 +183,19 @@ class PolicyAgent(BenchmarkAgent):
     def _to_benchmark_action(
         self, action: Action, thought: str | None
     ) -> BenchmarkAction:
-        """Convert openadapt-ml Action to BenchmarkAction.
-        Args:
-            action: Action from policy.
-            thought: Optional thought/reasoning.
-        Returns:
-            BenchmarkAction.
-        """
-        # Extract normalized coordinates
+        """Convert openadapt-ml Action to BenchmarkAction."""
         x, y = None, None
         if action.normalized_coordinates is not None:
             x, y = action.normalized_coordinates
-        # Extract end coordinates for drag
         end_x, end_y = None, None
         if action.normalized_end is not None:
             end_x, end_y = action.normalized_end
-        # Extract action type value (enum -> string)
-        action_type = action.type.value if hasattr(action.type, 'value') else action.type
+        action_type = (
+            action.type.value if hasattr(action.type, "value") else action.type
+        )
-        # Extract element info if available
         target_node_id = None
         target_role = None
         target_name = None
@@ -310,192 +233,28 @@ class PolicyAgent(BenchmarkAgent):
     def reset(self) -> None:
         """Reset agent state."""
-        # PolicyAgent is stateless, nothing to reset
-        pass
-class ScriptedAgent(BenchmarkAgent):
-    """Agent that follows a predefined script of actions.
-    Useful for testing benchmark adapters or replaying trajectories.
-    Args:
-        actions: List of actions to execute in order.
-    """
-    def __init__(self, actions: list[BenchmarkAction]):
-        self.actions = actions
-        self._step = 0
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self.actions):
-            action = self.actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
-class RandomAgent(BenchmarkAgent):
-    """Agent that takes random actions.
-    Useful for baseline comparisons.
-    Args:
-        action_types: List of action types to randomly select from.
-        seed: Random seed for reproducibility.
-    """
-    def __init__(
-        self,
-        action_types: list[str] | None = None,
-        seed: int | None = None,
-    ):
-        import random
-        self.action_types = action_types or ["click", "type", "scroll", "done"]
-        self.rng = random.Random(seed)
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return a random action.
-        Args:
-            observation: Used to get viewport bounds.
-            task: Ignored.
-            history: Used to decide when to stop.
-        Returns:
-            Random action.
-        """
-        # Stop after many actions
-        if history and len(history) > 20:
-            return BenchmarkAction(type="done")
-        action_type = self.rng.choice(self.action_types)
-        if action_type == "click":
-            return BenchmarkAction(
-                type="click",
-                x=self.rng.random(),
-                y=self.rng.random(),
-            )
-        elif action_type == "type":
-            return BenchmarkAction(
-                type="type",
-                text="test",
-            )
-        elif action_type == "scroll":
-            return BenchmarkAction(
-                type="scroll",
-                scroll_direction=self.rng.choice(["up", "down"]),
-            )
-        else:
-            return BenchmarkAction(type="done")
-    def reset(self) -> None:
-        """Nothing to reset."""
         pass
-class SmartMockAgent(BenchmarkAgent):
-    """Agent designed to pass WAAMockAdapter evaluation.
-    Performs a fixed sequence of actions that satisfy the mock adapter's
-    success criteria. Use for validating the benchmark pipeline locally.
-    The mock adapter evaluates success based on:
-    - Clicking Submit (ID 4) - primary success path
-    - Typing something AND clicking OK (ID 1) - form submission path
-    - Calling DONE after at least 2 actions - reasonable completion
-    This agent clicks Submit (ID 4) which is the simplest success path.
-    """
-    def __init__(self):
-        """Initialize the agent."""
-        self._step = 0
-        # Simple action sequence: click Submit button (ID 4), then done
-        self._actions = [
-            BenchmarkAction(type="click", target_node_id="4"),  # Click Submit
-            BenchmarkAction(type="done"),
-        ]
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self._actions):
-            action = self._actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
 class APIBenchmarkAgent(BenchmarkAgent):
-    """Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.
+    """Agent that uses hosted VLM APIs via openadapt-ml ApiVLMAdapter.
     This agent wraps ApiVLMAdapter to provide Claude or GPT-5.1 baselines
     for benchmark evaluation. It converts BenchmarkObservation to the
     API format and parses VLM responses into BenchmarkActions.
+    Note: For standalone API evaluation without openadapt-ml, use
+    openadapt_evals.ApiAgent instead (has P0 demo persistence fix).
     Args:
         provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
         api_key: Optional API key override. If not provided, uses env vars.
-        model: Optional model name override. Defaults to provider's best VLM.
+        model: Optional model name override.
         max_tokens: Maximum tokens for VLM response.
         use_accessibility_tree: Whether to include accessibility tree in prompt.
         use_history: Whether to include action history in prompt.
-    Example:
-        # Claude baseline
-        agent = APIBenchmarkAgent(provider="anthropic")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-        # GPT-5.1 baseline
-        agent = APIBenchmarkAgent(provider="openai")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
     """
-    # System prompt for GUI automation
     SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
 Available actions:
@@ -505,7 +264,7 @@ Available actions:
 - KEY(key) - Press a key (e.g., Enter, Tab, Escape)
 - KEY(modifier+key) - Press key combination (e.g., Ctrl+c, Alt+Tab)
 - SCROLL(direction) - Scroll up or down
-- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2) (pixel or normalized)
+- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
 - DONE() - Task is complete
 - ANSWER("response") - For QA tasks, provide the answer
@@ -554,32 +313,15 @@ Then output the action on a new line starting with "ACTION:"
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> BenchmarkAction:
-        """Use VLM API to determine next action.
-        Args:
-            observation: Current observation with screenshot.
-            task: Task being performed.
-            history: Previous observations and actions.
-        Returns:
-            BenchmarkAction parsed from VLM response.
-        """
+        """Use VLM API to determine next action."""
         adapter = self._get_adapter()
-        # Build the sample for the API
         sample = self._build_sample(observation, task, history)
-        # Call the VLM API
         try:
             response = adapter.generate(sample, max_new_tokens=self.max_tokens)
         except Exception as e:
-            # On API error, return done to avoid infinite loops
-            return BenchmarkAction(
-                type="done",
-                raw_action={"error": str(e)},
-            )
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
-        # Parse the response into a BenchmarkAction
         return self._parse_response(response, observation)
     def _build_sample(
@@ -588,41 +330,26 @@ Then output the action on a new line starting with "ACTION:"
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict[str, Any]:
-        """Build API sample from benchmark observation.
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build API sample from benchmark observation."""
         content_parts = [f"GOAL: {task.instruction}"]
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
-            # Truncate if too long
             if len(tree_str) > 4000:
                 tree_str = tree_str[:4000] + "\n... (truncated)"
             content_parts.append(f"UI Elements:\n{tree_str}")
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
         content_parts.append("\nWhat is the next action?")
-        # Build sample
         sample: dict[str, Any] = {
             "messages": [
                 {"role": "system", "content": self.SYSTEM_PROMPT},
@@ -630,22 +357,13 @@ Then output the action on a new line starting with "ACTION:"
             ],
         }
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
         return sample
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-        Returns:
-            Formatted string representation.
-        """
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
@@ -666,29 +384,15 @@ Then output the action on a new line starting with "ACTION:"
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-        Args:
-            history: List of (observation, action) pairs.
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-        Args:
-            action: Action to convert.
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_node_id:
                 return f"CLICK([{action.target_node_id}])"
@@ -717,32 +421,14 @@ Then output the action on a new line starting with "ACTION:"
     def _parse_response(
         self, response: str, observation: BenchmarkObservation | None = None
     ) -> BenchmarkAction:
-        """Parse VLM response into BenchmarkAction.
-        Handles various response formats:
-        - ACTION: CLICK(0.5, 0.3)
-        - CLICK(0.5, 0.3)
-        - I'll click at coordinates (0.5, 0.3) -> CLICK(0.5, 0.3)
-        Args:
-            response: Raw VLM response text.
-            observation: Current observation (used for coordinate normalization).
-        Returns:
-            Parsed BenchmarkAction.
-        """
-        # Store raw response for debugging
+        """Parse VLM response into BenchmarkAction."""
         raw_action = {"response": response}
-        # Extract action line (look for ACTION: prefix or action pattern)
         action_line = None
-        # Try to find ACTION: prefix
         action_match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
         if action_match:
             action_line = action_match.group(1).strip()
         else:
-            # Look for action pattern anywhere in response
             patterns = [
                 r"(CLICK\s*\([^)]+\))",
                 r"(TYPE\s*\([^)]+\))",
@@ -759,146 +445,304 @@ Then output the action on a new line starting with "ACTION:"
                     break
         if not action_line:
-            # Could not parse action, return done
             raw_action["parse_error"] = "No action pattern found"
             return BenchmarkAction(type="done", raw_action=raw_action)
-        # Parse CLICK action
+        # Parse CLICK([id])
         click_match = re.match(
             r"CLICK\s*\(\s*\[?(\d+)\]?\s*\)", action_line, re.IGNORECASE
         )
         if click_match:
-            # CLICK([id]) - element ID
             node_id = click_match.group(1)
             return BenchmarkAction(
-                type="click",
-                target_node_id=node_id,
-                raw_action=raw_action,
+                type="click", target_node_id=node_id, raw_action=raw_action
             )
+        # Parse CLICK(x, y)
         click_coords = re.match(
             r"CLICK\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", action_line, re.IGNORECASE
         )
         if click_coords:
-            # CLICK(x, y) - coordinates
             x = float(click_coords.group(1))
             y = float(click_coords.group(2))
-            # Normalize coordinates if they appear to be pixel values
-            # If x or y > 1.0, assume pixel coordinates and normalize using viewport
             if observation and observation.viewport and (x > 1.0 or y > 1.0):
                 width, height = observation.viewport
-                x_norm = x / width
-                y_norm = y / height
                 raw_action["original_coords"] = {"x": x, "y": y}
                 raw_action["normalized"] = True
-                x = x_norm
-                y = y_norm
-            return BenchmarkAction(
-                type="click",
-                x=x,
-                y=y,
-                raw_action=raw_action,
-            )
+                x, y = x / width, y / height
+            return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
-        # Parse TYPE action
+        # Parse TYPE
         type_match = re.match(
             r"TYPE\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if type_match:
-            text = type_match.group(1)
             return BenchmarkAction(
-                type="type",
-                text=text,
-                raw_action=raw_action,
+                type="type", text=type_match.group(1), raw_action=raw_action
             )
-        # Parse KEY action
+        # Parse KEY
         key_match = re.match(r"KEY\s*\(\s*(.+?)\s*\)", action_line, re.IGNORECASE)
         if key_match:
             key_str = key_match.group(1)
-            # Handle modifier+key format
             if "+" in key_str:
                 parts = key_str.split("+")
-                key = parts[-1]
-                modifiers = parts[:-1]
                 return BenchmarkAction(
                     type="key",
-                    key=key,
-                    modifiers=modifiers,
+                    key=parts[-1],
+                    modifiers=parts[:-1],
                     raw_action=raw_action,
                 )
-            return BenchmarkAction(
-                type="key",
-                key=key_str,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="key", key=key_str, raw_action=raw_action)
-        # Parse SCROLL action
+        # Parse SCROLL
         scroll_match = re.match(
             r"SCROLL\s*\(\s*(up|down)\s*\)", action_line, re.IGNORECASE
         )
         if scroll_match:
-            direction = scroll_match.group(1).lower()
             return BenchmarkAction(
                 type="scroll",
-                scroll_direction=direction,
+                scroll_direction=scroll_match.group(1).lower(),
                 raw_action=raw_action,
             )
-        # Parse DRAG action
+        # Parse DRAG
         drag_match = re.match(
             r"DRAG\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*\)",
             action_line,
             re.IGNORECASE,
         )
         if drag_match:
-            x = float(drag_match.group(1))
-            y = float(drag_match.group(2))
-            end_x = float(drag_match.group(3))
-            end_y = float(drag_match.group(4))
-            # Normalize coordinates if they appear to be pixel values
-            if observation and observation.viewport and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0):
+            x, y = float(drag_match.group(1)), float(drag_match.group(2))
+            end_x, end_y = float(drag_match.group(3)), float(drag_match.group(4))
+            if (
+                observation
+                and observation.viewport
+                and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0)
+            ):
                 width, height = observation.viewport
-                raw_action["original_coords"] = {"x": x, "y": y, "end_x": end_x, "end_y": end_y}
+                raw_action["original_coords"] = {
+                    "x": x,
+                    "y": y,
+                    "end_x": end_x,
+                    "end_y": end_y,
+                }
                 raw_action["normalized"] = True
-                x = x / width
-                y = y / height
-                end_x = end_x / width
-                end_y = end_y / height
+                x, y, end_x, end_y = (
+                    x / width,
+                    y / height,
+                    end_x / width,
+                    end_y / height,
+                )
             return BenchmarkAction(
-                type="drag",
-                x=x,
-                y=y,
-                end_x=end_x,
-                end_y=end_y,
-                raw_action=raw_action,
+                type="drag", x=x, y=y, end_x=end_x, end_y=end_y, raw_action=raw_action
             )
-        # Parse DONE action
+        # Parse DONE
         if re.match(r"DONE\s*\(\s*\)", action_line, re.IGNORECASE):
             return BenchmarkAction(type="done", raw_action=raw_action)
-        # Parse ANSWER action
+        # Parse ANSWER
         answer_match = re.match(
             r"ANSWER\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if answer_match:
-            answer = answer_match.group(1)
             return BenchmarkAction(
-                type="answer",
-                answer=answer,
-                raw_action=raw_action,
+                type="answer", answer=answer_match.group(1), raw_action=raw_action
             )
-        # Unknown action format
         raw_action["parse_error"] = f"Unknown action format: {action_line}"
         return BenchmarkAction(type="done", raw_action=raw_action)
     def reset(self) -> None:
         """Reset agent state."""
-        # APIBenchmarkAgent is stateless, nothing to reset
         pass
+class UnifiedBaselineAgent(BenchmarkAgent):
+    """Agent that uses UnifiedBaselineAdapter for benchmark evaluation.
+    Provides unified interface for Claude, GPT, and Gemini baselines
+    across multiple tracks (A: coordinates, B: ReAct, C: SoM).
+    Args:
+        model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2').
+        track: Track type ('A', 'B', or 'C'). Defaults to 'A'.
+        api_key: Optional API key override.
+        temperature: Sampling temperature.
+        max_tokens: Maximum tokens for response.
+        demo: Optional demo text for prompts.
+        verbose: Whether to print debug output.
+    """
+    def __init__(
+        self,
+        model_alias: str = "claude-opus-4.5",
+        track: str = "A",
+        api_key: str | None = None,
+        temperature: float = 0.1,
+        max_tokens: int = 1024,
+        demo: str | None = None,
+        verbose: bool = False,
+    ):
+        self.model_alias = model_alias
+        self.track = track.upper()
+        self.api_key = api_key
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.demo = demo
+        self.verbose = verbose
+        self._adapter = None
+    def _get_adapter(self):
+        """Lazily initialize the UnifiedBaselineAdapter."""
+        if self._adapter is None:
+            from openadapt_ml.baselines import TrackConfig, UnifiedBaselineAdapter
+            track_configs = {
+                "A": TrackConfig.track_a(),
+                "B": TrackConfig.track_b(),
+                "C": TrackConfig.track_c(),
+            }
+            track_config = track_configs.get(self.track, TrackConfig.track_a())
+            self._adapter = UnifiedBaselineAdapter.from_alias(
+                self.model_alias,
+                track=track_config,
+                api_key=self.api_key,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                demo=self.demo,
+                verbose=self.verbose,
+            )
+        return self._adapter
+    def act(
+        self,
+        observation: BenchmarkObservation,
+        task: BenchmarkTask,
+        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
+    ) -> BenchmarkAction:
+        """Use UnifiedBaselineAdapter to determine next action."""
+        from PIL import Image
+        adapter = self._get_adapter()
+        screenshot = None
+        if observation.screenshot_path:
+            try:
+                screenshot = Image.open(observation.screenshot_path)
+            except Exception as e:
+                if self.verbose:
+                    print(f"[UnifiedBaselineAgent] Failed to load screenshot: {e}")
+        a11y_tree = (
+            observation.accessibility_tree if observation.accessibility_tree else None
+        )
+        adapter_history = None
+        if history:
+            adapter_history = [
+                self._benchmark_action_to_dict(a) for _, a in history[-5:]
+            ]
+        try:
+            parsed_action = adapter.predict(
+                screenshot=screenshot,
+                goal=task.instruction,
+                a11y_tree=a11y_tree,
+                history=adapter_history,
+            )
+        except Exception as e:
+            if self.verbose:
+                print(f"[UnifiedBaselineAgent] Adapter error: {e}")
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
+        return self._parsed_to_benchmark_action(parsed_action, observation)
+    def _benchmark_action_to_dict(self, action: BenchmarkAction) -> dict[str, Any]:
+        """Convert BenchmarkAction to dict for history."""
+        result = {"type": action.type}
+        if action.x is not None:
+            result["x"] = action.x
+        if action.y is not None:
+            result["y"] = action.y
+        if action.text:
+            result["text"] = action.text
+        if action.key:
+            result["key"] = action.key
+        if action.target_node_id:
+            result["element_id"] = action.target_node_id
+        if action.scroll_direction:
+            result["direction"] = action.scroll_direction
+        return result
+    def _parsed_to_benchmark_action(
+        self, parsed_action, observation: BenchmarkObservation | None = None
+    ) -> BenchmarkAction:
+        """Convert ParsedAction to BenchmarkAction."""
+        raw_action = {
+            "raw_response": parsed_action.raw_response,
+            "thought": parsed_action.thought,
+        }
+        if not parsed_action.is_valid:
+            raw_action["parse_error"] = parsed_action.parse_error
+            return BenchmarkAction(type="done", raw_action=raw_action)
+        action_type = parsed_action.action_type
+        if action_type == "click":
+            if parsed_action.element_id is not None:
+                return BenchmarkAction(
+                    type="click",
+                    target_node_id=str(parsed_action.element_id),
+                    raw_action=raw_action,
+                )
+            elif parsed_action.x is not None and parsed_action.y is not None:
+                x, y = parsed_action.x, parsed_action.y
+                if observation and observation.viewport and (x > 1.0 or y > 1.0):
+                    width, height = observation.viewport
+                    raw_action["original_coords"] = {"x": x, "y": y}
+                    x, y = x / width, y / height
+                return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
+        elif action_type == "type":
+            return BenchmarkAction(
+                type="type", text=parsed_action.text, raw_action=raw_action
+            )
+        elif action_type == "key":
+            return BenchmarkAction(
+                type="key", key=parsed_action.key, raw_action=raw_action
+            )
+        elif action_type == "scroll":
+            return BenchmarkAction(
+                type="scroll",
+                scroll_direction=parsed_action.direction,
+                raw_action=raw_action,
+            )
+        elif action_type == "done":
+            return BenchmarkAction(type="done", raw_action=raw_action)
+        elif action_type == "drag":
+            return BenchmarkAction(
+                type="drag",
+                x=parsed_action.x,
+                y=parsed_action.y,
+                end_x=getattr(parsed_action, "end_x", None),
+                end_y=getattr(parsed_action, "end_y", None),
+                raw_action=raw_action,
+            )
+        raw_action["unknown_action"] = action_type
+        return BenchmarkAction(type="done", raw_action=raw_action)
+    def reset(self) -> None:
+        """Reset agent state."""
+        pass
+    def __repr__(self) -> str:
+        return f"UnifiedBaselineAgent(model={self.model_alias}, track={self.track})"

openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

openadapt-ml 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl