PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

openadapt_ml/benchmarks/__init__.py +8 -0
openadapt_ml/benchmarks/agent.py +90 -11
openadapt_ml/benchmarks/azure.py +35 -6
openadapt_ml/benchmarks/cli.py +4449 -201
openadapt_ml/benchmarks/live_tracker.py +180 -0
openadapt_ml/benchmarks/runner.py +41 -4
openadapt_ml/benchmarks/viewer.py +1219 -0
openadapt_ml/benchmarks/vm_monitor.py +610 -0
openadapt_ml/benchmarks/waa.py +61 -4
openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/benchmarks/waa_live.py +619 -0
openadapt_ml/cloud/local.py +1555 -1
openadapt_ml/cloud/ssh_tunnel.py +553 -0
openadapt_ml/datasets/next_action.py +87 -68
openadapt_ml/evals/grounding.py +26 -8
openadapt_ml/evals/trajectory_matching.py +84 -36
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +717 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +265 -0
openadapt_ml/ingest/__init__.py +3 -4
openadapt_ml/ingest/capture.py +89 -81
openadapt_ml/ingest/loader.py +116 -68
openadapt_ml/ingest/synthetic.py +221 -159
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +817 -0
openadapt_ml/retrieval/embeddings.py +629 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +160 -0
openadapt_ml/runtime/policy.py +10 -10
openadapt_ml/schema/__init__.py +104 -0
openadapt_ml/schema/converters.py +541 -0
openadapt_ml/schema/episode.py +457 -0
openadapt_ml/scripts/compare.py +26 -16
openadapt_ml/scripts/eval_policy.py +4 -5
openadapt_ml/scripts/prepare_synthetic.py +14 -17
openadapt_ml/scripts/train.py +81 -70
openadapt_ml/training/benchmark_viewer.py +3225 -0
openadapt_ml/training/trainer.py +120 -363
openadapt_ml/training/trl_trainer.py +354 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
openadapt_ml-0.2.0.dist-info/RECORD +86 -0
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/live_tracker.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""Live evaluation progress tracker for benchmark viewer.
+This module provides a tracker that writes real-time evaluation progress
+to a JSON file that the viewer can poll via /api/benchmark-live.
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from openadapt_ml.benchmarks.base import (
+    BenchmarkAction,
+    BenchmarkObservation,
+    BenchmarkResult,
+    BenchmarkTask,
+)
+@dataclass
+class LiveStepData:
+    """Data for a single step in live evaluation."""
+    step_idx: int
+    action: dict[str, Any]
+    reasoning: str | None = None
+    screenshot_url: str | None = None
+@dataclass
+class LiveTaskData:
+    """Data for current task being evaluated."""
+    task_id: str
+    instruction: str
+    domain: str
+    steps: list[LiveStepData]
+    result: dict[str, Any] | None = None
+class LiveEvaluationTracker:
+    """Tracks live evaluation progress and writes to benchmark_live.json.
+    This class is designed to be used alongside ExecutionTraceCollector
+    to provide real-time progress updates to the viewer.
+    Args:
+        output_file: Path to output JSON file (default: benchmark_live.json).
+        total_tasks: Total number of tasks to evaluate.
+    """
+    def __init__(
+        self,
+        output_file: str | Path = "benchmark_live.json",
+        total_tasks: int = 0,
+    ):
+        self.output_file = Path(output_file)
+        self.total_tasks = total_tasks
+        self.tasks_completed = 0
+        self.current_task: LiveTaskData | None = None
+        # Initialize with idle state
+        self._write_state({"status": "idle"})
+    def start_task(self, task: BenchmarkTask) -> None:
+        """Start tracking a new task.
+        Args:
+            task: The benchmark task being evaluated.
+        """
+        self.current_task = LiveTaskData(
+            task_id=task.task_id,
+            instruction=task.instruction,
+            domain=task.domain or "unknown",
+            steps=[],
+            result=None,
+        )
+        self._write_state({
+            "status": "running",
+            "total_tasks": self.total_tasks,
+            "tasks_completed": self.tasks_completed,
+            "current_task": asdict(self.current_task),
+        })
+    def record_step(
+        self,
+        step_idx: int,
+        observation: BenchmarkObservation,
+        action: BenchmarkAction,
+        reasoning: str | None = None,
+    ) -> None:
+        """Record a step in the current task.
+        Args:
+            step_idx: Index of this step.
+            observation: Observation at this step.
+            action: Action taken at this step.
+            reasoning: Optional reasoning/thought from agent.
+        """
+        if self.current_task is None:
+            raise RuntimeError("No task started. Call start_task() first.")
+        # Serialize action
+        action_data = {
+            "type": action.type,
+            "x": action.x,
+            "y": action.y,
+            "target_node_id": action.target_node_id,
+            "text": action.text,
+            "key": action.key,
+        }
+        # Create step data
+        step = LiveStepData(
+            step_idx=step_idx,
+            action=action_data,
+            reasoning=reasoning,
+            screenshot_url=None,  # Could be populated if we serve screenshots
+        )
+        self.current_task.steps.append(step)
+        # Write updated state
+        self._write_state({
+            "status": "running",
+            "total_tasks": self.total_tasks,
+            "tasks_completed": self.tasks_completed,
+            "current_task": asdict(self.current_task),
+        })
+    def finish_task(self, result: BenchmarkResult) -> None:
+        """Finish tracking the current task.
+        Args:
+            result: The evaluation result for the task.
+        """
+        if self.current_task is None:
+            raise RuntimeError("No task started. Call start_task() first.")
+        # Add result to current task
+        self.current_task.result = {
+            "success": result.success,
+            "score": result.score,
+            "num_steps": result.num_steps,
+            "total_time_seconds": result.total_time_seconds,
+        }
+        # Increment completed count
+        self.tasks_completed += 1
+        # Write updated state
+        self._write_state({
+            "status": "running",
+            "total_tasks": self.total_tasks,
+            "tasks_completed": self.tasks_completed,
+            "current_task": asdict(self.current_task),
+        })
+        # Clear current task
+        self.current_task = None
+    def finish(self) -> None:
+        """Mark evaluation as complete."""
+        self._write_state({
+            "status": "complete",
+            "total_tasks": self.total_tasks,
+            "tasks_completed": self.tasks_completed,
+        })
+    def _write_state(self, state: dict[str, Any]) -> None:
+        """Write current state to JSON file.
+        Args:
+            state: State dictionary to write.
+        """
+        with open(self.output_file, "w") as f:
+            json.dump(state, f, indent=2)

openadapt_ml/benchmarks/runner.py CHANGED Viewed

@@ -29,6 +29,7 @@ from openadapt_ml.benchmarks.base import (
     BenchmarkTask,
 )
 from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
+from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
 logger = logging.getLogger(__name__)
@@ -48,6 +49,8 @@ class EvaluationConfig:
         model_id: Model identifier for execution traces.
         output_dir: Output directory for benchmark results.
         run_name: Name for this evaluation run.
+        enable_live_tracking: Whether to enable live evaluation progress tracking.
+        live_tracking_file: Path to live tracking JSON file.
     """
     max_steps: int = 50
@@ -60,6 +63,8 @@ class EvaluationConfig:
     model_id: str = "unknown"
     output_dir: str = "benchmark_results"
     run_name: str | None = None
+    enable_live_tracking: bool = True
+    live_tracking_file: str = "benchmark_live.json"
 def evaluate_agent_on_benchmark(
@@ -107,16 +112,30 @@ def evaluate_agent_on_benchmark(
         if config.verbose:
             logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
+    # Initialize live evaluation tracker if enabled
+    live_tracker = None
+    if config.enable_live_tracking:
+        live_tracker = LiveEvaluationTracker(
+            output_file=config.live_tracking_file,
+            total_tasks=len(tasks),
+        )
+        if config.verbose:
+            logger.info(f"Live tracking enabled: {config.live_tracking_file}")
     # Run evaluation
     if config.parallel > 1 and adapter.supports_parallel:
-        results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector)
+        results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector, live_tracker)
     else:
-        results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector)
+        results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector, live_tracker)
     # Save summary if trace collection is enabled
     if trace_collector is not None:
         trace_collector.save_summary(results)
+    # Mark live tracking as complete
+    if live_tracker is not None:
+        live_tracker.finish()
     # Log summary
     if config.verbose:
         success_count = sum(1 for r in results if r.success)
@@ -136,6 +155,7 @@ def _evaluate_sequential(
     tasks: list[BenchmarkTask],
     config: EvaluationConfig,
     trace_collector: ExecutionTraceCollector | None = None,
+    live_tracker: LiveEvaluationTracker | None = None,
 ) -> list[BenchmarkResult]:
     """Run evaluation sequentially.
@@ -145,6 +165,7 @@ def _evaluate_sequential(
         tasks: Tasks to evaluate.
         config: Evaluation configuration.
         trace_collector: Optional trace collector for saving execution data.
+        live_tracker: Optional live evaluation tracker.
     Returns:
         List of results.
@@ -154,7 +175,7 @@ def _evaluate_sequential(
         if config.verbose:
             logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
-        result = _run_single_task(agent, adapter, task, config, trace_collector)
+        result = _run_single_task(agent, adapter, task, config, trace_collector, live_tracker)
         results.append(result)
         if config.on_task_complete:
@@ -169,6 +190,7 @@ def _evaluate_parallel(
     tasks: list[BenchmarkTask],
     config: EvaluationConfig,
     trace_collector: ExecutionTraceCollector | None = None,
+    live_tracker: LiveEvaluationTracker | None = None,
 ) -> list[BenchmarkResult]:
     """Run evaluation in parallel.
@@ -181,6 +203,7 @@ def _evaluate_parallel(
         tasks: Tasks to evaluate.
         config: Evaluation configuration.
         trace_collector: Optional trace collector for saving execution data.
+        live_tracker: Optional live evaluation tracker.
     Returns:
         List of results.
@@ -190,7 +213,7 @@ def _evaluate_parallel(
     with ThreadPoolExecutor(max_workers=config.parallel) as executor:
         # Submit all tasks
         future_to_task = {
-            executor.submit(_run_single_task, agent, adapter, task, config, trace_collector): task
+            executor.submit(_run_single_task, agent, adapter, task, config, trace_collector, live_tracker): task
             for task in tasks
         }
@@ -228,6 +251,7 @@ def _run_single_task(
     task: BenchmarkTask,
     config: EvaluationConfig,
     trace_collector: ExecutionTraceCollector | None = None,
+    live_tracker: LiveEvaluationTracker | None = None,
 ) -> BenchmarkResult:
     """Run a single task and return result.
@@ -237,6 +261,7 @@ def _run_single_task(
         task: Task to run.
         config: Evaluation configuration.
         trace_collector: Optional trace collector for saving execution data.
+        live_tracker: Optional live evaluation tracker.
     Returns:
         BenchmarkResult.
@@ -248,6 +273,10 @@ def _run_single_task(
     if trace_collector is not None:
         trace_collector.start_task(task)
+    # Start live tracking if enabled
+    if live_tracker is not None:
+        live_tracker.start_task(task)
     try:
         # Reset agent and environment
         agent.reset()
@@ -270,6 +299,10 @@ def _run_single_task(
             if trace_collector is not None:
                 trace_collector.record_step(steps, obs, action, reasoning)
+            # Record step in live tracker
+            if live_tracker is not None:
+                live_tracker.record_step(steps, obs, action, reasoning)
             # Record step in history
             if config.save_trajectories:
                 history.append((obs, action))
@@ -298,6 +331,10 @@ def _run_single_task(
         if trace_collector is not None:
             trace_collector.finish_task(result)
+        # Finish live tracking if enabled
+        if live_tracker is not None:
+            live_tracker.finish_task(result)
         return result
     except Exception as e:

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl