PyPI - openadapt-ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -115
openadapt_ml/benchmarks/agent.py +265 -421
openadapt_ml/benchmarks/azure.py +28 -19
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1722 -4847
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +22 -5
openadapt_ml/benchmarks/vm_monitor.py +530 -29
openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +2038 -487
openadapt_ml/cloud/ssh_tunnel.py +68 -26
openadapt_ml/datasets/next_action.py +40 -30
openadapt_ml/evals/grounding.py +8 -3
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +41 -26
openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/runner.py +29 -14
openadapt_ml/export/parquet.py +36 -24
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +8 -6
openadapt_ml/ingest/capture.py +25 -22
openadapt_ml/ingest/loader.py +7 -4
openadapt_ml/ingest/synthetic.py +189 -100
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/demo_retriever.py +50 -24
openadapt_ml/retrieval/embeddings.py +9 -8
openadapt_ml/retrieval/retriever.py +3 -1
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +18 -5
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +9 -0
openadapt_ml/schema/converters.py +74 -27
openadapt_ml/schema/episode.py +31 -18
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +85 -54
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +15 -9
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +3 -1
openadapt_ml/scripts/train.py +21 -9
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +52 -41
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +143 -86
openadapt_ml/training/trl_trainer.py +70 -21
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/live_tracker.py +0 -180
openadapt_ml/benchmarks/runner.py +0 -418
openadapt_ml/benchmarks/waa.py +0 -761
openadapt_ml/benchmarks/waa_live.py +0 -619
openadapt_ml-0.2.0.dist-info/RECORD +0 -86
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/benchmarks/__init__.py CHANGED Viewed

@@ -1,133 +1,31 @@
 """Benchmark integration for openadapt-ml.
-This module provides interfaces and utilities for evaluating GUI agents
-on standardized benchmarks like Windows Agent Arena (WAA), OSWorld,
-WebArena, and others.
+This module provides ML-specific agents for benchmark evaluation.
+These agents wrap openadapt-ml internals (trained policies, API adapters).
-Core classes:
-    - BenchmarkAdapter: Abstract interface for benchmark integration
-    - BenchmarkAgent: Abstract interface for agents to be evaluated
-    - BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
-Agent implementations:
-    - PolicyAgent: Wraps openadapt-ml AgentPolicy
-    - APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1)
-    - ScriptedAgent: Follows predefined action sequence
-    - RandomAgent: Takes random actions (baseline)
-Evaluation:
-    - evaluate_agent_on_benchmark: Run agent on benchmark tasks
-    - compute_metrics: Compute aggregate metrics from results
-Example:
+For benchmark infrastructure (adapters, runners, viewers), use openadapt-evals:
     ```python
-    from openadapt_ml.benchmarks import (
-        BenchmarkAdapter,
-        PolicyAgent,
-        APIBenchmarkAgent,
+    from openadapt_evals import (
+        WAAMockAdapter,
+        WAALiveAdapter,
         evaluate_agent_on_benchmark,
-        compute_metrics,
     )
-    # Create adapter for specific benchmark
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-    # Wrap policy as benchmark agent
-    agent = PolicyAgent(policy)
-    # Or use API-backed agent for baselines
-    agent = APIBenchmarkAgent(provider="anthropic")  # Claude
-    agent = APIBenchmarkAgent(provider="openai")     # GPT-5.1
-    # Run evaluation
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-    # Compute metrics
-    metrics = compute_metrics(results)
-    print(f"Success rate: {metrics['success_rate']:.1%}")
     ```
+ML-specific agents (only available in openadapt-ml):
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 """
 from openadapt_ml.benchmarks.agent import (
     APIBenchmarkAgent,
-    BenchmarkAgent,
     PolicyAgent,
-    RandomAgent,
-    ScriptedAgent,
-    SmartMockAgent,
-)
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-    StaticDatasetAdapter,
-    UIElement,
+    UnifiedBaselineAgent,
 )
-from openadapt_ml.benchmarks.runner import (
-    EvaluationConfig,
-    compute_domain_metrics,
-    compute_metrics,
-    evaluate_agent_on_benchmark,
-)
-from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
-from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
-# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
-def _get_azure_classes():
-    from openadapt_ml.benchmarks.azure import (
-        AzureConfig,
-        AzureWAAOrchestrator,
-        estimate_cost,
-    )
-    return AzureConfig, AzureWAAOrchestrator, estimate_cost
 __all__ = [
-    # Base classes
-    "BenchmarkAdapter",
-    "BenchmarkTask",
-    "BenchmarkObservation",
-    "BenchmarkAction",
-    "BenchmarkResult",
-    "StaticDatasetAdapter",
-    "UIElement",
-    # Agents
-    "BenchmarkAgent",
     "PolicyAgent",
     "APIBenchmarkAgent",
-    "ScriptedAgent",
-    "RandomAgent",
-    "SmartMockAgent",
-    # Evaluation
-    "EvaluationConfig",
-    "evaluate_agent_on_benchmark",
-    "compute_metrics",
-    "compute_domain_metrics",
-    # WAA
-    "WAAAdapter",
-    "WAAConfig",
-    "WAAMockAdapter",
-    "WAALiveAdapter",
-    "WAALiveConfig",
-    # Viewer
-    "generate_benchmark_viewer",
-    # Azure (lazy-loaded)
-    "AzureConfig",
-    "AzureWAAOrchestrator",
-    "estimate_cost",
+    "UnifiedBaselineAgent",
 ]
-# Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
-def __getattr__(name: str):
-    if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
-        from openadapt_ml.benchmarks.azure import (
-            AzureConfig,
-            AzureWAAOrchestrator,
-            estimate_cost,
-        )
-        return {"AzureConfig": AzureConfig, "AzureWAAOrchestrator": AzureWAAOrchestrator, "estimate_cost": estimate_cost}[name]
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl