openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.2.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,133 +1,31 @@
|
|
|
1
1
|
"""Benchmark integration for openadapt-ml.
|
|
2
2
|
|
|
3
|
-
This module provides
|
|
4
|
-
|
|
5
|
-
WebArena, and others.
|
|
3
|
+
This module provides ML-specific agents for benchmark evaluation.
|
|
4
|
+
These agents wrap openadapt-ml internals (trained policies, API adapters).
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
- BenchmarkAdapter: Abstract interface for benchmark integration
|
|
9
|
-
- BenchmarkAgent: Abstract interface for agents to be evaluated
|
|
10
|
-
- BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
|
|
11
|
-
|
|
12
|
-
Agent implementations:
|
|
13
|
-
- PolicyAgent: Wraps openadapt-ml AgentPolicy
|
|
14
|
-
- APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1)
|
|
15
|
-
- ScriptedAgent: Follows predefined action sequence
|
|
16
|
-
- RandomAgent: Takes random actions (baseline)
|
|
17
|
-
|
|
18
|
-
Evaluation:
|
|
19
|
-
- evaluate_agent_on_benchmark: Run agent on benchmark tasks
|
|
20
|
-
- compute_metrics: Compute aggregate metrics from results
|
|
21
|
-
|
|
22
|
-
Example:
|
|
6
|
+
For benchmark infrastructure (adapters, runners, viewers), use openadapt-evals:
|
|
23
7
|
```python
|
|
24
|
-
from
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
APIBenchmarkAgent,
|
|
8
|
+
from openadapt_evals import (
|
|
9
|
+
WAAMockAdapter,
|
|
10
|
+
WAALiveAdapter,
|
|
28
11
|
evaluate_agent_on_benchmark,
|
|
29
|
-
compute_metrics,
|
|
30
12
|
)
|
|
31
|
-
|
|
32
|
-
# Create adapter for specific benchmark
|
|
33
|
-
adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
|
|
34
|
-
|
|
35
|
-
# Wrap policy as benchmark agent
|
|
36
|
-
agent = PolicyAgent(policy)
|
|
37
|
-
|
|
38
|
-
# Or use API-backed agent for baselines
|
|
39
|
-
agent = APIBenchmarkAgent(provider="anthropic") # Claude
|
|
40
|
-
agent = APIBenchmarkAgent(provider="openai") # GPT-5.1
|
|
41
|
-
|
|
42
|
-
# Run evaluation
|
|
43
|
-
results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
|
|
44
|
-
|
|
45
|
-
# Compute metrics
|
|
46
|
-
metrics = compute_metrics(results)
|
|
47
|
-
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
48
13
|
```
|
|
14
|
+
|
|
15
|
+
ML-specific agents (only available in openadapt-ml):
|
|
16
|
+
- PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
|
|
17
|
+
- APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
|
|
18
|
+
- UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
|
|
49
19
|
"""
|
|
50
20
|
|
|
51
21
|
from openadapt_ml.benchmarks.agent import (
|
|
52
22
|
APIBenchmarkAgent,
|
|
53
|
-
BenchmarkAgent,
|
|
54
23
|
PolicyAgent,
|
|
55
|
-
|
|
56
|
-
ScriptedAgent,
|
|
57
|
-
SmartMockAgent,
|
|
58
|
-
)
|
|
59
|
-
from openadapt_ml.benchmarks.base import (
|
|
60
|
-
BenchmarkAction,
|
|
61
|
-
BenchmarkAdapter,
|
|
62
|
-
BenchmarkObservation,
|
|
63
|
-
BenchmarkResult,
|
|
64
|
-
BenchmarkTask,
|
|
65
|
-
StaticDatasetAdapter,
|
|
66
|
-
UIElement,
|
|
24
|
+
UnifiedBaselineAgent,
|
|
67
25
|
)
|
|
68
|
-
from openadapt_ml.benchmarks.runner import (
|
|
69
|
-
EvaluationConfig,
|
|
70
|
-
compute_domain_metrics,
|
|
71
|
-
compute_metrics,
|
|
72
|
-
evaluate_agent_on_benchmark,
|
|
73
|
-
)
|
|
74
|
-
from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
|
|
75
|
-
from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
|
|
76
|
-
from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
|
|
77
|
-
|
|
78
|
-
# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
|
|
79
|
-
def _get_azure_classes():
|
|
80
|
-
from openadapt_ml.benchmarks.azure import (
|
|
81
|
-
AzureConfig,
|
|
82
|
-
AzureWAAOrchestrator,
|
|
83
|
-
estimate_cost,
|
|
84
|
-
)
|
|
85
|
-
return AzureConfig, AzureWAAOrchestrator, estimate_cost
|
|
86
|
-
|
|
87
26
|
|
|
88
27
|
__all__ = [
|
|
89
|
-
# Base classes
|
|
90
|
-
"BenchmarkAdapter",
|
|
91
|
-
"BenchmarkTask",
|
|
92
|
-
"BenchmarkObservation",
|
|
93
|
-
"BenchmarkAction",
|
|
94
|
-
"BenchmarkResult",
|
|
95
|
-
"StaticDatasetAdapter",
|
|
96
|
-
"UIElement",
|
|
97
|
-
# Agents
|
|
98
|
-
"BenchmarkAgent",
|
|
99
28
|
"PolicyAgent",
|
|
100
29
|
"APIBenchmarkAgent",
|
|
101
|
-
"
|
|
102
|
-
"RandomAgent",
|
|
103
|
-
"SmartMockAgent",
|
|
104
|
-
# Evaluation
|
|
105
|
-
"EvaluationConfig",
|
|
106
|
-
"evaluate_agent_on_benchmark",
|
|
107
|
-
"compute_metrics",
|
|
108
|
-
"compute_domain_metrics",
|
|
109
|
-
# WAA
|
|
110
|
-
"WAAAdapter",
|
|
111
|
-
"WAAConfig",
|
|
112
|
-
"WAAMockAdapter",
|
|
113
|
-
"WAALiveAdapter",
|
|
114
|
-
"WAALiveConfig",
|
|
115
|
-
# Viewer
|
|
116
|
-
"generate_benchmark_viewer",
|
|
117
|
-
# Azure (lazy-loaded)
|
|
118
|
-
"AzureConfig",
|
|
119
|
-
"AzureWAAOrchestrator",
|
|
120
|
-
"estimate_cost",
|
|
30
|
+
"UnifiedBaselineAgent",
|
|
121
31
|
]
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
# Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
|
|
125
|
-
def __getattr__(name: str):
|
|
126
|
-
if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
|
|
127
|
-
from openadapt_ml.benchmarks.azure import (
|
|
128
|
-
AzureConfig,
|
|
129
|
-
AzureWAAOrchestrator,
|
|
130
|
-
estimate_cost,
|
|
131
|
-
)
|
|
132
|
-
return {"AzureConfig": AzureConfig, "AzureWAAOrchestrator": AzureWAAOrchestrator, "estimate_cost": estimate_cost}[name]
|
|
133
|
-
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|