openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
openadapt_ml/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Benchmark integration for openadapt-ml.
|
|
2
|
+
|
|
3
|
+
This module provides interfaces and utilities for evaluating GUI agents
|
|
4
|
+
on standardized benchmarks like Windows Agent Arena (WAA), OSWorld,
|
|
5
|
+
WebArena, and others.
|
|
6
|
+
|
|
7
|
+
Core classes:
|
|
8
|
+
- BenchmarkAdapter: Abstract interface for benchmark integration
|
|
9
|
+
- BenchmarkAgent: Abstract interface for agents to be evaluated
|
|
10
|
+
- BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
|
|
11
|
+
|
|
12
|
+
Agent implementations:
|
|
13
|
+
- PolicyAgent: Wraps openadapt-ml AgentPolicy
|
|
14
|
+
- APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1)
|
|
15
|
+
- ScriptedAgent: Follows predefined action sequence
|
|
16
|
+
- RandomAgent: Takes random actions (baseline)
|
|
17
|
+
|
|
18
|
+
Evaluation:
|
|
19
|
+
- evaluate_agent_on_benchmark: Run agent on benchmark tasks
|
|
20
|
+
- compute_metrics: Compute aggregate metrics from results
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
```python
|
|
24
|
+
from openadapt_ml.benchmarks import (
|
|
25
|
+
BenchmarkAdapter,
|
|
26
|
+
PolicyAgent,
|
|
27
|
+
APIBenchmarkAgent,
|
|
28
|
+
evaluate_agent_on_benchmark,
|
|
29
|
+
compute_metrics,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Create adapter for specific benchmark
|
|
33
|
+
adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
|
|
34
|
+
|
|
35
|
+
# Wrap policy as benchmark agent
|
|
36
|
+
agent = PolicyAgent(policy)
|
|
37
|
+
|
|
38
|
+
# Or use API-backed agent for baselines
|
|
39
|
+
agent = APIBenchmarkAgent(provider="anthropic") # Claude
|
|
40
|
+
agent = APIBenchmarkAgent(provider="openai") # GPT-5.1
|
|
41
|
+
|
|
42
|
+
# Run evaluation
|
|
43
|
+
results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
|
|
44
|
+
|
|
45
|
+
# Compute metrics
|
|
46
|
+
metrics = compute_metrics(results)
|
|
47
|
+
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from openadapt_ml.benchmarks.agent import (
|
|
52
|
+
APIBenchmarkAgent,
|
|
53
|
+
BenchmarkAgent,
|
|
54
|
+
PolicyAgent,
|
|
55
|
+
RandomAgent,
|
|
56
|
+
ScriptedAgent,
|
|
57
|
+
)
|
|
58
|
+
from openadapt_ml.benchmarks.base import (
|
|
59
|
+
BenchmarkAction,
|
|
60
|
+
BenchmarkAdapter,
|
|
61
|
+
BenchmarkObservation,
|
|
62
|
+
BenchmarkResult,
|
|
63
|
+
BenchmarkTask,
|
|
64
|
+
StaticDatasetAdapter,
|
|
65
|
+
UIElement,
|
|
66
|
+
)
|
|
67
|
+
from openadapt_ml.benchmarks.runner import (
|
|
68
|
+
EvaluationConfig,
|
|
69
|
+
compute_domain_metrics,
|
|
70
|
+
compute_metrics,
|
|
71
|
+
evaluate_agent_on_benchmark,
|
|
72
|
+
)
|
|
73
|
+
from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
|
|
74
|
+
|
|
75
|
+
# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
|
|
76
|
+
def _get_azure_classes():
|
|
77
|
+
from openadapt_ml.benchmarks.azure import (
|
|
78
|
+
AzureConfig,
|
|
79
|
+
AzureWAAOrchestrator,
|
|
80
|
+
estimate_cost,
|
|
81
|
+
)
|
|
82
|
+
return AzureConfig, AzureWAAOrchestrator, estimate_cost
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
__all__ = [
|
|
86
|
+
# Base classes
|
|
87
|
+
"BenchmarkAdapter",
|
|
88
|
+
"BenchmarkTask",
|
|
89
|
+
"BenchmarkObservation",
|
|
90
|
+
"BenchmarkAction",
|
|
91
|
+
"BenchmarkResult",
|
|
92
|
+
"StaticDatasetAdapter",
|
|
93
|
+
"UIElement",
|
|
94
|
+
# Agents
|
|
95
|
+
"BenchmarkAgent",
|
|
96
|
+
"PolicyAgent",
|
|
97
|
+
"APIBenchmarkAgent",
|
|
98
|
+
"ScriptedAgent",
|
|
99
|
+
"RandomAgent",
|
|
100
|
+
# Evaluation
|
|
101
|
+
"EvaluationConfig",
|
|
102
|
+
"evaluate_agent_on_benchmark",
|
|
103
|
+
"compute_metrics",
|
|
104
|
+
"compute_domain_metrics",
|
|
105
|
+
# WAA
|
|
106
|
+
"WAAAdapter",
|
|
107
|
+
"WAAConfig",
|
|
108
|
+
"WAAMockAdapter",
|
|
109
|
+
# Azure (lazy-loaded)
|
|
110
|
+
"AzureConfig",
|
|
111
|
+
"AzureWAAOrchestrator",
|
|
112
|
+
"estimate_cost",
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
|
|
117
|
+
def __getattr__(name: str):
|
|
118
|
+
if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
|
|
119
|
+
from openadapt_ml.benchmarks.azure import (
|
|
120
|
+
AzureConfig,
|
|
121
|
+
AzureWAAOrchestrator,
|
|
122
|
+
estimate_cost,
|
|
123
|
+
)
|
|
124
|
+
return {"AzureConfig": AzureConfig, "AzureWAAOrchestrator": AzureWAAOrchestrator, "estimate_cost": estimate_cost}[name]
|
|
125
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|