openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,884 @@
|
|
|
1
|
+
"""CLI for WAA benchmark evaluation.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
# Estimate costs
|
|
5
|
+
python -m openadapt_ml.benchmarks.cli estimate --workers 40
|
|
6
|
+
|
|
7
|
+
# Run local evaluation (Windows only)
|
|
8
|
+
python -m openadapt_ml.benchmarks.cli run-local --waa-path /path/to/WAA --tasks notepad_1,notepad_2
|
|
9
|
+
|
|
10
|
+
# Run Azure evaluation
|
|
11
|
+
python -m openadapt_ml.benchmarks.cli run-azure --config azure_config.json --workers 40
|
|
12
|
+
|
|
13
|
+
# Run API-backed evaluation (Claude/GPT-5.1 baseline)
|
|
14
|
+
python -m openadapt_ml.benchmarks.cli run-api --provider anthropic --tasks 5
|
|
15
|
+
python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5
|
|
16
|
+
|
|
17
|
+
# Test with mock adapter
|
|
18
|
+
python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
|
|
19
|
+
|
|
20
|
+
# Test data collection (with screenshots and execution traces)
|
|
21
|
+
python -m openadapt_ml.benchmarks.cli test-collection --tasks 5
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
import sys
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Pre-configure loggers to be quiet by default (before any Azure imports)
|
|
35
|
+
logging.getLogger("azure").setLevel(logging.WARNING)
|
|
36
|
+
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
|
|
37
|
+
logging.getLogger("azure.ai.ml").setLevel(logging.WARNING)
|
|
38
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
39
|
+
logging.getLogger("msrest").setLevel(logging.WARNING)
|
|
40
|
+
logging.getLogger("openadapt_ml.benchmarks.azure").setLevel(logging.WARNING)
|
|
41
|
+
|
|
42
|
+
# Suppress Azure SDK experimental class warnings
|
|
43
|
+
import warnings
|
|
44
|
+
warnings.filterwarnings("ignore", message=".*experimental class.*")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_logging(verbose: bool = False) -> None:
|
|
48
|
+
"""Configure logging with appropriate verbosity.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
verbose: If True, show all logs. If False, suppress Azure SDK noise.
|
|
52
|
+
"""
|
|
53
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
54
|
+
logging.basicConfig(
|
|
55
|
+
level=level,
|
|
56
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Suppress noisy Azure SDK logs unless verbose
|
|
60
|
+
if not verbose:
|
|
61
|
+
logging.getLogger("azure").setLevel(logging.WARNING)
|
|
62
|
+
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
|
|
63
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
64
|
+
logging.getLogger("msrest").setLevel(logging.WARNING)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def find_waa_path() -> Path | None:
|
|
68
|
+
"""Auto-detect Windows Agent Arena repository path.
|
|
69
|
+
|
|
70
|
+
Searches in order:
|
|
71
|
+
1. vendor/WindowsAgentArena (git submodule)
|
|
72
|
+
2. ../WindowsAgentArena (sibling directory)
|
|
73
|
+
3. ~/WindowsAgentArena (home directory)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Path to WAA repo, or None if not found.
|
|
77
|
+
"""
|
|
78
|
+
# Get the project root (where this package is installed)
|
|
79
|
+
project_root = Path(__file__).parent.parent.parent
|
|
80
|
+
|
|
81
|
+
candidates = [
|
|
82
|
+
project_root / "vendor" / "WindowsAgentArena",
|
|
83
|
+
project_root.parent / "WindowsAgentArena",
|
|
84
|
+
Path.home() / "WindowsAgentArena",
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
for path in candidates:
|
|
88
|
+
if path.exists() and (path / "src").exists():
|
|
89
|
+
return path
|
|
90
|
+
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_waa_path(args_path: str | None) -> Path:
|
|
95
|
+
"""Get WAA path from args or auto-detect.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
args_path: Path from command line args, or None.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Resolved WAA path.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
SystemExit: If WAA cannot be found.
|
|
105
|
+
"""
|
|
106
|
+
if args_path:
|
|
107
|
+
path = Path(args_path)
|
|
108
|
+
if not path.exists():
|
|
109
|
+
print(f"ERROR: WAA path does not exist: {path}")
|
|
110
|
+
sys.exit(1)
|
|
111
|
+
return path
|
|
112
|
+
|
|
113
|
+
path = find_waa_path()
|
|
114
|
+
if path:
|
|
115
|
+
print(f" Using WAA from: {path}")
|
|
116
|
+
return path
|
|
117
|
+
|
|
118
|
+
print("ERROR: Windows Agent Arena not found!")
|
|
119
|
+
print("\nTo fix, run:")
|
|
120
|
+
print(" git submodule update --init --recursive")
|
|
121
|
+
print("\nOr specify path manually:")
|
|
122
|
+
print(" --waa-path /path/to/WindowsAgentArena")
|
|
123
|
+
sys.exit(1)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def cmd_estimate(args: argparse.Namespace) -> None:
|
|
127
|
+
"""Estimate Azure costs."""
|
|
128
|
+
from openadapt_ml.benchmarks.azure import estimate_cost
|
|
129
|
+
|
|
130
|
+
estimate = estimate_cost(
|
|
131
|
+
num_tasks=args.tasks,
|
|
132
|
+
num_workers=args.workers,
|
|
133
|
+
avg_task_duration_minutes=args.duration,
|
|
134
|
+
vm_hourly_cost=args.vm_cost,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
print("\n=== WAA Azure Cost Estimate ===")
|
|
138
|
+
print(f"Tasks: {estimate['num_tasks']}")
|
|
139
|
+
print(f"Workers: {estimate['num_workers']}")
|
|
140
|
+
print(f"Tasks per worker: {estimate['tasks_per_worker']:.1f}")
|
|
141
|
+
print(f"Estimated duration: {estimate['estimated_duration_minutes']:.1f} minutes")
|
|
142
|
+
print(f"Total VM hours: {estimate['total_vm_hours']:.2f}")
|
|
143
|
+
print(f"Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
|
|
144
|
+
print(f"Cost per task: ${estimate['cost_per_task_usd']:.4f}")
|
|
145
|
+
print()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def cmd_run_local(args: argparse.Namespace) -> None:
|
|
149
|
+
"""Run evaluation locally on Windows."""
|
|
150
|
+
from openadapt_ml.benchmarks import (
|
|
151
|
+
RandomAgent,
|
|
152
|
+
WAAAdapter,
|
|
153
|
+
compute_metrics,
|
|
154
|
+
evaluate_agent_on_benchmark,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Check platform
|
|
158
|
+
if sys.platform != "win32" and not args.force:
|
|
159
|
+
print("ERROR: WAA requires Windows. Use --force to override.")
|
|
160
|
+
sys.exit(1)
|
|
161
|
+
|
|
162
|
+
# Parse task IDs
|
|
163
|
+
task_ids = None
|
|
164
|
+
if args.tasks:
|
|
165
|
+
task_ids = [t.strip() for t in args.tasks.split(",")]
|
|
166
|
+
|
|
167
|
+
# Get WAA path (auto-detect if not specified)
|
|
168
|
+
waa_path = get_waa_path(args.waa_path)
|
|
169
|
+
|
|
170
|
+
# Create adapter
|
|
171
|
+
adapter = WAAAdapter(waa_repo_path=waa_path)
|
|
172
|
+
|
|
173
|
+
# Create agent (for now, just random - in practice, would load a model)
|
|
174
|
+
if args.agent == "random":
|
|
175
|
+
agent = RandomAgent(seed=args.seed)
|
|
176
|
+
else:
|
|
177
|
+
print(f"ERROR: Unknown agent type: {args.agent}")
|
|
178
|
+
sys.exit(1)
|
|
179
|
+
|
|
180
|
+
# Run evaluation
|
|
181
|
+
print(f"\nRunning WAA evaluation...")
|
|
182
|
+
print(f" WAA path: {waa_path}")
|
|
183
|
+
print(f" Tasks: {len(task_ids) if task_ids else 'all (154)'}")
|
|
184
|
+
print(f" Max steps: {args.max_steps}")
|
|
185
|
+
print()
|
|
186
|
+
|
|
187
|
+
results = evaluate_agent_on_benchmark(
|
|
188
|
+
agent=agent,
|
|
189
|
+
adapter=adapter,
|
|
190
|
+
task_ids=task_ids,
|
|
191
|
+
max_steps=args.max_steps,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Print results
|
|
195
|
+
metrics = compute_metrics(results)
|
|
196
|
+
print("\n=== Results ===")
|
|
197
|
+
print(f"Tasks: {metrics['num_tasks']}")
|
|
198
|
+
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
199
|
+
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
200
|
+
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
201
|
+
print()
|
|
202
|
+
|
|
203
|
+
# Save results
|
|
204
|
+
if args.output:
|
|
205
|
+
output_path = Path(args.output)
|
|
206
|
+
with open(output_path, "w") as f:
|
|
207
|
+
json.dump(
|
|
208
|
+
{
|
|
209
|
+
"metrics": metrics,
|
|
210
|
+
"results": [
|
|
211
|
+
{
|
|
212
|
+
"task_id": r.task_id,
|
|
213
|
+
"success": r.success,
|
|
214
|
+
"score": r.score,
|
|
215
|
+
"num_steps": r.num_steps,
|
|
216
|
+
"error": r.error,
|
|
217
|
+
}
|
|
218
|
+
for r in results
|
|
219
|
+
],
|
|
220
|
+
},
|
|
221
|
+
f,
|
|
222
|
+
indent=2,
|
|
223
|
+
)
|
|
224
|
+
print(f"Results saved to: {output_path}")
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def cmd_run_azure(args: argparse.Namespace) -> None:
|
|
228
|
+
"""Run evaluation on Azure."""
|
|
229
|
+
from openadapt_ml.benchmarks import RandomAgent
|
|
230
|
+
from openadapt_ml.benchmarks.azure import AzureConfig, AzureWAAOrchestrator
|
|
231
|
+
|
|
232
|
+
# Load config
|
|
233
|
+
if args.config:
|
|
234
|
+
config = AzureConfig.from_json(args.config)
|
|
235
|
+
else:
|
|
236
|
+
config = AzureConfig.from_env()
|
|
237
|
+
|
|
238
|
+
# Get WAA path (auto-detect if not specified)
|
|
239
|
+
waa_path = get_waa_path(args.waa_path)
|
|
240
|
+
|
|
241
|
+
# Parse task IDs
|
|
242
|
+
task_ids = None
|
|
243
|
+
if args.tasks:
|
|
244
|
+
task_ids = [t.strip() for t in args.tasks.split(",")]
|
|
245
|
+
|
|
246
|
+
# Create orchestrator
|
|
247
|
+
orchestrator = AzureWAAOrchestrator(
|
|
248
|
+
config=config,
|
|
249
|
+
waa_repo_path=waa_path,
|
|
250
|
+
experiment_name=args.experiment,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Create agent
|
|
254
|
+
if args.agent == "random":
|
|
255
|
+
agent = RandomAgent(seed=args.seed)
|
|
256
|
+
else:
|
|
257
|
+
print(f"ERROR: Unknown agent type: {args.agent}")
|
|
258
|
+
sys.exit(1)
|
|
259
|
+
|
|
260
|
+
# Estimate costs first
|
|
261
|
+
from openadapt_ml.benchmarks.azure import estimate_cost
|
|
262
|
+
|
|
263
|
+
num_tasks = len(task_ids) if task_ids else 154
|
|
264
|
+
estimate = estimate_cost(num_tasks=num_tasks, num_workers=args.workers)
|
|
265
|
+
|
|
266
|
+
print(f"\n=== Azure WAA Evaluation ===")
|
|
267
|
+
print(f" Workers: {args.workers}")
|
|
268
|
+
print(f" Tasks: {num_tasks}")
|
|
269
|
+
print(f" Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
|
|
270
|
+
print(f" Estimated time: {estimate['estimated_duration_minutes']:.1f} minutes")
|
|
271
|
+
print()
|
|
272
|
+
|
|
273
|
+
if not args.yes:
|
|
274
|
+
response = input("Proceed? [y/N] ")
|
|
275
|
+
if response.lower() != "y":
|
|
276
|
+
print("Aborted.")
|
|
277
|
+
sys.exit(0)
|
|
278
|
+
|
|
279
|
+
# Run evaluation
|
|
280
|
+
print("\nStarting Azure evaluation...")
|
|
281
|
+
print(" (VM provisioning takes 3-5 minutes - monitor at https://ml.azure.com)")
|
|
282
|
+
print()
|
|
283
|
+
results = orchestrator.run_evaluation(
|
|
284
|
+
agent=agent,
|
|
285
|
+
num_workers=args.workers,
|
|
286
|
+
task_ids=task_ids,
|
|
287
|
+
max_steps_per_task=args.max_steps,
|
|
288
|
+
cleanup_on_complete=not args.no_cleanup,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Print results
|
|
292
|
+
from openadapt_ml.benchmarks import compute_metrics
|
|
293
|
+
|
|
294
|
+
metrics = compute_metrics(results)
|
|
295
|
+
print("\n=== Results ===")
|
|
296
|
+
print(f"Tasks: {metrics['num_tasks']}")
|
|
297
|
+
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
298
|
+
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
299
|
+
print()
|
|
300
|
+
|
|
301
|
+
# Save results
|
|
302
|
+
if args.output:
|
|
303
|
+
output_path = Path(args.output)
|
|
304
|
+
with open(output_path, "w") as f:
|
|
305
|
+
json.dump(
|
|
306
|
+
{
|
|
307
|
+
"metrics": metrics,
|
|
308
|
+
"run_status": orchestrator.get_run_status(),
|
|
309
|
+
"results": [
|
|
310
|
+
{
|
|
311
|
+
"task_id": r.task_id,
|
|
312
|
+
"success": r.success,
|
|
313
|
+
"score": r.score,
|
|
314
|
+
"num_steps": r.num_steps,
|
|
315
|
+
}
|
|
316
|
+
for r in results
|
|
317
|
+
],
|
|
318
|
+
},
|
|
319
|
+
f,
|
|
320
|
+
indent=2,
|
|
321
|
+
)
|
|
322
|
+
print(f"Results saved to: {output_path}")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def cmd_test_mock(args: argparse.Namespace) -> None:
|
|
326
|
+
"""Test with mock adapter (no Windows required)."""
|
|
327
|
+
from openadapt_ml.benchmarks import (
|
|
328
|
+
RandomAgent,
|
|
329
|
+
WAAMockAdapter,
|
|
330
|
+
compute_domain_metrics,
|
|
331
|
+
compute_metrics,
|
|
332
|
+
evaluate_agent_on_benchmark,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
print(f"\n=== Testing with Mock Adapter ===")
|
|
336
|
+
print(f" Tasks: {args.tasks}")
|
|
337
|
+
print(f" Max steps: {args.max_steps}")
|
|
338
|
+
print()
|
|
339
|
+
|
|
340
|
+
# Create mock adapter
|
|
341
|
+
adapter = WAAMockAdapter(num_tasks=args.tasks)
|
|
342
|
+
agent = RandomAgent(seed=args.seed)
|
|
343
|
+
|
|
344
|
+
# Run evaluation
|
|
345
|
+
results = evaluate_agent_on_benchmark(
|
|
346
|
+
agent=agent,
|
|
347
|
+
adapter=adapter,
|
|
348
|
+
max_steps=args.max_steps,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Print results
|
|
352
|
+
metrics = compute_metrics(results)
|
|
353
|
+
print("=== Results ===")
|
|
354
|
+
print(f"Tasks: {metrics['num_tasks']}")
|
|
355
|
+
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
356
|
+
print(f"Successes: {metrics['success_count']}")
|
|
357
|
+
print(f"Failures: {metrics['fail_count']}")
|
|
358
|
+
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
359
|
+
print()
|
|
360
|
+
|
|
361
|
+
# Domain breakdown
|
|
362
|
+
tasks = adapter.list_tasks()
|
|
363
|
+
domain_metrics = compute_domain_metrics(results, tasks)
|
|
364
|
+
if domain_metrics:
|
|
365
|
+
print("=== By Domain ===")
|
|
366
|
+
for domain, dm in domain_metrics.items():
|
|
367
|
+
print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
|
|
368
|
+
print()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def cmd_test_collection(args: argparse.Namespace) -> None:
|
|
372
|
+
"""Test benchmark data collection with mock adapter.
|
|
373
|
+
|
|
374
|
+
This command runs a benchmark evaluation with data collection enabled,
|
|
375
|
+
creating a full directory structure with screenshots, execution traces,
|
|
376
|
+
and metadata suitable for the benchmark viewer.
|
|
377
|
+
"""
|
|
378
|
+
import json
|
|
379
|
+
from pathlib import Path
|
|
380
|
+
|
|
381
|
+
from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter
|
|
382
|
+
from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
|
|
383
|
+
|
|
384
|
+
print(f"\n=== Testing Benchmark Data Collection ===")
|
|
385
|
+
print(f" Tasks: {args.tasks}")
|
|
386
|
+
print(f" Max steps: {args.max_steps}")
|
|
387
|
+
print(f" Output dir: {args.output}")
|
|
388
|
+
print(f" Run name: {args.run_name or '(auto-generated)'}")
|
|
389
|
+
print()
|
|
390
|
+
|
|
391
|
+
# Create mock adapter
|
|
392
|
+
adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
|
|
393
|
+
agent = RandomAgent(action_types=["click", "type", "scroll", "done"], seed=args.seed)
|
|
394
|
+
|
|
395
|
+
# Configure evaluation with data collection
|
|
396
|
+
config = EvaluationConfig(
|
|
397
|
+
max_steps=args.max_steps,
|
|
398
|
+
parallel=1,
|
|
399
|
+
save_trajectories=True,
|
|
400
|
+
save_execution_traces=True,
|
|
401
|
+
model_id=args.model_id,
|
|
402
|
+
output_dir=args.output,
|
|
403
|
+
run_name=args.run_name,
|
|
404
|
+
verbose=True,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Run evaluation
|
|
408
|
+
results = evaluate_agent_on_benchmark(
|
|
409
|
+
agent=agent,
|
|
410
|
+
adapter=adapter,
|
|
411
|
+
config=config,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Print results
|
|
415
|
+
success_count = sum(1 for r in results if r.success)
|
|
416
|
+
success_rate = success_count / len(results) if results else 0.0
|
|
417
|
+
avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0.0
|
|
418
|
+
|
|
419
|
+
print(f"\n=== Results ===")
|
|
420
|
+
print(f"Total tasks: {len(results)}")
|
|
421
|
+
print(f"Success: {success_count} ({success_rate:.1%})")
|
|
422
|
+
print(f"Failure: {len(results) - success_count}")
|
|
423
|
+
print(f"Avg steps: {avg_steps:.1f}")
|
|
424
|
+
|
|
425
|
+
# Find the actual output directory by reading metadata
|
|
426
|
+
output_dir = Path(args.output)
|
|
427
|
+
run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
428
|
+
if run_dirs:
|
|
429
|
+
run_dir = run_dirs[0].parent
|
|
430
|
+
with open(run_dirs[0]) as f:
|
|
431
|
+
metadata = json.load(f)
|
|
432
|
+
run_name = metadata.get("run_name", run_dir.name)
|
|
433
|
+
else:
|
|
434
|
+
run_dir = output_dir
|
|
435
|
+
run_name = "unknown"
|
|
436
|
+
|
|
437
|
+
print(f"\n=== Output Directory ===")
|
|
438
|
+
print(f"Location: {run_dir.absolute()}")
|
|
439
|
+
print(f"\nDirectory structure:")
|
|
440
|
+
print(f" {run_dir.name}/")
|
|
441
|
+
print(f" ├── metadata.json")
|
|
442
|
+
print(f" ├── summary.json")
|
|
443
|
+
print(f" └── tasks/")
|
|
444
|
+
print(f" ├── task_001/")
|
|
445
|
+
print(f" │ ├── task.json")
|
|
446
|
+
print(f" │ ├── execution.json")
|
|
447
|
+
print(f" │ └── screenshots/")
|
|
448
|
+
print(f" │ ├── step_000.png")
|
|
449
|
+
print(f" │ ├── step_001.png")
|
|
450
|
+
print(f" │ └── ...")
|
|
451
|
+
print(f" └── ...")
|
|
452
|
+
print(f"\nYou can inspect the results at: {run_dir.absolute()}")
|
|
453
|
+
print()
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def cmd_run_api(args: argparse.Namespace) -> None:
|
|
457
|
+
"""Run evaluation using API-backed VLM (Claude/GPT-5.1).
|
|
458
|
+
|
|
459
|
+
This provides baselines for comparing against fine-tuned models.
|
|
460
|
+
"""
|
|
461
|
+
from openadapt_ml.benchmarks import (
|
|
462
|
+
APIBenchmarkAgent,
|
|
463
|
+
WAAMockAdapter,
|
|
464
|
+
compute_domain_metrics,
|
|
465
|
+
compute_metrics,
|
|
466
|
+
)
|
|
467
|
+
from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
|
|
468
|
+
|
|
469
|
+
provider_names = {
|
|
470
|
+
"anthropic": "Claude",
|
|
471
|
+
"openai": "GPT-5.1",
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
print(f"\n=== API-Backed Benchmark Evaluation ===")
|
|
475
|
+
print(f" Provider: {args.provider} ({provider_names.get(args.provider, 'Unknown')})")
|
|
476
|
+
print(f" Tasks: {args.tasks}")
|
|
477
|
+
print(f" Max steps: {args.max_steps}")
|
|
478
|
+
print(f" Output dir: {args.output}")
|
|
479
|
+
print()
|
|
480
|
+
|
|
481
|
+
# Check for API key
|
|
482
|
+
import os
|
|
483
|
+
key_name = "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY"
|
|
484
|
+
if not os.getenv(key_name):
|
|
485
|
+
print(f"WARNING: {key_name} environment variable not set!")
|
|
486
|
+
print(f" Set it in your .env file or export it before running.")
|
|
487
|
+
print()
|
|
488
|
+
|
|
489
|
+
# Create mock adapter for testing (real WAA would require Windows)
|
|
490
|
+
# In a real scenario, this would be WAAAdapter on Windows
|
|
491
|
+
if args.use_real_waa:
|
|
492
|
+
if sys.platform != "win32" and not args.force:
|
|
493
|
+
print("ERROR: WAA requires Windows. Use --force to override.")
|
|
494
|
+
sys.exit(1)
|
|
495
|
+
from openadapt_ml.benchmarks import WAAAdapter
|
|
496
|
+
waa_path = get_waa_path(args.waa_path)
|
|
497
|
+
adapter = WAAAdapter(waa_repo_path=waa_path)
|
|
498
|
+
task_ids = None
|
|
499
|
+
if args.task_ids:
|
|
500
|
+
task_ids = [t.strip() for t in args.task_ids.split(",")]
|
|
501
|
+
else:
|
|
502
|
+
adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
|
|
503
|
+
task_ids = None
|
|
504
|
+
|
|
505
|
+
# Create API-backed agent
|
|
506
|
+
agent = APIBenchmarkAgent(
|
|
507
|
+
provider=args.provider,
|
|
508
|
+
max_tokens=args.max_tokens,
|
|
509
|
+
use_accessibility_tree=not args.no_a11y,
|
|
510
|
+
use_history=not args.no_history,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
# Configure evaluation
|
|
514
|
+
model_id = args.model_id if args.model_id else f"{args.provider}-api"
|
|
515
|
+
config = EvaluationConfig(
|
|
516
|
+
max_steps=args.max_steps,
|
|
517
|
+
parallel=1, # API calls should be sequential to avoid rate limits
|
|
518
|
+
save_trajectories=True,
|
|
519
|
+
save_execution_traces=True,
|
|
520
|
+
model_id=model_id,
|
|
521
|
+
output_dir=args.output,
|
|
522
|
+
run_name=args.run_name,
|
|
523
|
+
verbose=args.verbose,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Run evaluation
|
|
527
|
+
print("Starting evaluation...")
|
|
528
|
+
print(" (Each step calls the API - this may take a while)")
|
|
529
|
+
print()
|
|
530
|
+
|
|
531
|
+
try:
|
|
532
|
+
results = evaluate_agent_on_benchmark(
|
|
533
|
+
agent=agent,
|
|
534
|
+
adapter=adapter,
|
|
535
|
+
task_ids=task_ids,
|
|
536
|
+
config=config,
|
|
537
|
+
)
|
|
538
|
+
except Exception as e:
|
|
539
|
+
print(f"\nERROR: {e}")
|
|
540
|
+
if "API key" in str(e) or "api_key" in str(e).lower():
|
|
541
|
+
print(f"\nMake sure {key_name} is set in your environment.")
|
|
542
|
+
sys.exit(1)
|
|
543
|
+
|
|
544
|
+
# Print results
|
|
545
|
+
metrics = compute_metrics(results)
|
|
546
|
+
print("\n=== Results ===")
|
|
547
|
+
print(f"Tasks: {metrics['num_tasks']}")
|
|
548
|
+
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
549
|
+
print(f"Successes: {metrics['success_count']}")
|
|
550
|
+
print(f"Failures: {metrics['fail_count']}")
|
|
551
|
+
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
552
|
+
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
553
|
+
print()
|
|
554
|
+
|
|
555
|
+
# Domain breakdown
|
|
556
|
+
tasks = adapter.list_tasks()
|
|
557
|
+
domain_metrics = compute_domain_metrics(results, tasks)
|
|
558
|
+
if domain_metrics:
|
|
559
|
+
print("=== By Domain ===")
|
|
560
|
+
for domain, dm in domain_metrics.items():
|
|
561
|
+
print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
|
|
562
|
+
print()
|
|
563
|
+
|
|
564
|
+
# Find output directory
|
|
565
|
+
output_dir = Path(args.output)
|
|
566
|
+
run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
567
|
+
if run_dirs:
|
|
568
|
+
run_dir = run_dirs[0].parent
|
|
569
|
+
print(f"Results saved to: {run_dir.absolute()}")
|
|
570
|
+
print(f"View with: uv run python -m openadapt_ml.cloud.local serve --open")
|
|
571
|
+
print()
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def cmd_create_config(args: argparse.Namespace) -> None:
|
|
575
|
+
"""Create a sample Azure config file."""
|
|
576
|
+
from openadapt_ml.benchmarks.azure import AzureConfig
|
|
577
|
+
|
|
578
|
+
config = AzureConfig(
|
|
579
|
+
subscription_id="<your-subscription-id>",
|
|
580
|
+
resource_group="agents",
|
|
581
|
+
workspace_name="agents_ml",
|
|
582
|
+
vm_size="Standard_D4_v3",
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
output_path = Path(args.output)
|
|
586
|
+
config.to_json(output_path)
|
|
587
|
+
print(f"Sample config saved to: {output_path}")
|
|
588
|
+
print("\nEdit this file with your Azure credentials before using.")
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def cmd_status(args: argparse.Namespace) -> None:
|
|
592
|
+
"""Check Azure workspace and compute status."""
|
|
593
|
+
setup_logging(args.verbose)
|
|
594
|
+
|
|
595
|
+
# Import after logging setup to suppress Azure SDK noise
|
|
596
|
+
from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient # noqa: E402
|
|
597
|
+
|
|
598
|
+
print("\n=== Azure WAA Status ===\n")
|
|
599
|
+
|
|
600
|
+
# Check config
|
|
601
|
+
try:
|
|
602
|
+
config = AzureConfig.from_env()
|
|
603
|
+
print(f"Subscription: {config.subscription_id[:8]}...")
|
|
604
|
+
print(f"Resource Group: {config.resource_group}")
|
|
605
|
+
print(f"Workspace: {config.workspace_name}")
|
|
606
|
+
print(f"VM Size: {config.vm_size}")
|
|
607
|
+
except ValueError as e:
|
|
608
|
+
print(f"Config Error: {e}")
|
|
609
|
+
print("\nRun 'python scripts/setup_azure.py' to configure.")
|
|
610
|
+
return
|
|
611
|
+
|
|
612
|
+
# Check WAA
|
|
613
|
+
waa_path = find_waa_path()
|
|
614
|
+
if waa_path:
|
|
615
|
+
print(f"WAA Path: {waa_path}")
|
|
616
|
+
else:
|
|
617
|
+
print("WAA Path: NOT FOUND")
|
|
618
|
+
print(" Run: git submodule update --init --recursive")
|
|
619
|
+
|
|
620
|
+
# Check Azure connection
|
|
621
|
+
print("\nConnecting to Azure...")
|
|
622
|
+
try:
|
|
623
|
+
client = AzureMLClient(config)
|
|
624
|
+
computes = client.list_compute_instances(prefix="w")
|
|
625
|
+
print(f"Connection: OK")
|
|
626
|
+
|
|
627
|
+
if computes:
|
|
628
|
+
print(f"\nActive Compute Instances ({len(computes)}):")
|
|
629
|
+
for name in computes:
|
|
630
|
+
try:
|
|
631
|
+
status = client.get_compute_status(name)
|
|
632
|
+
print(f" - {name}: {status}")
|
|
633
|
+
except Exception:
|
|
634
|
+
print(f" - {name}: (status unknown)")
|
|
635
|
+
else:
|
|
636
|
+
print("\nNo active compute instances.")
|
|
637
|
+
|
|
638
|
+
except Exception as e:
|
|
639
|
+
print(f"Connection: FAILED")
|
|
640
|
+
print(f" Error: {e}")
|
|
641
|
+
|
|
642
|
+
print()
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def cmd_cleanup(args: argparse.Namespace) -> None:
|
|
646
|
+
"""Clean up all Azure compute resources."""
|
|
647
|
+
setup_logging(args.verbose)
|
|
648
|
+
|
|
649
|
+
from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient
|
|
650
|
+
|
|
651
|
+
print("\n=== Azure WAA Cleanup ===\n")
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
config = AzureConfig.from_env()
|
|
655
|
+
except ValueError as e:
|
|
656
|
+
print(f"Config Error: {e}")
|
|
657
|
+
return
|
|
658
|
+
|
|
659
|
+
print(f"Workspace: {config.workspace_name}")
|
|
660
|
+
print(f"Resource Group: {config.resource_group}")
|
|
661
|
+
print()
|
|
662
|
+
|
|
663
|
+
client = AzureMLClient(config)
|
|
664
|
+
|
|
665
|
+
# List ALL compute instances (no prefix filter)
|
|
666
|
+
print("Finding all compute instances...")
|
|
667
|
+
computes = client.list_compute_instances() # No prefix = get all
|
|
668
|
+
|
|
669
|
+
if not computes:
|
|
670
|
+
print(" No compute instances found")
|
|
671
|
+
else:
|
|
672
|
+
print(f" Found {len(computes)} compute instance(s):")
|
|
673
|
+
for name in computes:
|
|
674
|
+
try:
|
|
675
|
+
status = client.get_compute_status(name)
|
|
676
|
+
except Exception:
|
|
677
|
+
status = "unknown"
|
|
678
|
+
print(f" - {name} ({status})")
|
|
679
|
+
|
|
680
|
+
print()
|
|
681
|
+
for name in computes:
|
|
682
|
+
if not args.yes:
|
|
683
|
+
confirm = input(f" Delete '{name}'? [y/N]: ").strip().lower()
|
|
684
|
+
if confirm != "y":
|
|
685
|
+
print(f" Skipped {name}")
|
|
686
|
+
continue
|
|
687
|
+
print(f" Deleting {name}...", end="", flush=True)
|
|
688
|
+
try:
|
|
689
|
+
client.delete_compute_instance(name)
|
|
690
|
+
print(" done")
|
|
691
|
+
except Exception as e:
|
|
692
|
+
print(f" FAILED: {e}")
|
|
693
|
+
|
|
694
|
+
print("\nCleanup complete.")
|
|
695
|
+
print("Note: Resource deletion may take a few minutes to free quota.")
|
|
696
|
+
print()
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def cmd_setup(args: argparse.Namespace) -> None:
|
|
700
|
+
"""Run full setup (Azure + WAA submodule)."""
|
|
701
|
+
import subprocess
|
|
702
|
+
|
|
703
|
+
print("\n=== OpenAdapt-ML WAA Setup ===\n")
|
|
704
|
+
|
|
705
|
+
# Step 1: Git submodule
|
|
706
|
+
print("[1/2] Checking WAA submodule...")
|
|
707
|
+
waa_path = find_waa_path()
|
|
708
|
+
if waa_path:
|
|
709
|
+
print(f" WAA already available at: {waa_path}")
|
|
710
|
+
else:
|
|
711
|
+
print(" Initializing WAA submodule...")
|
|
712
|
+
try:
|
|
713
|
+
subprocess.run(
|
|
714
|
+
["git", "submodule", "update", "--init", "--recursive"],
|
|
715
|
+
check=True,
|
|
716
|
+
capture_output=not args.verbose,
|
|
717
|
+
)
|
|
718
|
+
print(" WAA submodule initialized")
|
|
719
|
+
except subprocess.CalledProcessError as e:
|
|
720
|
+
print(f" Failed: {e}")
|
|
721
|
+
if not args.force:
|
|
722
|
+
sys.exit(1)
|
|
723
|
+
|
|
724
|
+
# Step 2: Azure setup
|
|
725
|
+
print("\n[2/2] Azure setup...")
|
|
726
|
+
setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_azure.py"
|
|
727
|
+
if setup_script.exists():
|
|
728
|
+
cmd = ["python", str(setup_script)]
|
|
729
|
+
if args.yes:
|
|
730
|
+
cmd.append("--yes")
|
|
731
|
+
try:
|
|
732
|
+
subprocess.run(cmd, check=True)
|
|
733
|
+
except subprocess.CalledProcessError:
|
|
734
|
+
print(" Azure setup failed or was cancelled")
|
|
735
|
+
if not args.force:
|
|
736
|
+
sys.exit(1)
|
|
737
|
+
else:
|
|
738
|
+
print(f" Setup script not found: {setup_script}")
|
|
739
|
+
print(" Run manually: python scripts/setup_azure.py")
|
|
740
|
+
|
|
741
|
+
print("\n=== Setup Complete ===")
|
|
742
|
+
print("\nNext steps:")
|
|
743
|
+
print(" 1. Check status: python -m openadapt_ml.benchmarks.cli status")
|
|
744
|
+
print(" 2. Test locally: python -m openadapt_ml.benchmarks.cli test-mock")
|
|
745
|
+
print(" 3. Run on Azure: python -m openadapt_ml.benchmarks.cli run-azure")
|
|
746
|
+
print()
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def main() -> None:
|
|
750
|
+
parser = argparse.ArgumentParser(
|
|
751
|
+
description="WAA Benchmark CLI - Windows Agent Arena evaluation toolkit",
|
|
752
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
753
|
+
epilog="""
|
|
754
|
+
Quick Start:
|
|
755
|
+
# First time setup (Azure + WAA submodule)
|
|
756
|
+
python -m openadapt_ml.benchmarks.cli setup
|
|
757
|
+
|
|
758
|
+
# Check everything is configured
|
|
759
|
+
python -m openadapt_ml.benchmarks.cli status
|
|
760
|
+
|
|
761
|
+
# Test locally with mock adapter
|
|
762
|
+
python -m openadapt_ml.benchmarks.cli test-mock
|
|
763
|
+
|
|
764
|
+
# Run on Azure
|
|
765
|
+
python -m openadapt_ml.benchmarks.cli run-azure
|
|
766
|
+
""",
|
|
767
|
+
)
|
|
768
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to run")
|
|
769
|
+
|
|
770
|
+
# Setup (new!)
|
|
771
|
+
p_setup = subparsers.add_parser("setup", help="One-command setup (Azure + WAA)")
|
|
772
|
+
p_setup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts")
|
|
773
|
+
p_setup.add_argument("--force", action="store_true", help="Continue on errors")
|
|
774
|
+
p_setup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
775
|
+
|
|
776
|
+
# Status
|
|
777
|
+
p_status = subparsers.add_parser("status", help="Check Azure and WAA status")
|
|
778
|
+
p_status.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
779
|
+
|
|
780
|
+
# Cleanup
|
|
781
|
+
p_cleanup = subparsers.add_parser("cleanup", help="Delete all Azure compute instances")
|
|
782
|
+
p_cleanup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
|
783
|
+
p_cleanup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
784
|
+
|
|
785
|
+
# Estimate costs
|
|
786
|
+
p_estimate = subparsers.add_parser("estimate", help="Estimate Azure costs")
|
|
787
|
+
p_estimate.add_argument("--tasks", type=int, default=154, help="Number of tasks")
|
|
788
|
+
p_estimate.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
|
|
789
|
+
p_estimate.add_argument("--duration", type=float, default=1.0, help="Avg task duration (minutes)")
|
|
790
|
+
p_estimate.add_argument("--vm-cost", type=float, default=0.19, help="VM hourly cost ($ for D4_v3)")
|
|
791
|
+
|
|
792
|
+
# Run local
|
|
793
|
+
p_local = subparsers.add_parser("run-local", help="Run evaluation locally (Windows)")
|
|
794
|
+
p_local.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
|
|
795
|
+
p_local.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
|
|
796
|
+
p_local.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
|
|
797
|
+
p_local.add_argument("--agent", default="random", help="Agent type")
|
|
798
|
+
p_local.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
799
|
+
p_local.add_argument("--output", help="Output JSON path")
|
|
800
|
+
p_local.add_argument("--force", action="store_true", help="Force run on non-Windows")
|
|
801
|
+
p_local.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
802
|
+
|
|
803
|
+
# Run Azure
|
|
804
|
+
p_azure = subparsers.add_parser("run-azure", help="Run evaluation on Azure")
|
|
805
|
+
p_azure.add_argument("--config", help="Azure config JSON path")
|
|
806
|
+
p_azure.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
|
|
807
|
+
p_azure.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
|
|
808
|
+
p_azure.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
|
|
809
|
+
p_azure.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
|
|
810
|
+
p_azure.add_argument("--agent", default="random", help="Agent type")
|
|
811
|
+
p_azure.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
812
|
+
p_azure.add_argument("--experiment", default="waa-eval", help="Experiment name")
|
|
813
|
+
p_azure.add_argument("--output", help="Output JSON path")
|
|
814
|
+
p_azure.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
|
815
|
+
p_azure.add_argument("--no-cleanup", action="store_true", help="Don't delete VMs after")
|
|
816
|
+
p_azure.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
817
|
+
|
|
818
|
+
# Test mock
|
|
819
|
+
p_mock = subparsers.add_parser("test-mock", help="Test with mock adapter")
|
|
820
|
+
p_mock.add_argument("--tasks", type=int, default=20, help="Number of mock tasks")
|
|
821
|
+
p_mock.add_argument("--max-steps", type=int, default=10, help="Max steps per task")
|
|
822
|
+
p_mock.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
823
|
+
|
|
824
|
+
# Test collection
|
|
825
|
+
p_collection = subparsers.add_parser("test-collection", help="Test benchmark data collection")
|
|
826
|
+
p_collection.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
|
|
827
|
+
p_collection.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
|
|
828
|
+
p_collection.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
829
|
+
p_collection.add_argument("--model-id", default="random-agent-test", help="Model identifier")
|
|
830
|
+
p_collection.add_argument("--output", default="benchmark_results", help="Output directory")
|
|
831
|
+
p_collection.add_argument("--run-name", help="Run name (default: auto-generated)")
|
|
832
|
+
|
|
833
|
+
# Run API-backed evaluation
|
|
834
|
+
p_api = subparsers.add_parser("run-api", help="Run evaluation with API-backed VLM (Claude/GPT-5.1)")
|
|
835
|
+
p_api.add_argument("--provider", choices=["anthropic", "openai"], default="anthropic",
|
|
836
|
+
help="API provider (anthropic=Claude, openai=GPT-5.1)")
|
|
837
|
+
p_api.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
|
|
838
|
+
p_api.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
|
|
839
|
+
p_api.add_argument("--max-tokens", type=int, default=512, help="Max tokens for API response")
|
|
840
|
+
p_api.add_argument("--no-a11y", action="store_true", help="Disable accessibility tree in prompt")
|
|
841
|
+
p_api.add_argument("--no-history", action="store_true", help="Disable action history in prompt")
|
|
842
|
+
p_api.add_argument("--output", default="benchmark_results", help="Output directory")
|
|
843
|
+
p_api.add_argument("--run-name", help="Run name (default: auto-generated)")
|
|
844
|
+
p_api.add_argument("--model-id", help="Model identifier (default: {provider}-api)")
|
|
845
|
+
p_api.add_argument("--use-real-waa", action="store_true", help="Use real WAA adapter (Windows only)")
|
|
846
|
+
p_api.add_argument("--waa-path", help="Path to WAA repository")
|
|
847
|
+
p_api.add_argument("--task-ids", help="Comma-separated task IDs for real WAA")
|
|
848
|
+
p_api.add_argument("--force", action="store_true", help="Force run on non-Windows")
|
|
849
|
+
p_api.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
850
|
+
|
|
851
|
+
# Create config
|
|
852
|
+
p_config = subparsers.add_parser("create-config", help="Create sample Azure config")
|
|
853
|
+
p_config.add_argument("--output", default="azure_config.json", help="Output path")
|
|
854
|
+
|
|
855
|
+
args = parser.parse_args()
|
|
856
|
+
|
|
857
|
+
if args.command == "setup":
|
|
858
|
+
cmd_setup(args)
|
|
859
|
+
elif args.command == "status":
|
|
860
|
+
cmd_status(args)
|
|
861
|
+
elif args.command == "cleanup":
|
|
862
|
+
cmd_cleanup(args)
|
|
863
|
+
elif args.command == "estimate":
|
|
864
|
+
cmd_estimate(args)
|
|
865
|
+
elif args.command == "run-local":
|
|
866
|
+
setup_logging(getattr(args, 'verbose', False))
|
|
867
|
+
cmd_run_local(args)
|
|
868
|
+
elif args.command == "run-azure":
|
|
869
|
+
setup_logging(getattr(args, 'verbose', False))
|
|
870
|
+
cmd_run_azure(args)
|
|
871
|
+
elif args.command == "test-mock":
|
|
872
|
+
cmd_test_mock(args)
|
|
873
|
+
elif args.command == "test-collection":
|
|
874
|
+
cmd_test_collection(args)
|
|
875
|
+
elif args.command == "run-api":
|
|
876
|
+
cmd_run_api(args)
|
|
877
|
+
elif args.command == "create-config":
|
|
878
|
+
cmd_create_config(args)
|
|
879
|
+
else:
|
|
880
|
+
parser.print_help()
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
if __name__ == "__main__":
|
|
884
|
+
main()
|