openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,884 @@
1
+ """CLI for WAA benchmark evaluation.
2
+
3
+ Usage:
4
+ # Estimate costs
5
+ python -m openadapt_ml.benchmarks.cli estimate --workers 40
6
+
7
+ # Run local evaluation (Windows only)
8
+ python -m openadapt_ml.benchmarks.cli run-local --waa-path /path/to/WAA --tasks notepad_1,notepad_2
9
+
10
+ # Run Azure evaluation
11
+ python -m openadapt_ml.benchmarks.cli run-azure --config azure_config.json --workers 40
12
+
13
+ # Run API-backed evaluation (Claude/GPT-5.1 baseline)
14
+ python -m openadapt_ml.benchmarks.cli run-api --provider anthropic --tasks 5
15
+ python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5
16
+
17
+ # Test with mock adapter
18
+ python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
19
+
20
+ # Test data collection (with screenshots and execution traces)
21
+ python -m openadapt_ml.benchmarks.cli test-collection --tasks 5
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import logging
29
+ import sys
30
+ from pathlib import Path
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Pre-configure loggers to be quiet by default (before any Azure imports)
35
+ logging.getLogger("azure").setLevel(logging.WARNING)
36
+ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
37
+ logging.getLogger("azure.ai.ml").setLevel(logging.WARNING)
38
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
39
+ logging.getLogger("msrest").setLevel(logging.WARNING)
40
+ logging.getLogger("openadapt_ml.benchmarks.azure").setLevel(logging.WARNING)
41
+
42
+ # Suppress Azure SDK experimental class warnings
43
+ import warnings
44
+ warnings.filterwarnings("ignore", message=".*experimental class.*")
45
+
46
+
47
+ def setup_logging(verbose: bool = False) -> None:
48
+ """Configure logging with appropriate verbosity.
49
+
50
+ Args:
51
+ verbose: If True, show all logs. If False, suppress Azure SDK noise.
52
+ """
53
+ level = logging.DEBUG if verbose else logging.INFO
54
+ logging.basicConfig(
55
+ level=level,
56
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
57
+ )
58
+
59
+ # Suppress noisy Azure SDK logs unless verbose
60
+ if not verbose:
61
+ logging.getLogger("azure").setLevel(logging.WARNING)
62
+ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
63
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
64
+ logging.getLogger("msrest").setLevel(logging.WARNING)
65
+
66
+
67
+ def find_waa_path() -> Path | None:
68
+ """Auto-detect Windows Agent Arena repository path.
69
+
70
+ Searches in order:
71
+ 1. vendor/WindowsAgentArena (git submodule)
72
+ 2. ../WindowsAgentArena (sibling directory)
73
+ 3. ~/WindowsAgentArena (home directory)
74
+
75
+ Returns:
76
+ Path to WAA repo, or None if not found.
77
+ """
78
+ # Get the project root (where this package is installed)
79
+ project_root = Path(__file__).parent.parent.parent
80
+
81
+ candidates = [
82
+ project_root / "vendor" / "WindowsAgentArena",
83
+ project_root.parent / "WindowsAgentArena",
84
+ Path.home() / "WindowsAgentArena",
85
+ ]
86
+
87
+ for path in candidates:
88
+ if path.exists() and (path / "src").exists():
89
+ return path
90
+
91
+ return None
92
+
93
+
94
+ def get_waa_path(args_path: str | None) -> Path:
95
+ """Get WAA path from args or auto-detect.
96
+
97
+ Args:
98
+ args_path: Path from command line args, or None.
99
+
100
+ Returns:
101
+ Resolved WAA path.
102
+
103
+ Raises:
104
+ SystemExit: If WAA cannot be found.
105
+ """
106
+ if args_path:
107
+ path = Path(args_path)
108
+ if not path.exists():
109
+ print(f"ERROR: WAA path does not exist: {path}")
110
+ sys.exit(1)
111
+ return path
112
+
113
+ path = find_waa_path()
114
+ if path:
115
+ print(f" Using WAA from: {path}")
116
+ return path
117
+
118
+ print("ERROR: Windows Agent Arena not found!")
119
+ print("\nTo fix, run:")
120
+ print(" git submodule update --init --recursive")
121
+ print("\nOr specify path manually:")
122
+ print(" --waa-path /path/to/WindowsAgentArena")
123
+ sys.exit(1)
124
+
125
+
126
+ def cmd_estimate(args: argparse.Namespace) -> None:
127
+ """Estimate Azure costs."""
128
+ from openadapt_ml.benchmarks.azure import estimate_cost
129
+
130
+ estimate = estimate_cost(
131
+ num_tasks=args.tasks,
132
+ num_workers=args.workers,
133
+ avg_task_duration_minutes=args.duration,
134
+ vm_hourly_cost=args.vm_cost,
135
+ )
136
+
137
+ print("\n=== WAA Azure Cost Estimate ===")
138
+ print(f"Tasks: {estimate['num_tasks']}")
139
+ print(f"Workers: {estimate['num_workers']}")
140
+ print(f"Tasks per worker: {estimate['tasks_per_worker']:.1f}")
141
+ print(f"Estimated duration: {estimate['estimated_duration_minutes']:.1f} minutes")
142
+ print(f"Total VM hours: {estimate['total_vm_hours']:.2f}")
143
+ print(f"Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
144
+ print(f"Cost per task: ${estimate['cost_per_task_usd']:.4f}")
145
+ print()
146
+
147
+
148
+ def cmd_run_local(args: argparse.Namespace) -> None:
149
+ """Run evaluation locally on Windows."""
150
+ from openadapt_ml.benchmarks import (
151
+ RandomAgent,
152
+ WAAAdapter,
153
+ compute_metrics,
154
+ evaluate_agent_on_benchmark,
155
+ )
156
+
157
+ # Check platform
158
+ if sys.platform != "win32" and not args.force:
159
+ print("ERROR: WAA requires Windows. Use --force to override.")
160
+ sys.exit(1)
161
+
162
+ # Parse task IDs
163
+ task_ids = None
164
+ if args.tasks:
165
+ task_ids = [t.strip() for t in args.tasks.split(",")]
166
+
167
+ # Get WAA path (auto-detect if not specified)
168
+ waa_path = get_waa_path(args.waa_path)
169
+
170
+ # Create adapter
171
+ adapter = WAAAdapter(waa_repo_path=waa_path)
172
+
173
+ # Create agent (for now, just random - in practice, would load a model)
174
+ if args.agent == "random":
175
+ agent = RandomAgent(seed=args.seed)
176
+ else:
177
+ print(f"ERROR: Unknown agent type: {args.agent}")
178
+ sys.exit(1)
179
+
180
+ # Run evaluation
181
+ print(f"\nRunning WAA evaluation...")
182
+ print(f" WAA path: {waa_path}")
183
+ print(f" Tasks: {len(task_ids) if task_ids else 'all (154)'}")
184
+ print(f" Max steps: {args.max_steps}")
185
+ print()
186
+
187
+ results = evaluate_agent_on_benchmark(
188
+ agent=agent,
189
+ adapter=adapter,
190
+ task_ids=task_ids,
191
+ max_steps=args.max_steps,
192
+ )
193
+
194
+ # Print results
195
+ metrics = compute_metrics(results)
196
+ print("\n=== Results ===")
197
+ print(f"Tasks: {metrics['num_tasks']}")
198
+ print(f"Success rate: {metrics['success_rate']:.1%}")
199
+ print(f"Avg score: {metrics['avg_score']:.3f}")
200
+ print(f"Avg steps: {metrics['avg_steps']:.1f}")
201
+ print()
202
+
203
+ # Save results
204
+ if args.output:
205
+ output_path = Path(args.output)
206
+ with open(output_path, "w") as f:
207
+ json.dump(
208
+ {
209
+ "metrics": metrics,
210
+ "results": [
211
+ {
212
+ "task_id": r.task_id,
213
+ "success": r.success,
214
+ "score": r.score,
215
+ "num_steps": r.num_steps,
216
+ "error": r.error,
217
+ }
218
+ for r in results
219
+ ],
220
+ },
221
+ f,
222
+ indent=2,
223
+ )
224
+ print(f"Results saved to: {output_path}")
225
+
226
+
227
+ def cmd_run_azure(args: argparse.Namespace) -> None:
228
+ """Run evaluation on Azure."""
229
+ from openadapt_ml.benchmarks import RandomAgent
230
+ from openadapt_ml.benchmarks.azure import AzureConfig, AzureWAAOrchestrator
231
+
232
+ # Load config
233
+ if args.config:
234
+ config = AzureConfig.from_json(args.config)
235
+ else:
236
+ config = AzureConfig.from_env()
237
+
238
+ # Get WAA path (auto-detect if not specified)
239
+ waa_path = get_waa_path(args.waa_path)
240
+
241
+ # Parse task IDs
242
+ task_ids = None
243
+ if args.tasks:
244
+ task_ids = [t.strip() for t in args.tasks.split(",")]
245
+
246
+ # Create orchestrator
247
+ orchestrator = AzureWAAOrchestrator(
248
+ config=config,
249
+ waa_repo_path=waa_path,
250
+ experiment_name=args.experiment,
251
+ )
252
+
253
+ # Create agent
254
+ if args.agent == "random":
255
+ agent = RandomAgent(seed=args.seed)
256
+ else:
257
+ print(f"ERROR: Unknown agent type: {args.agent}")
258
+ sys.exit(1)
259
+
260
+ # Estimate costs first
261
+ from openadapt_ml.benchmarks.azure import estimate_cost
262
+
263
+ num_tasks = len(task_ids) if task_ids else 154
264
+ estimate = estimate_cost(num_tasks=num_tasks, num_workers=args.workers)
265
+
266
+ print(f"\n=== Azure WAA Evaluation ===")
267
+ print(f" Workers: {args.workers}")
268
+ print(f" Tasks: {num_tasks}")
269
+ print(f" Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
270
+ print(f" Estimated time: {estimate['estimated_duration_minutes']:.1f} minutes")
271
+ print()
272
+
273
+ if not args.yes:
274
+ response = input("Proceed? [y/N] ")
275
+ if response.lower() != "y":
276
+ print("Aborted.")
277
+ sys.exit(0)
278
+
279
+ # Run evaluation
280
+ print("\nStarting Azure evaluation...")
281
+ print(" (VM provisioning takes 3-5 minutes - monitor at https://ml.azure.com)")
282
+ print()
283
+ results = orchestrator.run_evaluation(
284
+ agent=agent,
285
+ num_workers=args.workers,
286
+ task_ids=task_ids,
287
+ max_steps_per_task=args.max_steps,
288
+ cleanup_on_complete=not args.no_cleanup,
289
+ )
290
+
291
+ # Print results
292
+ from openadapt_ml.benchmarks import compute_metrics
293
+
294
+ metrics = compute_metrics(results)
295
+ print("\n=== Results ===")
296
+ print(f"Tasks: {metrics['num_tasks']}")
297
+ print(f"Success rate: {metrics['success_rate']:.1%}")
298
+ print(f"Avg score: {metrics['avg_score']:.3f}")
299
+ print()
300
+
301
+ # Save results
302
+ if args.output:
303
+ output_path = Path(args.output)
304
+ with open(output_path, "w") as f:
305
+ json.dump(
306
+ {
307
+ "metrics": metrics,
308
+ "run_status": orchestrator.get_run_status(),
309
+ "results": [
310
+ {
311
+ "task_id": r.task_id,
312
+ "success": r.success,
313
+ "score": r.score,
314
+ "num_steps": r.num_steps,
315
+ }
316
+ for r in results
317
+ ],
318
+ },
319
+ f,
320
+ indent=2,
321
+ )
322
+ print(f"Results saved to: {output_path}")
323
+
324
+
325
+ def cmd_test_mock(args: argparse.Namespace) -> None:
326
+ """Test with mock adapter (no Windows required)."""
327
+ from openadapt_ml.benchmarks import (
328
+ RandomAgent,
329
+ WAAMockAdapter,
330
+ compute_domain_metrics,
331
+ compute_metrics,
332
+ evaluate_agent_on_benchmark,
333
+ )
334
+
335
+ print(f"\n=== Testing with Mock Adapter ===")
336
+ print(f" Tasks: {args.tasks}")
337
+ print(f" Max steps: {args.max_steps}")
338
+ print()
339
+
340
+ # Create mock adapter
341
+ adapter = WAAMockAdapter(num_tasks=args.tasks)
342
+ agent = RandomAgent(seed=args.seed)
343
+
344
+ # Run evaluation
345
+ results = evaluate_agent_on_benchmark(
346
+ agent=agent,
347
+ adapter=adapter,
348
+ max_steps=args.max_steps,
349
+ )
350
+
351
+ # Print results
352
+ metrics = compute_metrics(results)
353
+ print("=== Results ===")
354
+ print(f"Tasks: {metrics['num_tasks']}")
355
+ print(f"Success rate: {metrics['success_rate']:.1%}")
356
+ print(f"Successes: {metrics['success_count']}")
357
+ print(f"Failures: {metrics['fail_count']}")
358
+ print(f"Avg steps: {metrics['avg_steps']:.1f}")
359
+ print()
360
+
361
+ # Domain breakdown
362
+ tasks = adapter.list_tasks()
363
+ domain_metrics = compute_domain_metrics(results, tasks)
364
+ if domain_metrics:
365
+ print("=== By Domain ===")
366
+ for domain, dm in domain_metrics.items():
367
+ print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
368
+ print()
369
+
370
+
371
+ def cmd_test_collection(args: argparse.Namespace) -> None:
372
+ """Test benchmark data collection with mock adapter.
373
+
374
+ This command runs a benchmark evaluation with data collection enabled,
375
+ creating a full directory structure with screenshots, execution traces,
376
+ and metadata suitable for the benchmark viewer.
377
+ """
378
+ import json
379
+ from pathlib import Path
380
+
381
+ from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter
382
+ from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
383
+
384
+ print(f"\n=== Testing Benchmark Data Collection ===")
385
+ print(f" Tasks: {args.tasks}")
386
+ print(f" Max steps: {args.max_steps}")
387
+ print(f" Output dir: {args.output}")
388
+ print(f" Run name: {args.run_name or '(auto-generated)'}")
389
+ print()
390
+
391
+ # Create mock adapter
392
+ adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
393
+ agent = RandomAgent(action_types=["click", "type", "scroll", "done"], seed=args.seed)
394
+
395
+ # Configure evaluation with data collection
396
+ config = EvaluationConfig(
397
+ max_steps=args.max_steps,
398
+ parallel=1,
399
+ save_trajectories=True,
400
+ save_execution_traces=True,
401
+ model_id=args.model_id,
402
+ output_dir=args.output,
403
+ run_name=args.run_name,
404
+ verbose=True,
405
+ )
406
+
407
+ # Run evaluation
408
+ results = evaluate_agent_on_benchmark(
409
+ agent=agent,
410
+ adapter=adapter,
411
+ config=config,
412
+ )
413
+
414
+ # Print results
415
+ success_count = sum(1 for r in results if r.success)
416
+ success_rate = success_count / len(results) if results else 0.0
417
+ avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0.0
418
+
419
+ print(f"\n=== Results ===")
420
+ print(f"Total tasks: {len(results)}")
421
+ print(f"Success: {success_count} ({success_rate:.1%})")
422
+ print(f"Failure: {len(results) - success_count}")
423
+ print(f"Avg steps: {avg_steps:.1f}")
424
+
425
+ # Find the actual output directory by reading metadata
426
+ output_dir = Path(args.output)
427
+ run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
428
+ if run_dirs:
429
+ run_dir = run_dirs[0].parent
430
+ with open(run_dirs[0]) as f:
431
+ metadata = json.load(f)
432
+ run_name = metadata.get("run_name", run_dir.name)
433
+ else:
434
+ run_dir = output_dir
435
+ run_name = "unknown"
436
+
437
+ print(f"\n=== Output Directory ===")
438
+ print(f"Location: {run_dir.absolute()}")
439
+ print(f"\nDirectory structure:")
440
+ print(f" {run_dir.name}/")
441
+ print(f" ├── metadata.json")
442
+ print(f" ├── summary.json")
443
+ print(f" └── tasks/")
444
+ print(f" ├── task_001/")
445
+ print(f" │ ├── task.json")
446
+ print(f" │ ├── execution.json")
447
+ print(f" │ └── screenshots/")
448
+ print(f" │ ├── step_000.png")
449
+ print(f" │ ├── step_001.png")
450
+ print(f" │ └── ...")
451
+ print(f" └── ...")
452
+ print(f"\nYou can inspect the results at: {run_dir.absolute()}")
453
+ print()
454
+
455
+
456
+ def cmd_run_api(args: argparse.Namespace) -> None:
457
+ """Run evaluation using API-backed VLM (Claude/GPT-5.1).
458
+
459
+ This provides baselines for comparing against fine-tuned models.
460
+ """
461
+ from openadapt_ml.benchmarks import (
462
+ APIBenchmarkAgent,
463
+ WAAMockAdapter,
464
+ compute_domain_metrics,
465
+ compute_metrics,
466
+ )
467
+ from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
468
+
469
+ provider_names = {
470
+ "anthropic": "Claude",
471
+ "openai": "GPT-5.1",
472
+ }
473
+
474
+ print(f"\n=== API-Backed Benchmark Evaluation ===")
475
+ print(f" Provider: {args.provider} ({provider_names.get(args.provider, 'Unknown')})")
476
+ print(f" Tasks: {args.tasks}")
477
+ print(f" Max steps: {args.max_steps}")
478
+ print(f" Output dir: {args.output}")
479
+ print()
480
+
481
+ # Check for API key
482
+ import os
483
+ key_name = "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY"
484
+ if not os.getenv(key_name):
485
+ print(f"WARNING: {key_name} environment variable not set!")
486
+ print(f" Set it in your .env file or export it before running.")
487
+ print()
488
+
489
+ # Create mock adapter for testing (real WAA would require Windows)
490
+ # In a real scenario, this would be WAAAdapter on Windows
491
+ if args.use_real_waa:
492
+ if sys.platform != "win32" and not args.force:
493
+ print("ERROR: WAA requires Windows. Use --force to override.")
494
+ sys.exit(1)
495
+ from openadapt_ml.benchmarks import WAAAdapter
496
+ waa_path = get_waa_path(args.waa_path)
497
+ adapter = WAAAdapter(waa_repo_path=waa_path)
498
+ task_ids = None
499
+ if args.task_ids:
500
+ task_ids = [t.strip() for t in args.task_ids.split(",")]
501
+ else:
502
+ adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
503
+ task_ids = None
504
+
505
+ # Create API-backed agent
506
+ agent = APIBenchmarkAgent(
507
+ provider=args.provider,
508
+ max_tokens=args.max_tokens,
509
+ use_accessibility_tree=not args.no_a11y,
510
+ use_history=not args.no_history,
511
+ )
512
+
513
+ # Configure evaluation
514
+ model_id = args.model_id if args.model_id else f"{args.provider}-api"
515
+ config = EvaluationConfig(
516
+ max_steps=args.max_steps,
517
+ parallel=1, # API calls should be sequential to avoid rate limits
518
+ save_trajectories=True,
519
+ save_execution_traces=True,
520
+ model_id=model_id,
521
+ output_dir=args.output,
522
+ run_name=args.run_name,
523
+ verbose=args.verbose,
524
+ )
525
+
526
+ # Run evaluation
527
+ print("Starting evaluation...")
528
+ print(" (Each step calls the API - this may take a while)")
529
+ print()
530
+
531
+ try:
532
+ results = evaluate_agent_on_benchmark(
533
+ agent=agent,
534
+ adapter=adapter,
535
+ task_ids=task_ids,
536
+ config=config,
537
+ )
538
+ except Exception as e:
539
+ print(f"\nERROR: {e}")
540
+ if "API key" in str(e) or "api_key" in str(e).lower():
541
+ print(f"\nMake sure {key_name} is set in your environment.")
542
+ sys.exit(1)
543
+
544
+ # Print results
545
+ metrics = compute_metrics(results)
546
+ print("\n=== Results ===")
547
+ print(f"Tasks: {metrics['num_tasks']}")
548
+ print(f"Success rate: {metrics['success_rate']:.1%}")
549
+ print(f"Successes: {metrics['success_count']}")
550
+ print(f"Failures: {metrics['fail_count']}")
551
+ print(f"Avg score: {metrics['avg_score']:.3f}")
552
+ print(f"Avg steps: {metrics['avg_steps']:.1f}")
553
+ print()
554
+
555
+ # Domain breakdown
556
+ tasks = adapter.list_tasks()
557
+ domain_metrics = compute_domain_metrics(results, tasks)
558
+ if domain_metrics:
559
+ print("=== By Domain ===")
560
+ for domain, dm in domain_metrics.items():
561
+ print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
562
+ print()
563
+
564
+ # Find output directory
565
+ output_dir = Path(args.output)
566
+ run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
567
+ if run_dirs:
568
+ run_dir = run_dirs[0].parent
569
+ print(f"Results saved to: {run_dir.absolute()}")
570
+ print(f"View with: uv run python -m openadapt_ml.cloud.local serve --open")
571
+ print()
572
+
573
+
574
+ def cmd_create_config(args: argparse.Namespace) -> None:
575
+ """Create a sample Azure config file."""
576
+ from openadapt_ml.benchmarks.azure import AzureConfig
577
+
578
+ config = AzureConfig(
579
+ subscription_id="<your-subscription-id>",
580
+ resource_group="agents",
581
+ workspace_name="agents_ml",
582
+ vm_size="Standard_D4_v3",
583
+ )
584
+
585
+ output_path = Path(args.output)
586
+ config.to_json(output_path)
587
+ print(f"Sample config saved to: {output_path}")
588
+ print("\nEdit this file with your Azure credentials before using.")
589
+
590
+
591
+ def cmd_status(args: argparse.Namespace) -> None:
592
+ """Check Azure workspace and compute status."""
593
+ setup_logging(args.verbose)
594
+
595
+ # Import after logging setup to suppress Azure SDK noise
596
+ from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient # noqa: E402
597
+
598
+ print("\n=== Azure WAA Status ===\n")
599
+
600
+ # Check config
601
+ try:
602
+ config = AzureConfig.from_env()
603
+ print(f"Subscription: {config.subscription_id[:8]}...")
604
+ print(f"Resource Group: {config.resource_group}")
605
+ print(f"Workspace: {config.workspace_name}")
606
+ print(f"VM Size: {config.vm_size}")
607
+ except ValueError as e:
608
+ print(f"Config Error: {e}")
609
+ print("\nRun 'python scripts/setup_azure.py' to configure.")
610
+ return
611
+
612
+ # Check WAA
613
+ waa_path = find_waa_path()
614
+ if waa_path:
615
+ print(f"WAA Path: {waa_path}")
616
+ else:
617
+ print("WAA Path: NOT FOUND")
618
+ print(" Run: git submodule update --init --recursive")
619
+
620
+ # Check Azure connection
621
+ print("\nConnecting to Azure...")
622
+ try:
623
+ client = AzureMLClient(config)
624
+ computes = client.list_compute_instances(prefix="w")
625
+ print(f"Connection: OK")
626
+
627
+ if computes:
628
+ print(f"\nActive Compute Instances ({len(computes)}):")
629
+ for name in computes:
630
+ try:
631
+ status = client.get_compute_status(name)
632
+ print(f" - {name}: {status}")
633
+ except Exception:
634
+ print(f" - {name}: (status unknown)")
635
+ else:
636
+ print("\nNo active compute instances.")
637
+
638
+ except Exception as e:
639
+ print(f"Connection: FAILED")
640
+ print(f" Error: {e}")
641
+
642
+ print()
643
+
644
+
645
+ def cmd_cleanup(args: argparse.Namespace) -> None:
646
+ """Clean up all Azure compute resources."""
647
+ setup_logging(args.verbose)
648
+
649
+ from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient
650
+
651
+ print("\n=== Azure WAA Cleanup ===\n")
652
+
653
+ try:
654
+ config = AzureConfig.from_env()
655
+ except ValueError as e:
656
+ print(f"Config Error: {e}")
657
+ return
658
+
659
+ print(f"Workspace: {config.workspace_name}")
660
+ print(f"Resource Group: {config.resource_group}")
661
+ print()
662
+
663
+ client = AzureMLClient(config)
664
+
665
+ # List ALL compute instances (no prefix filter)
666
+ print("Finding all compute instances...")
667
+ computes = client.list_compute_instances() # No prefix = get all
668
+
669
+ if not computes:
670
+ print(" No compute instances found")
671
+ else:
672
+ print(f" Found {len(computes)} compute instance(s):")
673
+ for name in computes:
674
+ try:
675
+ status = client.get_compute_status(name)
676
+ except Exception:
677
+ status = "unknown"
678
+ print(f" - {name} ({status})")
679
+
680
+ print()
681
+ for name in computes:
682
+ if not args.yes:
683
+ confirm = input(f" Delete '{name}'? [y/N]: ").strip().lower()
684
+ if confirm != "y":
685
+ print(f" Skipped {name}")
686
+ continue
687
+ print(f" Deleting {name}...", end="", flush=True)
688
+ try:
689
+ client.delete_compute_instance(name)
690
+ print(" done")
691
+ except Exception as e:
692
+ print(f" FAILED: {e}")
693
+
694
+ print("\nCleanup complete.")
695
+ print("Note: Resource deletion may take a few minutes to free quota.")
696
+ print()
697
+
698
+
699
+ def cmd_setup(args: argparse.Namespace) -> None:
700
+ """Run full setup (Azure + WAA submodule)."""
701
+ import subprocess
702
+
703
+ print("\n=== OpenAdapt-ML WAA Setup ===\n")
704
+
705
+ # Step 1: Git submodule
706
+ print("[1/2] Checking WAA submodule...")
707
+ waa_path = find_waa_path()
708
+ if waa_path:
709
+ print(f" WAA already available at: {waa_path}")
710
+ else:
711
+ print(" Initializing WAA submodule...")
712
+ try:
713
+ subprocess.run(
714
+ ["git", "submodule", "update", "--init", "--recursive"],
715
+ check=True,
716
+ capture_output=not args.verbose,
717
+ )
718
+ print(" WAA submodule initialized")
719
+ except subprocess.CalledProcessError as e:
720
+ print(f" Failed: {e}")
721
+ if not args.force:
722
+ sys.exit(1)
723
+
724
+ # Step 2: Azure setup
725
+ print("\n[2/2] Azure setup...")
726
+ setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_azure.py"
727
+ if setup_script.exists():
728
+ cmd = ["python", str(setup_script)]
729
+ if args.yes:
730
+ cmd.append("--yes")
731
+ try:
732
+ subprocess.run(cmd, check=True)
733
+ except subprocess.CalledProcessError:
734
+ print(" Azure setup failed or was cancelled")
735
+ if not args.force:
736
+ sys.exit(1)
737
+ else:
738
+ print(f" Setup script not found: {setup_script}")
739
+ print(" Run manually: python scripts/setup_azure.py")
740
+
741
+ print("\n=== Setup Complete ===")
742
+ print("\nNext steps:")
743
+ print(" 1. Check status: python -m openadapt_ml.benchmarks.cli status")
744
+ print(" 2. Test locally: python -m openadapt_ml.benchmarks.cli test-mock")
745
+ print(" 3. Run on Azure: python -m openadapt_ml.benchmarks.cli run-azure")
746
+ print()
747
+
748
+
749
+ def main() -> None:
750
+ parser = argparse.ArgumentParser(
751
+ description="WAA Benchmark CLI - Windows Agent Arena evaluation toolkit",
752
+ formatter_class=argparse.RawDescriptionHelpFormatter,
753
+ epilog="""
754
+ Quick Start:
755
+ # First time setup (Azure + WAA submodule)
756
+ python -m openadapt_ml.benchmarks.cli setup
757
+
758
+ # Check everything is configured
759
+ python -m openadapt_ml.benchmarks.cli status
760
+
761
+ # Test locally with mock adapter
762
+ python -m openadapt_ml.benchmarks.cli test-mock
763
+
764
+ # Run on Azure
765
+ python -m openadapt_ml.benchmarks.cli run-azure
766
+ """,
767
+ )
768
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
769
+
770
+ # Setup (new!)
771
+ p_setup = subparsers.add_parser("setup", help="One-command setup (Azure + WAA)")
772
+ p_setup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts")
773
+ p_setup.add_argument("--force", action="store_true", help="Continue on errors")
774
+ p_setup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
775
+
776
+ # Status
777
+ p_status = subparsers.add_parser("status", help="Check Azure and WAA status")
778
+ p_status.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
779
+
780
+ # Cleanup
781
+ p_cleanup = subparsers.add_parser("cleanup", help="Delete all Azure compute instances")
782
+ p_cleanup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
783
+ p_cleanup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
784
+
785
+ # Estimate costs
786
+ p_estimate = subparsers.add_parser("estimate", help="Estimate Azure costs")
787
+ p_estimate.add_argument("--tasks", type=int, default=154, help="Number of tasks")
788
+ p_estimate.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
789
+ p_estimate.add_argument("--duration", type=float, default=1.0, help="Avg task duration (minutes)")
790
+ p_estimate.add_argument("--vm-cost", type=float, default=0.19, help="VM hourly cost ($ for D4_v3)")
791
+
792
+ # Run local
793
+ p_local = subparsers.add_parser("run-local", help="Run evaluation locally (Windows)")
794
+ p_local.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
795
+ p_local.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
796
+ p_local.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
797
+ p_local.add_argument("--agent", default="random", help="Agent type")
798
+ p_local.add_argument("--seed", type=int, default=42, help="Random seed")
799
+ p_local.add_argument("--output", help="Output JSON path")
800
+ p_local.add_argument("--force", action="store_true", help="Force run on non-Windows")
801
+ p_local.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
802
+
803
+ # Run Azure
804
+ p_azure = subparsers.add_parser("run-azure", help="Run evaluation on Azure")
805
+ p_azure.add_argument("--config", help="Azure config JSON path")
806
+ p_azure.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
807
+ p_azure.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
808
+ p_azure.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
809
+ p_azure.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
810
+ p_azure.add_argument("--agent", default="random", help="Agent type")
811
+ p_azure.add_argument("--seed", type=int, default=42, help="Random seed")
812
+ p_azure.add_argument("--experiment", default="waa-eval", help="Experiment name")
813
+ p_azure.add_argument("--output", help="Output JSON path")
814
+ p_azure.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
815
+ p_azure.add_argument("--no-cleanup", action="store_true", help="Don't delete VMs after")
816
+ p_azure.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
817
+
818
+ # Test mock
819
+ p_mock = subparsers.add_parser("test-mock", help="Test with mock adapter")
820
+ p_mock.add_argument("--tasks", type=int, default=20, help="Number of mock tasks")
821
+ p_mock.add_argument("--max-steps", type=int, default=10, help="Max steps per task")
822
+ p_mock.add_argument("--seed", type=int, default=42, help="Random seed")
823
+
824
+ # Test collection
825
+ p_collection = subparsers.add_parser("test-collection", help="Test benchmark data collection")
826
+ p_collection.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
827
+ p_collection.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
828
+ p_collection.add_argument("--seed", type=int, default=42, help="Random seed")
829
+ p_collection.add_argument("--model-id", default="random-agent-test", help="Model identifier")
830
+ p_collection.add_argument("--output", default="benchmark_results", help="Output directory")
831
+ p_collection.add_argument("--run-name", help="Run name (default: auto-generated)")
832
+
833
+ # Run API-backed evaluation
834
+ p_api = subparsers.add_parser("run-api", help="Run evaluation with API-backed VLM (Claude/GPT-5.1)")
835
+ p_api.add_argument("--provider", choices=["anthropic", "openai"], default="anthropic",
836
+ help="API provider (anthropic=Claude, openai=GPT-5.1)")
837
+ p_api.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
838
+ p_api.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
839
+ p_api.add_argument("--max-tokens", type=int, default=512, help="Max tokens for API response")
840
+ p_api.add_argument("--no-a11y", action="store_true", help="Disable accessibility tree in prompt")
841
+ p_api.add_argument("--no-history", action="store_true", help="Disable action history in prompt")
842
+ p_api.add_argument("--output", default="benchmark_results", help="Output directory")
843
+ p_api.add_argument("--run-name", help="Run name (default: auto-generated)")
844
+ p_api.add_argument("--model-id", help="Model identifier (default: {provider}-api)")
845
+ p_api.add_argument("--use-real-waa", action="store_true", help="Use real WAA adapter (Windows only)")
846
+ p_api.add_argument("--waa-path", help="Path to WAA repository")
847
+ p_api.add_argument("--task-ids", help="Comma-separated task IDs for real WAA")
848
+ p_api.add_argument("--force", action="store_true", help="Force run on non-Windows")
849
+ p_api.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
850
+
851
+ # Create config
852
+ p_config = subparsers.add_parser("create-config", help="Create sample Azure config")
853
+ p_config.add_argument("--output", default="azure_config.json", help="Output path")
854
+
855
+ args = parser.parse_args()
856
+
857
+ if args.command == "setup":
858
+ cmd_setup(args)
859
+ elif args.command == "status":
860
+ cmd_status(args)
861
+ elif args.command == "cleanup":
862
+ cmd_cleanup(args)
863
+ elif args.command == "estimate":
864
+ cmd_estimate(args)
865
+ elif args.command == "run-local":
866
+ setup_logging(getattr(args, 'verbose', False))
867
+ cmd_run_local(args)
868
+ elif args.command == "run-azure":
869
+ setup_logging(getattr(args, 'verbose', False))
870
+ cmd_run_azure(args)
871
+ elif args.command == "test-mock":
872
+ cmd_test_mock(args)
873
+ elif args.command == "test-collection":
874
+ cmd_test_collection(args)
875
+ elif args.command == "run-api":
876
+ cmd_run_api(args)
877
+ elif args.command == "create-config":
878
+ cmd_create_config(args)
879
+ else:
880
+ parser.print_help()
881
+
882
+
883
+ if __name__ == "__main__":
884
+ main()