openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,883 +1,2006 @@
1
- """CLI for WAA benchmark evaluation.
2
-
3
- Usage:
4
- # Estimate costs
5
- python -m openadapt_ml.benchmarks.cli estimate --workers 40
6
-
7
- # Run local evaluation (Windows only)
8
- python -m openadapt_ml.benchmarks.cli run-local --waa-path /path/to/WAA --tasks notepad_1,notepad_2
9
-
10
- # Run Azure evaluation
11
- python -m openadapt_ml.benchmarks.cli run-azure --config azure_config.json --workers 40
1
+ #!/usr/bin/env python3
2
+ """
3
+ WAA Benchmark CLI - Windows Agent Arena evaluation toolkit
12
4
 
13
- # Run API-backed evaluation (Claude/GPT-5.1 baseline)
14
- python -m openadapt_ml.benchmarks.cli run-api --provider anthropic --tasks 5
15
- python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5
5
+ Uses custom waa_deploy/Dockerfile with dockurr/windows:latest base and
6
+ Python 3.9 from vanilla windowsarena/winarena for GroundingDINO compatibility.
16
7
 
17
- # Test with mock adapter
18
- python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
8
+ See waa_deploy/Dockerfile for details.
19
9
 
20
- # Test data collection (with screenshots and execution traces)
21
- python -m openadapt_ml.benchmarks.cli test-collection --tasks 5
10
+ Usage:
11
+ uv run python -m openadapt_ml.benchmarks.cli <command> [options]
12
+
13
+ Commands:
14
+ create Create Azure VM with nested virtualization
15
+ delete Delete VM and ALL associated resources
16
+ status Show VM state and IP
17
+ build Build WAA image from waa_deploy/Dockerfile
18
+ start Start WAA container (Windows boots + WAA server)
19
+ probe Check if WAA server is ready
20
+ run Run benchmark tasks
21
+ deallocate Stop VM (preserves disk, stops billing)
22
+ logs Show WAA status and logs
23
+
24
+ Workflow:
25
+ 1. create - Create Azure VM (~5 min)
26
+ 2. build - Build custom WAA image (~10 min)
27
+ 3. start - Start container, Windows downloads+boots (~15-20 min first time)
28
+ 4. probe --wait - Wait for WAA server
29
+ 5. run - Run benchmark
30
+ 6. deallocate - Stop billing
22
31
  """
23
32
 
24
- from __future__ import annotations
25
-
26
33
  import argparse
27
34
  import json
28
- import logging
35
+ import subprocess
29
36
  import sys
37
+ import time
38
+ import webbrowser
39
+ from datetime import datetime
30
40
  from pathlib import Path
41
+ from typing import Optional
42
+
43
+ # =============================================================================
44
+ # Constants (single source of truth)
45
+ # =============================================================================
46
+
47
+ # VM sizes with nested virtualization support
48
+ # Standard: $0.19/hr, 4 vCPU, 16GB RAM - baseline
49
+ # Fast: $0.38/hr, 8 vCPU, 32GB RAM - ~30% faster install, ~40% faster eval
50
+ VM_SIZE_STANDARD = "Standard_D4ds_v4"
51
+ VM_SIZE_FAST = "Standard_D8ds_v5"
52
+ VM_SIZE = VM_SIZE_STANDARD # Default, can be overridden by --fast flag
53
+
54
+ # Fallback sizes for --fast mode (in order of preference)
55
+ # D8ds_v5: First choice (v5 with local SSD)
56
+ # D8s_v5: v5 without local SSD
57
+ # D8ds_v4: v4 with local SSD
58
+ # D8as_v5: AMD version
59
+ VM_SIZE_FAST_FALLBACKS = [
60
+ ("Standard_D8ds_v5", 0.38),
61
+ ("Standard_D8s_v5", 0.36),
62
+ ("Standard_D8ds_v4", 0.38),
63
+ ("Standard_D8as_v5", 0.34),
64
+ ]
65
+ VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"]
66
+ VM_NAME = "waa-eval-vm"
67
+ RESOURCE_GROUP = "openadapt-agents"
68
+ # Custom image built from waa_deploy/Dockerfile
69
+ # Uses dockurr/windows:latest (proper ISO download) + WAA components
70
+ DOCKER_IMAGE = "waa-auto:latest"
71
+ LOG_DIR = Path.home() / ".openadapt" / "waa"
72
+ SSH_OPTS = [
73
+ "-o",
74
+ "StrictHostKeyChecking=no",
75
+ "-o",
76
+ "UserKnownHostsFile=/dev/null",
77
+ "-o",
78
+ "LogLevel=ERROR",
79
+ "-o",
80
+ "ConnectTimeout=10",
81
+ ]
82
+
83
+
84
+ def setup_vnc_tunnel_and_browser(ip: str) -> Optional[subprocess.Popen]:
85
+ """Set up SSH tunnel for VNC and open browser.
86
+
87
+ Returns the tunnel process on success, None on failure.
88
+ """
89
+ # Kill any existing tunnel on port 8006
90
+ subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
91
+
92
+ # Start SSH tunnel in background
93
+ tunnel_proc = subprocess.Popen(
94
+ ["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
95
+ stdout=subprocess.DEVNULL,
96
+ stderr=subprocess.DEVNULL,
97
+ )
31
98
 
32
- logger = logging.getLogger(__name__)
99
+ # Wait for tunnel to establish
100
+ time.sleep(2)
33
101
 
34
- # Pre-configure loggers to be quiet by default (before any Azure imports)
35
- logging.getLogger("azure").setLevel(logging.WARNING)
36
- logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
37
- logging.getLogger("azure.ai.ml").setLevel(logging.WARNING)
38
- logging.getLogger("urllib3").setLevel(logging.WARNING)
39
- logging.getLogger("msrest").setLevel(logging.WARNING)
40
- logging.getLogger("openadapt_ml.benchmarks.azure").setLevel(logging.WARNING)
102
+ # Check if tunnel is running
103
+ if tunnel_proc.poll() is not None:
104
+ return None
41
105
 
42
- # Suppress Azure SDK experimental class warnings
43
- import warnings
44
- warnings.filterwarnings("ignore", message=".*experimental class.*")
106
+ # Open browser
107
+ vnc_url = "http://localhost:8006"
108
+ webbrowser.open(vnc_url)
45
109
 
110
+ return tunnel_proc
46
111
 
47
- def setup_logging(verbose: bool = False) -> None:
48
- """Configure logging with appropriate verbosity.
49
112
 
50
- Args:
51
- verbose: If True, show all logs. If False, suppress Azure SDK noise.
52
- """
53
- level = logging.DEBUG if verbose else logging.INFO
54
- logging.basicConfig(
55
- level=level,
56
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
113
+ # Dockerfile location (relative to this file)
114
+ DOCKERFILE_PATH = Path(__file__).parent / "waa_deploy" / "Dockerfile"
115
+
116
+ # =============================================================================
117
+ # Logging
118
+ # =============================================================================
119
+
120
+ _log_file: Optional[Path] = None
121
+ _session_id: Optional[str] = None
122
+
123
+
124
+ def init_logging() -> Path:
125
+ """Initialize logging for this session."""
126
+ global _log_file, _session_id
127
+
128
+ LOG_DIR.mkdir(parents=True, exist_ok=True)
129
+
130
+ # Create session ID
131
+ _session_id = datetime.now().strftime("%Y-%m-%d_%H%M%S")
132
+ session_dir = LOG_DIR / "sessions" / _session_id
133
+ session_dir.mkdir(parents=True, exist_ok=True)
134
+
135
+ # Session log file
136
+ _log_file = session_dir / "full.log"
137
+
138
+ # Update current session pointer
139
+ (LOG_DIR / "session_id.txt").write_text(_session_id)
140
+
141
+ # Symlink for easy access
142
+ current_link = LOG_DIR / "current"
143
+ if current_link.exists() or current_link.is_symlink():
144
+ current_link.unlink()
145
+ current_link.symlink_to(session_dir)
146
+
147
+ return _log_file
148
+
149
+
150
+ def log(step: str, message: str, end: str = "\n"):
151
+ """Log message to file and stdout."""
152
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
153
+ formatted = f"[{timestamp}] [{step}] {message}"
154
+
155
+ # Print to stdout
156
+ print(formatted, end=end, flush=True)
157
+
158
+ # Write to log file
159
+ if _log_file:
160
+ with open(_log_file, "a") as f:
161
+ f.write(formatted + end)
162
+
163
+
164
+ def log_stream(step: str, process: subprocess.Popen):
165
+ """Stream process output to log and stdout."""
166
+ if process.stdout:
167
+ for line in iter(process.stdout.readline, ""):
168
+ if line:
169
+ log(step, line.rstrip())
170
+
171
+
172
+ # =============================================================================
173
+ # Azure Helpers
174
+ # =============================================================================
175
+
176
+
177
+ def get_vm_ip() -> Optional[str]:
178
+ """Get VM public IP if it exists."""
179
+ result = subprocess.run(
180
+ [
181
+ "az",
182
+ "vm",
183
+ "show",
184
+ "-d",
185
+ "-g",
186
+ RESOURCE_GROUP,
187
+ "-n",
188
+ VM_NAME,
189
+ "--query",
190
+ "publicIps",
191
+ "-o",
192
+ "tsv",
193
+ ],
194
+ capture_output=True,
195
+ text=True,
57
196
  )
197
+ if result.returncode == 0 and result.stdout.strip():
198
+ return result.stdout.strip()
199
+ return None
58
200
 
59
- # Suppress noisy Azure SDK logs unless verbose
60
- if not verbose:
61
- logging.getLogger("azure").setLevel(logging.WARNING)
62
- logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
63
- logging.getLogger("urllib3").setLevel(logging.WARNING)
64
- logging.getLogger("msrest").setLevel(logging.WARNING)
65
201
 
202
+ def get_vm_state() -> Optional[str]:
203
+ """Get VM power state."""
204
+ result = subprocess.run(
205
+ [
206
+ "az",
207
+ "vm",
208
+ "get-instance-view",
209
+ "-g",
210
+ RESOURCE_GROUP,
211
+ "-n",
212
+ VM_NAME,
213
+ "--query",
214
+ "instanceView.statuses[1].displayStatus",
215
+ "-o",
216
+ "tsv",
217
+ ],
218
+ capture_output=True,
219
+ text=True,
220
+ )
221
+ if result.returncode == 0 and result.stdout.strip():
222
+ return result.stdout.strip()
223
+ return None
66
224
 
67
- def find_waa_path() -> Path | None:
68
- """Auto-detect Windows Agent Arena repository path.
69
225
 
70
- Searches in order:
71
- 1. vendor/WindowsAgentArena (git submodule)
72
- 2. ../WindowsAgentArena (sibling directory)
73
- 3. ~/WindowsAgentArena (home directory)
226
+ def ssh_run(
227
+ ip: str, cmd: str, stream: bool = False, step: str = "SSH"
228
+ ) -> subprocess.CompletedProcess:
229
+ """Run command on VM via SSH.
74
230
 
75
- Returns:
76
- Path to WAA repo, or None if not found.
231
+ When stream=True:
232
+ 1. Runs command on VM with output redirected to a persistent log file
233
+ 2. Streams that log file locally in real-time
234
+ 3. Log file persists on VM even if connection breaks
235
+
236
+ Remote logs are stored at: /home/azureuser/cli_logs/{step}.log
77
237
  """
78
- # Get the project root (where this package is installed)
79
- project_root = Path(__file__).parent.parent.parent
238
+ if stream:
239
+ # Remote log directory and file (persistent across sessions)
240
+ remote_log_dir = "/home/azureuser/cli_logs"
241
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
242
+ remote_log = f"{remote_log_dir}/{step.lower()}_{timestamp}.log"
243
+
244
+ # Ensure log directory exists
245
+ subprocess.run(
246
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", f"mkdir -p {remote_log_dir}"],
247
+ capture_output=True,
248
+ )
80
249
 
81
- candidates = [
82
- project_root / "vendor" / "WindowsAgentArena",
83
- project_root.parent / "WindowsAgentArena",
84
- Path.home() / "WindowsAgentArena",
85
- ]
250
+ log(step, f"Remote log: {remote_log}")
251
+
252
+ # Run command with output to log file, capturing exit code
253
+ # Using script to capture terminal output including \r progress updates
254
+ # The command runs in foreground but output goes to file AND stdout
255
+ wrapped_cmd = f"""
256
+ set -o pipefail
257
+ {{
258
+ {cmd}
259
+ echo $? > {remote_log}.exit
260
+ }} 2>&1 | tee {remote_log}
261
+ """
262
+ full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", wrapped_cmd]
263
+
264
+ process = subprocess.Popen(
265
+ full_cmd,
266
+ stdout=subprocess.PIPE,
267
+ stderr=subprocess.STDOUT,
268
+ text=True,
269
+ bufsize=1,
270
+ )
86
271
 
87
- for path in candidates:
88
- if path.exists() and (path / "src").exists():
89
- return path
272
+ # Stream output to local log
273
+ try:
274
+ for line in iter(process.stdout.readline, ""):
275
+ if line:
276
+ # Handle carriage returns (Docker progress)
277
+ clean_line = line.rstrip()
278
+ if "\r" in clean_line:
279
+ # Take the last part after \r
280
+ parts = clean_line.split("\r")
281
+ clean_line = parts[-1].strip()
282
+ if clean_line:
283
+ log(step, clean_line)
284
+ process.wait()
285
+ except KeyboardInterrupt:
286
+ log(step, "Interrupted - command continues on VM")
287
+ log(step, f"View full log: ssh azureuser@{ip} 'cat {remote_log}'")
288
+ process.terminate()
289
+ return subprocess.CompletedProcess(cmd, 130, "", "")
290
+
291
+ # Get exit code
292
+ result = subprocess.run(
293
+ [
294
+ "ssh",
295
+ *SSH_OPTS,
296
+ f"azureuser@{ip}",
297
+ f"cat {remote_log}.exit 2>/dev/null || echo 1",
298
+ ],
299
+ capture_output=True,
300
+ text=True,
301
+ )
302
+ exit_code = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 1
90
303
 
91
- return None
304
+ if exit_code != 0:
305
+ log(step, f"Command failed (exit {exit_code})")
306
+ log(step, f"Full log: ssh azureuser@{ip} 'cat {remote_log}'")
92
307
 
308
+ return subprocess.CompletedProcess(cmd, exit_code, "", "")
309
+ else:
310
+ full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd]
311
+ return subprocess.run(full_cmd, capture_output=True, text=True)
312
+
313
+
314
+ def wait_for_ssh(ip: str, timeout: int = 120) -> bool:
315
+ """Wait for SSH to become available."""
316
+ start = time.time()
317
+ while time.time() - start < timeout:
318
+ result = subprocess.run(
319
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", "echo ok"],
320
+ capture_output=True,
321
+ text=True,
322
+ timeout=15,
323
+ )
324
+ if result.returncode == 0:
325
+ return True
326
+ time.sleep(5)
327
+ return False
328
+
329
+
330
+ # =============================================================================
331
+ # Commands
332
+ # =============================================================================
333
+
334
+
335
+ def cmd_create(args):
336
+ """Create Azure VM with nested virtualization."""
337
+ init_logging()
338
+
339
+ # Check if VM already exists
340
+ ip = get_vm_ip()
341
+ if ip:
342
+ log("CREATE", f"VM already exists: {ip}")
343
+ log("CREATE", "Use 'delete' first if you want to recreate")
344
+ return 0
345
+
346
+ # Determine which sizes to try
347
+ use_fast = getattr(args, "fast", False)
348
+ if use_fast:
349
+ # Try multiple fast sizes with fallbacks
350
+ sizes_to_try = VM_SIZE_FAST_FALLBACKS
351
+ log(
352
+ "CREATE",
353
+ f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...",
354
+ )
355
+ else:
356
+ # Standard mode: single size
357
+ sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
358
+ log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE_STANDARD}, $0.19/hr)...")
359
+
360
+ # Try size+region combinations until one works
361
+ vm_created = False
362
+ successful_size = None
363
+ successful_cost = None
364
+
365
+ for vm_size, cost_per_hour in sizes_to_try:
366
+ log("CREATE", f"Trying size {vm_size} (${cost_per_hour:.2f}/hr)...")
367
+
368
+ for region in VM_REGIONS:
369
+ log("CREATE", f" {region}...", end=" ")
370
+
371
+ result = subprocess.run(
372
+ [
373
+ "az",
374
+ "vm",
375
+ "create",
376
+ "--resource-group",
377
+ RESOURCE_GROUP,
378
+ "--name",
379
+ VM_NAME,
380
+ "--location",
381
+ region,
382
+ "--image",
383
+ "Ubuntu2204",
384
+ "--size",
385
+ vm_size,
386
+ "--admin-username",
387
+ "azureuser",
388
+ "--generate-ssh-keys",
389
+ "--public-ip-sku",
390
+ "Standard",
391
+ ],
392
+ capture_output=True,
393
+ text=True,
394
+ )
93
395
 
94
- def get_waa_path(args_path: str | None) -> Path:
95
- """Get WAA path from args or auto-detect.
396
+ if result.returncode == 0:
397
+ vm_info = json.loads(result.stdout)
398
+ ip = vm_info.get("publicIpAddress", "")
399
+ log("CREATE", f"created ({ip})")
400
+ vm_created = True
401
+ successful_size = vm_size
402
+ successful_cost = cost_per_hour
403
+ break
404
+ else:
405
+ log("CREATE", "unavailable")
406
+
407
+ if vm_created:
408
+ break
409
+
410
+ if not vm_created:
411
+ log("CREATE", "ERROR: Could not create VM in any region with any size")
412
+ if use_fast:
413
+ log("CREATE", "Tried sizes: " + ", ".join(s[0] for s in sizes_to_try))
414
+ return 1
415
+
416
+ log(
417
+ "CREATE",
418
+ f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}",
419
+ )
96
420
 
97
- Args:
98
- args_path: Path from command line args, or None.
421
+ # Wait for SSH
422
+ log("CREATE", "Waiting for SSH...")
423
+ if not wait_for_ssh(ip):
424
+ log("CREATE", "ERROR: SSH not available after 2 minutes")
425
+ return 1
426
+ log("CREATE", "SSH ready")
427
+
428
+ # Install Docker with /mnt storage
429
+ log("CREATE", "Installing Docker with /mnt storage...")
430
+ docker_setup = """
431
+ set -e
432
+ sudo apt-get update -qq
433
+ sudo apt-get install -y -qq docker.io
434
+ sudo systemctl start docker
435
+ sudo systemctl enable docker
436
+ sudo usermod -aG docker $USER
437
+
438
+ # Configure Docker to use /mnt (larger temp disk)
439
+ sudo systemctl stop docker
440
+ sudo mkdir -p /mnt/docker
441
+ sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json'
442
+ sudo systemctl start docker
443
+
444
+ # Verify
445
+ docker --version
446
+ df -h /mnt
447
+ """
448
+ result = ssh_run(ip, docker_setup, stream=True, step="CREATE")
449
+ if result.returncode != 0:
450
+ log("CREATE", "ERROR: Docker setup failed")
451
+ return 1
452
+
453
+ log("CREATE", f"VM ready: {ip}")
454
+ return 0
455
+
456
+
457
+ def cmd_delete(args):
458
+ """Delete VM and ALL associated resources."""
459
+ init_logging()
460
+ log("DELETE", f"Deleting VM '{VM_NAME}' and all associated resources...")
461
+
462
+ # Delete VM
463
+ log("DELETE", "Deleting VM...")
464
+ result = subprocess.run(
465
+ [
466
+ "az",
467
+ "vm",
468
+ "delete",
469
+ "-g",
470
+ RESOURCE_GROUP,
471
+ "-n",
472
+ VM_NAME,
473
+ "--yes",
474
+ "--force-deletion",
475
+ "true",
476
+ ],
477
+ capture_output=True,
478
+ text=True,
479
+ )
480
+ if result.returncode == 0:
481
+ log("DELETE", "VM deleted")
482
+ else:
483
+ log("DELETE", "VM not found or already deleted")
484
+
485
+ # Delete NICs
486
+ log("DELETE", "Deleting NICs...")
487
+ result = subprocess.run(
488
+ [
489
+ "az",
490
+ "network",
491
+ "nic",
492
+ "list",
493
+ "-g",
494
+ RESOURCE_GROUP,
495
+ "--query",
496
+ "[?contains(name, 'waa')].name",
497
+ "-o",
498
+ "tsv",
499
+ ],
500
+ capture_output=True,
501
+ text=True,
502
+ )
503
+ for nic in result.stdout.strip().split("\n"):
504
+ if nic:
505
+ subprocess.run(
506
+ ["az", "network", "nic", "delete", "-g", RESOURCE_GROUP, "-n", nic],
507
+ capture_output=True,
508
+ )
509
+ log("DELETE", f" Deleted NIC: {nic}")
510
+
511
+ # Delete public IPs
512
+ log("DELETE", "Deleting public IPs...")
513
+ result = subprocess.run(
514
+ [
515
+ "az",
516
+ "network",
517
+ "public-ip",
518
+ "list",
519
+ "-g",
520
+ RESOURCE_GROUP,
521
+ "--query",
522
+ "[?contains(name, 'waa')].name",
523
+ "-o",
524
+ "tsv",
525
+ ],
526
+ capture_output=True,
527
+ text=True,
528
+ )
529
+ for pip in result.stdout.strip().split("\n"):
530
+ if pip:
531
+ subprocess.run(
532
+ [
533
+ "az",
534
+ "network",
535
+ "public-ip",
536
+ "delete",
537
+ "-g",
538
+ RESOURCE_GROUP,
539
+ "-n",
540
+ pip,
541
+ ],
542
+ capture_output=True,
543
+ )
544
+ log("DELETE", f" Deleted IP: {pip}")
545
+
546
+ # Delete disks
547
+ log("DELETE", "Deleting disks...")
548
+ result = subprocess.run(
549
+ [
550
+ "az",
551
+ "disk",
552
+ "list",
553
+ "-g",
554
+ RESOURCE_GROUP,
555
+ "--query",
556
+ "[?contains(name, 'waa')].name",
557
+ "-o",
558
+ "tsv",
559
+ ],
560
+ capture_output=True,
561
+ text=True,
562
+ )
563
+ for disk in result.stdout.strip().split("\n"):
564
+ if disk:
565
+ subprocess.run(
566
+ ["az", "disk", "delete", "-g", RESOURCE_GROUP, "-n", disk, "--yes"],
567
+ capture_output=True,
568
+ )
569
+ log("DELETE", f" Deleted disk: {disk}")
570
+
571
+ # Delete NSGs
572
+ log("DELETE", "Deleting NSGs...")
573
+ result = subprocess.run(
574
+ [
575
+ "az",
576
+ "network",
577
+ "nsg",
578
+ "list",
579
+ "-g",
580
+ RESOURCE_GROUP,
581
+ "--query",
582
+ "[?contains(name, 'waa')].name",
583
+ "-o",
584
+ "tsv",
585
+ ],
586
+ capture_output=True,
587
+ text=True,
588
+ )
589
+ for nsg in result.stdout.strip().split("\n"):
590
+ if nsg:
591
+ subprocess.run(
592
+ ["az", "network", "nsg", "delete", "-g", RESOURCE_GROUP, "-n", nsg],
593
+ capture_output=True,
594
+ )
595
+ log("DELETE", f" Deleted NSG: {nsg}")
99
596
 
100
- Returns:
101
- Resolved WAA path.
597
+ log("DELETE", "Cleanup complete")
598
+ return 0
599
+
600
+
601
+ def cmd_status(args):
602
+ """Show VM status."""
603
+ ip = get_vm_ip()
604
+ state = get_vm_state()
605
+
606
+ if not ip:
607
+ print(f"VM '{VM_NAME}' not found")
608
+ return 1
102
609
 
103
- Raises:
104
- SystemExit: If WAA cannot be found.
610
+ print(f"VM: {VM_NAME}")
611
+ print(f" State: {state or 'unknown'}")
612
+ print(f" IP: {ip}")
613
+ print(f" Size: {VM_SIZE}")
614
+ print(f" SSH: ssh azureuser@{ip}")
615
+ return 0
616
+
617
+
618
+ def cmd_build(args):
619
+ """Build WAA image from waa_deploy/Dockerfile.
620
+
621
+ This builds our custom image that:
622
+ - Uses dockurr/windows:latest (has working ISO auto-download)
623
+ - Copies WAA components from windowsarena/winarena:latest
624
+ - Patches IP addresses and adds automation
105
625
  """
106
- if args_path:
107
- path = Path(args_path)
108
- if not path.exists():
109
- print(f"ERROR: WAA path does not exist: {path}")
110
- sys.exit(1)
111
- return path
112
-
113
- path = find_waa_path()
114
- if path:
115
- print(f" Using WAA from: {path}")
116
- return path
117
-
118
- print("ERROR: Windows Agent Arena not found!")
119
- print("\nTo fix, run:")
120
- print(" git submodule update --init --recursive")
121
- print("\nOr specify path manually:")
122
- print(" --waa-path /path/to/WindowsAgentArena")
123
- sys.exit(1)
124
-
125
-
126
- def cmd_estimate(args: argparse.Namespace) -> None:
127
- """Estimate Azure costs."""
128
- from openadapt_ml.benchmarks.azure import estimate_cost
129
-
130
- estimate = estimate_cost(
131
- num_tasks=args.tasks,
132
- num_workers=args.workers,
133
- avg_task_duration_minutes=args.duration,
134
- vm_hourly_cost=args.vm_cost,
135
- )
136
-
137
- print("\n=== WAA Azure Cost Estimate ===")
138
- print(f"Tasks: {estimate['num_tasks']}")
139
- print(f"Workers: {estimate['num_workers']}")
140
- print(f"Tasks per worker: {estimate['tasks_per_worker']:.1f}")
141
- print(f"Estimated duration: {estimate['estimated_duration_minutes']:.1f} minutes")
142
- print(f"Total VM hours: {estimate['total_vm_hours']:.2f}")
143
- print(f"Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
144
- print(f"Cost per task: ${estimate['cost_per_task_usd']:.4f}")
145
- print()
626
+ init_logging()
627
+
628
+ ip = get_vm_ip()
629
+ if not ip:
630
+ log("BUILD", "ERROR: VM not found. Run 'create' first.")
631
+ return 1
632
+
633
+ log("BUILD", "Building WAA image from waa_deploy/Dockerfile...")
634
+
635
+ # Check Dockerfile exists
636
+ if not DOCKERFILE_PATH.exists():
637
+ log("BUILD", f"ERROR: Dockerfile not found: {DOCKERFILE_PATH}")
638
+ return 1
639
+
640
+ # Copy Dockerfile and supporting files to VM
641
+ log("BUILD", "Copying build files to VM...")
642
+ ssh_run(ip, "mkdir -p ~/build")
643
+
644
+ waa_deploy_dir = DOCKERFILE_PATH.parent
645
+ files_to_copy = ["Dockerfile", "start_waa_server.bat", "api_agent.py"]
646
+ for filename in files_to_copy:
647
+ src = waa_deploy_dir / filename
648
+ if src.exists():
649
+ result = subprocess.run(
650
+ ["scp", *SSH_OPTS, str(src), f"azureuser@{ip}:~/build/"],
651
+ capture_output=True,
652
+ text=True,
653
+ )
654
+ if result.returncode != 0:
655
+ log("BUILD", f"ERROR: Failed to copy {filename}: {result.stderr}")
656
+ return 1
657
+
658
+ # Pre-build cleanup
659
+ log("BUILD", "Cleaning up dangling images before build...")
660
+ ssh_run(ip, "docker image prune -f 2>/dev/null")
661
+
662
+ # Build image (streams output)
663
+ log("BUILD", "Running docker build (this takes ~10-15 minutes)...")
664
+ build_cmd = f"cd ~/build && docker build --pull -t {DOCKER_IMAGE} . 2>&1"
665
+ result = ssh_run(ip, build_cmd, stream=True, step="BUILD")
666
+
667
+ if result.returncode != 0:
668
+ log("BUILD", "ERROR: Docker build failed")
669
+ return 1
670
+
671
+ # Post-build cleanup
672
+ log("BUILD", "Cleaning up dangling images after build...")
673
+ ssh_run(ip, "docker image prune -f 2>/dev/null")
146
674
 
675
+ log("BUILD", f"Image built: {DOCKER_IMAGE}")
676
+ return 0
147
677
 
148
- def cmd_run_local(args: argparse.Namespace) -> None:
149
- """Run evaluation locally on Windows."""
150
- from openadapt_ml.benchmarks import (
151
- RandomAgent,
152
- WAAAdapter,
153
- compute_metrics,
154
- evaluate_agent_on_benchmark,
678
+
679
+ def cmd_start(args):
680
+ """Start WAA container."""
681
+ init_logging()
682
+
683
+ ip = get_vm_ip()
684
+ if not ip:
685
+ log("START", "ERROR: VM not found. Run 'create' first.")
686
+ return 1
687
+
688
+ log("START", "Starting WAA container...")
689
+
690
+ # Stop existing container
691
+ log("START", "Stopping any existing container...")
692
+ ssh_run(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null")
693
+
694
+ # Clean storage if --fresh
695
+ if args.fresh:
696
+ log("START", "Cleaning storage for fresh Windows install...")
697
+ ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*")
698
+
699
+ # Create storage directory
700
+ ssh_run(
701
+ ip,
702
+ "sudo mkdir -p /mnt/waa-storage && sudo chown azureuser:azureuser /mnt/waa-storage",
155
703
  )
156
704
 
157
- # Check platform
158
- if sys.platform != "win32" and not args.force:
159
- print("ERROR: WAA requires Windows. Use --force to override.")
160
- sys.exit(1)
705
+ # Start container
706
+ # Our custom image has ENTRYPOINT that handles everything:
707
+ # - Downloads Windows 11 Enterprise if not present
708
+ # - Boots QEMU VM
709
+ # - Runs WAA server automatically via FirstLogonCommands
710
+ # QEMU resource allocation (--fast uses more resources on D8ds_v5)
711
+ if getattr(args, "fast", False):
712
+ ram_size = "16G"
713
+ cpu_cores = 6
714
+ log(
715
+ "START",
716
+ "Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...",
717
+ )
718
+ else:
719
+ ram_size = "8G"
720
+ cpu_cores = 4
721
+ log("START", "Starting container with VERSION=11e...")
722
+
723
+ docker_cmd = f"""docker run -d \\
724
+ --name winarena \\
725
+ --device=/dev/kvm \\
726
+ --cap-add NET_ADMIN \\
727
+ -p 8006:8006 \\
728
+ -p 5000:5000 \\
729
+ -p 7200:7200 \\
730
+ -v /mnt/waa-storage:/storage \\
731
+ -e VERSION=11e \\
732
+ -e RAM_SIZE={ram_size} \\
733
+ -e CPU_CORES={cpu_cores} \\
734
+ -e DISK_SIZE=64G \\
735
+ {DOCKER_IMAGE}"""
736
+
737
+ result = ssh_run(ip, docker_cmd)
738
+ if result.returncode != 0:
739
+ log("START", f"ERROR: Failed to start container: {result.stderr}")
740
+ return 1
741
+
742
+ log("START", "Container started")
743
+ log("START", "Windows will boot and install (15-20 min on first run)")
744
+
745
+ # Auto-launch VNC unless --no-vnc specified
746
+ if not getattr(args, "no_vnc", False):
747
+ log("START", "Auto-launching VNC viewer...")
748
+ tunnel_proc = setup_vnc_tunnel_and_browser(ip)
749
+ if tunnel_proc:
750
+ log(
751
+ "START",
752
+ f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})",
753
+ )
754
+ else:
755
+ log("START", "WARNING: VNC tunnel failed to start")
756
+ log("START", f"Manual VNC: ssh -L 8006:localhost:8006 azureuser@{ip}")
757
+ else:
758
+ log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
759
+
760
+ return 0
161
761
 
162
- # Parse task IDs
163
- task_ids = None
164
- if args.tasks:
165
- task_ids = [t.strip() for t in args.tasks.split(",")]
166
762
 
167
- # Get WAA path (auto-detect if not specified)
168
- waa_path = get_waa_path(args.waa_path)
763
+ def cmd_stop(args):
764
+ """Stop and remove WAA container."""
765
+ ip = get_vm_ip()
766
+ if not ip:
767
+ print("ERROR: VM not found")
768
+ return 1
169
769
 
170
- # Create adapter
171
- adapter = WAAAdapter(waa_repo_path=waa_path)
770
+ print(f"Stopping container on VM ({ip})...")
172
771
 
173
- # Create agent (for now, just random - in practice, would load a model)
174
- if args.agent == "random":
175
- agent = RandomAgent(seed=args.seed)
772
+ # Stop container
773
+ result = ssh_run(
774
+ ip, "docker stop winarena 2>/dev/null && echo STOPPED || echo NOT_RUNNING"
775
+ )
776
+ if "STOPPED" in result.stdout:
777
+ print(" Container stopped")
176
778
  else:
177
- print(f"ERROR: Unknown agent type: {args.agent}")
178
- sys.exit(1)
179
-
180
- # Run evaluation
181
- print(f"\nRunning WAA evaluation...")
182
- print(f" WAA path: {waa_path}")
183
- print(f" Tasks: {len(task_ids) if task_ids else 'all (154)'}")
184
- print(f" Max steps: {args.max_steps}")
185
- print()
779
+ print(" Container was not running")
186
780
 
187
- results = evaluate_agent_on_benchmark(
188
- agent=agent,
189
- adapter=adapter,
190
- task_ids=task_ids,
191
- max_steps=args.max_steps,
781
+ # Remove container
782
+ result = ssh_run(
783
+ ip, "docker rm -f winarena 2>/dev/null && echo REMOVED || echo NOT_FOUND"
192
784
  )
785
+ if "REMOVED" in result.stdout:
786
+ print(" Container removed")
787
+ else:
788
+ print(" Container already removed")
789
+
790
+ # Optionally clean storage
791
+ if hasattr(args, "clean") and args.clean:
792
+ print(" Cleaning Windows storage...")
793
+ ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*")
794
+ print(" Storage cleaned")
795
+
796
+ print("Done")
797
+ return 0
798
+
799
+
800
+ def cmd_probe(args):
801
+ """Check if WAA server is ready."""
802
+ ip = get_vm_ip()
803
+ if not ip:
804
+ print("ERROR: VM not found")
805
+ return 1
806
+
807
+ timeout = args.timeout
808
+ start = time.time()
809
+ last_storage = None
810
+
811
+ while True:
812
+ # Check via SSH - must run curl INSIDE container to reach Docker network
813
+ result = ssh_run(
814
+ ip,
815
+ "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL",
816
+ )
193
817
 
194
- # Print results
195
- metrics = compute_metrics(results)
196
- print("\n=== Results ===")
197
- print(f"Tasks: {metrics['num_tasks']}")
198
- print(f"Success rate: {metrics['success_rate']:.1%}")
199
- print(f"Avg score: {metrics['avg_score']:.3f}")
200
- print(f"Avg steps: {metrics['avg_steps']:.1f}")
201
- print()
818
+ if "FAIL" not in result.stdout and result.stdout.strip():
819
+ print("\nWAA server is READY")
820
+ print(f" Response: {result.stdout.strip()[:100]}")
821
+ return 0
202
822
 
203
- # Save results
204
- if args.output:
205
- output_path = Path(args.output)
206
- with open(output_path, "w") as f:
207
- json.dump(
208
- {
209
- "metrics": metrics,
210
- "results": [
211
- {
212
- "task_id": r.task_id,
213
- "success": r.success,
214
- "score": r.score,
215
- "num_steps": r.num_steps,
216
- "error": r.error,
217
- }
218
- for r in results
219
- ],
220
- },
221
- f,
222
- indent=2,
223
- )
224
- print(f"Results saved to: {output_path}")
823
+ if not args.wait:
824
+ print("WAA server is NOT ready")
825
+ return 1
826
+
827
+ elapsed = time.time() - start
828
+ if elapsed > timeout:
829
+ print(f"\nTIMEOUT: WAA server not ready after {timeout}s")
830
+ return 1
831
+
832
+ # Get detailed status for progress display
833
+ elapsed_min = int(elapsed // 60)
834
+ elapsed_sec = int(elapsed % 60)
835
+
836
+ # Get storage in bytes for detailed view
837
+ storage_result = ssh_run(
838
+ ip, "docker exec winarena du -sb /storage/ 2>/dev/null | cut -f1"
839
+ )
840
+ storage_bytes = storage_result.stdout.strip()
841
+ if storage_bytes.isdigit():
842
+ storage_mb = int(storage_bytes) / (1024 * 1024)
843
+ storage_str = f"{storage_mb:,.1f} MB"
844
+ # Show delta if we have previous value
845
+ if last_storage is not None:
846
+ delta = int(storage_bytes) - last_storage
847
+ if delta > 0:
848
+ delta_mb = delta / (1024 * 1024)
849
+ storage_str += f" (+{delta_mb:,.1f} MB)"
850
+ last_storage = int(storage_bytes)
851
+ else:
852
+ storage_str = "unknown"
853
+
854
+ # Get QEMU uptime
855
+ qemu_result = ssh_run(
856
+ ip,
857
+ 'docker exec winarena sh -c \'QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); [ -n "$QPID" ] && ps -o etime= -p $QPID 2>/dev/null | tr -d " " || echo N/A\'',
858
+ )
859
+ qemu_uptime = qemu_result.stdout.strip() or "N/A"
860
+
861
+ # Get container uptime
862
+ container_result = ssh_run(
863
+ ip, "docker ps --filter name=winarena --format '{{.Status}}' 2>/dev/null"
864
+ )
865
+ container_status = container_result.stdout.strip() or "unknown"
866
+
867
+ print(
868
+ f"[{elapsed_min:02d}:{elapsed_sec:02d}] Waiting... | Storage: {storage_str} | QEMU: {qemu_uptime} | Container: {container_status}"
869
+ )
870
+ time.sleep(30)
225
871
 
226
872
 
227
- def cmd_run_azure(args: argparse.Namespace) -> None:
228
- """Run evaluation on Azure."""
229
- from openadapt_ml.benchmarks import RandomAgent
230
- from openadapt_ml.benchmarks.azure import AzureConfig, AzureWAAOrchestrator
873
+ def cmd_run(args):
874
+ """Run benchmark tasks using vanilla WAA's navi agent.
875
+
876
+ Note: For API-based agents (Claude, GPT-4 direct), use openadapt-evals
877
+ which communicates with WAA's Flask API externally.
878
+ """
879
+ init_logging()
880
+
881
+ ip = get_vm_ip()
882
+ if not ip:
883
+ log("RUN", "ERROR: VM not found")
884
+ return 1
885
+
886
+ # Check WAA is ready
887
+ log("RUN", "Checking WAA server...")
888
+ result = ssh_run(
889
+ ip,
890
+ "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL",
891
+ )
892
+ if "FAIL" in result.stdout or not result.stdout.strip():
893
+ log("RUN", "ERROR: WAA server not ready. Run 'probe --wait' first.")
894
+ return 1
895
+
896
+ log("RUN", "WAA server is ready")
231
897
 
232
- # Load config
233
- if args.config:
234
- config = AzureConfig.from_json(args.config)
898
+ # Get API key (navi uses GPT-4o for reasoning)
899
+ api_key = args.api_key
900
+ if not api_key:
901
+ try:
902
+ from openadapt_ml.config import settings
903
+
904
+ api_key = settings.openai_api_key or ""
905
+ except ImportError:
906
+ api_key = ""
907
+
908
+ if not api_key:
909
+ log("RUN", "ERROR: OpenAI API key required (navi uses GPT-4o)")
910
+ log("RUN", " Set OPENAI_API_KEY in .env file or pass --api-key")
911
+ return 1
912
+
913
+ # Build task selection
914
+ domain = args.domain
915
+ task = args.task
916
+ model = args.model
917
+
918
+ task_info = []
919
+ if task:
920
+ task_info.append(f"task={task}")
921
+ elif domain != "all":
922
+ task_info.append(f"domain={domain}")
235
923
  else:
236
- config = AzureConfig.from_env()
924
+ task_info.append(f"{args.num_tasks} task(s)")
925
+
926
+ log("RUN", f"Starting benchmark: {', '.join(task_info)}, model={model}")
237
927
 
238
- # Get WAA path (auto-detect if not specified)
239
- waa_path = get_waa_path(args.waa_path)
928
+ # Build run.py arguments
929
+ run_args = [
930
+ "--agent_name navi",
931
+ f"--model {model}",
932
+ f"--domain {domain}",
933
+ ]
240
934
 
241
- # Parse task IDs
242
- task_ids = None
243
- if args.tasks:
244
- task_ids = [t.strip() for t in args.tasks.split(",")]
935
+ # Add parallelization flags if specified (argparse converts hyphens to underscores)
936
+ worker_id = getattr(args, "worker_id", 0)
937
+ num_workers = getattr(args, "num_workers", 1)
938
+ if num_workers > 1:
939
+ run_args.append(f"--worker_id {worker_id}")
940
+ run_args.append(f"--num_workers {num_workers}")
941
+ log("RUN", f"Parallel mode: worker {worker_id}/{num_workers}")
942
+
943
+ # If specific task requested, create custom test config
944
+ if task:
945
+ create_custom_test_cmd = f'''
946
+ cat > /client/evaluation_examples_windows/test_custom.json << 'CUSTOMEOF'
947
+ ["{task}"]
948
+ CUSTOMEOF
949
+ '''
950
+ run_args.append(
951
+ "--test_all_meta_path evaluation_examples_windows/test_custom.json"
952
+ )
953
+ pre_cmd = create_custom_test_cmd
954
+ elif args.num_tasks and args.num_tasks < 154:
955
+ # Limit tasks by creating custom test config with first N tasks
956
+ num = args.num_tasks
957
+ # Write a temp Python script then run it (avoids quote escaping hell)
958
+ # test_all.json is a dict {{domain: [task_ids...]}} - preserve domain structure
959
+ create_limited_test_cmd = f"""cat > /tmp/limit_tasks.py << LIMITEOF
960
+ import json
961
+ d = json.load(open("/client/evaluation_examples_windows/test_all.json"))
962
+ # Collect (domain, task_id) pairs to preserve domain info
963
+ all_tasks = []
964
+ for domain, tasks in d.items():
965
+ for task in tasks:
966
+ all_tasks.append((domain, task))
967
+ # Limit total tasks
968
+ limited = all_tasks[:{num}]
969
+ # Rebuild dict preserving original domain structure
970
+ result = {{}}
971
+ for domain, task in limited:
972
+ if domain not in result:
973
+ result[domain] = []
974
+ result[domain].append(task)
975
+ json.dump(result, open("/client/evaluation_examples_windows/test_limited.json", "w"))
976
+ print("Limited to", len(limited), "tasks from", len(result), "domains")
977
+ LIMITEOF
978
+ python /tmp/limit_tasks.py && """
979
+ run_args.append(
980
+ "--test_all_meta_path evaluation_examples_windows/test_limited.json"
981
+ )
982
+ pre_cmd = create_limited_test_cmd
983
+ else:
984
+ pre_cmd = ""
245
985
 
246
- # Create orchestrator
247
- orchestrator = AzureWAAOrchestrator(
248
- config=config,
249
- waa_repo_path=waa_path,
250
- experiment_name=args.experiment,
986
+ # Run the benchmark inside the container
987
+ run_cmd = (
988
+ f'export OPENAI_API_KEY="{api_key}" && '
989
+ f"docker exec -e OPENAI_API_KEY winarena "
990
+ f"bash -c '{pre_cmd}cd /client && python run.py {' '.join(run_args)}'"
251
991
  )
252
992
 
253
- # Create agent
254
- if args.agent == "random":
255
- agent = RandomAgent(seed=args.seed)
993
+ log("RUN", "Executing benchmark...")
994
+ log("RUN", f" Model: {model}")
995
+ log("RUN", f" Tasks: {task_info[0]}")
996
+ log("RUN", "-" * 60)
997
+
998
+ # Run with streaming output
999
+ result = ssh_run(ip, run_cmd, stream=True, step="RUN")
1000
+
1001
+ if result.returncode != 0:
1002
+ log("RUN", f"Benchmark failed with exit code {result.returncode}")
256
1003
  else:
257
- print(f"ERROR: Unknown agent type: {args.agent}")
258
- sys.exit(1)
1004
+ log("RUN", "Benchmark completed!")
259
1005
 
260
- # Estimate costs first
261
- from openadapt_ml.benchmarks.azure import estimate_cost
1006
+ # Download results unless --no-download
1007
+ if not args.no_download:
1008
+ log("RUN", "Downloading results...")
1009
+ download_benchmark_results(ip)
262
1010
 
263
- num_tasks = len(task_ids) if task_ids else 154
264
- estimate = estimate_cost(num_tasks=num_tasks, num_workers=args.workers)
1011
+ return result.returncode
265
1012
 
266
- print(f"\n=== Azure WAA Evaluation ===")
267
- print(f" Workers: {args.workers}")
268
- print(f" Tasks: {num_tasks}")
269
- print(f" Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
270
- print(f" Estimated time: {estimate['estimated_duration_minutes']:.1f} minutes")
271
- print()
272
1013
 
273
- if not args.yes:
274
- response = input("Proceed? [y/N] ")
275
- if response.lower() != "y":
276
- print("Aborted.")
277
- sys.exit(0)
1014
+ def download_benchmark_results(ip: str) -> str:
1015
+ """Download benchmark results from the container.
278
1016
 
279
- # Run evaluation
280
- print("\nStarting Azure evaluation...")
281
- print(" (VM provisioning takes 3-5 minutes - monitor at https://ml.azure.com)")
282
- print()
283
- results = orchestrator.run_evaluation(
284
- agent=agent,
285
- num_workers=args.workers,
286
- task_ids=task_ids,
287
- max_steps_per_task=args.max_steps,
288
- cleanup_on_complete=not args.no_cleanup,
289
- )
290
-
291
- # Print results
292
- from openadapt_ml.benchmarks import compute_metrics
293
-
294
- metrics = compute_metrics(results)
295
- print("\n=== Results ===")
296
- print(f"Tasks: {metrics['num_tasks']}")
297
- print(f"Success rate: {metrics['success_rate']:.1%}")
298
- print(f"Avg score: {metrics['avg_score']:.3f}")
299
- print()
1017
+ Results are saved to benchmark_results/waa_results_TIMESTAMP/
1018
+ Returns the path to the results directory, or None if failed.
1019
+ """
1020
+ from pathlib import Path
300
1021
 
301
- # Save results
302
- if args.output:
303
- output_path = Path(args.output)
304
- with open(output_path, "w") as f:
305
- json.dump(
306
- {
307
- "metrics": metrics,
308
- "run_status": orchestrator.get_run_status(),
309
- "results": [
310
- {
311
- "task_id": r.task_id,
312
- "success": r.success,
313
- "score": r.score,
314
- "num_steps": r.num_steps,
315
- }
316
- for r in results
317
- ],
318
- },
319
- f,
320
- indent=2,
1022
+ # Create local results directory with timestamp
1023
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1024
+ results_dir = Path("benchmark_results") / f"waa_results_{timestamp}"
1025
+ results_dir.mkdir(parents=True, exist_ok=True)
1026
+
1027
+ log("RUN", f"Saving results to {results_dir}/")
1028
+
1029
+ # Create tarball of results inside container
1030
+ log("RUN", "Creating results archive...")
1031
+ tar_cmd = "docker exec winarena tar -czvf /tmp/results.tar.gz -C /client/results . 2>/dev/null"
1032
+ result = subprocess.run(
1033
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", tar_cmd], capture_output=True, text=True
1034
+ )
1035
+
1036
+ if result.returncode != 0:
1037
+ log(
1038
+ "RUN",
1039
+ f"Warning: Failed to create archive: {result.stderr[:200] if result.stderr else 'unknown'}",
1040
+ )
1041
+ log("RUN", "Trying direct copy...")
1042
+
1043
+ # Try copying results directory directly
1044
+ copy_cmd = "docker cp winarena:/client/results/. /tmp/waa-results/"
1045
+ subprocess.run(
1046
+ [
1047
+ "ssh",
1048
+ *SSH_OPTS,
1049
+ f"azureuser@{ip}",
1050
+ f"rm -rf /tmp/waa-results && mkdir -p /tmp/waa-results && {copy_cmd}",
1051
+ ],
1052
+ capture_output=True,
1053
+ )
1054
+
1055
+ # Download via scp
1056
+ scp_result = subprocess.run(
1057
+ [
1058
+ "scp",
1059
+ "-r",
1060
+ *SSH_OPTS,
1061
+ f"azureuser@{ip}:/tmp/waa-results/*",
1062
+ str(results_dir),
1063
+ ],
1064
+ capture_output=True,
1065
+ text=True,
1066
+ )
1067
+ if scp_result.returncode == 0:
1068
+ log("RUN", f"Results saved to: {results_dir}")
1069
+ return str(results_dir)
1070
+ else:
1071
+ log(
1072
+ "RUN",
1073
+ f"Warning: Failed to download results: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}",
321
1074
  )
322
- print(f"Results saved to: {output_path}")
1075
+ return None
1076
+
1077
+ # Copy tarball from container to VM host
1078
+ copy_tar_cmd = "docker cp winarena:/tmp/results.tar.gz /tmp/results.tar.gz"
1079
+ subprocess.run(
1080
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", copy_tar_cmd], capture_output=True
1081
+ )
1082
+
1083
+ # Download tarball
1084
+ local_tar = results_dir / "results.tar.gz"
1085
+ scp_result = subprocess.run(
1086
+ ["scp", *SSH_OPTS, f"azureuser@{ip}:/tmp/results.tar.gz", str(local_tar)],
1087
+ capture_output=True,
1088
+ text=True,
1089
+ )
1090
+
1091
+ if scp_result.returncode != 0:
1092
+ log(
1093
+ "RUN",
1094
+ f"Warning: Failed to download tarball: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}",
1095
+ )
1096
+ return None
323
1097
 
1098
+ # Extract tarball
1099
+ log("RUN", "Extracting results...")
1100
+ import tarfile
324
1101
 
325
- def cmd_test_mock(args: argparse.Namespace) -> None:
326
- """Test with mock adapter (no Windows required)."""
327
- from openadapt_ml.benchmarks import (
328
- RandomAgent,
329
- WAAMockAdapter,
330
- compute_domain_metrics,
331
- compute_metrics,
332
- evaluate_agent_on_benchmark,
1102
+ try:
1103
+ with tarfile.open(local_tar, "r:gz") as tar:
1104
+ tar.extractall(path=results_dir)
1105
+ local_tar.unlink() # Remove tarball after extraction
1106
+ except Exception as e:
1107
+ log("RUN", f"Warning: Failed to extract: {e}")
1108
+ log("RUN", f"Tarball saved at: {local_tar}")
1109
+
1110
+ # Clean up remote tarball
1111
+ subprocess.run(
1112
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", "rm -f /tmp/results.tar.gz"],
1113
+ capture_output=True,
333
1114
  )
334
1115
 
335
- print(f"\n=== Testing with Mock Adapter ===")
336
- print(f" Tasks: {args.tasks}")
337
- print(f" Max steps: {args.max_steps}")
338
- print()
1116
+ # List what we downloaded
1117
+ result_files = list(results_dir.glob("**/*"))
1118
+ log("RUN", f"Downloaded {len(result_files)} files to {results_dir}/")
339
1119
 
340
- # Create mock adapter
341
- adapter = WAAMockAdapter(num_tasks=args.tasks)
342
- agent = RandomAgent(seed=args.seed)
343
-
344
- # Run evaluation
345
- results = evaluate_agent_on_benchmark(
346
- agent=agent,
347
- adapter=adapter,
348
- max_steps=args.max_steps,
349
- )
350
-
351
- # Print results
352
- metrics = compute_metrics(results)
353
- print("=== Results ===")
354
- print(f"Tasks: {metrics['num_tasks']}")
355
- print(f"Success rate: {metrics['success_rate']:.1%}")
356
- print(f"Successes: {metrics['success_count']}")
357
- print(f"Failures: {metrics['fail_count']}")
358
- print(f"Avg steps: {metrics['avg_steps']:.1f}")
359
- print()
1120
+ # Show summary if available
1121
+ summary_file = results_dir / "summary.json"
1122
+ if summary_file.exists():
1123
+ import json
360
1124
 
361
- # Domain breakdown
362
- tasks = adapter.list_tasks()
363
- domain_metrics = compute_domain_metrics(results, tasks)
364
- if domain_metrics:
365
- print("=== By Domain ===")
366
- for domain, dm in domain_metrics.items():
367
- print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
368
- print()
1125
+ try:
1126
+ with open(summary_file) as f:
1127
+ summary = json.load(f)
1128
+ log("RUN", f"Summary: {json.dumps(summary, indent=2)[:500]}")
1129
+ except Exception:
1130
+ pass
369
1131
 
1132
+ return str(results_dir)
370
1133
 
371
- def cmd_test_collection(args: argparse.Namespace) -> None:
372
- """Test benchmark data collection with mock adapter.
373
1134
 
374
- This command runs a benchmark evaluation with data collection enabled,
375
- creating a full directory structure with screenshots, execution traces,
376
- and metadata suitable for the benchmark viewer.
377
- """
378
- import json
379
- from pathlib import Path
1135
+ def cmd_download(args):
1136
+ """Download benchmark results from VM."""
1137
+ init_logging()
380
1138
 
381
- from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter
382
- from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
1139
+ ip = get_vm_ip()
1140
+ if not ip:
1141
+ log("DOWNLOAD", "ERROR: VM not found")
1142
+ return 1
383
1143
 
384
- print(f"\n=== Testing Benchmark Data Collection ===")
385
- print(f" Tasks: {args.tasks}")
386
- print(f" Max steps: {args.max_steps}")
387
- print(f" Output dir: {args.output}")
388
- print(f" Run name: {args.run_name or '(auto-generated)'}")
389
- print()
1144
+ log("DOWNLOAD", "Downloading benchmark results...")
1145
+ result_path = download_benchmark_results(ip)
390
1146
 
391
- # Create mock adapter
392
- adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
393
- agent = RandomAgent(action_types=["click", "type", "scroll", "done"], seed=args.seed)
394
-
395
- # Configure evaluation with data collection
396
- config = EvaluationConfig(
397
- max_steps=args.max_steps,
398
- parallel=1,
399
- save_trajectories=True,
400
- save_execution_traces=True,
401
- model_id=args.model_id,
402
- output_dir=args.output,
403
- run_name=args.run_name,
404
- verbose=True,
405
- )
406
-
407
- # Run evaluation
408
- results = evaluate_agent_on_benchmark(
409
- agent=agent,
410
- adapter=adapter,
411
- config=config,
412
- )
413
-
414
- # Print results
415
- success_count = sum(1 for r in results if r.success)
416
- success_rate = success_count / len(results) if results else 0.0
417
- avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0.0
418
-
419
- print(f"\n=== Results ===")
420
- print(f"Total tasks: {len(results)}")
421
- print(f"Success: {success_count} ({success_rate:.1%})")
422
- print(f"Failure: {len(results) - success_count}")
423
- print(f"Avg steps: {avg_steps:.1f}")
424
-
425
- # Find the actual output directory by reading metadata
426
- output_dir = Path(args.output)
427
- run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
428
- if run_dirs:
429
- run_dir = run_dirs[0].parent
430
- with open(run_dirs[0]) as f:
431
- metadata = json.load(f)
432
- run_name = metadata.get("run_name", run_dir.name)
1147
+ if result_path:
1148
+ log("DOWNLOAD", f"Results saved to: {result_path}")
1149
+ return 0
433
1150
  else:
434
- run_dir = output_dir
435
- run_name = "unknown"
436
-
437
- print(f"\n=== Output Directory ===")
438
- print(f"Location: {run_dir.absolute()}")
439
- print(f"\nDirectory structure:")
440
- print(f" {run_dir.name}/")
441
- print(f" ├── metadata.json")
442
- print(f" ├── summary.json")
443
- print(f" └── tasks/")
444
- print(f" ├── task_001/")
445
- print(f" │ ├── task.json")
446
- print(f" │ ├── execution.json")
447
- print(f" │ └── screenshots/")
448
- print(f" │ ├── step_000.png")
449
- print(f" │ ├── step_001.png")
450
- print(f" │ └── ...")
451
- print(f" └── ...")
452
- print(f"\nYou can inspect the results at: {run_dir.absolute()}")
453
- print()
1151
+ log("DOWNLOAD", "Failed to download results")
1152
+ return 1
454
1153
 
455
1154
 
456
- def cmd_run_api(args: argparse.Namespace) -> None:
457
- """Run evaluation using API-backed VLM (Claude/GPT-5.1).
1155
+ def cmd_analyze(args):
1156
+ """Analyze benchmark results from downloaded logs."""
1157
+ import re
1158
+ from collections import defaultdict
458
1159
 
459
- This provides baselines for comparing against fine-tuned models.
460
- """
461
- from openadapt_ml.benchmarks import (
462
- APIBenchmarkAgent,
463
- WAAMockAdapter,
464
- compute_domain_metrics,
465
- compute_metrics,
466
- )
467
- from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
468
-
469
- provider_names = {
470
- "anthropic": "Claude",
471
- "openai": "GPT-5.1",
472
- }
473
-
474
- print(f"\n=== API-Backed Benchmark Evaluation ===")
475
- print(f" Provider: {args.provider} ({provider_names.get(args.provider, 'Unknown')})")
476
- print(f" Tasks: {args.tasks}")
477
- print(f" Max steps: {args.max_steps}")
478
- print(f" Output dir: {args.output}")
479
- print()
480
-
481
- # Check for API key
482
- import os
483
- key_name = "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY"
484
- if not os.getenv(key_name):
485
- print(f"WARNING: {key_name} environment variable not set!")
486
- print(f" Set it in your .env file or export it before running.")
487
- print()
1160
+ results_dir = (
1161
+ Path(args.results_dir) if args.results_dir else Path("benchmark_results")
1162
+ )
488
1163
 
489
- # Create mock adapter for testing (real WAA would require Windows)
490
- # In a real scenario, this would be WAAAdapter on Windows
491
- if args.use_real_waa:
492
- if sys.platform != "win32" and not args.force:
493
- print("ERROR: WAA requires Windows. Use --force to override.")
494
- sys.exit(1)
495
- from openadapt_ml.benchmarks import WAAAdapter
496
- waa_path = get_waa_path(args.waa_path)
497
- adapter = WAAAdapter(waa_repo_path=waa_path)
498
- task_ids = None
499
- if args.task_ids:
500
- task_ids = [t.strip() for t in args.task_ids.split(",")]
1164
+ # Find most recent results if no specific dir given
1165
+ if args.results_dir:
1166
+ target_dir = Path(args.results_dir)
501
1167
  else:
502
- adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"])
503
- task_ids = None
504
-
505
- # Create API-backed agent
506
- agent = APIBenchmarkAgent(
507
- provider=args.provider,
508
- max_tokens=args.max_tokens,
509
- use_accessibility_tree=not args.no_a11y,
510
- use_history=not args.no_history,
511
- )
512
-
513
- # Configure evaluation
514
- model_id = args.model_id if args.model_id else f"{args.provider}-api"
515
- config = EvaluationConfig(
516
- max_steps=args.max_steps,
517
- parallel=1, # API calls should be sequential to avoid rate limits
518
- save_trajectories=True,
519
- save_execution_traces=True,
520
- model_id=model_id,
521
- output_dir=args.output,
522
- run_name=args.run_name,
523
- verbose=args.verbose,
524
- )
525
-
526
- # Run evaluation
527
- print("Starting evaluation...")
528
- print(" (Each step calls the API - this may take a while)")
529
- print()
1168
+ dirs = sorted(results_dir.glob("waa_results_*"), reverse=True)
1169
+ if not dirs:
1170
+ print("No results found in benchmark_results/")
1171
+ print("Run 'cli download' first to get results from VM")
1172
+ return 1
1173
+ target_dir = dirs[0]
1174
+
1175
+ print(f"Analyzing: {target_dir}")
1176
+ print("=" * 60)
1177
+
1178
+ # Find log files
1179
+ log_files = list(target_dir.glob("logs/normal-*.log"))
1180
+ if not log_files:
1181
+ print("No log files found")
1182
+ return 1
1183
+
1184
+ # Parse results
1185
+ tasks = []
1186
+ current_task = None
1187
+ pending_domain = None
1188
+
1189
+ for log_file in sorted(log_files):
1190
+ with open(log_file) as f:
1191
+ for line in f:
1192
+ # Strip ANSI codes
1193
+ clean = re.sub(r"\x1b\[[0-9;]*m", "", line)
1194
+
1195
+ # Domain comes before Example ID
1196
+ if "[Domain]:" in clean:
1197
+ match = re.search(r"\[Domain\]: (.+)", clean)
1198
+ if match:
1199
+ pending_domain = match.group(1).strip()
1200
+
1201
+ # Task start (Example ID comes after Domain)
1202
+ if "[Example ID]:" in clean:
1203
+ match = re.search(r"\[Example ID\]: (.+)", clean)
1204
+ if match:
1205
+ current_task = {
1206
+ "id": match.group(1).strip(),
1207
+ "domain": pending_domain,
1208
+ "reward": None,
1209
+ "error": None,
1210
+ }
1211
+ pending_domain = None
1212
+
1213
+ # Task result
1214
+ if "Reward:" in clean and current_task:
1215
+ match = re.search(r"Reward: ([0-9.]+)", clean)
1216
+ if match:
1217
+ current_task["reward"] = float(match.group(1))
1218
+ tasks.append(current_task)
1219
+ current_task = None
1220
+
1221
+ # Task error
1222
+ if "Exception in" in clean and current_task:
1223
+ match = re.search(r"Exception in .+: (.+)", clean)
1224
+ if match:
1225
+ current_task["error"] = match.group(1).strip()
1226
+ current_task["reward"] = 0.0
1227
+ tasks.append(current_task)
1228
+ current_task = None
1229
+
1230
+ # Summary
1231
+ print(f"\nTotal tasks attempted: {len(tasks)}")
1232
+
1233
+ if not tasks:
1234
+ print("No completed tasks found")
1235
+ return 0
1236
+
1237
+ # Success rate
1238
+ successes = sum(1 for t in tasks if t["reward"] and t["reward"] > 0)
1239
+ print(f"Successful: {successes} ({100 * successes / len(tasks):.1f}%)")
1240
+
1241
+ # By domain
1242
+ by_domain = defaultdict(list)
1243
+ for t in tasks:
1244
+ by_domain[t["domain"] or "unknown"].append(t)
1245
+
1246
+ print("\nBy domain:")
1247
+ for domain in sorted(by_domain.keys()):
1248
+ domain_tasks = by_domain[domain]
1249
+ domain_success = sum(1 for t in domain_tasks if t["reward"] and t["reward"] > 0)
1250
+ print(
1251
+ f" {domain}: {domain_success}/{len(domain_tasks)} ({100 * domain_success / len(domain_tasks):.1f}%)"
1252
+ )
530
1253
 
531
- try:
532
- results = evaluate_agent_on_benchmark(
533
- agent=agent,
534
- adapter=adapter,
535
- task_ids=task_ids,
536
- config=config,
1254
+ # Errors
1255
+ errors = [t for t in tasks if t.get("error")]
1256
+ if errors:
1257
+ print(f"\nErrors ({len(errors)}):")
1258
+ for t in errors[:5]: # Show first 5
1259
+ print(f" {t['id']}: {t['error'][:50]}")
1260
+ if len(errors) > 5:
1261
+ print(f" ... and {len(errors) - 5} more")
1262
+
1263
+ return 0
1264
+
1265
+
1266
+ def cmd_tasks(args):
1267
+ """List available WAA benchmark tasks."""
1268
+ ip = get_vm_ip()
1269
+ if not ip:
1270
+ print("ERROR: VM not found")
1271
+ return 1
1272
+
1273
+ print("Fetching available tasks from WAA container...")
1274
+ print("-" * 60)
1275
+
1276
+ # Get list of domains (subdirectories in examples/)
1277
+ result = subprocess.run(
1278
+ [
1279
+ "ssh",
1280
+ *SSH_OPTS,
1281
+ f"azureuser@{ip}",
1282
+ "docker exec winarena ls /client/evaluation_examples_windows/examples/",
1283
+ ],
1284
+ capture_output=True,
1285
+ text=True,
1286
+ )
1287
+
1288
+ if result.returncode != 0:
1289
+ print("ERROR: Could not fetch domain list")
1290
+ return 1
1291
+
1292
+ domains = result.stdout.strip().split("\n")
1293
+
1294
+ # Count tasks per domain
1295
+ domain_tasks = {}
1296
+ total_tasks = 0
1297
+
1298
+ for domain in domains:
1299
+ if not domain:
1300
+ continue
1301
+ count_result = subprocess.run(
1302
+ [
1303
+ "ssh",
1304
+ *SSH_OPTS,
1305
+ f"azureuser@{ip}",
1306
+ f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/ 2>/dev/null | wc -l",
1307
+ ],
1308
+ capture_output=True,
1309
+ text=True,
537
1310
  )
538
- except Exception as e:
539
- print(f"\nERROR: {e}")
540
- if "API key" in str(e) or "api_key" in str(e).lower():
541
- print(f"\nMake sure {key_name} is set in your environment.")
542
- sys.exit(1)
543
-
544
- # Print results
545
- metrics = compute_metrics(results)
546
- print("\n=== Results ===")
547
- print(f"Tasks: {metrics['num_tasks']}")
548
- print(f"Success rate: {metrics['success_rate']:.1%}")
549
- print(f"Successes: {metrics['success_count']}")
550
- print(f"Failures: {metrics['fail_count']}")
551
- print(f"Avg score: {metrics['avg_score']:.3f}")
552
- print(f"Avg steps: {metrics['avg_steps']:.1f}")
553
- print()
1311
+ count = (
1312
+ int(count_result.stdout.strip())
1313
+ if count_result.stdout.strip().isdigit()
1314
+ else 0
1315
+ )
1316
+ domain_tasks[domain] = count
1317
+ total_tasks += count
554
1318
 
555
- # Domain breakdown
556
- tasks = adapter.list_tasks()
557
- domain_metrics = compute_domain_metrics(results, tasks)
558
- if domain_metrics:
559
- print("=== By Domain ===")
560
- for domain, dm in domain_metrics.items():
561
- print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
1319
+ # Print summary
1320
+ print(f"Total tasks: {total_tasks}")
1321
+ print(f"Domains: {len(domains)}")
562
1322
  print()
563
1323
 
564
- # Find output directory
565
- output_dir = Path(args.output)
566
- run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
567
- if run_dirs:
568
- run_dir = run_dirs[0].parent
569
- print(f"Results saved to: {run_dir.absolute()}")
570
- print(f"View with: uv run python -m openadapt_ml.cloud.local serve --open")
1324
+ # Print by domain
1325
+ for domain in sorted(domain_tasks.keys()):
1326
+ count = domain_tasks[domain]
1327
+ print(f" {domain}: {count} tasks")
1328
+
1329
+ if args.verbose and count > 0:
1330
+ # List actual task IDs
1331
+ tasks_result = subprocess.run(
1332
+ [
1333
+ "ssh",
1334
+ *SSH_OPTS,
1335
+ f"azureuser@{ip}",
1336
+ f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/",
1337
+ ],
1338
+ capture_output=True,
1339
+ text=True,
1340
+ )
1341
+ for task_file in tasks_result.stdout.strip().split("\n")[:5]: # Limit to 5
1342
+ task_id = task_file.replace(".json", "")
1343
+ print(f" - {task_id}")
1344
+ if count > 5:
1345
+ print(f" ... and {count - 5} more")
1346
+
571
1347
  print()
1348
+ print("Usage examples:")
1349
+ print(" Run all notepad tasks: cli_v2 run --domain notepad")
1350
+ print(" Run all chrome tasks: cli_v2 run --domain chrome")
1351
+ print(
1352
+ " Run specific task: cli_v2 run --task 366de66e-cbae-4d72-b042-26390db2b145-WOS"
1353
+ )
572
1354
 
1355
+ return 0
573
1356
 
574
- def cmd_create_config(args: argparse.Namespace) -> None:
575
- """Create a sample Azure config file."""
576
- from openadapt_ml.benchmarks.azure import AzureConfig
577
1357
 
578
- config = AzureConfig(
579
- subscription_id="<your-subscription-id>",
580
- resource_group="agents",
581
- workspace_name="agents_ml",
582
- vm_size="Standard_D4_v3",
583
- )
1358
+ def cmd_deallocate(args):
1359
+ """Stop VM (preserves disk, stops billing)."""
1360
+ init_logging()
1361
+ log("DEALLOCATE", f"Deallocating VM '{VM_NAME}'...")
584
1362
 
585
- output_path = Path(args.output)
586
- config.to_json(output_path)
587
- print(f"Sample config saved to: {output_path}")
588
- print("\nEdit this file with your Azure credentials before using.")
1363
+ result = subprocess.run(
1364
+ ["az", "vm", "deallocate", "-g", RESOURCE_GROUP, "-n", VM_NAME],
1365
+ capture_output=True,
1366
+ text=True,
1367
+ )
589
1368
 
1369
+ if result.returncode == 0:
1370
+ log("DEALLOCATE", "VM deallocated (billing stopped)")
1371
+ log("DEALLOCATE", "Use 'vm-start' to resume")
1372
+ return 0
1373
+ else:
1374
+ log("DEALLOCATE", f"ERROR: {result.stderr}")
1375
+ return 1
590
1376
 
591
- def cmd_status(args: argparse.Namespace) -> None:
592
- """Check Azure workspace and compute status."""
593
- setup_logging(args.verbose)
594
1377
 
595
- # Import after logging setup to suppress Azure SDK noise
596
- from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient # noqa: E402
1378
+ def cmd_vm_start(args):
1379
+ """Start a deallocated VM."""
1380
+ init_logging()
1381
+ log("VM-START", f"Starting VM '{VM_NAME}'...")
597
1382
 
598
- print("\n=== Azure WAA Status ===\n")
1383
+ result = subprocess.run(
1384
+ ["az", "vm", "start", "-g", RESOURCE_GROUP, "-n", VM_NAME],
1385
+ capture_output=True,
1386
+ text=True,
1387
+ )
599
1388
 
600
- # Check config
601
- try:
602
- config = AzureConfig.from_env()
603
- print(f"Subscription: {config.subscription_id[:8]}...")
604
- print(f"Resource Group: {config.resource_group}")
605
- print(f"Workspace: {config.workspace_name}")
606
- print(f"VM Size: {config.vm_size}")
607
- except ValueError as e:
608
- print(f"Config Error: {e}")
609
- print("\nRun 'python scripts/setup_azure.py' to configure.")
610
- return
611
-
612
- # Check WAA
613
- waa_path = find_waa_path()
614
- if waa_path:
615
- print(f"WAA Path: {waa_path}")
1389
+ if result.returncode == 0:
1390
+ ip = get_vm_ip()
1391
+ log("VM-START", f"VM started: {ip}")
1392
+ log("VM-START", "Run 'build' then 'start' to launch WAA container")
1393
+ return 0
616
1394
  else:
617
- print("WAA Path: NOT FOUND")
618
- print(" Run: git submodule update --init --recursive")
1395
+ log("VM-START", f"ERROR: {result.stderr}")
1396
+ return 1
1397
+
1398
+
1399
+ def cmd_exec(args):
1400
+ """Run command on VM host."""
1401
+ ip = get_vm_ip()
1402
+ if not ip:
1403
+ print("ERROR: VM not found or not running")
1404
+ return 1
1405
+
1406
+ cmd = args.cmd
1407
+ if not cmd:
1408
+ print("ERROR: --cmd is required")
1409
+ return 1
1410
+
1411
+ result = ssh_run(ip, cmd, stream=True)
1412
+ return result.returncode
1413
+
1414
+
1415
+ def cmd_docker_exec(args):
1416
+ """Run command inside winarena container."""
1417
+ ip = get_vm_ip()
1418
+ if not ip:
1419
+ print("ERROR: VM not found or not running")
1420
+ return 1
1421
+
1422
+ cmd = args.cmd
1423
+ if not cmd:
1424
+ print("ERROR: --cmd is required")
1425
+ return 1
1426
+
1427
+ docker_cmd = f"docker exec winarena {cmd}"
1428
+ result = ssh_run(ip, docker_cmd, stream=True)
1429
+ return result.returncode
1430
+
1431
+
1432
+ def cmd_vnc(args):
1433
+ """Open VNC to view Windows desktop via SSH tunnel."""
1434
+ ip = get_vm_ip()
1435
+ if not ip:
1436
+ print("ERROR: VM not found or not running")
1437
+ return 1
1438
+
1439
+ print(f"Setting up SSH tunnel to VM ({ip})...")
1440
+ print("VNC will be available at: http://localhost:8006")
1441
+ print("-" * 60)
1442
+
1443
+ # Kill any existing tunnel on port 8006
1444
+ subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
1445
+
1446
+ # Start SSH tunnel in background
1447
+ tunnel_proc = subprocess.Popen(
1448
+ ["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
1449
+ stdout=subprocess.DEVNULL,
1450
+ stderr=subprocess.DEVNULL,
1451
+ )
619
1452
 
620
- # Check Azure connection
621
- print("\nConnecting to Azure...")
622
- try:
623
- client = AzureMLClient(config)
624
- computes = client.list_compute_instances(prefix="w")
625
- print(f"Connection: OK")
626
-
627
- if computes:
628
- print(f"\nActive Compute Instances ({len(computes)}):")
629
- for name in computes:
630
- try:
631
- status = client.get_compute_status(name)
632
- print(f" - {name}: {status}")
633
- except Exception:
634
- print(f" - {name}: (status unknown)")
635
- else:
636
- print("\nNo active compute instances.")
1453
+ # Give tunnel a moment to establish
1454
+ time.sleep(2)
637
1455
 
638
- except Exception as e:
639
- print(f"Connection: FAILED")
640
- print(f" Error: {e}")
1456
+ # Check if tunnel is running
1457
+ if tunnel_proc.poll() is not None:
1458
+ print("ERROR: SSH tunnel failed to start")
1459
+ return 1
1460
+
1461
+ print(f"SSH tunnel established (PID: {tunnel_proc.pid})")
1462
+
1463
+ # Open browser
1464
+ import webbrowser
1465
+
1466
+ vnc_url = "http://localhost:8006"
1467
+ print(f"Opening {vnc_url} in browser...")
1468
+ webbrowser.open(vnc_url)
641
1469
 
642
1470
  print()
1471
+ print("VNC is now accessible at: http://localhost:8006")
1472
+ print("Press Ctrl+C to close the tunnel")
1473
+ print("-" * 60)
643
1474
 
1475
+ try:
1476
+ # Keep tunnel alive
1477
+ tunnel_proc.wait()
1478
+ except KeyboardInterrupt:
1479
+ print("\nClosing SSH tunnel...")
1480
+ tunnel_proc.terminate()
644
1481
 
645
- def cmd_cleanup(args: argparse.Namespace) -> None:
646
- """Clean up all Azure compute resources."""
647
- setup_logging(args.verbose)
1482
+ return 0
648
1483
 
649
- from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient
650
1484
 
651
- print("\n=== Azure WAA Cleanup ===\n")
1485
+ def _show_benchmark_progress(ip: str) -> int:
1486
+ """Show benchmark progress with estimated completion time.
652
1487
 
653
- try:
654
- config = AzureConfig.from_env()
655
- except ValueError as e:
656
- print(f"Config Error: {e}")
657
- return
1488
+ Parses the run log to count completed tasks and estimate remaining time.
1489
+ """
1490
+ # Find the most recent run log
1491
+ result = ssh_run(
1492
+ ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
1493
+ )
1494
+ log_file = result.stdout.strip()
1495
+
1496
+ if not log_file:
1497
+ print("No benchmark running. Start one with: run --num-tasks N")
1498
+ return 1
1499
+
1500
+ # Get task count and timestamps
1501
+ result = ssh_run(
1502
+ ip,
1503
+ f"""
1504
+ echo "=== WAA Benchmark Progress ==="
1505
+ echo ""
1506
+
1507
+ # Count completed tasks (each "Result:" line = 1 task done)
1508
+ COMPLETED=$(grep -c "Result:" {log_file} 2>/dev/null || echo 0)
1509
+ # Count total tasks from task list (sum of all domain counts)
1510
+ TOTAL=$(grep -A20 "Left tasks:" {log_file} | grep -E "^[a-z_]+: [0-9]+" | awk -F': ' '{{sum+=$2}} END {{print sum}}')
1511
+ [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ] && TOTAL=154
1512
+
1513
+ # Get timestamps
1514
+ FIRST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | head -1 | tr -d '[')
1515
+ LAST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | tail -1 | tr -d '[')
1516
+
1517
+ echo "Log: {log_file}"
1518
+ echo "Started: $FIRST_TS"
1519
+ echo "Latest: $LAST_TS"
1520
+ echo ""
1521
+ echo "Tasks completed: $COMPLETED / $TOTAL"
1522
+
1523
+ # Calculate elapsed minutes
1524
+ if [ -n "$FIRST_TS" ] && [ -n "$LAST_TS" ]; then
1525
+ START_H=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f1)
1526
+ START_M=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f2)
1527
+ NOW_H=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f1)
1528
+ NOW_M=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f2)
1529
+
1530
+ ELAPSED_MIN=$(( (NOW_H - START_H) * 60 + (NOW_M - START_M) ))
1531
+ echo "Elapsed: $ELAPSED_MIN minutes"
1532
+
1533
+ if [ "$COMPLETED" -gt 0 ] && [ "$ELAPSED_MIN" -gt 0 ]; then
1534
+ MIN_PER_TASK=$((ELAPSED_MIN / COMPLETED))
1535
+ REMAINING=$((TOTAL - COMPLETED))
1536
+ EST_MIN=$((REMAINING * MIN_PER_TASK))
1537
+ EST_H=$((EST_MIN / 60))
1538
+ EST_M=$((EST_MIN % 60))
1539
+
1540
+ echo ""
1541
+ echo "Avg time per task: ~$MIN_PER_TASK min"
1542
+ echo "Remaining tasks: $REMAINING"
1543
+ echo "Estimated remaining: ~${{EST_H}}h ${{EST_M}}m"
1544
+
1545
+ # Progress bar
1546
+ PCT=$((COMPLETED * 100 / TOTAL))
1547
+ echo ""
1548
+ echo "Progress: $PCT% [$COMPLETED/$TOTAL]"
1549
+ fi
1550
+ fi
1551
+ """,
1552
+ )
1553
+ print(result.stdout)
1554
+ return 0
658
1555
 
659
- print(f"Workspace: {config.workspace_name}")
660
- print(f"Resource Group: {config.resource_group}")
661
- print()
662
1556
 
663
- client = AzureMLClient(config)
1557
+ def _show_run_logs(ip: str, follow: bool = False, tail: Optional[int] = None) -> int:
1558
+ """Show the most recent run command log file.
664
1559
 
665
- # List ALL compute instances (no prefix filter)
666
- print("Finding all compute instances...")
667
- computes = client.list_compute_instances() # No prefix = get all
1560
+ Args:
1561
+ ip: VM IP address
1562
+ follow: If True, use tail -f to stream the log
1563
+ tail: Number of lines to show (default: entire file or 100 for follow)
1564
+
1565
+ Returns:
1566
+ Exit code (0 for success, 1 for error)
1567
+ """
1568
+ # Find the most recent run log file
1569
+ result = ssh_run(
1570
+ ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
1571
+ )
1572
+ log_file = result.stdout.strip()
668
1573
 
669
- if not computes:
670
- print(" No compute instances found")
1574
+ if not log_file:
1575
+ print("No run logs found at /home/azureuser/cli_logs/run_*.log")
1576
+ print("Run a benchmark first: cli_v2 run --task <task_id>")
1577
+ return 1
1578
+
1579
+ print(f"Run log: {log_file}")
1580
+ print("-" * 60)
1581
+
1582
+ if follow:
1583
+ # Stream the log file
1584
+ print("Streaming log (Ctrl+C to stop)...")
1585
+ subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", f"tail -f {log_file}"])
671
1586
  else:
672
- print(f" Found {len(computes)} compute instance(s):")
673
- for name in computes:
674
- try:
675
- status = client.get_compute_status(name)
676
- except Exception:
677
- status = "unknown"
678
- print(f" - {name} ({status})")
1587
+ # Show the log file contents
1588
+ if tail:
1589
+ cmd = f"tail -n {tail} {log_file}"
1590
+ else:
1591
+ # Check file size first - if small, cat it; if large, use tail
1592
+ size_result = ssh_run(ip, f"wc -l < {log_file}")
1593
+ line_count = (
1594
+ int(size_result.stdout.strip())
1595
+ if size_result.stdout.strip().isdigit()
1596
+ else 0
1597
+ )
679
1598
 
680
- print()
681
- for name in computes:
682
- if not args.yes:
683
- confirm = input(f" Delete '{name}'? [y/N]: ").strip().lower()
684
- if confirm != "y":
685
- print(f" Skipped {name}")
686
- continue
687
- print(f" Deleting {name}...", end="", flush=True)
688
- try:
689
- client.delete_compute_instance(name)
690
- print(" done")
691
- except Exception as e:
692
- print(f" FAILED: {e}")
693
-
694
- print("\nCleanup complete.")
695
- print("Note: Resource deletion may take a few minutes to free quota.")
696
- print()
1599
+ if line_count <= 200:
1600
+ cmd = f"cat {log_file}"
1601
+ else:
1602
+ print(
1603
+ f"(Showing last 100 of {line_count} lines, use --tail N for more)"
1604
+ )
1605
+ cmd = f"tail -n 100 {log_file}"
1606
+
1607
+ subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd])
1608
+
1609
+ return 0
697
1610
 
698
1611
 
699
- def cmd_setup(args: argparse.Namespace) -> None:
700
- """Run full setup (Azure + WAA submodule)."""
701
- import subprocess
1612
+ def cmd_logs(args):
1613
+ """Show comprehensive logs from the WAA container.
702
1614
 
703
- print("\n=== OpenAdapt-ML WAA Setup ===\n")
1615
+ Default behavior shows all relevant logs (docker, storage, probe status).
1616
+ Use --follow to stream docker logs continuously.
1617
+ Use --run to show run command output instead of container logs.
1618
+ Use --progress to show benchmark progress and ETA.
1619
+ """
1620
+ ip = get_vm_ip()
1621
+ if not ip:
1622
+ print("ERROR: VM not found")
1623
+ return 1
1624
+
1625
+ # Handle --progress flag: show benchmark progress
1626
+ if getattr(args, "progress", False):
1627
+ return _show_benchmark_progress(ip)
1628
+
1629
+ # Handle --run flag: show run command output
1630
+ if args.run:
1631
+ return _show_run_logs(ip, args.follow, args.tail)
1632
+
1633
+ # Check if container exists
1634
+ result = ssh_run(ip, "docker ps -a --filter name=winarena --format '{{.Status}}'")
1635
+ container_status = result.stdout.strip()
1636
+ container_exists = bool(container_status)
1637
+
1638
+ # If --follow, stream the most relevant logs
1639
+ if args.follow:
1640
+ # Priority 1: If container is running, stream container logs
1641
+ if container_exists and "Up" in container_status:
1642
+ print(f"Streaming container logs from VM ({ip}):")
1643
+ print("Press Ctrl+C to stop")
1644
+ print("-" * 60)
1645
+ subprocess.run(
1646
+ ["ssh", *SSH_OPTS, f"azureuser@{ip}", "docker logs -f winarena 2>&1"]
1647
+ )
1648
+ return 0
704
1649
 
705
- # Step 1: Git submodule
706
- print("[1/2] Checking WAA submodule...")
707
- waa_path = find_waa_path()
708
- if waa_path:
709
- print(f" WAA already available at: {waa_path}")
1650
+ # Priority 2: Check for active docker build
1651
+ result = ssh_run(
1652
+ ip,
1653
+ "pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD",
1654
+ )
1655
+ if "BUILD_RUNNING" in result.stdout:
1656
+ print(f"Docker build in progress on VM ({ip})")
1657
+ print("Streaming build logs (Ctrl+C to stop):")
1658
+ print("-" * 60)
1659
+ # Find and tail the most recent build log
1660
+ subprocess.run(
1661
+ [
1662
+ "ssh",
1663
+ *SSH_OPTS,
1664
+ f"azureuser@{ip}",
1665
+ "tail -f $(ls -t ~/cli_logs/build_*.log 2>/dev/null | head -1) 2>/dev/null || "
1666
+ "tail -f ~/build.log 2>/dev/null || "
1667
+ "echo 'No build logs found - build may have just started'",
1668
+ ]
1669
+ )
1670
+ return 0
1671
+
1672
+ # Priority 3: No container, no build - show helpful message
1673
+ print(f"Container 'winarena' not running on VM ({ip})")
1674
+ print()
1675
+ # Check if image exists
1676
+ result = ssh_run(
1677
+ ip, "docker images waa-auto:latest --format '{{.Repository}}:{{.Tag}}'"
1678
+ )
1679
+ if result.stdout.strip():
1680
+ print("Image 'waa-auto:latest' is ready.")
1681
+ print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 start")
1682
+ else:
1683
+ print("Image not yet built.")
1684
+ print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 build")
1685
+ return 1
1686
+
1687
+ # Default: show comprehensive status
1688
+ import sys
1689
+
1690
+ print(f"WAA Status ({ip})")
1691
+ print("=" * 60)
1692
+ sys.stdout.flush()
1693
+
1694
+ # Docker images
1695
+ print("\n[Docker Images]", flush=True)
1696
+ subprocess.run(
1697
+ [
1698
+ "ssh",
1699
+ *SSH_OPTS,
1700
+ f"azureuser@{ip}",
1701
+ "docker images --format 'table {{.Repository}}\\t{{.Tag}}\\t{{.Size}}' 2>/dev/null | head -5",
1702
+ ]
1703
+ )
1704
+
1705
+ # Container status
1706
+ print("\n[Container]", flush=True)
1707
+ if container_exists:
1708
+ print(f" Status: {container_status}", flush=True)
710
1709
  else:
711
- print(" Initializing WAA submodule...")
712
- try:
1710
+ print(" Container 'winarena' not created yet", flush=True)
1711
+ # Check for active build
1712
+ result = ssh_run(
1713
+ ip,
1714
+ "pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD",
1715
+ )
1716
+ if "BUILD_RUNNING" in result.stdout:
1717
+ print(" Docker build in progress...", flush=True)
1718
+
1719
+ # Only show these sections if container exists
1720
+ if container_exists and "Up" in container_status:
1721
+ # Storage info
1722
+ print("\n[Storage]", flush=True)
1723
+ subprocess.run(
1724
+ [
1725
+ "ssh",
1726
+ *SSH_OPTS,
1727
+ f"azureuser@{ip}",
1728
+ "docker exec winarena sh -c '"
1729
+ 'echo " Total: $(du -sh /storage/ 2>/dev/null | cut -f1)"; '
1730
+ 'ls -lh /storage/*.img 2>/dev/null | awk "{print \\" Disk image: \\" \\$5}" || true'
1731
+ "'",
1732
+ ]
1733
+ )
1734
+
1735
+ # QEMU VM status
1736
+ print("\n[QEMU VM]", flush=True)
1737
+ subprocess.run(
1738
+ [
1739
+ "ssh",
1740
+ *SSH_OPTS,
1741
+ f"azureuser@{ip}",
1742
+ "docker exec winarena sh -c '"
1743
+ "QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); "
1744
+ 'if [ -n "$QPID" ]; then '
1745
+ ' echo " Status: Running (PID $QPID)"; '
1746
+ ' ps -o %cpu,%mem,etime -p $QPID 2>/dev/null | tail -1 | awk "{print \\" CPU: \\" \\$1 \\"%, MEM: \\" \\$2 \\"%, Uptime: \\" \\$3}"; '
1747
+ "else "
1748
+ ' echo " Status: Not running"; '
1749
+ "fi"
1750
+ "'",
1751
+ ]
1752
+ )
1753
+
1754
+ # WAA server probe
1755
+ print("\n[WAA Server]", flush=True)
1756
+ subprocess.run(
1757
+ [
1758
+ "ssh",
1759
+ *SSH_OPTS,
1760
+ f"azureuser@{ip}",
1761
+ "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null && echo ' (READY)' || echo 'Not ready (Windows installing - check VNC for progress)'",
1762
+ ]
1763
+ )
1764
+
1765
+ # Windows install log (written by install.bat to Samba share at Z:\install_log.txt)
1766
+ # The Samba share \\host.lan\Data maps to /tmp/smb inside the container
1767
+ result = ssh_run(
1768
+ ip, "docker exec winarena cat /tmp/smb/install_log.txt 2>/dev/null | wc -l"
1769
+ )
1770
+ install_log_lines = result.stdout.strip()
1771
+ if install_log_lines and install_log_lines != "0":
1772
+ print("\n[Windows Install Log]", flush=True)
1773
+ # Show last 10 lines of the install log (shows current step like [5/14] Installing Git...)
713
1774
  subprocess.run(
714
- ["git", "submodule", "update", "--init", "--recursive"],
715
- check=True,
716
- capture_output=not args.verbose,
1775
+ [
1776
+ "ssh",
1777
+ *SSH_OPTS,
1778
+ f"azureuser@{ip}",
1779
+ "docker exec winarena tail -10 /tmp/smb/install_log.txt 2>/dev/null",
1780
+ ]
717
1781
  )
718
- print(" WAA submodule initialized")
719
- except subprocess.CalledProcessError as e:
720
- print(f" Failed: {e}")
721
- if not args.force:
722
- sys.exit(1)
723
-
724
- # Step 2: Azure setup
725
- print("\n[2/2] Azure setup...")
726
- setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_azure.py"
727
- if setup_script.exists():
728
- cmd = ["python", str(setup_script)]
729
- if args.yes:
730
- cmd.append("--yes")
731
- try:
732
- subprocess.run(cmd, check=True)
733
- except subprocess.CalledProcessError:
734
- print(" Azure setup failed or was cancelled")
735
- if not args.force:
736
- sys.exit(1)
1782
+
1783
+ # Recent docker logs
1784
+ tail_lines = args.tail if args.tail else 20
1785
+ print(f"\n[Recent Logs (last {tail_lines} lines)]", flush=True)
1786
+ print("-" * 60, flush=True)
1787
+ subprocess.run(
1788
+ [
1789
+ "ssh",
1790
+ *SSH_OPTS,
1791
+ f"azureuser@{ip}",
1792
+ f"docker logs --tail {tail_lines} winarena 2>&1",
1793
+ ]
1794
+ )
1795
+
1796
+ print("\n" + "=" * 60, flush=True)
1797
+ print("VNC: ssh -L 8006:localhost:8006 azureuser@" + ip, flush=True)
1798
+ print(" Then open http://localhost:8006", flush=True)
1799
+ print(" (Windows installation % visible on VNC screen)", flush=True)
737
1800
  else:
738
- print(f" Setup script not found: {setup_script}")
739
- print(" Run manually: python scripts/setup_azure.py")
740
-
741
- print("\n=== Setup Complete ===")
742
- print("\nNext steps:")
743
- print(" 1. Check status: python -m openadapt_ml.benchmarks.cli status")
744
- print(" 2. Test locally: python -m openadapt_ml.benchmarks.cli test-mock")
745
- print(" 3. Run on Azure: python -m openadapt_ml.benchmarks.cli run-azure")
746
- print()
1801
+ # Show next steps
1802
+ print("\n[Next Steps]")
1803
+ result = ssh_run(ip, "docker images waa-auto:latest --format '{{.Repository}}'")
1804
+ if result.stdout.strip():
1805
+ print(" Image ready. Run: cli_v2 start")
1806
+ else:
1807
+ print(" Build image first. Run: cli_v2 build")
1808
+
1809
+ return 0
1810
+
1811
+
1812
+ # =============================================================================
1813
+ # Main
1814
+ # =============================================================================
747
1815
 
748
1816
 
749
- def main() -> None:
1817
+ def main():
750
1818
  parser = argparse.ArgumentParser(
751
- description="WAA Benchmark CLI - Windows Agent Arena evaluation toolkit",
1819
+ description="WAA Benchmark CLI v2 - Minimal working CLI",
752
1820
  formatter_class=argparse.RawDescriptionHelpFormatter,
753
1821
  epilog="""
754
- Quick Start:
755
- # First time setup (Azure + WAA submodule)
756
- python -m openadapt_ml.benchmarks.cli setup
1822
+ Examples:
1823
+ # Full setup workflow (vanilla WAA)
1824
+ %(prog)s create # Create Azure VM
1825
+ %(prog)s pull # Pull vanilla WAA image
1826
+ %(prog)s start # Start container + Windows
1827
+ %(prog)s probe --wait # Wait for WAA server
1828
+ %(prog)s run --num-tasks 1 --agent navi # Run benchmark
1829
+ %(prog)s deallocate # Stop billing
1830
+
1831
+ # Monitor in separate terminal
1832
+ %(prog)s logs --docker # Docker container logs
1833
+ %(prog)s vnc # View Windows desktop
1834
+
1835
+ # Cleanup
1836
+ %(prog)s delete
1837
+ """,
1838
+ )
1839
+
1840
+ subparsers = parser.add_subparsers(dest="command", required=True)
757
1841
 
758
- # Check everything is configured
759
- python -m openadapt_ml.benchmarks.cli status
1842
+ # create
1843
+ p_create = subparsers.add_parser("create", help="Create Azure VM")
1844
+ p_create.add_argument(
1845
+ "--fast",
1846
+ action="store_true",
1847
+ help="Use larger VM (D8ds_v5, $0.38/hr) for ~30%% faster install, ~40%% faster eval",
1848
+ )
1849
+ p_create.set_defaults(func=cmd_create)
760
1850
 
761
- # Test locally with mock adapter
762
- python -m openadapt_ml.benchmarks.cli test-mock
1851
+ # delete
1852
+ p_delete = subparsers.add_parser("delete", help="Delete VM and all resources")
1853
+ p_delete.set_defaults(func=cmd_delete)
763
1854
 
764
- # Run on Azure
765
- python -m openadapt_ml.benchmarks.cli run-azure
766
- """,
1855
+ # status
1856
+ p_status = subparsers.add_parser("status", help="Show VM status")
1857
+ p_status.set_defaults(func=cmd_status)
1858
+
1859
+ # build
1860
+ p_build = subparsers.add_parser(
1861
+ "build", help="Build WAA image from waa_deploy/Dockerfile"
767
1862
  )
768
- subparsers = parser.add_subparsers(dest="command", help="Command to run")
769
-
770
- # Setup (new!)
771
- p_setup = subparsers.add_parser("setup", help="One-command setup (Azure + WAA)")
772
- p_setup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts")
773
- p_setup.add_argument("--force", action="store_true", help="Continue on errors")
774
- p_setup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
775
-
776
- # Status
777
- p_status = subparsers.add_parser("status", help="Check Azure and WAA status")
778
- p_status.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
779
-
780
- # Cleanup
781
- p_cleanup = subparsers.add_parser("cleanup", help="Delete all Azure compute instances")
782
- p_cleanup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
783
- p_cleanup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
784
-
785
- # Estimate costs
786
- p_estimate = subparsers.add_parser("estimate", help="Estimate Azure costs")
787
- p_estimate.add_argument("--tasks", type=int, default=154, help="Number of tasks")
788
- p_estimate.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
789
- p_estimate.add_argument("--duration", type=float, default=1.0, help="Avg task duration (minutes)")
790
- p_estimate.add_argument("--vm-cost", type=float, default=0.19, help="VM hourly cost ($ for D4_v3)")
791
-
792
- # Run local
793
- p_local = subparsers.add_parser("run-local", help="Run evaluation locally (Windows)")
794
- p_local.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
795
- p_local.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
796
- p_local.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
797
- p_local.add_argument("--agent", default="random", help="Agent type")
798
- p_local.add_argument("--seed", type=int, default=42, help="Random seed")
799
- p_local.add_argument("--output", help="Output JSON path")
800
- p_local.add_argument("--force", action="store_true", help="Force run on non-Windows")
801
- p_local.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
802
-
803
- # Run Azure
804
- p_azure = subparsers.add_parser("run-azure", help="Run evaluation on Azure")
805
- p_azure.add_argument("--config", help="Azure config JSON path")
806
- p_azure.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
807
- p_azure.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
808
- p_azure.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
809
- p_azure.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
810
- p_azure.add_argument("--agent", default="random", help="Agent type")
811
- p_azure.add_argument("--seed", type=int, default=42, help="Random seed")
812
- p_azure.add_argument("--experiment", default="waa-eval", help="Experiment name")
813
- p_azure.add_argument("--output", help="Output JSON path")
814
- p_azure.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
815
- p_azure.add_argument("--no-cleanup", action="store_true", help="Don't delete VMs after")
816
- p_azure.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
817
-
818
- # Test mock
819
- p_mock = subparsers.add_parser("test-mock", help="Test with mock adapter")
820
- p_mock.add_argument("--tasks", type=int, default=20, help="Number of mock tasks")
821
- p_mock.add_argument("--max-steps", type=int, default=10, help="Max steps per task")
822
- p_mock.add_argument("--seed", type=int, default=42, help="Random seed")
823
-
824
- # Test collection
825
- p_collection = subparsers.add_parser("test-collection", help="Test benchmark data collection")
826
- p_collection.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
827
- p_collection.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
828
- p_collection.add_argument("--seed", type=int, default=42, help="Random seed")
829
- p_collection.add_argument("--model-id", default="random-agent-test", help="Model identifier")
830
- p_collection.add_argument("--output", default="benchmark_results", help="Output directory")
831
- p_collection.add_argument("--run-name", help="Run name (default: auto-generated)")
832
-
833
- # Run API-backed evaluation
834
- p_api = subparsers.add_parser("run-api", help="Run evaluation with API-backed VLM (Claude/GPT-5.1)")
835
- p_api.add_argument("--provider", choices=["anthropic", "openai"], default="anthropic",
836
- help="API provider (anthropic=Claude, openai=GPT-5.1)")
837
- p_api.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
838
- p_api.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
839
- p_api.add_argument("--max-tokens", type=int, default=512, help="Max tokens for API response")
840
- p_api.add_argument("--no-a11y", action="store_true", help="Disable accessibility tree in prompt")
841
- p_api.add_argument("--no-history", action="store_true", help="Disable action history in prompt")
842
- p_api.add_argument("--output", default="benchmark_results", help="Output directory")
843
- p_api.add_argument("--run-name", help="Run name (default: auto-generated)")
844
- p_api.add_argument("--model-id", help="Model identifier (default: {provider}-api)")
845
- p_api.add_argument("--use-real-waa", action="store_true", help="Use real WAA adapter (Windows only)")
846
- p_api.add_argument("--waa-path", help="Path to WAA repository")
847
- p_api.add_argument("--task-ids", help="Comma-separated task IDs for real WAA")
848
- p_api.add_argument("--force", action="store_true", help="Force run on non-Windows")
849
- p_api.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
850
-
851
- # Create config
852
- p_config = subparsers.add_parser("create-config", help="Create sample Azure config")
853
- p_config.add_argument("--output", default="azure_config.json", help="Output path")
1863
+ p_build.set_defaults(func=cmd_build)
854
1864
 
855
- args = parser.parse_args()
1865
+ # start
1866
+ p_start = subparsers.add_parser("start", help="Start WAA container")
1867
+ p_start.add_argument(
1868
+ "--fresh", action="store_true", help="Clean storage for fresh Windows install"
1869
+ )
1870
+ p_start.add_argument(
1871
+ "--no-vnc", action="store_true", help="Don't auto-launch VNC viewer"
1872
+ )
1873
+ p_start.add_argument(
1874
+ "--fast",
1875
+ action="store_true",
1876
+ help="Allocate more CPU/RAM to QEMU (use with D8ds_v5 VM)",
1877
+ )
1878
+ p_start.set_defaults(func=cmd_start)
856
1879
 
857
- if args.command == "setup":
858
- cmd_setup(args)
859
- elif args.command == "status":
860
- cmd_status(args)
861
- elif args.command == "cleanup":
862
- cmd_cleanup(args)
863
- elif args.command == "estimate":
864
- cmd_estimate(args)
865
- elif args.command == "run-local":
866
- setup_logging(getattr(args, 'verbose', False))
867
- cmd_run_local(args)
868
- elif args.command == "run-azure":
869
- setup_logging(getattr(args, 'verbose', False))
870
- cmd_run_azure(args)
871
- elif args.command == "test-mock":
872
- cmd_test_mock(args)
873
- elif args.command == "test-collection":
874
- cmd_test_collection(args)
875
- elif args.command == "run-api":
876
- cmd_run_api(args)
877
- elif args.command == "create-config":
878
- cmd_create_config(args)
879
- else:
880
- parser.print_help()
1880
+ # stop
1881
+ p_stop = subparsers.add_parser("stop", help="Stop and remove WAA container")
1882
+ p_stop.add_argument(
1883
+ "--clean", action="store_true", help="Also clean Windows storage"
1884
+ )
1885
+ p_stop.set_defaults(func=cmd_stop)
1886
+
1887
+ # probe
1888
+ p_probe = subparsers.add_parser("probe", help="Check if WAA server is ready")
1889
+ p_probe.add_argument("--wait", action="store_true", help="Wait until ready")
1890
+ p_probe.add_argument(
1891
+ "--timeout", type=int, default=1200, help="Timeout in seconds (default: 1200)"
1892
+ )
1893
+ p_probe.set_defaults(func=cmd_probe)
1894
+
1895
+ # run
1896
+ p_run = subparsers.add_parser(
1897
+ "run", help="Run benchmark tasks (uses vanilla WAA navi agent)"
1898
+ )
1899
+ p_run.add_argument(
1900
+ "--num-tasks",
1901
+ type=int,
1902
+ default=1,
1903
+ help="Number of tasks to run (ignored if --task specified)",
1904
+ )
1905
+ p_run.add_argument("--task", help="Specific task ID to run")
1906
+ p_run.add_argument(
1907
+ "--domain",
1908
+ default="all",
1909
+ help="Domain filter (e.g., 'notepad', 'chrome', 'all')",
1910
+ )
1911
+ p_run.add_argument(
1912
+ "--model", default="gpt-4o", help="Model for navi agent (default: gpt-4o)"
1913
+ )
1914
+ p_run.add_argument(
1915
+ "--api-key", help="OpenAI API key (or set OPENAI_API_KEY in .env)"
1916
+ )
1917
+ p_run.add_argument(
1918
+ "--no-download", action="store_true", help="Skip downloading results"
1919
+ )
1920
+ p_run.add_argument(
1921
+ "--worker-id",
1922
+ type=int,
1923
+ default=0,
1924
+ help="Worker ID for parallel execution (0-indexed)",
1925
+ )
1926
+ p_run.add_argument(
1927
+ "--num-workers",
1928
+ type=int,
1929
+ default=1,
1930
+ help="Total number of parallel workers",
1931
+ )
1932
+ p_run.set_defaults(func=cmd_run)
1933
+
1934
+ # download
1935
+ p_download = subparsers.add_parser(
1936
+ "download", help="Download benchmark results from VM"
1937
+ )
1938
+ p_download.set_defaults(func=cmd_download)
1939
+
1940
+ # analyze
1941
+ p_analyze = subparsers.add_parser("analyze", help="Analyze benchmark results")
1942
+ p_analyze.add_argument(
1943
+ "--results-dir",
1944
+ help="Results directory (default: most recent in benchmark_results/)",
1945
+ )
1946
+ p_analyze.set_defaults(func=cmd_analyze)
1947
+
1948
+ # tasks
1949
+ p_tasks = subparsers.add_parser("tasks", help="List available WAA benchmark tasks")
1950
+ p_tasks.add_argument(
1951
+ "--verbose", "-v", action="store_true", help="Show all task IDs"
1952
+ )
1953
+ p_tasks.set_defaults(func=cmd_tasks)
1954
+
1955
+ # deallocate
1956
+ p_dealloc = subparsers.add_parser("deallocate", help="Stop VM (preserves disk)")
1957
+ p_dealloc.set_defaults(func=cmd_deallocate)
1958
+
1959
+ # vm-start
1960
+ p_vmstart = subparsers.add_parser("vm-start", help="Start a deallocated VM")
1961
+ p_vmstart.set_defaults(func=cmd_vm_start)
1962
+
1963
+ # logs
1964
+ p_logs = subparsers.add_parser("logs", help="Show WAA status and logs")
1965
+ p_logs.add_argument(
1966
+ "--follow", "-f", action="store_true", help="Stream docker logs continuously"
1967
+ )
1968
+ p_logs.add_argument(
1969
+ "--tail", "-n", type=int, help="Number of log lines to show (default: 20)"
1970
+ )
1971
+ p_logs.add_argument(
1972
+ "--run",
1973
+ action="store_true",
1974
+ help="Show run command output instead of container logs",
1975
+ )
1976
+ p_logs.add_argument(
1977
+ "--progress",
1978
+ "-p",
1979
+ action="store_true",
1980
+ help="Show benchmark progress and estimated completion time",
1981
+ )
1982
+ p_logs.set_defaults(func=cmd_logs)
1983
+
1984
+ # exec
1985
+ p_exec = subparsers.add_parser("exec", help="Run command on VM host")
1986
+ p_exec.add_argument("--cmd", required=True, help="Command to run")
1987
+ p_exec.set_defaults(func=cmd_exec)
1988
+
1989
+ # docker-exec
1990
+ p_dexec = subparsers.add_parser(
1991
+ "docker-exec", help="Run command inside winarena container"
1992
+ )
1993
+ p_dexec.add_argument("--cmd", required=True, help="Command to run")
1994
+ p_dexec.set_defaults(func=cmd_docker_exec)
1995
+
1996
+ # vnc
1997
+ p_vnc = subparsers.add_parser(
1998
+ "vnc", help="Open VNC to view Windows desktop via SSH tunnel"
1999
+ )
2000
+ p_vnc.set_defaults(func=cmd_vnc)
2001
+
2002
+ args = parser.parse_args()
2003
+ sys.exit(args.func(args))
881
2004
 
882
2005
 
883
2006
  if __name__ == "__main__":