openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -107
- openadapt_ml/benchmarks/agent.py +297 -374
- openadapt_ml/benchmarks/azure.py +62 -24
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1874 -751
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +1236 -0
- openadapt_ml/benchmarks/vm_monitor.py +1111 -0
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +3194 -89
- openadapt_ml/cloud/ssh_tunnel.py +595 -0
- openadapt_ml/datasets/next_action.py +125 -96
- openadapt_ml/evals/grounding.py +32 -9
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +120 -57
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +732 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +277 -0
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +11 -10
- openadapt_ml/ingest/capture.py +97 -86
- openadapt_ml/ingest/loader.py +120 -69
- openadapt_ml/ingest/synthetic.py +344 -193
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +843 -0
- openadapt_ml/retrieval/embeddings.py +630 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +162 -0
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +27 -14
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +113 -0
- openadapt_ml/schema/converters.py +588 -0
- openadapt_ml/schema/episode.py +470 -0
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +102 -61
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +19 -14
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +16 -17
- openadapt_ml/scripts/train.py +98 -75
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +3255 -19
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +255 -441
- openadapt_ml/training/trl_trainer.py +403 -0
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/runner.py +0 -381
- openadapt_ml/benchmarks/waa.py +0 -704
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
openadapt_ml/benchmarks/cli.py
CHANGED
|
@@ -1,883 +1,2006 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
# Estimate costs
|
|
5
|
-
python -m openadapt_ml.benchmarks.cli estimate --workers 40
|
|
6
|
-
|
|
7
|
-
# Run local evaluation (Windows only)
|
|
8
|
-
python -m openadapt_ml.benchmarks.cli run-local --waa-path /path/to/WAA --tasks notepad_1,notepad_2
|
|
9
|
-
|
|
10
|
-
# Run Azure evaluation
|
|
11
|
-
python -m openadapt_ml.benchmarks.cli run-azure --config azure_config.json --workers 40
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
WAA Benchmark CLI - Windows Agent Arena evaluation toolkit
|
|
12
4
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5
|
|
5
|
+
Uses custom waa_deploy/Dockerfile with dockurr/windows:latest base and
|
|
6
|
+
Python 3.9 from vanilla windowsarena/winarena for GroundingDINO compatibility.
|
|
16
7
|
|
|
17
|
-
|
|
18
|
-
python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
|
|
8
|
+
See waa_deploy/Dockerfile for details.
|
|
19
9
|
|
|
20
|
-
|
|
21
|
-
python -m openadapt_ml.benchmarks.cli
|
|
10
|
+
Usage:
|
|
11
|
+
uv run python -m openadapt_ml.benchmarks.cli <command> [options]
|
|
12
|
+
|
|
13
|
+
Commands:
|
|
14
|
+
create Create Azure VM with nested virtualization
|
|
15
|
+
delete Delete VM and ALL associated resources
|
|
16
|
+
status Show VM state and IP
|
|
17
|
+
build Build WAA image from waa_deploy/Dockerfile
|
|
18
|
+
start Start WAA container (Windows boots + WAA server)
|
|
19
|
+
probe Check if WAA server is ready
|
|
20
|
+
run Run benchmark tasks
|
|
21
|
+
deallocate Stop VM (preserves disk, stops billing)
|
|
22
|
+
logs Show WAA status and logs
|
|
23
|
+
|
|
24
|
+
Workflow:
|
|
25
|
+
1. create - Create Azure VM (~5 min)
|
|
26
|
+
2. build - Build custom WAA image (~10 min)
|
|
27
|
+
3. start - Start container, Windows downloads+boots (~15-20 min first time)
|
|
28
|
+
4. probe --wait - Wait for WAA server
|
|
29
|
+
5. run - Run benchmark
|
|
30
|
+
6. deallocate - Stop billing
|
|
22
31
|
"""
|
|
23
32
|
|
|
24
|
-
from __future__ import annotations
|
|
25
|
-
|
|
26
33
|
import argparse
|
|
27
34
|
import json
|
|
28
|
-
import
|
|
35
|
+
import subprocess
|
|
29
36
|
import sys
|
|
37
|
+
import time
|
|
38
|
+
import webbrowser
|
|
39
|
+
from datetime import datetime
|
|
30
40
|
from pathlib import Path
|
|
41
|
+
from typing import Optional
|
|
42
|
+
|
|
43
|
+
# =============================================================================
|
|
44
|
+
# Constants (single source of truth)
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
# VM sizes with nested virtualization support
|
|
48
|
+
# Standard: $0.19/hr, 4 vCPU, 16GB RAM - baseline
|
|
49
|
+
# Fast: $0.38/hr, 8 vCPU, 32GB RAM - ~30% faster install, ~40% faster eval
|
|
50
|
+
VM_SIZE_STANDARD = "Standard_D4ds_v4"
|
|
51
|
+
VM_SIZE_FAST = "Standard_D8ds_v5"
|
|
52
|
+
VM_SIZE = VM_SIZE_STANDARD # Default, can be overridden by --fast flag
|
|
53
|
+
|
|
54
|
+
# Fallback sizes for --fast mode (in order of preference)
|
|
55
|
+
# D8ds_v5: First choice (v5 with local SSD)
|
|
56
|
+
# D8s_v5: v5 without local SSD
|
|
57
|
+
# D8ds_v4: v4 with local SSD
|
|
58
|
+
# D8as_v5: AMD version
|
|
59
|
+
VM_SIZE_FAST_FALLBACKS = [
|
|
60
|
+
("Standard_D8ds_v5", 0.38),
|
|
61
|
+
("Standard_D8s_v5", 0.36),
|
|
62
|
+
("Standard_D8ds_v4", 0.38),
|
|
63
|
+
("Standard_D8as_v5", 0.34),
|
|
64
|
+
]
|
|
65
|
+
VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"]
|
|
66
|
+
VM_NAME = "waa-eval-vm"
|
|
67
|
+
RESOURCE_GROUP = "openadapt-agents"
|
|
68
|
+
# Custom image built from waa_deploy/Dockerfile
|
|
69
|
+
# Uses dockurr/windows:latest (proper ISO download) + WAA components
|
|
70
|
+
DOCKER_IMAGE = "waa-auto:latest"
|
|
71
|
+
LOG_DIR = Path.home() / ".openadapt" / "waa"
|
|
72
|
+
SSH_OPTS = [
|
|
73
|
+
"-o",
|
|
74
|
+
"StrictHostKeyChecking=no",
|
|
75
|
+
"-o",
|
|
76
|
+
"UserKnownHostsFile=/dev/null",
|
|
77
|
+
"-o",
|
|
78
|
+
"LogLevel=ERROR",
|
|
79
|
+
"-o",
|
|
80
|
+
"ConnectTimeout=10",
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def setup_vnc_tunnel_and_browser(ip: str) -> Optional[subprocess.Popen]:
|
|
85
|
+
"""Set up SSH tunnel for VNC and open browser.
|
|
86
|
+
|
|
87
|
+
Returns the tunnel process on success, None on failure.
|
|
88
|
+
"""
|
|
89
|
+
# Kill any existing tunnel on port 8006
|
|
90
|
+
subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
|
|
91
|
+
|
|
92
|
+
# Start SSH tunnel in background
|
|
93
|
+
tunnel_proc = subprocess.Popen(
|
|
94
|
+
["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
|
|
95
|
+
stdout=subprocess.DEVNULL,
|
|
96
|
+
stderr=subprocess.DEVNULL,
|
|
97
|
+
)
|
|
31
98
|
|
|
32
|
-
|
|
99
|
+
# Wait for tunnel to establish
|
|
100
|
+
time.sleep(2)
|
|
33
101
|
|
|
34
|
-
#
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
logging.getLogger("azure.ai.ml").setLevel(logging.WARNING)
|
|
38
|
-
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
39
|
-
logging.getLogger("msrest").setLevel(logging.WARNING)
|
|
40
|
-
logging.getLogger("openadapt_ml.benchmarks.azure").setLevel(logging.WARNING)
|
|
102
|
+
# Check if tunnel is running
|
|
103
|
+
if tunnel_proc.poll() is not None:
|
|
104
|
+
return None
|
|
41
105
|
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
106
|
+
# Open browser
|
|
107
|
+
vnc_url = "http://localhost:8006"
|
|
108
|
+
webbrowser.open(vnc_url)
|
|
45
109
|
|
|
110
|
+
return tunnel_proc
|
|
46
111
|
|
|
47
|
-
def setup_logging(verbose: bool = False) -> None:
|
|
48
|
-
"""Configure logging with appropriate verbosity.
|
|
49
112
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
113
|
+
# Dockerfile location (relative to this file)
|
|
114
|
+
DOCKERFILE_PATH = Path(__file__).parent / "waa_deploy" / "Dockerfile"
|
|
115
|
+
|
|
116
|
+
# =============================================================================
|
|
117
|
+
# Logging
|
|
118
|
+
# =============================================================================
|
|
119
|
+
|
|
120
|
+
_log_file: Optional[Path] = None
|
|
121
|
+
_session_id: Optional[str] = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def init_logging() -> Path:
|
|
125
|
+
"""Initialize logging for this session."""
|
|
126
|
+
global _log_file, _session_id
|
|
127
|
+
|
|
128
|
+
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
# Create session ID
|
|
131
|
+
_session_id = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
|
132
|
+
session_dir = LOG_DIR / "sessions" / _session_id
|
|
133
|
+
session_dir.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
|
|
135
|
+
# Session log file
|
|
136
|
+
_log_file = session_dir / "full.log"
|
|
137
|
+
|
|
138
|
+
# Update current session pointer
|
|
139
|
+
(LOG_DIR / "session_id.txt").write_text(_session_id)
|
|
140
|
+
|
|
141
|
+
# Symlink for easy access
|
|
142
|
+
current_link = LOG_DIR / "current"
|
|
143
|
+
if current_link.exists() or current_link.is_symlink():
|
|
144
|
+
current_link.unlink()
|
|
145
|
+
current_link.symlink_to(session_dir)
|
|
146
|
+
|
|
147
|
+
return _log_file
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def log(step: str, message: str, end: str = "\n"):
|
|
151
|
+
"""Log message to file and stdout."""
|
|
152
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
153
|
+
formatted = f"[{timestamp}] [{step}] {message}"
|
|
154
|
+
|
|
155
|
+
# Print to stdout
|
|
156
|
+
print(formatted, end=end, flush=True)
|
|
157
|
+
|
|
158
|
+
# Write to log file
|
|
159
|
+
if _log_file:
|
|
160
|
+
with open(_log_file, "a") as f:
|
|
161
|
+
f.write(formatted + end)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def log_stream(step: str, process: subprocess.Popen):
|
|
165
|
+
"""Stream process output to log and stdout."""
|
|
166
|
+
if process.stdout:
|
|
167
|
+
for line in iter(process.stdout.readline, ""):
|
|
168
|
+
if line:
|
|
169
|
+
log(step, line.rstrip())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# =============================================================================
|
|
173
|
+
# Azure Helpers
|
|
174
|
+
# =============================================================================
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def get_vm_ip() -> Optional[str]:
|
|
178
|
+
"""Get VM public IP if it exists."""
|
|
179
|
+
result = subprocess.run(
|
|
180
|
+
[
|
|
181
|
+
"az",
|
|
182
|
+
"vm",
|
|
183
|
+
"show",
|
|
184
|
+
"-d",
|
|
185
|
+
"-g",
|
|
186
|
+
RESOURCE_GROUP,
|
|
187
|
+
"-n",
|
|
188
|
+
VM_NAME,
|
|
189
|
+
"--query",
|
|
190
|
+
"publicIps",
|
|
191
|
+
"-o",
|
|
192
|
+
"tsv",
|
|
193
|
+
],
|
|
194
|
+
capture_output=True,
|
|
195
|
+
text=True,
|
|
57
196
|
)
|
|
197
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
198
|
+
return result.stdout.strip()
|
|
199
|
+
return None
|
|
58
200
|
|
|
59
|
-
# Suppress noisy Azure SDK logs unless verbose
|
|
60
|
-
if not verbose:
|
|
61
|
-
logging.getLogger("azure").setLevel(logging.WARNING)
|
|
62
|
-
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
|
|
63
|
-
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
64
|
-
logging.getLogger("msrest").setLevel(logging.WARNING)
|
|
65
201
|
|
|
202
|
+
def get_vm_state() -> Optional[str]:
|
|
203
|
+
"""Get VM power state."""
|
|
204
|
+
result = subprocess.run(
|
|
205
|
+
[
|
|
206
|
+
"az",
|
|
207
|
+
"vm",
|
|
208
|
+
"get-instance-view",
|
|
209
|
+
"-g",
|
|
210
|
+
RESOURCE_GROUP,
|
|
211
|
+
"-n",
|
|
212
|
+
VM_NAME,
|
|
213
|
+
"--query",
|
|
214
|
+
"instanceView.statuses[1].displayStatus",
|
|
215
|
+
"-o",
|
|
216
|
+
"tsv",
|
|
217
|
+
],
|
|
218
|
+
capture_output=True,
|
|
219
|
+
text=True,
|
|
220
|
+
)
|
|
221
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
222
|
+
return result.stdout.strip()
|
|
223
|
+
return None
|
|
66
224
|
|
|
67
|
-
def find_waa_path() -> Path | None:
|
|
68
|
-
"""Auto-detect Windows Agent Arena repository path.
|
|
69
225
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
226
|
+
def ssh_run(
|
|
227
|
+
ip: str, cmd: str, stream: bool = False, step: str = "SSH"
|
|
228
|
+
) -> subprocess.CompletedProcess:
|
|
229
|
+
"""Run command on VM via SSH.
|
|
74
230
|
|
|
75
|
-
|
|
76
|
-
|
|
231
|
+
When stream=True:
|
|
232
|
+
1. Runs command on VM with output redirected to a persistent log file
|
|
233
|
+
2. Streams that log file locally in real-time
|
|
234
|
+
3. Log file persists on VM even if connection breaks
|
|
235
|
+
|
|
236
|
+
Remote logs are stored at: /home/azureuser/cli_logs/{step}.log
|
|
77
237
|
"""
|
|
78
|
-
|
|
79
|
-
|
|
238
|
+
if stream:
|
|
239
|
+
# Remote log directory and file (persistent across sessions)
|
|
240
|
+
remote_log_dir = "/home/azureuser/cli_logs"
|
|
241
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
242
|
+
remote_log = f"{remote_log_dir}/{step.lower()}_{timestamp}.log"
|
|
243
|
+
|
|
244
|
+
# Ensure log directory exists
|
|
245
|
+
subprocess.run(
|
|
246
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", f"mkdir -p {remote_log_dir}"],
|
|
247
|
+
capture_output=True,
|
|
248
|
+
)
|
|
80
249
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
250
|
+
log(step, f"Remote log: {remote_log}")
|
|
251
|
+
|
|
252
|
+
# Run command with output to log file, capturing exit code
|
|
253
|
+
# Using script to capture terminal output including \r progress updates
|
|
254
|
+
# The command runs in foreground but output goes to file AND stdout
|
|
255
|
+
wrapped_cmd = f"""
|
|
256
|
+
set -o pipefail
|
|
257
|
+
{{
|
|
258
|
+
{cmd}
|
|
259
|
+
echo $? > {remote_log}.exit
|
|
260
|
+
}} 2>&1 | tee {remote_log}
|
|
261
|
+
"""
|
|
262
|
+
full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", wrapped_cmd]
|
|
263
|
+
|
|
264
|
+
process = subprocess.Popen(
|
|
265
|
+
full_cmd,
|
|
266
|
+
stdout=subprocess.PIPE,
|
|
267
|
+
stderr=subprocess.STDOUT,
|
|
268
|
+
text=True,
|
|
269
|
+
bufsize=1,
|
|
270
|
+
)
|
|
86
271
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
272
|
+
# Stream output to local log
|
|
273
|
+
try:
|
|
274
|
+
for line in iter(process.stdout.readline, ""):
|
|
275
|
+
if line:
|
|
276
|
+
# Handle carriage returns (Docker progress)
|
|
277
|
+
clean_line = line.rstrip()
|
|
278
|
+
if "\r" in clean_line:
|
|
279
|
+
# Take the last part after \r
|
|
280
|
+
parts = clean_line.split("\r")
|
|
281
|
+
clean_line = parts[-1].strip()
|
|
282
|
+
if clean_line:
|
|
283
|
+
log(step, clean_line)
|
|
284
|
+
process.wait()
|
|
285
|
+
except KeyboardInterrupt:
|
|
286
|
+
log(step, "Interrupted - command continues on VM")
|
|
287
|
+
log(step, f"View full log: ssh azureuser@{ip} 'cat {remote_log}'")
|
|
288
|
+
process.terminate()
|
|
289
|
+
return subprocess.CompletedProcess(cmd, 130, "", "")
|
|
290
|
+
|
|
291
|
+
# Get exit code
|
|
292
|
+
result = subprocess.run(
|
|
293
|
+
[
|
|
294
|
+
"ssh",
|
|
295
|
+
*SSH_OPTS,
|
|
296
|
+
f"azureuser@{ip}",
|
|
297
|
+
f"cat {remote_log}.exit 2>/dev/null || echo 1",
|
|
298
|
+
],
|
|
299
|
+
capture_output=True,
|
|
300
|
+
text=True,
|
|
301
|
+
)
|
|
302
|
+
exit_code = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 1
|
|
90
303
|
|
|
91
|
-
|
|
304
|
+
if exit_code != 0:
|
|
305
|
+
log(step, f"Command failed (exit {exit_code})")
|
|
306
|
+
log(step, f"Full log: ssh azureuser@{ip} 'cat {remote_log}'")
|
|
92
307
|
|
|
308
|
+
return subprocess.CompletedProcess(cmd, exit_code, "", "")
|
|
309
|
+
else:
|
|
310
|
+
full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd]
|
|
311
|
+
return subprocess.run(full_cmd, capture_output=True, text=True)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def wait_for_ssh(ip: str, timeout: int = 120) -> bool:
|
|
315
|
+
"""Wait for SSH to become available."""
|
|
316
|
+
start = time.time()
|
|
317
|
+
while time.time() - start < timeout:
|
|
318
|
+
result = subprocess.run(
|
|
319
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", "echo ok"],
|
|
320
|
+
capture_output=True,
|
|
321
|
+
text=True,
|
|
322
|
+
timeout=15,
|
|
323
|
+
)
|
|
324
|
+
if result.returncode == 0:
|
|
325
|
+
return True
|
|
326
|
+
time.sleep(5)
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# =============================================================================
|
|
331
|
+
# Commands
|
|
332
|
+
# =============================================================================
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def cmd_create(args):
|
|
336
|
+
"""Create Azure VM with nested virtualization."""
|
|
337
|
+
init_logging()
|
|
338
|
+
|
|
339
|
+
# Check if VM already exists
|
|
340
|
+
ip = get_vm_ip()
|
|
341
|
+
if ip:
|
|
342
|
+
log("CREATE", f"VM already exists: {ip}")
|
|
343
|
+
log("CREATE", "Use 'delete' first if you want to recreate")
|
|
344
|
+
return 0
|
|
345
|
+
|
|
346
|
+
# Determine which sizes to try
|
|
347
|
+
use_fast = getattr(args, "fast", False)
|
|
348
|
+
if use_fast:
|
|
349
|
+
# Try multiple fast sizes with fallbacks
|
|
350
|
+
sizes_to_try = VM_SIZE_FAST_FALLBACKS
|
|
351
|
+
log(
|
|
352
|
+
"CREATE",
|
|
353
|
+
f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...",
|
|
354
|
+
)
|
|
355
|
+
else:
|
|
356
|
+
# Standard mode: single size
|
|
357
|
+
sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
|
|
358
|
+
log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE_STANDARD}, $0.19/hr)...")
|
|
359
|
+
|
|
360
|
+
# Try size+region combinations until one works
|
|
361
|
+
vm_created = False
|
|
362
|
+
successful_size = None
|
|
363
|
+
successful_cost = None
|
|
364
|
+
|
|
365
|
+
for vm_size, cost_per_hour in sizes_to_try:
|
|
366
|
+
log("CREATE", f"Trying size {vm_size} (${cost_per_hour:.2f}/hr)...")
|
|
367
|
+
|
|
368
|
+
for region in VM_REGIONS:
|
|
369
|
+
log("CREATE", f" {region}...", end=" ")
|
|
370
|
+
|
|
371
|
+
result = subprocess.run(
|
|
372
|
+
[
|
|
373
|
+
"az",
|
|
374
|
+
"vm",
|
|
375
|
+
"create",
|
|
376
|
+
"--resource-group",
|
|
377
|
+
RESOURCE_GROUP,
|
|
378
|
+
"--name",
|
|
379
|
+
VM_NAME,
|
|
380
|
+
"--location",
|
|
381
|
+
region,
|
|
382
|
+
"--image",
|
|
383
|
+
"Ubuntu2204",
|
|
384
|
+
"--size",
|
|
385
|
+
vm_size,
|
|
386
|
+
"--admin-username",
|
|
387
|
+
"azureuser",
|
|
388
|
+
"--generate-ssh-keys",
|
|
389
|
+
"--public-ip-sku",
|
|
390
|
+
"Standard",
|
|
391
|
+
],
|
|
392
|
+
capture_output=True,
|
|
393
|
+
text=True,
|
|
394
|
+
)
|
|
93
395
|
|
|
94
|
-
|
|
95
|
-
|
|
396
|
+
if result.returncode == 0:
|
|
397
|
+
vm_info = json.loads(result.stdout)
|
|
398
|
+
ip = vm_info.get("publicIpAddress", "")
|
|
399
|
+
log("CREATE", f"created ({ip})")
|
|
400
|
+
vm_created = True
|
|
401
|
+
successful_size = vm_size
|
|
402
|
+
successful_cost = cost_per_hour
|
|
403
|
+
break
|
|
404
|
+
else:
|
|
405
|
+
log("CREATE", "unavailable")
|
|
406
|
+
|
|
407
|
+
if vm_created:
|
|
408
|
+
break
|
|
409
|
+
|
|
410
|
+
if not vm_created:
|
|
411
|
+
log("CREATE", "ERROR: Could not create VM in any region with any size")
|
|
412
|
+
if use_fast:
|
|
413
|
+
log("CREATE", "Tried sizes: " + ", ".join(s[0] for s in sizes_to_try))
|
|
414
|
+
return 1
|
|
415
|
+
|
|
416
|
+
log(
|
|
417
|
+
"CREATE",
|
|
418
|
+
f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}",
|
|
419
|
+
)
|
|
96
420
|
|
|
97
|
-
|
|
98
|
-
|
|
421
|
+
# Wait for SSH
|
|
422
|
+
log("CREATE", "Waiting for SSH...")
|
|
423
|
+
if not wait_for_ssh(ip):
|
|
424
|
+
log("CREATE", "ERROR: SSH not available after 2 minutes")
|
|
425
|
+
return 1
|
|
426
|
+
log("CREATE", "SSH ready")
|
|
427
|
+
|
|
428
|
+
# Install Docker with /mnt storage
|
|
429
|
+
log("CREATE", "Installing Docker with /mnt storage...")
|
|
430
|
+
docker_setup = """
|
|
431
|
+
set -e
|
|
432
|
+
sudo apt-get update -qq
|
|
433
|
+
sudo apt-get install -y -qq docker.io
|
|
434
|
+
sudo systemctl start docker
|
|
435
|
+
sudo systemctl enable docker
|
|
436
|
+
sudo usermod -aG docker $USER
|
|
437
|
+
|
|
438
|
+
# Configure Docker to use /mnt (larger temp disk)
|
|
439
|
+
sudo systemctl stop docker
|
|
440
|
+
sudo mkdir -p /mnt/docker
|
|
441
|
+
sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json'
|
|
442
|
+
sudo systemctl start docker
|
|
443
|
+
|
|
444
|
+
# Verify
|
|
445
|
+
docker --version
|
|
446
|
+
df -h /mnt
|
|
447
|
+
"""
|
|
448
|
+
result = ssh_run(ip, docker_setup, stream=True, step="CREATE")
|
|
449
|
+
if result.returncode != 0:
|
|
450
|
+
log("CREATE", "ERROR: Docker setup failed")
|
|
451
|
+
return 1
|
|
452
|
+
|
|
453
|
+
log("CREATE", f"VM ready: {ip}")
|
|
454
|
+
return 0
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def cmd_delete(args):
|
|
458
|
+
"""Delete VM and ALL associated resources."""
|
|
459
|
+
init_logging()
|
|
460
|
+
log("DELETE", f"Deleting VM '{VM_NAME}' and all associated resources...")
|
|
461
|
+
|
|
462
|
+
# Delete VM
|
|
463
|
+
log("DELETE", "Deleting VM...")
|
|
464
|
+
result = subprocess.run(
|
|
465
|
+
[
|
|
466
|
+
"az",
|
|
467
|
+
"vm",
|
|
468
|
+
"delete",
|
|
469
|
+
"-g",
|
|
470
|
+
RESOURCE_GROUP,
|
|
471
|
+
"-n",
|
|
472
|
+
VM_NAME,
|
|
473
|
+
"--yes",
|
|
474
|
+
"--force-deletion",
|
|
475
|
+
"true",
|
|
476
|
+
],
|
|
477
|
+
capture_output=True,
|
|
478
|
+
text=True,
|
|
479
|
+
)
|
|
480
|
+
if result.returncode == 0:
|
|
481
|
+
log("DELETE", "VM deleted")
|
|
482
|
+
else:
|
|
483
|
+
log("DELETE", "VM not found or already deleted")
|
|
484
|
+
|
|
485
|
+
# Delete NICs
|
|
486
|
+
log("DELETE", "Deleting NICs...")
|
|
487
|
+
result = subprocess.run(
|
|
488
|
+
[
|
|
489
|
+
"az",
|
|
490
|
+
"network",
|
|
491
|
+
"nic",
|
|
492
|
+
"list",
|
|
493
|
+
"-g",
|
|
494
|
+
RESOURCE_GROUP,
|
|
495
|
+
"--query",
|
|
496
|
+
"[?contains(name, 'waa')].name",
|
|
497
|
+
"-o",
|
|
498
|
+
"tsv",
|
|
499
|
+
],
|
|
500
|
+
capture_output=True,
|
|
501
|
+
text=True,
|
|
502
|
+
)
|
|
503
|
+
for nic in result.stdout.strip().split("\n"):
|
|
504
|
+
if nic:
|
|
505
|
+
subprocess.run(
|
|
506
|
+
["az", "network", "nic", "delete", "-g", RESOURCE_GROUP, "-n", nic],
|
|
507
|
+
capture_output=True,
|
|
508
|
+
)
|
|
509
|
+
log("DELETE", f" Deleted NIC: {nic}")
|
|
510
|
+
|
|
511
|
+
# Delete public IPs
|
|
512
|
+
log("DELETE", "Deleting public IPs...")
|
|
513
|
+
result = subprocess.run(
|
|
514
|
+
[
|
|
515
|
+
"az",
|
|
516
|
+
"network",
|
|
517
|
+
"public-ip",
|
|
518
|
+
"list",
|
|
519
|
+
"-g",
|
|
520
|
+
RESOURCE_GROUP,
|
|
521
|
+
"--query",
|
|
522
|
+
"[?contains(name, 'waa')].name",
|
|
523
|
+
"-o",
|
|
524
|
+
"tsv",
|
|
525
|
+
],
|
|
526
|
+
capture_output=True,
|
|
527
|
+
text=True,
|
|
528
|
+
)
|
|
529
|
+
for pip in result.stdout.strip().split("\n"):
|
|
530
|
+
if pip:
|
|
531
|
+
subprocess.run(
|
|
532
|
+
[
|
|
533
|
+
"az",
|
|
534
|
+
"network",
|
|
535
|
+
"public-ip",
|
|
536
|
+
"delete",
|
|
537
|
+
"-g",
|
|
538
|
+
RESOURCE_GROUP,
|
|
539
|
+
"-n",
|
|
540
|
+
pip,
|
|
541
|
+
],
|
|
542
|
+
capture_output=True,
|
|
543
|
+
)
|
|
544
|
+
log("DELETE", f" Deleted IP: {pip}")
|
|
545
|
+
|
|
546
|
+
# Delete disks
|
|
547
|
+
log("DELETE", "Deleting disks...")
|
|
548
|
+
result = subprocess.run(
|
|
549
|
+
[
|
|
550
|
+
"az",
|
|
551
|
+
"disk",
|
|
552
|
+
"list",
|
|
553
|
+
"-g",
|
|
554
|
+
RESOURCE_GROUP,
|
|
555
|
+
"--query",
|
|
556
|
+
"[?contains(name, 'waa')].name",
|
|
557
|
+
"-o",
|
|
558
|
+
"tsv",
|
|
559
|
+
],
|
|
560
|
+
capture_output=True,
|
|
561
|
+
text=True,
|
|
562
|
+
)
|
|
563
|
+
for disk in result.stdout.strip().split("\n"):
|
|
564
|
+
if disk:
|
|
565
|
+
subprocess.run(
|
|
566
|
+
["az", "disk", "delete", "-g", RESOURCE_GROUP, "-n", disk, "--yes"],
|
|
567
|
+
capture_output=True,
|
|
568
|
+
)
|
|
569
|
+
log("DELETE", f" Deleted disk: {disk}")
|
|
570
|
+
|
|
571
|
+
# Delete NSGs
|
|
572
|
+
log("DELETE", "Deleting NSGs...")
|
|
573
|
+
result = subprocess.run(
|
|
574
|
+
[
|
|
575
|
+
"az",
|
|
576
|
+
"network",
|
|
577
|
+
"nsg",
|
|
578
|
+
"list",
|
|
579
|
+
"-g",
|
|
580
|
+
RESOURCE_GROUP,
|
|
581
|
+
"--query",
|
|
582
|
+
"[?contains(name, 'waa')].name",
|
|
583
|
+
"-o",
|
|
584
|
+
"tsv",
|
|
585
|
+
],
|
|
586
|
+
capture_output=True,
|
|
587
|
+
text=True,
|
|
588
|
+
)
|
|
589
|
+
for nsg in result.stdout.strip().split("\n"):
|
|
590
|
+
if nsg:
|
|
591
|
+
subprocess.run(
|
|
592
|
+
["az", "network", "nsg", "delete", "-g", RESOURCE_GROUP, "-n", nsg],
|
|
593
|
+
capture_output=True,
|
|
594
|
+
)
|
|
595
|
+
log("DELETE", f" Deleted NSG: {nsg}")
|
|
99
596
|
|
|
100
|
-
|
|
101
|
-
|
|
597
|
+
log("DELETE", "Cleanup complete")
|
|
598
|
+
return 0
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def cmd_status(args):
|
|
602
|
+
"""Show VM status."""
|
|
603
|
+
ip = get_vm_ip()
|
|
604
|
+
state = get_vm_state()
|
|
605
|
+
|
|
606
|
+
if not ip:
|
|
607
|
+
print(f"VM '{VM_NAME}' not found")
|
|
608
|
+
return 1
|
|
102
609
|
|
|
103
|
-
|
|
104
|
-
|
|
610
|
+
print(f"VM: {VM_NAME}")
|
|
611
|
+
print(f" State: {state or 'unknown'}")
|
|
612
|
+
print(f" IP: {ip}")
|
|
613
|
+
print(f" Size: {VM_SIZE}")
|
|
614
|
+
print(f" SSH: ssh azureuser@{ip}")
|
|
615
|
+
return 0
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def cmd_build(args):
|
|
619
|
+
"""Build WAA image from waa_deploy/Dockerfile.
|
|
620
|
+
|
|
621
|
+
This builds our custom image that:
|
|
622
|
+
- Uses dockurr/windows:latest (has working ISO auto-download)
|
|
623
|
+
- Copies WAA components from windowsarena/winarena:latest
|
|
624
|
+
- Patches IP addresses and adds automation
|
|
105
625
|
"""
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
return
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
626
|
+
init_logging()
|
|
627
|
+
|
|
628
|
+
ip = get_vm_ip()
|
|
629
|
+
if not ip:
|
|
630
|
+
log("BUILD", "ERROR: VM not found. Run 'create' first.")
|
|
631
|
+
return 1
|
|
632
|
+
|
|
633
|
+
log("BUILD", "Building WAA image from waa_deploy/Dockerfile...")
|
|
634
|
+
|
|
635
|
+
# Check Dockerfile exists
|
|
636
|
+
if not DOCKERFILE_PATH.exists():
|
|
637
|
+
log("BUILD", f"ERROR: Dockerfile not found: {DOCKERFILE_PATH}")
|
|
638
|
+
return 1
|
|
639
|
+
|
|
640
|
+
# Copy Dockerfile and supporting files to VM
|
|
641
|
+
log("BUILD", "Copying build files to VM...")
|
|
642
|
+
ssh_run(ip, "mkdir -p ~/build")
|
|
643
|
+
|
|
644
|
+
waa_deploy_dir = DOCKERFILE_PATH.parent
|
|
645
|
+
files_to_copy = ["Dockerfile", "start_waa_server.bat", "api_agent.py"]
|
|
646
|
+
for filename in files_to_copy:
|
|
647
|
+
src = waa_deploy_dir / filename
|
|
648
|
+
if src.exists():
|
|
649
|
+
result = subprocess.run(
|
|
650
|
+
["scp", *SSH_OPTS, str(src), f"azureuser@{ip}:~/build/"],
|
|
651
|
+
capture_output=True,
|
|
652
|
+
text=True,
|
|
653
|
+
)
|
|
654
|
+
if result.returncode != 0:
|
|
655
|
+
log("BUILD", f"ERROR: Failed to copy {filename}: {result.stderr}")
|
|
656
|
+
return 1
|
|
657
|
+
|
|
658
|
+
# Pre-build cleanup
|
|
659
|
+
log("BUILD", "Cleaning up dangling images before build...")
|
|
660
|
+
ssh_run(ip, "docker image prune -f 2>/dev/null")
|
|
661
|
+
|
|
662
|
+
# Build image (streams output)
|
|
663
|
+
log("BUILD", "Running docker build (this takes ~10-15 minutes)...")
|
|
664
|
+
build_cmd = f"cd ~/build && docker build --pull -t {DOCKER_IMAGE} . 2>&1"
|
|
665
|
+
result = ssh_run(ip, build_cmd, stream=True, step="BUILD")
|
|
666
|
+
|
|
667
|
+
if result.returncode != 0:
|
|
668
|
+
log("BUILD", "ERROR: Docker build failed")
|
|
669
|
+
return 1
|
|
670
|
+
|
|
671
|
+
# Post-build cleanup
|
|
672
|
+
log("BUILD", "Cleaning up dangling images after build...")
|
|
673
|
+
ssh_run(ip, "docker image prune -f 2>/dev/null")
|
|
146
674
|
|
|
675
|
+
log("BUILD", f"Image built: {DOCKER_IMAGE}")
|
|
676
|
+
return 0
|
|
147
677
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
678
|
+
|
|
679
|
+
def cmd_start(args):
|
|
680
|
+
"""Start WAA container."""
|
|
681
|
+
init_logging()
|
|
682
|
+
|
|
683
|
+
ip = get_vm_ip()
|
|
684
|
+
if not ip:
|
|
685
|
+
log("START", "ERROR: VM not found. Run 'create' first.")
|
|
686
|
+
return 1
|
|
687
|
+
|
|
688
|
+
log("START", "Starting WAA container...")
|
|
689
|
+
|
|
690
|
+
# Stop existing container
|
|
691
|
+
log("START", "Stopping any existing container...")
|
|
692
|
+
ssh_run(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null")
|
|
693
|
+
|
|
694
|
+
# Clean storage if --fresh
|
|
695
|
+
if args.fresh:
|
|
696
|
+
log("START", "Cleaning storage for fresh Windows install...")
|
|
697
|
+
ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*")
|
|
698
|
+
|
|
699
|
+
# Create storage directory
|
|
700
|
+
ssh_run(
|
|
701
|
+
ip,
|
|
702
|
+
"sudo mkdir -p /mnt/waa-storage && sudo chown azureuser:azureuser /mnt/waa-storage",
|
|
155
703
|
)
|
|
156
704
|
|
|
157
|
-
#
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
705
|
+
# Start container
|
|
706
|
+
# Our custom image has ENTRYPOINT that handles everything:
|
|
707
|
+
# - Downloads Windows 11 Enterprise if not present
|
|
708
|
+
# - Boots QEMU VM
|
|
709
|
+
# - Runs WAA server automatically via FirstLogonCommands
|
|
710
|
+
# QEMU resource allocation (--fast uses more resources on D8ds_v5)
|
|
711
|
+
if getattr(args, "fast", False):
|
|
712
|
+
ram_size = "16G"
|
|
713
|
+
cpu_cores = 6
|
|
714
|
+
log(
|
|
715
|
+
"START",
|
|
716
|
+
"Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...",
|
|
717
|
+
)
|
|
718
|
+
else:
|
|
719
|
+
ram_size = "8G"
|
|
720
|
+
cpu_cores = 4
|
|
721
|
+
log("START", "Starting container with VERSION=11e...")
|
|
722
|
+
|
|
723
|
+
docker_cmd = f"""docker run -d \\
|
|
724
|
+
--name winarena \\
|
|
725
|
+
--device=/dev/kvm \\
|
|
726
|
+
--cap-add NET_ADMIN \\
|
|
727
|
+
-p 8006:8006 \\
|
|
728
|
+
-p 5000:5000 \\
|
|
729
|
+
-p 7200:7200 \\
|
|
730
|
+
-v /mnt/waa-storage:/storage \\
|
|
731
|
+
-e VERSION=11e \\
|
|
732
|
+
-e RAM_SIZE={ram_size} \\
|
|
733
|
+
-e CPU_CORES={cpu_cores} \\
|
|
734
|
+
-e DISK_SIZE=64G \\
|
|
735
|
+
{DOCKER_IMAGE}"""
|
|
736
|
+
|
|
737
|
+
result = ssh_run(ip, docker_cmd)
|
|
738
|
+
if result.returncode != 0:
|
|
739
|
+
log("START", f"ERROR: Failed to start container: {result.stderr}")
|
|
740
|
+
return 1
|
|
741
|
+
|
|
742
|
+
log("START", "Container started")
|
|
743
|
+
log("START", "Windows will boot and install (15-20 min on first run)")
|
|
744
|
+
|
|
745
|
+
# Auto-launch VNC unless --no-vnc specified
|
|
746
|
+
if not getattr(args, "no_vnc", False):
|
|
747
|
+
log("START", "Auto-launching VNC viewer...")
|
|
748
|
+
tunnel_proc = setup_vnc_tunnel_and_browser(ip)
|
|
749
|
+
if tunnel_proc:
|
|
750
|
+
log(
|
|
751
|
+
"START",
|
|
752
|
+
f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})",
|
|
753
|
+
)
|
|
754
|
+
else:
|
|
755
|
+
log("START", "WARNING: VNC tunnel failed to start")
|
|
756
|
+
log("START", f"Manual VNC: ssh -L 8006:localhost:8006 azureuser@{ip}")
|
|
757
|
+
else:
|
|
758
|
+
log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
|
|
759
|
+
|
|
760
|
+
return 0
|
|
161
761
|
|
|
162
|
-
# Parse task IDs
|
|
163
|
-
task_ids = None
|
|
164
|
-
if args.tasks:
|
|
165
|
-
task_ids = [t.strip() for t in args.tasks.split(",")]
|
|
166
762
|
|
|
167
|
-
|
|
168
|
-
|
|
763
|
+
def cmd_stop(args):
|
|
764
|
+
"""Stop and remove WAA container."""
|
|
765
|
+
ip = get_vm_ip()
|
|
766
|
+
if not ip:
|
|
767
|
+
print("ERROR: VM not found")
|
|
768
|
+
return 1
|
|
169
769
|
|
|
170
|
-
|
|
171
|
-
adapter = WAAAdapter(waa_repo_path=waa_path)
|
|
770
|
+
print(f"Stopping container on VM ({ip})...")
|
|
172
771
|
|
|
173
|
-
#
|
|
174
|
-
|
|
175
|
-
|
|
772
|
+
# Stop container
|
|
773
|
+
result = ssh_run(
|
|
774
|
+
ip, "docker stop winarena 2>/dev/null && echo STOPPED || echo NOT_RUNNING"
|
|
775
|
+
)
|
|
776
|
+
if "STOPPED" in result.stdout:
|
|
777
|
+
print(" Container stopped")
|
|
176
778
|
else:
|
|
177
|
-
print(
|
|
178
|
-
sys.exit(1)
|
|
179
|
-
|
|
180
|
-
# Run evaluation
|
|
181
|
-
print(f"\nRunning WAA evaluation...")
|
|
182
|
-
print(f" WAA path: {waa_path}")
|
|
183
|
-
print(f" Tasks: {len(task_ids) if task_ids else 'all (154)'}")
|
|
184
|
-
print(f" Max steps: {args.max_steps}")
|
|
185
|
-
print()
|
|
779
|
+
print(" Container was not running")
|
|
186
780
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
task_ids=task_ids,
|
|
191
|
-
max_steps=args.max_steps,
|
|
781
|
+
# Remove container
|
|
782
|
+
result = ssh_run(
|
|
783
|
+
ip, "docker rm -f winarena 2>/dev/null && echo REMOVED || echo NOT_FOUND"
|
|
192
784
|
)
|
|
785
|
+
if "REMOVED" in result.stdout:
|
|
786
|
+
print(" Container removed")
|
|
787
|
+
else:
|
|
788
|
+
print(" Container already removed")
|
|
789
|
+
|
|
790
|
+
# Optionally clean storage
|
|
791
|
+
if hasattr(args, "clean") and args.clean:
|
|
792
|
+
print(" Cleaning Windows storage...")
|
|
793
|
+
ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*")
|
|
794
|
+
print(" Storage cleaned")
|
|
795
|
+
|
|
796
|
+
print("Done")
|
|
797
|
+
return 0
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def cmd_probe(args):
|
|
801
|
+
"""Check if WAA server is ready."""
|
|
802
|
+
ip = get_vm_ip()
|
|
803
|
+
if not ip:
|
|
804
|
+
print("ERROR: VM not found")
|
|
805
|
+
return 1
|
|
806
|
+
|
|
807
|
+
timeout = args.timeout
|
|
808
|
+
start = time.time()
|
|
809
|
+
last_storage = None
|
|
810
|
+
|
|
811
|
+
while True:
|
|
812
|
+
# Check via SSH - must run curl INSIDE container to reach Docker network
|
|
813
|
+
result = ssh_run(
|
|
814
|
+
ip,
|
|
815
|
+
"docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL",
|
|
816
|
+
)
|
|
193
817
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
199
|
-
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
200
|
-
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
201
|
-
print()
|
|
818
|
+
if "FAIL" not in result.stdout and result.stdout.strip():
|
|
819
|
+
print("\nWAA server is READY")
|
|
820
|
+
print(f" Response: {result.stdout.strip()[:100]}")
|
|
821
|
+
return 0
|
|
202
822
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
823
|
+
if not args.wait:
|
|
824
|
+
print("WAA server is NOT ready")
|
|
825
|
+
return 1
|
|
826
|
+
|
|
827
|
+
elapsed = time.time() - start
|
|
828
|
+
if elapsed > timeout:
|
|
829
|
+
print(f"\nTIMEOUT: WAA server not ready after {timeout}s")
|
|
830
|
+
return 1
|
|
831
|
+
|
|
832
|
+
# Get detailed status for progress display
|
|
833
|
+
elapsed_min = int(elapsed // 60)
|
|
834
|
+
elapsed_sec = int(elapsed % 60)
|
|
835
|
+
|
|
836
|
+
# Get storage in bytes for detailed view
|
|
837
|
+
storage_result = ssh_run(
|
|
838
|
+
ip, "docker exec winarena du -sb /storage/ 2>/dev/null | cut -f1"
|
|
839
|
+
)
|
|
840
|
+
storage_bytes = storage_result.stdout.strip()
|
|
841
|
+
if storage_bytes.isdigit():
|
|
842
|
+
storage_mb = int(storage_bytes) / (1024 * 1024)
|
|
843
|
+
storage_str = f"{storage_mb:,.1f} MB"
|
|
844
|
+
# Show delta if we have previous value
|
|
845
|
+
if last_storage is not None:
|
|
846
|
+
delta = int(storage_bytes) - last_storage
|
|
847
|
+
if delta > 0:
|
|
848
|
+
delta_mb = delta / (1024 * 1024)
|
|
849
|
+
storage_str += f" (+{delta_mb:,.1f} MB)"
|
|
850
|
+
last_storage = int(storage_bytes)
|
|
851
|
+
else:
|
|
852
|
+
storage_str = "unknown"
|
|
853
|
+
|
|
854
|
+
# Get QEMU uptime
|
|
855
|
+
qemu_result = ssh_run(
|
|
856
|
+
ip,
|
|
857
|
+
'docker exec winarena sh -c \'QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); [ -n "$QPID" ] && ps -o etime= -p $QPID 2>/dev/null | tr -d " " || echo N/A\'',
|
|
858
|
+
)
|
|
859
|
+
qemu_uptime = qemu_result.stdout.strip() or "N/A"
|
|
860
|
+
|
|
861
|
+
# Get container uptime
|
|
862
|
+
container_result = ssh_run(
|
|
863
|
+
ip, "docker ps --filter name=winarena --format '{{.Status}}' 2>/dev/null"
|
|
864
|
+
)
|
|
865
|
+
container_status = container_result.stdout.strip() or "unknown"
|
|
866
|
+
|
|
867
|
+
print(
|
|
868
|
+
f"[{elapsed_min:02d}:{elapsed_sec:02d}] Waiting... | Storage: {storage_str} | QEMU: {qemu_uptime} | Container: {container_status}"
|
|
869
|
+
)
|
|
870
|
+
time.sleep(30)
|
|
225
871
|
|
|
226
872
|
|
|
227
|
-
def
|
|
228
|
-
"""Run
|
|
229
|
-
|
|
230
|
-
|
|
873
|
+
def cmd_run(args):
|
|
874
|
+
"""Run benchmark tasks using vanilla WAA's navi agent.
|
|
875
|
+
|
|
876
|
+
Note: For API-based agents (Claude, GPT-4 direct), use openadapt-evals
|
|
877
|
+
which communicates with WAA's Flask API externally.
|
|
878
|
+
"""
|
|
879
|
+
init_logging()
|
|
880
|
+
|
|
881
|
+
ip = get_vm_ip()
|
|
882
|
+
if not ip:
|
|
883
|
+
log("RUN", "ERROR: VM not found")
|
|
884
|
+
return 1
|
|
885
|
+
|
|
886
|
+
# Check WAA is ready
|
|
887
|
+
log("RUN", "Checking WAA server...")
|
|
888
|
+
result = ssh_run(
|
|
889
|
+
ip,
|
|
890
|
+
"docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL",
|
|
891
|
+
)
|
|
892
|
+
if "FAIL" in result.stdout or not result.stdout.strip():
|
|
893
|
+
log("RUN", "ERROR: WAA server not ready. Run 'probe --wait' first.")
|
|
894
|
+
return 1
|
|
895
|
+
|
|
896
|
+
log("RUN", "WAA server is ready")
|
|
231
897
|
|
|
232
|
-
#
|
|
233
|
-
|
|
234
|
-
|
|
898
|
+
# Get API key (navi uses GPT-4o for reasoning)
|
|
899
|
+
api_key = args.api_key
|
|
900
|
+
if not api_key:
|
|
901
|
+
try:
|
|
902
|
+
from openadapt_ml.config import settings
|
|
903
|
+
|
|
904
|
+
api_key = settings.openai_api_key or ""
|
|
905
|
+
except ImportError:
|
|
906
|
+
api_key = ""
|
|
907
|
+
|
|
908
|
+
if not api_key:
|
|
909
|
+
log("RUN", "ERROR: OpenAI API key required (navi uses GPT-4o)")
|
|
910
|
+
log("RUN", " Set OPENAI_API_KEY in .env file or pass --api-key")
|
|
911
|
+
return 1
|
|
912
|
+
|
|
913
|
+
# Build task selection
|
|
914
|
+
domain = args.domain
|
|
915
|
+
task = args.task
|
|
916
|
+
model = args.model
|
|
917
|
+
|
|
918
|
+
task_info = []
|
|
919
|
+
if task:
|
|
920
|
+
task_info.append(f"task={task}")
|
|
921
|
+
elif domain != "all":
|
|
922
|
+
task_info.append(f"domain={domain}")
|
|
235
923
|
else:
|
|
236
|
-
|
|
924
|
+
task_info.append(f"{args.num_tasks} task(s)")
|
|
925
|
+
|
|
926
|
+
log("RUN", f"Starting benchmark: {', '.join(task_info)}, model={model}")
|
|
237
927
|
|
|
238
|
-
#
|
|
239
|
-
|
|
928
|
+
# Build run.py arguments
|
|
929
|
+
run_args = [
|
|
930
|
+
"--agent_name navi",
|
|
931
|
+
f"--model {model}",
|
|
932
|
+
f"--domain {domain}",
|
|
933
|
+
]
|
|
240
934
|
|
|
241
|
-
#
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
935
|
+
# Add parallelization flags if specified (argparse converts hyphens to underscores)
|
|
936
|
+
worker_id = getattr(args, "worker_id", 0)
|
|
937
|
+
num_workers = getattr(args, "num_workers", 1)
|
|
938
|
+
if num_workers > 1:
|
|
939
|
+
run_args.append(f"--worker_id {worker_id}")
|
|
940
|
+
run_args.append(f"--num_workers {num_workers}")
|
|
941
|
+
log("RUN", f"Parallel mode: worker {worker_id}/{num_workers}")
|
|
942
|
+
|
|
943
|
+
# If specific task requested, create custom test config
|
|
944
|
+
if task:
|
|
945
|
+
create_custom_test_cmd = f'''
|
|
946
|
+
cat > /client/evaluation_examples_windows/test_custom.json << 'CUSTOMEOF'
|
|
947
|
+
["{task}"]
|
|
948
|
+
CUSTOMEOF
|
|
949
|
+
'''
|
|
950
|
+
run_args.append(
|
|
951
|
+
"--test_all_meta_path evaluation_examples_windows/test_custom.json"
|
|
952
|
+
)
|
|
953
|
+
pre_cmd = create_custom_test_cmd
|
|
954
|
+
elif args.num_tasks and args.num_tasks < 154:
|
|
955
|
+
# Limit tasks by creating custom test config with first N tasks
|
|
956
|
+
num = args.num_tasks
|
|
957
|
+
# Write a temp Python script then run it (avoids quote escaping hell)
|
|
958
|
+
# test_all.json is a dict {{domain: [task_ids...]}} - preserve domain structure
|
|
959
|
+
create_limited_test_cmd = f"""cat > /tmp/limit_tasks.py << LIMITEOF
|
|
960
|
+
import json
|
|
961
|
+
d = json.load(open("/client/evaluation_examples_windows/test_all.json"))
|
|
962
|
+
# Collect (domain, task_id) pairs to preserve domain info
|
|
963
|
+
all_tasks = []
|
|
964
|
+
for domain, tasks in d.items():
|
|
965
|
+
for task in tasks:
|
|
966
|
+
all_tasks.append((domain, task))
|
|
967
|
+
# Limit total tasks
|
|
968
|
+
limited = all_tasks[:{num}]
|
|
969
|
+
# Rebuild dict preserving original domain structure
|
|
970
|
+
result = {{}}
|
|
971
|
+
for domain, task in limited:
|
|
972
|
+
if domain not in result:
|
|
973
|
+
result[domain] = []
|
|
974
|
+
result[domain].append(task)
|
|
975
|
+
json.dump(result, open("/client/evaluation_examples_windows/test_limited.json", "w"))
|
|
976
|
+
print("Limited to", len(limited), "tasks from", len(result), "domains")
|
|
977
|
+
LIMITEOF
|
|
978
|
+
python /tmp/limit_tasks.py && """
|
|
979
|
+
run_args.append(
|
|
980
|
+
"--test_all_meta_path evaluation_examples_windows/test_limited.json"
|
|
981
|
+
)
|
|
982
|
+
pre_cmd = create_limited_test_cmd
|
|
983
|
+
else:
|
|
984
|
+
pre_cmd = ""
|
|
245
985
|
|
|
246
|
-
#
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
986
|
+
# Run the benchmark inside the container
|
|
987
|
+
run_cmd = (
|
|
988
|
+
f'export OPENAI_API_KEY="{api_key}" && '
|
|
989
|
+
f"docker exec -e OPENAI_API_KEY winarena "
|
|
990
|
+
f"bash -c '{pre_cmd}cd /client && python run.py {' '.join(run_args)}'"
|
|
251
991
|
)
|
|
252
992
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
993
|
+
log("RUN", "Executing benchmark...")
|
|
994
|
+
log("RUN", f" Model: {model}")
|
|
995
|
+
log("RUN", f" Tasks: {task_info[0]}")
|
|
996
|
+
log("RUN", "-" * 60)
|
|
997
|
+
|
|
998
|
+
# Run with streaming output
|
|
999
|
+
result = ssh_run(ip, run_cmd, stream=True, step="RUN")
|
|
1000
|
+
|
|
1001
|
+
if result.returncode != 0:
|
|
1002
|
+
log("RUN", f"Benchmark failed with exit code {result.returncode}")
|
|
256
1003
|
else:
|
|
257
|
-
|
|
258
|
-
sys.exit(1)
|
|
1004
|
+
log("RUN", "Benchmark completed!")
|
|
259
1005
|
|
|
260
|
-
#
|
|
261
|
-
|
|
1006
|
+
# Download results unless --no-download
|
|
1007
|
+
if not args.no_download:
|
|
1008
|
+
log("RUN", "Downloading results...")
|
|
1009
|
+
download_benchmark_results(ip)
|
|
262
1010
|
|
|
263
|
-
|
|
264
|
-
estimate = estimate_cost(num_tasks=num_tasks, num_workers=args.workers)
|
|
1011
|
+
return result.returncode
|
|
265
1012
|
|
|
266
|
-
print(f"\n=== Azure WAA Evaluation ===")
|
|
267
|
-
print(f" Workers: {args.workers}")
|
|
268
|
-
print(f" Tasks: {num_tasks}")
|
|
269
|
-
print(f" Estimated cost: ${estimate['estimated_cost_usd']:.2f}")
|
|
270
|
-
print(f" Estimated time: {estimate['estimated_duration_minutes']:.1f} minutes")
|
|
271
|
-
print()
|
|
272
1013
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
if response.lower() != "y":
|
|
276
|
-
print("Aborted.")
|
|
277
|
-
sys.exit(0)
|
|
1014
|
+
def download_benchmark_results(ip: str) -> str:
|
|
1015
|
+
"""Download benchmark results from the container.
|
|
278
1016
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
results = orchestrator.run_evaluation(
|
|
284
|
-
agent=agent,
|
|
285
|
-
num_workers=args.workers,
|
|
286
|
-
task_ids=task_ids,
|
|
287
|
-
max_steps_per_task=args.max_steps,
|
|
288
|
-
cleanup_on_complete=not args.no_cleanup,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
# Print results
|
|
292
|
-
from openadapt_ml.benchmarks import compute_metrics
|
|
293
|
-
|
|
294
|
-
metrics = compute_metrics(results)
|
|
295
|
-
print("\n=== Results ===")
|
|
296
|
-
print(f"Tasks: {metrics['num_tasks']}")
|
|
297
|
-
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
298
|
-
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
299
|
-
print()
|
|
1017
|
+
Results are saved to benchmark_results/waa_results_TIMESTAMP/
|
|
1018
|
+
Returns the path to the results directory, or None if failed.
|
|
1019
|
+
"""
|
|
1020
|
+
from pathlib import Path
|
|
300
1021
|
|
|
301
|
-
#
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
1022
|
+
# Create local results directory with timestamp
|
|
1023
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1024
|
+
results_dir = Path("benchmark_results") / f"waa_results_{timestamp}"
|
|
1025
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
1026
|
+
|
|
1027
|
+
log("RUN", f"Saving results to {results_dir}/")
|
|
1028
|
+
|
|
1029
|
+
# Create tarball of results inside container
|
|
1030
|
+
log("RUN", "Creating results archive...")
|
|
1031
|
+
tar_cmd = "docker exec winarena tar -czvf /tmp/results.tar.gz -C /client/results . 2>/dev/null"
|
|
1032
|
+
result = subprocess.run(
|
|
1033
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", tar_cmd], capture_output=True, text=True
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
if result.returncode != 0:
|
|
1037
|
+
log(
|
|
1038
|
+
"RUN",
|
|
1039
|
+
f"Warning: Failed to create archive: {result.stderr[:200] if result.stderr else 'unknown'}",
|
|
1040
|
+
)
|
|
1041
|
+
log("RUN", "Trying direct copy...")
|
|
1042
|
+
|
|
1043
|
+
# Try copying results directory directly
|
|
1044
|
+
copy_cmd = "docker cp winarena:/client/results/. /tmp/waa-results/"
|
|
1045
|
+
subprocess.run(
|
|
1046
|
+
[
|
|
1047
|
+
"ssh",
|
|
1048
|
+
*SSH_OPTS,
|
|
1049
|
+
f"azureuser@{ip}",
|
|
1050
|
+
f"rm -rf /tmp/waa-results && mkdir -p /tmp/waa-results && {copy_cmd}",
|
|
1051
|
+
],
|
|
1052
|
+
capture_output=True,
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
# Download via scp
|
|
1056
|
+
scp_result = subprocess.run(
|
|
1057
|
+
[
|
|
1058
|
+
"scp",
|
|
1059
|
+
"-r",
|
|
1060
|
+
*SSH_OPTS,
|
|
1061
|
+
f"azureuser@{ip}:/tmp/waa-results/*",
|
|
1062
|
+
str(results_dir),
|
|
1063
|
+
],
|
|
1064
|
+
capture_output=True,
|
|
1065
|
+
text=True,
|
|
1066
|
+
)
|
|
1067
|
+
if scp_result.returncode == 0:
|
|
1068
|
+
log("RUN", f"Results saved to: {results_dir}")
|
|
1069
|
+
return str(results_dir)
|
|
1070
|
+
else:
|
|
1071
|
+
log(
|
|
1072
|
+
"RUN",
|
|
1073
|
+
f"Warning: Failed to download results: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}",
|
|
321
1074
|
)
|
|
322
|
-
|
|
1075
|
+
return None
|
|
1076
|
+
|
|
1077
|
+
# Copy tarball from container to VM host
|
|
1078
|
+
copy_tar_cmd = "docker cp winarena:/tmp/results.tar.gz /tmp/results.tar.gz"
|
|
1079
|
+
subprocess.run(
|
|
1080
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", copy_tar_cmd], capture_output=True
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
# Download tarball
|
|
1084
|
+
local_tar = results_dir / "results.tar.gz"
|
|
1085
|
+
scp_result = subprocess.run(
|
|
1086
|
+
["scp", *SSH_OPTS, f"azureuser@{ip}:/tmp/results.tar.gz", str(local_tar)],
|
|
1087
|
+
capture_output=True,
|
|
1088
|
+
text=True,
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
if scp_result.returncode != 0:
|
|
1092
|
+
log(
|
|
1093
|
+
"RUN",
|
|
1094
|
+
f"Warning: Failed to download tarball: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}",
|
|
1095
|
+
)
|
|
1096
|
+
return None
|
|
323
1097
|
|
|
1098
|
+
# Extract tarball
|
|
1099
|
+
log("RUN", "Extracting results...")
|
|
1100
|
+
import tarfile
|
|
324
1101
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
1102
|
+
try:
|
|
1103
|
+
with tarfile.open(local_tar, "r:gz") as tar:
|
|
1104
|
+
tar.extractall(path=results_dir)
|
|
1105
|
+
local_tar.unlink() # Remove tarball after extraction
|
|
1106
|
+
except Exception as e:
|
|
1107
|
+
log("RUN", f"Warning: Failed to extract: {e}")
|
|
1108
|
+
log("RUN", f"Tarball saved at: {local_tar}")
|
|
1109
|
+
|
|
1110
|
+
# Clean up remote tarball
|
|
1111
|
+
subprocess.run(
|
|
1112
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", "rm -f /tmp/results.tar.gz"],
|
|
1113
|
+
capture_output=True,
|
|
333
1114
|
)
|
|
334
1115
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
print()
|
|
1116
|
+
# List what we downloaded
|
|
1117
|
+
result_files = list(results_dir.glob("**/*"))
|
|
1118
|
+
log("RUN", f"Downloaded {len(result_files)} files to {results_dir}/")
|
|
339
1119
|
|
|
340
|
-
#
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
# Run evaluation
|
|
345
|
-
results = evaluate_agent_on_benchmark(
|
|
346
|
-
agent=agent,
|
|
347
|
-
adapter=adapter,
|
|
348
|
-
max_steps=args.max_steps,
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
# Print results
|
|
352
|
-
metrics = compute_metrics(results)
|
|
353
|
-
print("=== Results ===")
|
|
354
|
-
print(f"Tasks: {metrics['num_tasks']}")
|
|
355
|
-
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
356
|
-
print(f"Successes: {metrics['success_count']}")
|
|
357
|
-
print(f"Failures: {metrics['fail_count']}")
|
|
358
|
-
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
359
|
-
print()
|
|
1120
|
+
# Show summary if available
|
|
1121
|
+
summary_file = results_dir / "summary.json"
|
|
1122
|
+
if summary_file.exists():
|
|
1123
|
+
import json
|
|
360
1124
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
|
|
368
|
-
print()
|
|
1125
|
+
try:
|
|
1126
|
+
with open(summary_file) as f:
|
|
1127
|
+
summary = json.load(f)
|
|
1128
|
+
log("RUN", f"Summary: {json.dumps(summary, indent=2)[:500]}")
|
|
1129
|
+
except Exception:
|
|
1130
|
+
pass
|
|
369
1131
|
|
|
1132
|
+
return str(results_dir)
|
|
370
1133
|
|
|
371
|
-
def cmd_test_collection(args: argparse.Namespace) -> None:
|
|
372
|
-
"""Test benchmark data collection with mock adapter.
|
|
373
1134
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
"""
|
|
378
|
-
import json
|
|
379
|
-
from pathlib import Path
|
|
1135
|
+
def cmd_download(args):
|
|
1136
|
+
"""Download benchmark results from VM."""
|
|
1137
|
+
init_logging()
|
|
380
1138
|
|
|
381
|
-
|
|
382
|
-
|
|
1139
|
+
ip = get_vm_ip()
|
|
1140
|
+
if not ip:
|
|
1141
|
+
log("DOWNLOAD", "ERROR: VM not found")
|
|
1142
|
+
return 1
|
|
383
1143
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
print(f" Max steps: {args.max_steps}")
|
|
387
|
-
print(f" Output dir: {args.output}")
|
|
388
|
-
print(f" Run name: {args.run_name or '(auto-generated)'}")
|
|
389
|
-
print()
|
|
1144
|
+
log("DOWNLOAD", "Downloading benchmark results...")
|
|
1145
|
+
result_path = download_benchmark_results(ip)
|
|
390
1146
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
# Configure evaluation with data collection
|
|
396
|
-
config = EvaluationConfig(
|
|
397
|
-
max_steps=args.max_steps,
|
|
398
|
-
parallel=1,
|
|
399
|
-
save_trajectories=True,
|
|
400
|
-
save_execution_traces=True,
|
|
401
|
-
model_id=args.model_id,
|
|
402
|
-
output_dir=args.output,
|
|
403
|
-
run_name=args.run_name,
|
|
404
|
-
verbose=True,
|
|
405
|
-
)
|
|
406
|
-
|
|
407
|
-
# Run evaluation
|
|
408
|
-
results = evaluate_agent_on_benchmark(
|
|
409
|
-
agent=agent,
|
|
410
|
-
adapter=adapter,
|
|
411
|
-
config=config,
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
# Print results
|
|
415
|
-
success_count = sum(1 for r in results if r.success)
|
|
416
|
-
success_rate = success_count / len(results) if results else 0.0
|
|
417
|
-
avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0.0
|
|
418
|
-
|
|
419
|
-
print(f"\n=== Results ===")
|
|
420
|
-
print(f"Total tasks: {len(results)}")
|
|
421
|
-
print(f"Success: {success_count} ({success_rate:.1%})")
|
|
422
|
-
print(f"Failure: {len(results) - success_count}")
|
|
423
|
-
print(f"Avg steps: {avg_steps:.1f}")
|
|
424
|
-
|
|
425
|
-
# Find the actual output directory by reading metadata
|
|
426
|
-
output_dir = Path(args.output)
|
|
427
|
-
run_dirs = sorted(output_dir.glob("*/metadata.json"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
428
|
-
if run_dirs:
|
|
429
|
-
run_dir = run_dirs[0].parent
|
|
430
|
-
with open(run_dirs[0]) as f:
|
|
431
|
-
metadata = json.load(f)
|
|
432
|
-
run_name = metadata.get("run_name", run_dir.name)
|
|
1147
|
+
if result_path:
|
|
1148
|
+
log("DOWNLOAD", f"Results saved to: {result_path}")
|
|
1149
|
+
return 0
|
|
433
1150
|
else:
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
print(f"\n=== Output Directory ===")
|
|
438
|
-
print(f"Location: {run_dir.absolute()}")
|
|
439
|
-
print(f"\nDirectory structure:")
|
|
440
|
-
print(f" {run_dir.name}/")
|
|
441
|
-
print(f" ├── metadata.json")
|
|
442
|
-
print(f" ├── summary.json")
|
|
443
|
-
print(f" └── tasks/")
|
|
444
|
-
print(f" ├── task_001/")
|
|
445
|
-
print(f" │ ├── task.json")
|
|
446
|
-
print(f" │ ├── execution.json")
|
|
447
|
-
print(f" │ └── screenshots/")
|
|
448
|
-
print(f" │ ├── step_000.png")
|
|
449
|
-
print(f" │ ├── step_001.png")
|
|
450
|
-
print(f" │ └── ...")
|
|
451
|
-
print(f" └── ...")
|
|
452
|
-
print(f"\nYou can inspect the results at: {run_dir.absolute()}")
|
|
453
|
-
print()
|
|
1151
|
+
log("DOWNLOAD", "Failed to download results")
|
|
1152
|
+
return 1
|
|
454
1153
|
|
|
455
1154
|
|
|
456
|
-
def
|
|
457
|
-
"""
|
|
1155
|
+
def cmd_analyze(args):
|
|
1156
|
+
"""Analyze benchmark results from downloaded logs."""
|
|
1157
|
+
import re
|
|
1158
|
+
from collections import defaultdict
|
|
458
1159
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
APIBenchmarkAgent,
|
|
463
|
-
WAAMockAdapter,
|
|
464
|
-
compute_domain_metrics,
|
|
465
|
-
compute_metrics,
|
|
466
|
-
)
|
|
467
|
-
from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
|
|
468
|
-
|
|
469
|
-
provider_names = {
|
|
470
|
-
"anthropic": "Claude",
|
|
471
|
-
"openai": "GPT-5.1",
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
print(f"\n=== API-Backed Benchmark Evaluation ===")
|
|
475
|
-
print(f" Provider: {args.provider} ({provider_names.get(args.provider, 'Unknown')})")
|
|
476
|
-
print(f" Tasks: {args.tasks}")
|
|
477
|
-
print(f" Max steps: {args.max_steps}")
|
|
478
|
-
print(f" Output dir: {args.output}")
|
|
479
|
-
print()
|
|
480
|
-
|
|
481
|
-
# Check for API key
|
|
482
|
-
import os
|
|
483
|
-
key_name = "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY"
|
|
484
|
-
if not os.getenv(key_name):
|
|
485
|
-
print(f"WARNING: {key_name} environment variable not set!")
|
|
486
|
-
print(f" Set it in your .env file or export it before running.")
|
|
487
|
-
print()
|
|
1160
|
+
results_dir = (
|
|
1161
|
+
Path(args.results_dir) if args.results_dir else Path("benchmark_results")
|
|
1162
|
+
)
|
|
488
1163
|
|
|
489
|
-
#
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
if sys.platform != "win32" and not args.force:
|
|
493
|
-
print("ERROR: WAA requires Windows. Use --force to override.")
|
|
494
|
-
sys.exit(1)
|
|
495
|
-
from openadapt_ml.benchmarks import WAAAdapter
|
|
496
|
-
waa_path = get_waa_path(args.waa_path)
|
|
497
|
-
adapter = WAAAdapter(waa_repo_path=waa_path)
|
|
498
|
-
task_ids = None
|
|
499
|
-
if args.task_ids:
|
|
500
|
-
task_ids = [t.strip() for t in args.task_ids.split(",")]
|
|
1164
|
+
# Find most recent results if no specific dir given
|
|
1165
|
+
if args.results_dir:
|
|
1166
|
+
target_dir = Path(args.results_dir)
|
|
501
1167
|
else:
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
1168
|
+
dirs = sorted(results_dir.glob("waa_results_*"), reverse=True)
|
|
1169
|
+
if not dirs:
|
|
1170
|
+
print("No results found in benchmark_results/")
|
|
1171
|
+
print("Run 'cli download' first to get results from VM")
|
|
1172
|
+
return 1
|
|
1173
|
+
target_dir = dirs[0]
|
|
1174
|
+
|
|
1175
|
+
print(f"Analyzing: {target_dir}")
|
|
1176
|
+
print("=" * 60)
|
|
1177
|
+
|
|
1178
|
+
# Find log files
|
|
1179
|
+
log_files = list(target_dir.glob("logs/normal-*.log"))
|
|
1180
|
+
if not log_files:
|
|
1181
|
+
print("No log files found")
|
|
1182
|
+
return 1
|
|
1183
|
+
|
|
1184
|
+
# Parse results
|
|
1185
|
+
tasks = []
|
|
1186
|
+
current_task = None
|
|
1187
|
+
pending_domain = None
|
|
1188
|
+
|
|
1189
|
+
for log_file in sorted(log_files):
|
|
1190
|
+
with open(log_file) as f:
|
|
1191
|
+
for line in f:
|
|
1192
|
+
# Strip ANSI codes
|
|
1193
|
+
clean = re.sub(r"\x1b\[[0-9;]*m", "", line)
|
|
1194
|
+
|
|
1195
|
+
# Domain comes before Example ID
|
|
1196
|
+
if "[Domain]:" in clean:
|
|
1197
|
+
match = re.search(r"\[Domain\]: (.+)", clean)
|
|
1198
|
+
if match:
|
|
1199
|
+
pending_domain = match.group(1).strip()
|
|
1200
|
+
|
|
1201
|
+
# Task start (Example ID comes after Domain)
|
|
1202
|
+
if "[Example ID]:" in clean:
|
|
1203
|
+
match = re.search(r"\[Example ID\]: (.+)", clean)
|
|
1204
|
+
if match:
|
|
1205
|
+
current_task = {
|
|
1206
|
+
"id": match.group(1).strip(),
|
|
1207
|
+
"domain": pending_domain,
|
|
1208
|
+
"reward": None,
|
|
1209
|
+
"error": None,
|
|
1210
|
+
}
|
|
1211
|
+
pending_domain = None
|
|
1212
|
+
|
|
1213
|
+
# Task result
|
|
1214
|
+
if "Reward:" in clean and current_task:
|
|
1215
|
+
match = re.search(r"Reward: ([0-9.]+)", clean)
|
|
1216
|
+
if match:
|
|
1217
|
+
current_task["reward"] = float(match.group(1))
|
|
1218
|
+
tasks.append(current_task)
|
|
1219
|
+
current_task = None
|
|
1220
|
+
|
|
1221
|
+
# Task error
|
|
1222
|
+
if "Exception in" in clean and current_task:
|
|
1223
|
+
match = re.search(r"Exception in .+: (.+)", clean)
|
|
1224
|
+
if match:
|
|
1225
|
+
current_task["error"] = match.group(1).strip()
|
|
1226
|
+
current_task["reward"] = 0.0
|
|
1227
|
+
tasks.append(current_task)
|
|
1228
|
+
current_task = None
|
|
1229
|
+
|
|
1230
|
+
# Summary
|
|
1231
|
+
print(f"\nTotal tasks attempted: {len(tasks)}")
|
|
1232
|
+
|
|
1233
|
+
if not tasks:
|
|
1234
|
+
print("No completed tasks found")
|
|
1235
|
+
return 0
|
|
1236
|
+
|
|
1237
|
+
# Success rate
|
|
1238
|
+
successes = sum(1 for t in tasks if t["reward"] and t["reward"] > 0)
|
|
1239
|
+
print(f"Successful: {successes} ({100 * successes / len(tasks):.1f}%)")
|
|
1240
|
+
|
|
1241
|
+
# By domain
|
|
1242
|
+
by_domain = defaultdict(list)
|
|
1243
|
+
for t in tasks:
|
|
1244
|
+
by_domain[t["domain"] or "unknown"].append(t)
|
|
1245
|
+
|
|
1246
|
+
print("\nBy domain:")
|
|
1247
|
+
for domain in sorted(by_domain.keys()):
|
|
1248
|
+
domain_tasks = by_domain[domain]
|
|
1249
|
+
domain_success = sum(1 for t in domain_tasks if t["reward"] and t["reward"] > 0)
|
|
1250
|
+
print(
|
|
1251
|
+
f" {domain}: {domain_success}/{len(domain_tasks)} ({100 * domain_success / len(domain_tasks):.1f}%)"
|
|
1252
|
+
)
|
|
530
1253
|
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
1254
|
+
# Errors
|
|
1255
|
+
errors = [t for t in tasks if t.get("error")]
|
|
1256
|
+
if errors:
|
|
1257
|
+
print(f"\nErrors ({len(errors)}):")
|
|
1258
|
+
for t in errors[:5]: # Show first 5
|
|
1259
|
+
print(f" {t['id']}: {t['error'][:50]}")
|
|
1260
|
+
if len(errors) > 5:
|
|
1261
|
+
print(f" ... and {len(errors) - 5} more")
|
|
1262
|
+
|
|
1263
|
+
return 0
|
|
1264
|
+
|
|
1265
|
+
|
|
1266
|
+
def cmd_tasks(args):
|
|
1267
|
+
"""List available WAA benchmark tasks."""
|
|
1268
|
+
ip = get_vm_ip()
|
|
1269
|
+
if not ip:
|
|
1270
|
+
print("ERROR: VM not found")
|
|
1271
|
+
return 1
|
|
1272
|
+
|
|
1273
|
+
print("Fetching available tasks from WAA container...")
|
|
1274
|
+
print("-" * 60)
|
|
1275
|
+
|
|
1276
|
+
# Get list of domains (subdirectories in examples/)
|
|
1277
|
+
result = subprocess.run(
|
|
1278
|
+
[
|
|
1279
|
+
"ssh",
|
|
1280
|
+
*SSH_OPTS,
|
|
1281
|
+
f"azureuser@{ip}",
|
|
1282
|
+
"docker exec winarena ls /client/evaluation_examples_windows/examples/",
|
|
1283
|
+
],
|
|
1284
|
+
capture_output=True,
|
|
1285
|
+
text=True,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
if result.returncode != 0:
|
|
1289
|
+
print("ERROR: Could not fetch domain list")
|
|
1290
|
+
return 1
|
|
1291
|
+
|
|
1292
|
+
domains = result.stdout.strip().split("\n")
|
|
1293
|
+
|
|
1294
|
+
# Count tasks per domain
|
|
1295
|
+
domain_tasks = {}
|
|
1296
|
+
total_tasks = 0
|
|
1297
|
+
|
|
1298
|
+
for domain in domains:
|
|
1299
|
+
if not domain:
|
|
1300
|
+
continue
|
|
1301
|
+
count_result = subprocess.run(
|
|
1302
|
+
[
|
|
1303
|
+
"ssh",
|
|
1304
|
+
*SSH_OPTS,
|
|
1305
|
+
f"azureuser@{ip}",
|
|
1306
|
+
f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/ 2>/dev/null | wc -l",
|
|
1307
|
+
],
|
|
1308
|
+
capture_output=True,
|
|
1309
|
+
text=True,
|
|
537
1310
|
)
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
metrics = compute_metrics(results)
|
|
546
|
-
print("\n=== Results ===")
|
|
547
|
-
print(f"Tasks: {metrics['num_tasks']}")
|
|
548
|
-
print(f"Success rate: {metrics['success_rate']:.1%}")
|
|
549
|
-
print(f"Successes: {metrics['success_count']}")
|
|
550
|
-
print(f"Failures: {metrics['fail_count']}")
|
|
551
|
-
print(f"Avg score: {metrics['avg_score']:.3f}")
|
|
552
|
-
print(f"Avg steps: {metrics['avg_steps']:.1f}")
|
|
553
|
-
print()
|
|
1311
|
+
count = (
|
|
1312
|
+
int(count_result.stdout.strip())
|
|
1313
|
+
if count_result.stdout.strip().isdigit()
|
|
1314
|
+
else 0
|
|
1315
|
+
)
|
|
1316
|
+
domain_tasks[domain] = count
|
|
1317
|
+
total_tasks += count
|
|
554
1318
|
|
|
555
|
-
#
|
|
556
|
-
tasks
|
|
557
|
-
|
|
558
|
-
if domain_metrics:
|
|
559
|
-
print("=== By Domain ===")
|
|
560
|
-
for domain, dm in domain_metrics.items():
|
|
561
|
-
print(f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})")
|
|
1319
|
+
# Print summary
|
|
1320
|
+
print(f"Total tasks: {total_tasks}")
|
|
1321
|
+
print(f"Domains: {len(domains)}")
|
|
562
1322
|
print()
|
|
563
1323
|
|
|
564
|
-
#
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
1324
|
+
# Print by domain
|
|
1325
|
+
for domain in sorted(domain_tasks.keys()):
|
|
1326
|
+
count = domain_tasks[domain]
|
|
1327
|
+
print(f" {domain}: {count} tasks")
|
|
1328
|
+
|
|
1329
|
+
if args.verbose and count > 0:
|
|
1330
|
+
# List actual task IDs
|
|
1331
|
+
tasks_result = subprocess.run(
|
|
1332
|
+
[
|
|
1333
|
+
"ssh",
|
|
1334
|
+
*SSH_OPTS,
|
|
1335
|
+
f"azureuser@{ip}",
|
|
1336
|
+
f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/",
|
|
1337
|
+
],
|
|
1338
|
+
capture_output=True,
|
|
1339
|
+
text=True,
|
|
1340
|
+
)
|
|
1341
|
+
for task_file in tasks_result.stdout.strip().split("\n")[:5]: # Limit to 5
|
|
1342
|
+
task_id = task_file.replace(".json", "")
|
|
1343
|
+
print(f" - {task_id}")
|
|
1344
|
+
if count > 5:
|
|
1345
|
+
print(f" ... and {count - 5} more")
|
|
1346
|
+
|
|
571
1347
|
print()
|
|
1348
|
+
print("Usage examples:")
|
|
1349
|
+
print(" Run all notepad tasks: cli_v2 run --domain notepad")
|
|
1350
|
+
print(" Run all chrome tasks: cli_v2 run --domain chrome")
|
|
1351
|
+
print(
|
|
1352
|
+
" Run specific task: cli_v2 run --task 366de66e-cbae-4d72-b042-26390db2b145-WOS"
|
|
1353
|
+
)
|
|
572
1354
|
|
|
1355
|
+
return 0
|
|
573
1356
|
|
|
574
|
-
def cmd_create_config(args: argparse.Namespace) -> None:
|
|
575
|
-
"""Create a sample Azure config file."""
|
|
576
|
-
from openadapt_ml.benchmarks.azure import AzureConfig
|
|
577
1357
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
vm_size="Standard_D4_v3",
|
|
583
|
-
)
|
|
1358
|
+
def cmd_deallocate(args):
|
|
1359
|
+
"""Stop VM (preserves disk, stops billing)."""
|
|
1360
|
+
init_logging()
|
|
1361
|
+
log("DEALLOCATE", f"Deallocating VM '{VM_NAME}'...")
|
|
584
1362
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
1363
|
+
result = subprocess.run(
|
|
1364
|
+
["az", "vm", "deallocate", "-g", RESOURCE_GROUP, "-n", VM_NAME],
|
|
1365
|
+
capture_output=True,
|
|
1366
|
+
text=True,
|
|
1367
|
+
)
|
|
589
1368
|
|
|
1369
|
+
if result.returncode == 0:
|
|
1370
|
+
log("DEALLOCATE", "VM deallocated (billing stopped)")
|
|
1371
|
+
log("DEALLOCATE", "Use 'vm-start' to resume")
|
|
1372
|
+
return 0
|
|
1373
|
+
else:
|
|
1374
|
+
log("DEALLOCATE", f"ERROR: {result.stderr}")
|
|
1375
|
+
return 1
|
|
590
1376
|
|
|
591
|
-
def cmd_status(args: argparse.Namespace) -> None:
|
|
592
|
-
"""Check Azure workspace and compute status."""
|
|
593
|
-
setup_logging(args.verbose)
|
|
594
1377
|
|
|
595
|
-
|
|
596
|
-
|
|
1378
|
+
def cmd_vm_start(args):
|
|
1379
|
+
"""Start a deallocated VM."""
|
|
1380
|
+
init_logging()
|
|
1381
|
+
log("VM-START", f"Starting VM '{VM_NAME}'...")
|
|
597
1382
|
|
|
598
|
-
|
|
1383
|
+
result = subprocess.run(
|
|
1384
|
+
["az", "vm", "start", "-g", RESOURCE_GROUP, "-n", VM_NAME],
|
|
1385
|
+
capture_output=True,
|
|
1386
|
+
text=True,
|
|
1387
|
+
)
|
|
599
1388
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
print(f"Workspace: {config.workspace_name}")
|
|
606
|
-
print(f"VM Size: {config.vm_size}")
|
|
607
|
-
except ValueError as e:
|
|
608
|
-
print(f"Config Error: {e}")
|
|
609
|
-
print("\nRun 'python scripts/setup_azure.py' to configure.")
|
|
610
|
-
return
|
|
611
|
-
|
|
612
|
-
# Check WAA
|
|
613
|
-
waa_path = find_waa_path()
|
|
614
|
-
if waa_path:
|
|
615
|
-
print(f"WAA Path: {waa_path}")
|
|
1389
|
+
if result.returncode == 0:
|
|
1390
|
+
ip = get_vm_ip()
|
|
1391
|
+
log("VM-START", f"VM started: {ip}")
|
|
1392
|
+
log("VM-START", "Run 'build' then 'start' to launch WAA container")
|
|
1393
|
+
return 0
|
|
616
1394
|
else:
|
|
617
|
-
|
|
618
|
-
|
|
1395
|
+
log("VM-START", f"ERROR: {result.stderr}")
|
|
1396
|
+
return 1
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def cmd_exec(args):
|
|
1400
|
+
"""Run command on VM host."""
|
|
1401
|
+
ip = get_vm_ip()
|
|
1402
|
+
if not ip:
|
|
1403
|
+
print("ERROR: VM not found or not running")
|
|
1404
|
+
return 1
|
|
1405
|
+
|
|
1406
|
+
cmd = args.cmd
|
|
1407
|
+
if not cmd:
|
|
1408
|
+
print("ERROR: --cmd is required")
|
|
1409
|
+
return 1
|
|
1410
|
+
|
|
1411
|
+
result = ssh_run(ip, cmd, stream=True)
|
|
1412
|
+
return result.returncode
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
def cmd_docker_exec(args):
|
|
1416
|
+
"""Run command inside winarena container."""
|
|
1417
|
+
ip = get_vm_ip()
|
|
1418
|
+
if not ip:
|
|
1419
|
+
print("ERROR: VM not found or not running")
|
|
1420
|
+
return 1
|
|
1421
|
+
|
|
1422
|
+
cmd = args.cmd
|
|
1423
|
+
if not cmd:
|
|
1424
|
+
print("ERROR: --cmd is required")
|
|
1425
|
+
return 1
|
|
1426
|
+
|
|
1427
|
+
docker_cmd = f"docker exec winarena {cmd}"
|
|
1428
|
+
result = ssh_run(ip, docker_cmd, stream=True)
|
|
1429
|
+
return result.returncode
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
def cmd_vnc(args):
|
|
1433
|
+
"""Open VNC to view Windows desktop via SSH tunnel."""
|
|
1434
|
+
ip = get_vm_ip()
|
|
1435
|
+
if not ip:
|
|
1436
|
+
print("ERROR: VM not found or not running")
|
|
1437
|
+
return 1
|
|
1438
|
+
|
|
1439
|
+
print(f"Setting up SSH tunnel to VM ({ip})...")
|
|
1440
|
+
print("VNC will be available at: http://localhost:8006")
|
|
1441
|
+
print("-" * 60)
|
|
1442
|
+
|
|
1443
|
+
# Kill any existing tunnel on port 8006
|
|
1444
|
+
subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
|
|
1445
|
+
|
|
1446
|
+
# Start SSH tunnel in background
|
|
1447
|
+
tunnel_proc = subprocess.Popen(
|
|
1448
|
+
["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
|
|
1449
|
+
stdout=subprocess.DEVNULL,
|
|
1450
|
+
stderr=subprocess.DEVNULL,
|
|
1451
|
+
)
|
|
619
1452
|
|
|
620
|
-
#
|
|
621
|
-
|
|
622
|
-
try:
|
|
623
|
-
client = AzureMLClient(config)
|
|
624
|
-
computes = client.list_compute_instances(prefix="w")
|
|
625
|
-
print(f"Connection: OK")
|
|
626
|
-
|
|
627
|
-
if computes:
|
|
628
|
-
print(f"\nActive Compute Instances ({len(computes)}):")
|
|
629
|
-
for name in computes:
|
|
630
|
-
try:
|
|
631
|
-
status = client.get_compute_status(name)
|
|
632
|
-
print(f" - {name}: {status}")
|
|
633
|
-
except Exception:
|
|
634
|
-
print(f" - {name}: (status unknown)")
|
|
635
|
-
else:
|
|
636
|
-
print("\nNo active compute instances.")
|
|
1453
|
+
# Give tunnel a moment to establish
|
|
1454
|
+
time.sleep(2)
|
|
637
1455
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
print(
|
|
1456
|
+
# Check if tunnel is running
|
|
1457
|
+
if tunnel_proc.poll() is not None:
|
|
1458
|
+
print("ERROR: SSH tunnel failed to start")
|
|
1459
|
+
return 1
|
|
1460
|
+
|
|
1461
|
+
print(f"SSH tunnel established (PID: {tunnel_proc.pid})")
|
|
1462
|
+
|
|
1463
|
+
# Open browser
|
|
1464
|
+
import webbrowser
|
|
1465
|
+
|
|
1466
|
+
vnc_url = "http://localhost:8006"
|
|
1467
|
+
print(f"Opening {vnc_url} in browser...")
|
|
1468
|
+
webbrowser.open(vnc_url)
|
|
641
1469
|
|
|
642
1470
|
print()
|
|
1471
|
+
print("VNC is now accessible at: http://localhost:8006")
|
|
1472
|
+
print("Press Ctrl+C to close the tunnel")
|
|
1473
|
+
print("-" * 60)
|
|
643
1474
|
|
|
1475
|
+
try:
|
|
1476
|
+
# Keep tunnel alive
|
|
1477
|
+
tunnel_proc.wait()
|
|
1478
|
+
except KeyboardInterrupt:
|
|
1479
|
+
print("\nClosing SSH tunnel...")
|
|
1480
|
+
tunnel_proc.terminate()
|
|
644
1481
|
|
|
645
|
-
|
|
646
|
-
"""Clean up all Azure compute resources."""
|
|
647
|
-
setup_logging(args.verbose)
|
|
1482
|
+
return 0
|
|
648
1483
|
|
|
649
|
-
from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient
|
|
650
1484
|
|
|
651
|
-
|
|
1485
|
+
def _show_benchmark_progress(ip: str) -> int:
|
|
1486
|
+
"""Show benchmark progress with estimated completion time.
|
|
652
1487
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
1488
|
+
Parses the run log to count completed tasks and estimate remaining time.
|
|
1489
|
+
"""
|
|
1490
|
+
# Find the most recent run log
|
|
1491
|
+
result = ssh_run(
|
|
1492
|
+
ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
|
|
1493
|
+
)
|
|
1494
|
+
log_file = result.stdout.strip()
|
|
1495
|
+
|
|
1496
|
+
if not log_file:
|
|
1497
|
+
print("No benchmark running. Start one with: run --num-tasks N")
|
|
1498
|
+
return 1
|
|
1499
|
+
|
|
1500
|
+
# Get task count and timestamps
|
|
1501
|
+
result = ssh_run(
|
|
1502
|
+
ip,
|
|
1503
|
+
f"""
|
|
1504
|
+
echo "=== WAA Benchmark Progress ==="
|
|
1505
|
+
echo ""
|
|
1506
|
+
|
|
1507
|
+
# Count completed tasks (each "Result:" line = 1 task done)
|
|
1508
|
+
COMPLETED=$(grep -c "Result:" {log_file} 2>/dev/null || echo 0)
|
|
1509
|
+
# Count total tasks from task list (sum of all domain counts)
|
|
1510
|
+
TOTAL=$(grep -A20 "Left tasks:" {log_file} | grep -E "^[a-z_]+: [0-9]+" | awk -F': ' '{{sum+=$2}} END {{print sum}}')
|
|
1511
|
+
[ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ] && TOTAL=154
|
|
1512
|
+
|
|
1513
|
+
# Get timestamps
|
|
1514
|
+
FIRST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | head -1 | tr -d '[')
|
|
1515
|
+
LAST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | tail -1 | tr -d '[')
|
|
1516
|
+
|
|
1517
|
+
echo "Log: {log_file}"
|
|
1518
|
+
echo "Started: $FIRST_TS"
|
|
1519
|
+
echo "Latest: $LAST_TS"
|
|
1520
|
+
echo ""
|
|
1521
|
+
echo "Tasks completed: $COMPLETED / $TOTAL"
|
|
1522
|
+
|
|
1523
|
+
# Calculate elapsed minutes
|
|
1524
|
+
if [ -n "$FIRST_TS" ] && [ -n "$LAST_TS" ]; then
|
|
1525
|
+
START_H=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f1)
|
|
1526
|
+
START_M=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f2)
|
|
1527
|
+
NOW_H=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f1)
|
|
1528
|
+
NOW_M=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f2)
|
|
1529
|
+
|
|
1530
|
+
ELAPSED_MIN=$(( (NOW_H - START_H) * 60 + (NOW_M - START_M) ))
|
|
1531
|
+
echo "Elapsed: $ELAPSED_MIN minutes"
|
|
1532
|
+
|
|
1533
|
+
if [ "$COMPLETED" -gt 0 ] && [ "$ELAPSED_MIN" -gt 0 ]; then
|
|
1534
|
+
MIN_PER_TASK=$((ELAPSED_MIN / COMPLETED))
|
|
1535
|
+
REMAINING=$((TOTAL - COMPLETED))
|
|
1536
|
+
EST_MIN=$((REMAINING * MIN_PER_TASK))
|
|
1537
|
+
EST_H=$((EST_MIN / 60))
|
|
1538
|
+
EST_M=$((EST_MIN % 60))
|
|
1539
|
+
|
|
1540
|
+
echo ""
|
|
1541
|
+
echo "Avg time per task: ~$MIN_PER_TASK min"
|
|
1542
|
+
echo "Remaining tasks: $REMAINING"
|
|
1543
|
+
echo "Estimated remaining: ~${{EST_H}}h ${{EST_M}}m"
|
|
1544
|
+
|
|
1545
|
+
# Progress bar
|
|
1546
|
+
PCT=$((COMPLETED * 100 / TOTAL))
|
|
1547
|
+
echo ""
|
|
1548
|
+
echo "Progress: $PCT% [$COMPLETED/$TOTAL]"
|
|
1549
|
+
fi
|
|
1550
|
+
fi
|
|
1551
|
+
""",
|
|
1552
|
+
)
|
|
1553
|
+
print(result.stdout)
|
|
1554
|
+
return 0
|
|
658
1555
|
|
|
659
|
-
print(f"Workspace: {config.workspace_name}")
|
|
660
|
-
print(f"Resource Group: {config.resource_group}")
|
|
661
|
-
print()
|
|
662
1556
|
|
|
663
|
-
|
|
1557
|
+
def _show_run_logs(ip: str, follow: bool = False, tail: Optional[int] = None) -> int:
|
|
1558
|
+
"""Show the most recent run command log file.
|
|
664
1559
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
1560
|
+
Args:
|
|
1561
|
+
ip: VM IP address
|
|
1562
|
+
follow: If True, use tail -f to stream the log
|
|
1563
|
+
tail: Number of lines to show (default: entire file or 100 for follow)
|
|
1564
|
+
|
|
1565
|
+
Returns:
|
|
1566
|
+
Exit code (0 for success, 1 for error)
|
|
1567
|
+
"""
|
|
1568
|
+
# Find the most recent run log file
|
|
1569
|
+
result = ssh_run(
|
|
1570
|
+
ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
|
|
1571
|
+
)
|
|
1572
|
+
log_file = result.stdout.strip()
|
|
668
1573
|
|
|
669
|
-
if not
|
|
670
|
-
print("
|
|
1574
|
+
if not log_file:
|
|
1575
|
+
print("No run logs found at /home/azureuser/cli_logs/run_*.log")
|
|
1576
|
+
print("Run a benchmark first: cli_v2 run --task <task_id>")
|
|
1577
|
+
return 1
|
|
1578
|
+
|
|
1579
|
+
print(f"Run log: {log_file}")
|
|
1580
|
+
print("-" * 60)
|
|
1581
|
+
|
|
1582
|
+
if follow:
|
|
1583
|
+
# Stream the log file
|
|
1584
|
+
print("Streaming log (Ctrl+C to stop)...")
|
|
1585
|
+
subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", f"tail -f {log_file}"])
|
|
671
1586
|
else:
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
1587
|
+
# Show the log file contents
|
|
1588
|
+
if tail:
|
|
1589
|
+
cmd = f"tail -n {tail} {log_file}"
|
|
1590
|
+
else:
|
|
1591
|
+
# Check file size first - if small, cat it; if large, use tail
|
|
1592
|
+
size_result = ssh_run(ip, f"wc -l < {log_file}")
|
|
1593
|
+
line_count = (
|
|
1594
|
+
int(size_result.stdout.strip())
|
|
1595
|
+
if size_result.stdout.strip().isdigit()
|
|
1596
|
+
else 0
|
|
1597
|
+
)
|
|
679
1598
|
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
except Exception as e:
|
|
692
|
-
print(f" FAILED: {e}")
|
|
693
|
-
|
|
694
|
-
print("\nCleanup complete.")
|
|
695
|
-
print("Note: Resource deletion may take a few minutes to free quota.")
|
|
696
|
-
print()
|
|
1599
|
+
if line_count <= 200:
|
|
1600
|
+
cmd = f"cat {log_file}"
|
|
1601
|
+
else:
|
|
1602
|
+
print(
|
|
1603
|
+
f"(Showing last 100 of {line_count} lines, use --tail N for more)"
|
|
1604
|
+
)
|
|
1605
|
+
cmd = f"tail -n 100 {log_file}"
|
|
1606
|
+
|
|
1607
|
+
subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd])
|
|
1608
|
+
|
|
1609
|
+
return 0
|
|
697
1610
|
|
|
698
1611
|
|
|
699
|
-
def
|
|
700
|
-
"""
|
|
701
|
-
import subprocess
|
|
1612
|
+
def cmd_logs(args):
|
|
1613
|
+
"""Show comprehensive logs from the WAA container.
|
|
702
1614
|
|
|
703
|
-
|
|
1615
|
+
Default behavior shows all relevant logs (docker, storage, probe status).
|
|
1616
|
+
Use --follow to stream docker logs continuously.
|
|
1617
|
+
Use --run to show run command output instead of container logs.
|
|
1618
|
+
Use --progress to show benchmark progress and ETA.
|
|
1619
|
+
"""
|
|
1620
|
+
ip = get_vm_ip()
|
|
1621
|
+
if not ip:
|
|
1622
|
+
print("ERROR: VM not found")
|
|
1623
|
+
return 1
|
|
1624
|
+
|
|
1625
|
+
# Handle --progress flag: show benchmark progress
|
|
1626
|
+
if getattr(args, "progress", False):
|
|
1627
|
+
return _show_benchmark_progress(ip)
|
|
1628
|
+
|
|
1629
|
+
# Handle --run flag: show run command output
|
|
1630
|
+
if args.run:
|
|
1631
|
+
return _show_run_logs(ip, args.follow, args.tail)
|
|
1632
|
+
|
|
1633
|
+
# Check if container exists
|
|
1634
|
+
result = ssh_run(ip, "docker ps -a --filter name=winarena --format '{{.Status}}'")
|
|
1635
|
+
container_status = result.stdout.strip()
|
|
1636
|
+
container_exists = bool(container_status)
|
|
1637
|
+
|
|
1638
|
+
# If --follow, stream the most relevant logs
|
|
1639
|
+
if args.follow:
|
|
1640
|
+
# Priority 1: If container is running, stream container logs
|
|
1641
|
+
if container_exists and "Up" in container_status:
|
|
1642
|
+
print(f"Streaming container logs from VM ({ip}):")
|
|
1643
|
+
print("Press Ctrl+C to stop")
|
|
1644
|
+
print("-" * 60)
|
|
1645
|
+
subprocess.run(
|
|
1646
|
+
["ssh", *SSH_OPTS, f"azureuser@{ip}", "docker logs -f winarena 2>&1"]
|
|
1647
|
+
)
|
|
1648
|
+
return 0
|
|
704
1649
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
1650
|
+
# Priority 2: Check for active docker build
|
|
1651
|
+
result = ssh_run(
|
|
1652
|
+
ip,
|
|
1653
|
+
"pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD",
|
|
1654
|
+
)
|
|
1655
|
+
if "BUILD_RUNNING" in result.stdout:
|
|
1656
|
+
print(f"Docker build in progress on VM ({ip})")
|
|
1657
|
+
print("Streaming build logs (Ctrl+C to stop):")
|
|
1658
|
+
print("-" * 60)
|
|
1659
|
+
# Find and tail the most recent build log
|
|
1660
|
+
subprocess.run(
|
|
1661
|
+
[
|
|
1662
|
+
"ssh",
|
|
1663
|
+
*SSH_OPTS,
|
|
1664
|
+
f"azureuser@{ip}",
|
|
1665
|
+
"tail -f $(ls -t ~/cli_logs/build_*.log 2>/dev/null | head -1) 2>/dev/null || "
|
|
1666
|
+
"tail -f ~/build.log 2>/dev/null || "
|
|
1667
|
+
"echo 'No build logs found - build may have just started'",
|
|
1668
|
+
]
|
|
1669
|
+
)
|
|
1670
|
+
return 0
|
|
1671
|
+
|
|
1672
|
+
# Priority 3: No container, no build - show helpful message
|
|
1673
|
+
print(f"Container 'winarena' not running on VM ({ip})")
|
|
1674
|
+
print()
|
|
1675
|
+
# Check if image exists
|
|
1676
|
+
result = ssh_run(
|
|
1677
|
+
ip, "docker images waa-auto:latest --format '{{.Repository}}:{{.Tag}}'"
|
|
1678
|
+
)
|
|
1679
|
+
if result.stdout.strip():
|
|
1680
|
+
print("Image 'waa-auto:latest' is ready.")
|
|
1681
|
+
print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 start")
|
|
1682
|
+
else:
|
|
1683
|
+
print("Image not yet built.")
|
|
1684
|
+
print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 build")
|
|
1685
|
+
return 1
|
|
1686
|
+
|
|
1687
|
+
# Default: show comprehensive status
|
|
1688
|
+
import sys
|
|
1689
|
+
|
|
1690
|
+
print(f"WAA Status ({ip})")
|
|
1691
|
+
print("=" * 60)
|
|
1692
|
+
sys.stdout.flush()
|
|
1693
|
+
|
|
1694
|
+
# Docker images
|
|
1695
|
+
print("\n[Docker Images]", flush=True)
|
|
1696
|
+
subprocess.run(
|
|
1697
|
+
[
|
|
1698
|
+
"ssh",
|
|
1699
|
+
*SSH_OPTS,
|
|
1700
|
+
f"azureuser@{ip}",
|
|
1701
|
+
"docker images --format 'table {{.Repository}}\\t{{.Tag}}\\t{{.Size}}' 2>/dev/null | head -5",
|
|
1702
|
+
]
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
# Container status
|
|
1706
|
+
print("\n[Container]", flush=True)
|
|
1707
|
+
if container_exists:
|
|
1708
|
+
print(f" Status: {container_status}", flush=True)
|
|
710
1709
|
else:
|
|
711
|
-
print("
|
|
712
|
-
|
|
1710
|
+
print(" Container 'winarena' not created yet", flush=True)
|
|
1711
|
+
# Check for active build
|
|
1712
|
+
result = ssh_run(
|
|
1713
|
+
ip,
|
|
1714
|
+
"pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD",
|
|
1715
|
+
)
|
|
1716
|
+
if "BUILD_RUNNING" in result.stdout:
|
|
1717
|
+
print(" Docker build in progress...", flush=True)
|
|
1718
|
+
|
|
1719
|
+
# Only show these sections if container exists
|
|
1720
|
+
if container_exists and "Up" in container_status:
|
|
1721
|
+
# Storage info
|
|
1722
|
+
print("\n[Storage]", flush=True)
|
|
1723
|
+
subprocess.run(
|
|
1724
|
+
[
|
|
1725
|
+
"ssh",
|
|
1726
|
+
*SSH_OPTS,
|
|
1727
|
+
f"azureuser@{ip}",
|
|
1728
|
+
"docker exec winarena sh -c '"
|
|
1729
|
+
'echo " Total: $(du -sh /storage/ 2>/dev/null | cut -f1)"; '
|
|
1730
|
+
'ls -lh /storage/*.img 2>/dev/null | awk "{print \\" Disk image: \\" \\$5}" || true'
|
|
1731
|
+
"'",
|
|
1732
|
+
]
|
|
1733
|
+
)
|
|
1734
|
+
|
|
1735
|
+
# QEMU VM status
|
|
1736
|
+
print("\n[QEMU VM]", flush=True)
|
|
1737
|
+
subprocess.run(
|
|
1738
|
+
[
|
|
1739
|
+
"ssh",
|
|
1740
|
+
*SSH_OPTS,
|
|
1741
|
+
f"azureuser@{ip}",
|
|
1742
|
+
"docker exec winarena sh -c '"
|
|
1743
|
+
"QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); "
|
|
1744
|
+
'if [ -n "$QPID" ]; then '
|
|
1745
|
+
' echo " Status: Running (PID $QPID)"; '
|
|
1746
|
+
' ps -o %cpu,%mem,etime -p $QPID 2>/dev/null | tail -1 | awk "{print \\" CPU: \\" \\$1 \\"%, MEM: \\" \\$2 \\"%, Uptime: \\" \\$3}"; '
|
|
1747
|
+
"else "
|
|
1748
|
+
' echo " Status: Not running"; '
|
|
1749
|
+
"fi"
|
|
1750
|
+
"'",
|
|
1751
|
+
]
|
|
1752
|
+
)
|
|
1753
|
+
|
|
1754
|
+
# WAA server probe
|
|
1755
|
+
print("\n[WAA Server]", flush=True)
|
|
1756
|
+
subprocess.run(
|
|
1757
|
+
[
|
|
1758
|
+
"ssh",
|
|
1759
|
+
*SSH_OPTS,
|
|
1760
|
+
f"azureuser@{ip}",
|
|
1761
|
+
"docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null && echo ' (READY)' || echo 'Not ready (Windows installing - check VNC for progress)'",
|
|
1762
|
+
]
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
# Windows install log (written by install.bat to Samba share at Z:\install_log.txt)
|
|
1766
|
+
# The Samba share \\host.lan\Data maps to /tmp/smb inside the container
|
|
1767
|
+
result = ssh_run(
|
|
1768
|
+
ip, "docker exec winarena cat /tmp/smb/install_log.txt 2>/dev/null | wc -l"
|
|
1769
|
+
)
|
|
1770
|
+
install_log_lines = result.stdout.strip()
|
|
1771
|
+
if install_log_lines and install_log_lines != "0":
|
|
1772
|
+
print("\n[Windows Install Log]", flush=True)
|
|
1773
|
+
# Show last 10 lines of the install log (shows current step like [5/14] Installing Git...)
|
|
713
1774
|
subprocess.run(
|
|
714
|
-
[
|
|
715
|
-
|
|
716
|
-
|
|
1775
|
+
[
|
|
1776
|
+
"ssh",
|
|
1777
|
+
*SSH_OPTS,
|
|
1778
|
+
f"azureuser@{ip}",
|
|
1779
|
+
"docker exec winarena tail -10 /tmp/smb/install_log.txt 2>/dev/null",
|
|
1780
|
+
]
|
|
717
1781
|
)
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
sys.exit(1)
|
|
1782
|
+
|
|
1783
|
+
# Recent docker logs
|
|
1784
|
+
tail_lines = args.tail if args.tail else 20
|
|
1785
|
+
print(f"\n[Recent Logs (last {tail_lines} lines)]", flush=True)
|
|
1786
|
+
print("-" * 60, flush=True)
|
|
1787
|
+
subprocess.run(
|
|
1788
|
+
[
|
|
1789
|
+
"ssh",
|
|
1790
|
+
*SSH_OPTS,
|
|
1791
|
+
f"azureuser@{ip}",
|
|
1792
|
+
f"docker logs --tail {tail_lines} winarena 2>&1",
|
|
1793
|
+
]
|
|
1794
|
+
)
|
|
1795
|
+
|
|
1796
|
+
print("\n" + "=" * 60, flush=True)
|
|
1797
|
+
print("VNC: ssh -L 8006:localhost:8006 azureuser@" + ip, flush=True)
|
|
1798
|
+
print(" Then open http://localhost:8006", flush=True)
|
|
1799
|
+
print(" (Windows installation % visible on VNC screen)", flush=True)
|
|
737
1800
|
else:
|
|
738
|
-
|
|
739
|
-
print("
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
1801
|
+
# Show next steps
|
|
1802
|
+
print("\n[Next Steps]")
|
|
1803
|
+
result = ssh_run(ip, "docker images waa-auto:latest --format '{{.Repository}}'")
|
|
1804
|
+
if result.stdout.strip():
|
|
1805
|
+
print(" Image ready. Run: cli_v2 start")
|
|
1806
|
+
else:
|
|
1807
|
+
print(" Build image first. Run: cli_v2 build")
|
|
1808
|
+
|
|
1809
|
+
return 0
|
|
1810
|
+
|
|
1811
|
+
|
|
1812
|
+
# =============================================================================
|
|
1813
|
+
# Main
|
|
1814
|
+
# =============================================================================
|
|
747
1815
|
|
|
748
1816
|
|
|
749
|
-
def main()
|
|
1817
|
+
def main():
|
|
750
1818
|
parser = argparse.ArgumentParser(
|
|
751
|
-
description="WAA Benchmark CLI -
|
|
1819
|
+
description="WAA Benchmark CLI v2 - Minimal working CLI",
|
|
752
1820
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
753
1821
|
epilog="""
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
1822
|
+
Examples:
|
|
1823
|
+
# Full setup workflow (vanilla WAA)
|
|
1824
|
+
%(prog)s create # Create Azure VM
|
|
1825
|
+
%(prog)s pull # Pull vanilla WAA image
|
|
1826
|
+
%(prog)s start # Start container + Windows
|
|
1827
|
+
%(prog)s probe --wait # Wait for WAA server
|
|
1828
|
+
%(prog)s run --num-tasks 1 --agent navi # Run benchmark
|
|
1829
|
+
%(prog)s deallocate # Stop billing
|
|
1830
|
+
|
|
1831
|
+
# Monitor in separate terminal
|
|
1832
|
+
%(prog)s logs --docker # Docker container logs
|
|
1833
|
+
%(prog)s vnc # View Windows desktop
|
|
1834
|
+
|
|
1835
|
+
# Cleanup
|
|
1836
|
+
%(prog)s delete
|
|
1837
|
+
""",
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
757
1841
|
|
|
758
|
-
#
|
|
759
|
-
|
|
1842
|
+
# create
|
|
1843
|
+
p_create = subparsers.add_parser("create", help="Create Azure VM")
|
|
1844
|
+
p_create.add_argument(
|
|
1845
|
+
"--fast",
|
|
1846
|
+
action="store_true",
|
|
1847
|
+
help="Use larger VM (D8ds_v5, $0.38/hr) for ~30%% faster install, ~40%% faster eval",
|
|
1848
|
+
)
|
|
1849
|
+
p_create.set_defaults(func=cmd_create)
|
|
760
1850
|
|
|
761
|
-
#
|
|
762
|
-
|
|
1851
|
+
# delete
|
|
1852
|
+
p_delete = subparsers.add_parser("delete", help="Delete VM and all resources")
|
|
1853
|
+
p_delete.set_defaults(func=cmd_delete)
|
|
763
1854
|
|
|
764
|
-
#
|
|
765
|
-
|
|
766
|
-
|
|
1855
|
+
# status
|
|
1856
|
+
p_status = subparsers.add_parser("status", help="Show VM status")
|
|
1857
|
+
p_status.set_defaults(func=cmd_status)
|
|
1858
|
+
|
|
1859
|
+
# build
|
|
1860
|
+
p_build = subparsers.add_parser(
|
|
1861
|
+
"build", help="Build WAA image from waa_deploy/Dockerfile"
|
|
767
1862
|
)
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
# Setup (new!)
|
|
771
|
-
p_setup = subparsers.add_parser("setup", help="One-command setup (Azure + WAA)")
|
|
772
|
-
p_setup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts")
|
|
773
|
-
p_setup.add_argument("--force", action="store_true", help="Continue on errors")
|
|
774
|
-
p_setup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
775
|
-
|
|
776
|
-
# Status
|
|
777
|
-
p_status = subparsers.add_parser("status", help="Check Azure and WAA status")
|
|
778
|
-
p_status.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
779
|
-
|
|
780
|
-
# Cleanup
|
|
781
|
-
p_cleanup = subparsers.add_parser("cleanup", help="Delete all Azure compute instances")
|
|
782
|
-
p_cleanup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
|
783
|
-
p_cleanup.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
784
|
-
|
|
785
|
-
# Estimate costs
|
|
786
|
-
p_estimate = subparsers.add_parser("estimate", help="Estimate Azure costs")
|
|
787
|
-
p_estimate.add_argument("--tasks", type=int, default=154, help="Number of tasks")
|
|
788
|
-
p_estimate.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
|
|
789
|
-
p_estimate.add_argument("--duration", type=float, default=1.0, help="Avg task duration (minutes)")
|
|
790
|
-
p_estimate.add_argument("--vm-cost", type=float, default=0.19, help="VM hourly cost ($ for D4_v3)")
|
|
791
|
-
|
|
792
|
-
# Run local
|
|
793
|
-
p_local = subparsers.add_parser("run-local", help="Run evaluation locally (Windows)")
|
|
794
|
-
p_local.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
|
|
795
|
-
p_local.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
|
|
796
|
-
p_local.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
|
|
797
|
-
p_local.add_argument("--agent", default="random", help="Agent type")
|
|
798
|
-
p_local.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
799
|
-
p_local.add_argument("--output", help="Output JSON path")
|
|
800
|
-
p_local.add_argument("--force", action="store_true", help="Force run on non-Windows")
|
|
801
|
-
p_local.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
802
|
-
|
|
803
|
-
# Run Azure
|
|
804
|
-
p_azure = subparsers.add_parser("run-azure", help="Run evaluation on Azure")
|
|
805
|
-
p_azure.add_argument("--config", help="Azure config JSON path")
|
|
806
|
-
p_azure.add_argument("--waa-path", help="Path to WAA repository (auto-detected if not specified)")
|
|
807
|
-
p_azure.add_argument("--workers", type=int, default=1, help="Number of workers (default: 1 for free trial)")
|
|
808
|
-
p_azure.add_argument("--tasks", help="Comma-separated task IDs (default: all)")
|
|
809
|
-
p_azure.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
|
|
810
|
-
p_azure.add_argument("--agent", default="random", help="Agent type")
|
|
811
|
-
p_azure.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
812
|
-
p_azure.add_argument("--experiment", default="waa-eval", help="Experiment name")
|
|
813
|
-
p_azure.add_argument("--output", help="Output JSON path")
|
|
814
|
-
p_azure.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
|
815
|
-
p_azure.add_argument("--no-cleanup", action="store_true", help="Don't delete VMs after")
|
|
816
|
-
p_azure.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
817
|
-
|
|
818
|
-
# Test mock
|
|
819
|
-
p_mock = subparsers.add_parser("test-mock", help="Test with mock adapter")
|
|
820
|
-
p_mock.add_argument("--tasks", type=int, default=20, help="Number of mock tasks")
|
|
821
|
-
p_mock.add_argument("--max-steps", type=int, default=10, help="Max steps per task")
|
|
822
|
-
p_mock.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
823
|
-
|
|
824
|
-
# Test collection
|
|
825
|
-
p_collection = subparsers.add_parser("test-collection", help="Test benchmark data collection")
|
|
826
|
-
p_collection.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
|
|
827
|
-
p_collection.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
|
|
828
|
-
p_collection.add_argument("--seed", type=int, default=42, help="Random seed")
|
|
829
|
-
p_collection.add_argument("--model-id", default="random-agent-test", help="Model identifier")
|
|
830
|
-
p_collection.add_argument("--output", default="benchmark_results", help="Output directory")
|
|
831
|
-
p_collection.add_argument("--run-name", help="Run name (default: auto-generated)")
|
|
832
|
-
|
|
833
|
-
# Run API-backed evaluation
|
|
834
|
-
p_api = subparsers.add_parser("run-api", help="Run evaluation with API-backed VLM (Claude/GPT-5.1)")
|
|
835
|
-
p_api.add_argument("--provider", choices=["anthropic", "openai"], default="anthropic",
|
|
836
|
-
help="API provider (anthropic=Claude, openai=GPT-5.1)")
|
|
837
|
-
p_api.add_argument("--tasks", type=int, default=5, help="Number of mock tasks (default: 5)")
|
|
838
|
-
p_api.add_argument("--max-steps", type=int, default=10, help="Max steps per task (default: 10)")
|
|
839
|
-
p_api.add_argument("--max-tokens", type=int, default=512, help="Max tokens for API response")
|
|
840
|
-
p_api.add_argument("--no-a11y", action="store_true", help="Disable accessibility tree in prompt")
|
|
841
|
-
p_api.add_argument("--no-history", action="store_true", help="Disable action history in prompt")
|
|
842
|
-
p_api.add_argument("--output", default="benchmark_results", help="Output directory")
|
|
843
|
-
p_api.add_argument("--run-name", help="Run name (default: auto-generated)")
|
|
844
|
-
p_api.add_argument("--model-id", help="Model identifier (default: {provider}-api)")
|
|
845
|
-
p_api.add_argument("--use-real-waa", action="store_true", help="Use real WAA adapter (Windows only)")
|
|
846
|
-
p_api.add_argument("--waa-path", help="Path to WAA repository")
|
|
847
|
-
p_api.add_argument("--task-ids", help="Comma-separated task IDs for real WAA")
|
|
848
|
-
p_api.add_argument("--force", action="store_true", help="Force run on non-Windows")
|
|
849
|
-
p_api.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
850
|
-
|
|
851
|
-
# Create config
|
|
852
|
-
p_config = subparsers.add_parser("create-config", help="Create sample Azure config")
|
|
853
|
-
p_config.add_argument("--output", default="azure_config.json", help="Output path")
|
|
1863
|
+
p_build.set_defaults(func=cmd_build)
|
|
854
1864
|
|
|
855
|
-
|
|
1865
|
+
# start
|
|
1866
|
+
p_start = subparsers.add_parser("start", help="Start WAA container")
|
|
1867
|
+
p_start.add_argument(
|
|
1868
|
+
"--fresh", action="store_true", help="Clean storage for fresh Windows install"
|
|
1869
|
+
)
|
|
1870
|
+
p_start.add_argument(
|
|
1871
|
+
"--no-vnc", action="store_true", help="Don't auto-launch VNC viewer"
|
|
1872
|
+
)
|
|
1873
|
+
p_start.add_argument(
|
|
1874
|
+
"--fast",
|
|
1875
|
+
action="store_true",
|
|
1876
|
+
help="Allocate more CPU/RAM to QEMU (use with D8ds_v5 VM)",
|
|
1877
|
+
)
|
|
1878
|
+
p_start.set_defaults(func=cmd_start)
|
|
856
1879
|
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
1880
|
+
# stop
|
|
1881
|
+
p_stop = subparsers.add_parser("stop", help="Stop and remove WAA container")
|
|
1882
|
+
p_stop.add_argument(
|
|
1883
|
+
"--clean", action="store_true", help="Also clean Windows storage"
|
|
1884
|
+
)
|
|
1885
|
+
p_stop.set_defaults(func=cmd_stop)
|
|
1886
|
+
|
|
1887
|
+
# probe
|
|
1888
|
+
p_probe = subparsers.add_parser("probe", help="Check if WAA server is ready")
|
|
1889
|
+
p_probe.add_argument("--wait", action="store_true", help="Wait until ready")
|
|
1890
|
+
p_probe.add_argument(
|
|
1891
|
+
"--timeout", type=int, default=1200, help="Timeout in seconds (default: 1200)"
|
|
1892
|
+
)
|
|
1893
|
+
p_probe.set_defaults(func=cmd_probe)
|
|
1894
|
+
|
|
1895
|
+
# run
|
|
1896
|
+
p_run = subparsers.add_parser(
|
|
1897
|
+
"run", help="Run benchmark tasks (uses vanilla WAA navi agent)"
|
|
1898
|
+
)
|
|
1899
|
+
p_run.add_argument(
|
|
1900
|
+
"--num-tasks",
|
|
1901
|
+
type=int,
|
|
1902
|
+
default=1,
|
|
1903
|
+
help="Number of tasks to run (ignored if --task specified)",
|
|
1904
|
+
)
|
|
1905
|
+
p_run.add_argument("--task", help="Specific task ID to run")
|
|
1906
|
+
p_run.add_argument(
|
|
1907
|
+
"--domain",
|
|
1908
|
+
default="all",
|
|
1909
|
+
help="Domain filter (e.g., 'notepad', 'chrome', 'all')",
|
|
1910
|
+
)
|
|
1911
|
+
p_run.add_argument(
|
|
1912
|
+
"--model", default="gpt-4o", help="Model for navi agent (default: gpt-4o)"
|
|
1913
|
+
)
|
|
1914
|
+
p_run.add_argument(
|
|
1915
|
+
"--api-key", help="OpenAI API key (or set OPENAI_API_KEY in .env)"
|
|
1916
|
+
)
|
|
1917
|
+
p_run.add_argument(
|
|
1918
|
+
"--no-download", action="store_true", help="Skip downloading results"
|
|
1919
|
+
)
|
|
1920
|
+
p_run.add_argument(
|
|
1921
|
+
"--worker-id",
|
|
1922
|
+
type=int,
|
|
1923
|
+
default=0,
|
|
1924
|
+
help="Worker ID for parallel execution (0-indexed)",
|
|
1925
|
+
)
|
|
1926
|
+
p_run.add_argument(
|
|
1927
|
+
"--num-workers",
|
|
1928
|
+
type=int,
|
|
1929
|
+
default=1,
|
|
1930
|
+
help="Total number of parallel workers",
|
|
1931
|
+
)
|
|
1932
|
+
p_run.set_defaults(func=cmd_run)
|
|
1933
|
+
|
|
1934
|
+
# download
|
|
1935
|
+
p_download = subparsers.add_parser(
|
|
1936
|
+
"download", help="Download benchmark results from VM"
|
|
1937
|
+
)
|
|
1938
|
+
p_download.set_defaults(func=cmd_download)
|
|
1939
|
+
|
|
1940
|
+
# analyze
|
|
1941
|
+
p_analyze = subparsers.add_parser("analyze", help="Analyze benchmark results")
|
|
1942
|
+
p_analyze.add_argument(
|
|
1943
|
+
"--results-dir",
|
|
1944
|
+
help="Results directory (default: most recent in benchmark_results/)",
|
|
1945
|
+
)
|
|
1946
|
+
p_analyze.set_defaults(func=cmd_analyze)
|
|
1947
|
+
|
|
1948
|
+
# tasks
|
|
1949
|
+
p_tasks = subparsers.add_parser("tasks", help="List available WAA benchmark tasks")
|
|
1950
|
+
p_tasks.add_argument(
|
|
1951
|
+
"--verbose", "-v", action="store_true", help="Show all task IDs"
|
|
1952
|
+
)
|
|
1953
|
+
p_tasks.set_defaults(func=cmd_tasks)
|
|
1954
|
+
|
|
1955
|
+
# deallocate
|
|
1956
|
+
p_dealloc = subparsers.add_parser("deallocate", help="Stop VM (preserves disk)")
|
|
1957
|
+
p_dealloc.set_defaults(func=cmd_deallocate)
|
|
1958
|
+
|
|
1959
|
+
# vm-start
|
|
1960
|
+
p_vmstart = subparsers.add_parser("vm-start", help="Start a deallocated VM")
|
|
1961
|
+
p_vmstart.set_defaults(func=cmd_vm_start)
|
|
1962
|
+
|
|
1963
|
+
# logs
|
|
1964
|
+
p_logs = subparsers.add_parser("logs", help="Show WAA status and logs")
|
|
1965
|
+
p_logs.add_argument(
|
|
1966
|
+
"--follow", "-f", action="store_true", help="Stream docker logs continuously"
|
|
1967
|
+
)
|
|
1968
|
+
p_logs.add_argument(
|
|
1969
|
+
"--tail", "-n", type=int, help="Number of log lines to show (default: 20)"
|
|
1970
|
+
)
|
|
1971
|
+
p_logs.add_argument(
|
|
1972
|
+
"--run",
|
|
1973
|
+
action="store_true",
|
|
1974
|
+
help="Show run command output instead of container logs",
|
|
1975
|
+
)
|
|
1976
|
+
p_logs.add_argument(
|
|
1977
|
+
"--progress",
|
|
1978
|
+
"-p",
|
|
1979
|
+
action="store_true",
|
|
1980
|
+
help="Show benchmark progress and estimated completion time",
|
|
1981
|
+
)
|
|
1982
|
+
p_logs.set_defaults(func=cmd_logs)
|
|
1983
|
+
|
|
1984
|
+
# exec
|
|
1985
|
+
p_exec = subparsers.add_parser("exec", help="Run command on VM host")
|
|
1986
|
+
p_exec.add_argument("--cmd", required=True, help="Command to run")
|
|
1987
|
+
p_exec.set_defaults(func=cmd_exec)
|
|
1988
|
+
|
|
1989
|
+
# docker-exec
|
|
1990
|
+
p_dexec = subparsers.add_parser(
|
|
1991
|
+
"docker-exec", help="Run command inside winarena container"
|
|
1992
|
+
)
|
|
1993
|
+
p_dexec.add_argument("--cmd", required=True, help="Command to run")
|
|
1994
|
+
p_dexec.set_defaults(func=cmd_docker_exec)
|
|
1995
|
+
|
|
1996
|
+
# vnc
|
|
1997
|
+
p_vnc = subparsers.add_parser(
|
|
1998
|
+
"vnc", help="Open VNC to view Windows desktop via SSH tunnel"
|
|
1999
|
+
)
|
|
2000
|
+
p_vnc.set_defaults(func=cmd_vnc)
|
|
2001
|
+
|
|
2002
|
+
args = parser.parse_args()
|
|
2003
|
+
sys.exit(args.func(args))
|
|
881
2004
|
|
|
882
2005
|
|
|
883
2006
|
if __name__ == "__main__":
|