openadapt-ml 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/cli.py +37 -42
- {openadapt_ml-0.3.0.dist-info → openadapt_ml-0.3.1.dist-info}/METADATA +141 -4
- {openadapt_ml-0.3.0.dist-info → openadapt_ml-0.3.1.dist-info}/RECORD +5 -5
- {openadapt_ml-0.3.0.dist-info → openadapt_ml-0.3.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.3.0.dist-info → openadapt_ml-0.3.1.dist-info}/licenses/LICENSE +0 -0
openadapt_ml/benchmarks/cli.py
CHANGED
|
@@ -783,7 +783,7 @@ def cmd_pool_create(args):
|
|
|
783
783
|
working_region = region
|
|
784
784
|
working_cost = cost
|
|
785
785
|
# Delete the test VM and wait for completion
|
|
786
|
-
log("POOL",
|
|
786
|
+
log("POOL", " Found working combo, cleaning up test VM...")
|
|
787
787
|
subprocess.run(
|
|
788
788
|
["az", "vm", "delete", "-g", RESOURCE_GROUP, "-n", test_name, "--yes", "--force-deletion", "true"],
|
|
789
789
|
capture_output=True,
|
|
@@ -1089,7 +1089,7 @@ echo "STARTED"
|
|
|
1089
1089
|
workers_ready.append(worker)
|
|
1090
1090
|
del workers_pending[name]
|
|
1091
1091
|
registry.update_worker(name, waa_ready=True, status="ready")
|
|
1092
|
-
except Exception
|
|
1092
|
+
except Exception:
|
|
1093
1093
|
pass # Keep trying
|
|
1094
1094
|
|
|
1095
1095
|
if workers_pending:
|
|
@@ -1494,7 +1494,6 @@ def cmd_pool_logs(args):
|
|
|
1494
1494
|
Shows Docker container logs from each worker with [worker-name] prefix.
|
|
1495
1495
|
Use Ctrl+C to stop.
|
|
1496
1496
|
"""
|
|
1497
|
-
import sys
|
|
1498
1497
|
import threading
|
|
1499
1498
|
from queue import Queue, Empty
|
|
1500
1499
|
|
|
@@ -1516,7 +1515,7 @@ def cmd_pool_logs(args):
|
|
|
1516
1515
|
|
|
1517
1516
|
pool_id = pool.get("pool_id", "unknown")
|
|
1518
1517
|
print(f"[pool-logs] Streaming logs from {len(workers)} workers (pool: {pool_id})")
|
|
1519
|
-
print(
|
|
1518
|
+
print("[pool-logs] Press Ctrl+C to stop\n", flush=True)
|
|
1520
1519
|
|
|
1521
1520
|
# Queue for collecting output from all workers
|
|
1522
1521
|
output_queue = Queue()
|
|
@@ -2041,10 +2040,10 @@ def cmd_start(args):
|
|
|
2041
2040
|
log("START", "Starting container with VERSION=11e...")
|
|
2042
2041
|
|
|
2043
2042
|
# Get agent and model from args (defaults match WAA defaults)
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2043
|
+
getattr(args, "agent", "navi")
|
|
2044
|
+
getattr(args, "model", "gpt-4o")
|
|
2045
|
+
getattr(args, "som_origin", "oss")
|
|
2046
|
+
getattr(args, "a11y_backend", "uia")
|
|
2048
2047
|
|
|
2049
2048
|
# The vanilla windowsarena/winarena:latest image uses --entrypoint /bin/bash
|
|
2050
2049
|
# and requires entry.sh as the command argument
|
|
@@ -2210,8 +2209,8 @@ def cmd_test_golden_image(args):
|
|
|
2210
2209
|
|
|
2211
2210
|
if "FAIL" not in result.stdout and result.stdout.strip():
|
|
2212
2211
|
elapsed = time.time() - start_time
|
|
2213
|
-
log("TEST",
|
|
2214
|
-
log("TEST",
|
|
2212
|
+
log("TEST", "")
|
|
2213
|
+
log("TEST", "✅ GOLDEN IMAGE TEST PASSED")
|
|
2215
2214
|
log("TEST", f" Boot time: {elapsed:.1f} seconds")
|
|
2216
2215
|
log("TEST", f" Image size: {image_size}")
|
|
2217
2216
|
log("TEST", f" Response: {result.stdout.strip()[:80]}")
|
|
@@ -2219,11 +2218,11 @@ def cmd_test_golden_image(args):
|
|
|
2219
2218
|
|
|
2220
2219
|
elapsed = time.time() - start_time
|
|
2221
2220
|
if elapsed > timeout:
|
|
2222
|
-
log("TEST",
|
|
2223
|
-
log("TEST",
|
|
2221
|
+
log("TEST", "")
|
|
2222
|
+
log("TEST", "❌ GOLDEN IMAGE TEST FAILED")
|
|
2224
2223
|
log("TEST", f" WAA server did not respond after {timeout}s")
|
|
2225
|
-
log("TEST",
|
|
2226
|
-
log("TEST",
|
|
2224
|
+
log("TEST", " This may indicate a corrupted golden image")
|
|
2225
|
+
log("TEST", " Try: cli.py start --fresh # to create new golden image")
|
|
2227
2226
|
return 1
|
|
2228
2227
|
|
|
2229
2228
|
# Progress display
|
|
@@ -2289,7 +2288,7 @@ def cmd_test_blob_access(args):
|
|
|
2289
2288
|
if "NOT_FOUND" in result.stdout:
|
|
2290
2289
|
log("TEST-BLOB", f"❌ Storage account '{storage_account}' not found or not accessible")
|
|
2291
2290
|
return 1
|
|
2292
|
-
log("TEST-BLOB",
|
|
2291
|
+
log("TEST-BLOB", " ✓ Storage account accessible")
|
|
2293
2292
|
|
|
2294
2293
|
# Check container exists - try candidates in order
|
|
2295
2294
|
log("TEST-BLOB", "3. Checking for usable container...")
|
|
@@ -2306,7 +2305,7 @@ def cmd_test_blob_access(args):
|
|
|
2306
2305
|
break
|
|
2307
2306
|
|
|
2308
2307
|
if not container_name:
|
|
2309
|
-
log("TEST-BLOB",
|
|
2308
|
+
log("TEST-BLOB", "❌ No suitable container found")
|
|
2310
2309
|
log("TEST-BLOB", f" Available: {available_containers}")
|
|
2311
2310
|
log("TEST-BLOB", f" Create one with: az storage container create --name waa-golden-images --account-name {storage_account}")
|
|
2312
2311
|
return 1
|
|
@@ -2325,7 +2324,7 @@ def cmd_test_blob_access(args):
|
|
|
2325
2324
|
if not blob_output or "Name" not in blob_output:
|
|
2326
2325
|
log("TEST-BLOB", " ⚠ Container is empty (no golden image uploaded yet)")
|
|
2327
2326
|
else:
|
|
2328
|
-
log("TEST-BLOB",
|
|
2327
|
+
log("TEST-BLOB", " ✓ Blobs found:")
|
|
2329
2328
|
for line in blob_output.split("\n")[:10]: # Show first 10
|
|
2330
2329
|
log("TEST-BLOB", f" {line}")
|
|
2331
2330
|
|
|
@@ -2383,7 +2382,7 @@ def cmd_test_api_key(args):
|
|
|
2383
2382
|
|
|
2384
2383
|
if response.status_code == 200:
|
|
2385
2384
|
log("TEST-API", "✅ API KEY TEST PASSED")
|
|
2386
|
-
log("TEST-API",
|
|
2385
|
+
log("TEST-API", " Model: gpt-4o-mini responded successfully")
|
|
2387
2386
|
return 0
|
|
2388
2387
|
elif response.status_code == 401:
|
|
2389
2388
|
log("TEST-API", "❌ API KEY INVALID (401 Unauthorized)")
|
|
@@ -2439,7 +2438,7 @@ def cmd_test_waa_tasks(args):
|
|
|
2439
2438
|
if "NOT_FOUND" in result.stdout:
|
|
2440
2439
|
log("TEST-TASKS", "❌ Task directory not found")
|
|
2441
2440
|
return 1
|
|
2442
|
-
log("TEST-TASKS",
|
|
2441
|
+
log("TEST-TASKS", "Task files found:")
|
|
2443
2442
|
for line in result.stdout.strip().split("\n")[:10]:
|
|
2444
2443
|
log("TEST-TASKS", f" {line}")
|
|
2445
2444
|
else:
|
|
@@ -3819,7 +3818,6 @@ def check_golden_image_in_blob(storage_account: str, storage_key: str, blob_cont
|
|
|
3819
3818
|
# Check for required files (only data.img is truly required, OVMF files come from Docker image)
|
|
3820
3819
|
# windows.vars contains UEFI variables for the specific Windows install
|
|
3821
3820
|
required_files = ["data.img"]
|
|
3822
|
-
optional_files = ["windows.vars", "windows.ver", "windows.rom", "windows.mac", "windows.base"]
|
|
3823
3821
|
found_files = [f["name"].replace("storage/", "") for f in files]
|
|
3824
3822
|
has_required = all(rf in found_files for rf in required_files)
|
|
3825
3823
|
|
|
@@ -4527,7 +4525,7 @@ def show_azure_ml_resources() -> dict:
|
|
|
4527
4525
|
code_share = get_azure_ml_file_share_name()
|
|
4528
4526
|
if code_share:
|
|
4529
4527
|
log("AZURE-ML", f" Share: {code_share}")
|
|
4530
|
-
log("AZURE-ML",
|
|
4528
|
+
log("AZURE-ML", " - Users/openadapt/compute-instance-startup.sh")
|
|
4531
4529
|
result["file_share_files"].append("Users/openadapt/compute-instance-startup.sh")
|
|
4532
4530
|
else:
|
|
4533
4531
|
log("AZURE-ML", " (not found)")
|
|
@@ -4599,7 +4597,7 @@ def teardown_azure_ml_resources(confirm: bool = False, keep_image: bool = False)
|
|
|
4599
4597
|
log("AZURE-ML", "")
|
|
4600
4598
|
log("AZURE-ML", " File Share:")
|
|
4601
4599
|
if code_share:
|
|
4602
|
-
log("AZURE-ML",
|
|
4600
|
+
log("AZURE-ML", " - Users/openadapt/compute-instance-startup.sh")
|
|
4603
4601
|
else:
|
|
4604
4602
|
log("AZURE-ML", " (none)")
|
|
4605
4603
|
|
|
@@ -4723,7 +4721,7 @@ def cmd_run_azure_ml_auto(args):
|
|
|
4723
4721
|
skip_benchmark = getattr(args, "skip_benchmark", False)
|
|
4724
4722
|
fast_vm = getattr(args, "fast", False)
|
|
4725
4723
|
|
|
4726
|
-
log("AUTO",
|
|
4724
|
+
log("AUTO", "Configuration:")
|
|
4727
4725
|
log("AUTO", f" Workers: {num_workers}")
|
|
4728
4726
|
log("AUTO", f" Setup timeout: {timeout_minutes} min")
|
|
4729
4727
|
log("AUTO", f" Probe timeout: {probe_timeout} sec")
|
|
@@ -4742,7 +4740,7 @@ def cmd_run_azure_ml_auto(args):
|
|
|
4742
4740
|
if state and "running" in state.lower():
|
|
4743
4741
|
log("AUTO", f" VM already running at {ip}")
|
|
4744
4742
|
elif state and "deallocated" in state.lower():
|
|
4745
|
-
log("AUTO",
|
|
4743
|
+
log("AUTO", " VM deallocated, starting...")
|
|
4746
4744
|
result = subprocess.run(
|
|
4747
4745
|
["az", "vm", "start", "-g", RESOURCE_GROUP, "-n", VM_NAME],
|
|
4748
4746
|
capture_output=True,
|
|
@@ -4866,7 +4864,7 @@ def cmd_run_azure_ml_auto(args):
|
|
|
4866
4864
|
# Step 3: Wait for WAA server to become ready
|
|
4867
4865
|
# =========================================================================
|
|
4868
4866
|
log("AUTO", "[Step 3/5] Waiting for WAA server...")
|
|
4869
|
-
log("AUTO",
|
|
4867
|
+
log("AUTO", " (This may take 15-20 minutes on first run)")
|
|
4870
4868
|
log("AUTO", f" Timeout: {probe_timeout} seconds")
|
|
4871
4869
|
|
|
4872
4870
|
probe_start = time.time()
|
|
@@ -5016,7 +5014,7 @@ def cmd_run_azure_ml_auto(args):
|
|
|
5016
5014
|
result = subprocess.run(cmd, cwd=waa_scripts)
|
|
5017
5015
|
|
|
5018
5016
|
if result.returncode != 0:
|
|
5019
|
-
log("AUTO",
|
|
5017
|
+
log("AUTO", " ERROR: run_azure.py failed")
|
|
5020
5018
|
return 1
|
|
5021
5019
|
|
|
5022
5020
|
# =========================================================================
|
|
@@ -5883,7 +5881,7 @@ def find_best_region_for_vm(vm_size: str, min_vcpus: int = 8, preferred_regions:
|
|
|
5883
5881
|
"warning": warning
|
|
5884
5882
|
}
|
|
5885
5883
|
|
|
5886
|
-
except Exception
|
|
5884
|
+
except Exception:
|
|
5887
5885
|
continue
|
|
5888
5886
|
|
|
5889
5887
|
# No ideal region found - return best available
|
|
@@ -6196,7 +6194,6 @@ def cmd_azure_ml_monitor(args):
|
|
|
6196
6194
|
|
|
6197
6195
|
# Set up VNC if requested
|
|
6198
6196
|
vnc_port = 8007
|
|
6199
|
-
tunnel_proc = None
|
|
6200
6197
|
|
|
6201
6198
|
if auto_vnc:
|
|
6202
6199
|
# Find compute instance for this job
|
|
@@ -6350,7 +6347,6 @@ def cmd_azure_ml_logs(args):
|
|
|
6350
6347
|
log("AZURE-ML-LOGS", "=" * 60)
|
|
6351
6348
|
|
|
6352
6349
|
last_size = 0
|
|
6353
|
-
process = None
|
|
6354
6350
|
|
|
6355
6351
|
try:
|
|
6356
6352
|
while True:
|
|
@@ -6388,7 +6384,7 @@ def cmd_azure_ml_logs(args):
|
|
|
6388
6384
|
# Clean up temp file
|
|
6389
6385
|
try:
|
|
6390
6386
|
os.unlink(temp_file)
|
|
6391
|
-
except:
|
|
6387
|
+
except Exception:
|
|
6392
6388
|
pass
|
|
6393
6389
|
|
|
6394
6390
|
if not follow:
|
|
@@ -6591,7 +6587,7 @@ def cmd_azure_ml_stream_logs(args):
|
|
|
6591
6587
|
log("PROGRESS", f"Last: {progress['messages'][-1].get('text', '')}")
|
|
6592
6588
|
last_progress = progress.copy()
|
|
6593
6589
|
log("STREAM", "")
|
|
6594
|
-
except Exception
|
|
6590
|
+
except Exception:
|
|
6595
6591
|
pass # Progress file may be partially written
|
|
6596
6592
|
|
|
6597
6593
|
# Show events if requested
|
|
@@ -6604,10 +6600,10 @@ def cmd_azure_ml_stream_logs(args):
|
|
|
6604
6600
|
try:
|
|
6605
6601
|
event = json.loads(line.strip())
|
|
6606
6602
|
log("EVENT", f"{event['type']}: {json.dumps(event.get('data', {}))}")
|
|
6607
|
-
except:
|
|
6603
|
+
except Exception:
|
|
6608
6604
|
pass
|
|
6609
6605
|
last_event_count = len(lines)
|
|
6610
|
-
except:
|
|
6606
|
+
except Exception:
|
|
6611
6607
|
pass
|
|
6612
6608
|
|
|
6613
6609
|
# Show log content (default)
|
|
@@ -6620,7 +6616,7 @@ def cmd_azure_ml_stream_logs(args):
|
|
|
6620
6616
|
new_content = content[last_log_size:]
|
|
6621
6617
|
print(new_content, end='', flush=True)
|
|
6622
6618
|
last_log_size = len(content)
|
|
6623
|
-
except:
|
|
6619
|
+
except Exception:
|
|
6624
6620
|
pass
|
|
6625
6621
|
|
|
6626
6622
|
# If no logs available yet
|
|
@@ -6631,7 +6627,7 @@ def cmd_azure_ml_stream_logs(args):
|
|
|
6631
6627
|
try:
|
|
6632
6628
|
job = ml_client.jobs.get(job_name)
|
|
6633
6629
|
status = job.status
|
|
6634
|
-
except:
|
|
6630
|
+
except Exception:
|
|
6635
6631
|
status = "Unknown"
|
|
6636
6632
|
|
|
6637
6633
|
if not follow:
|
|
@@ -6762,11 +6758,11 @@ def cmd_azure_ml_progress(args):
|
|
|
6762
6758
|
print(f"Last Update: {progress.get('last_update', 'N/A')}")
|
|
6763
6759
|
|
|
6764
6760
|
if progress.get('messages'):
|
|
6765
|
-
print(
|
|
6761
|
+
print("\nRecent Messages:")
|
|
6766
6762
|
for msg in progress['messages'][-5:]:
|
|
6767
6763
|
print(f" {msg.get('time', '')} - {msg.get('text', '')}")
|
|
6768
6764
|
return
|
|
6769
|
-
except Exception
|
|
6765
|
+
except Exception:
|
|
6770
6766
|
# If can't download, just show job status
|
|
6771
6767
|
pass
|
|
6772
6768
|
|
|
@@ -6778,7 +6774,7 @@ def cmd_azure_ml_progress(args):
|
|
|
6778
6774
|
|
|
6779
6775
|
try:
|
|
6780
6776
|
if watch:
|
|
6781
|
-
log("PROGRESS",
|
|
6777
|
+
log("PROGRESS", "Watching progress (Ctrl+C to stop)")
|
|
6782
6778
|
while True:
|
|
6783
6779
|
show_progress()
|
|
6784
6780
|
|
|
@@ -6788,7 +6784,7 @@ def cmd_azure_ml_progress(args):
|
|
|
6788
6784
|
if job.status in ["Completed", "Failed", "Canceled"]:
|
|
6789
6785
|
print(f"\nJob {job.status}")
|
|
6790
6786
|
break
|
|
6791
|
-
except:
|
|
6787
|
+
except Exception:
|
|
6792
6788
|
pass
|
|
6793
6789
|
|
|
6794
6790
|
time.sleep(poll_interval)
|
|
@@ -6862,7 +6858,6 @@ def cmd_azure_ml_delete_compute(args):
|
|
|
6862
6858
|
uv run python -m openadapt_ml.benchmarks.cli azure-ml-delete-compute --all # Deletes all instances
|
|
6863
6859
|
"""
|
|
6864
6860
|
init_logging()
|
|
6865
|
-
from openadapt_ml.config import settings
|
|
6866
6861
|
|
|
6867
6862
|
compute_name = getattr(args, "name", None)
|
|
6868
6863
|
delete_all = getattr(args, "all", False)
|
|
@@ -7129,7 +7124,7 @@ def cmd_azure_ml_cost(args):
|
|
|
7129
7124
|
log("COST", f" {name}")
|
|
7130
7125
|
log("COST", f" Status: {state}")
|
|
7131
7126
|
log("COST", f" Size: {vm_size} (${hourly_rate:.2f}/hr)")
|
|
7132
|
-
log("COST",
|
|
7127
|
+
log("COST", " Created: (unknown)")
|
|
7133
7128
|
|
|
7134
7129
|
log("COST", "=" * 60)
|
|
7135
7130
|
log("COST", f"Total Running Cost: ${total_cost:.2f}")
|
|
@@ -7265,7 +7260,7 @@ def cmd_azure_ml_teardown(args):
|
|
|
7265
7260
|
text=True,
|
|
7266
7261
|
)
|
|
7267
7262
|
if result.returncode == 0:
|
|
7268
|
-
log("TEARDOWN",
|
|
7263
|
+
log("TEARDOWN", "Resource group deletion initiated (async)")
|
|
7269
7264
|
else:
|
|
7270
7265
|
log("TEARDOWN", f"Failed to delete resource group: {result.stderr}")
|
|
7271
7266
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openadapt-ml
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
|
|
5
5
|
Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
|
|
6
6
|
Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
|
|
@@ -58,7 +58,7 @@ Description-Content-Type: text/markdown
|
|
|
58
58
|
|
|
59
59
|
# OpenAdapt-ML
|
|
60
60
|
|
|
61
|
-
[](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml)
|
|
62
62
|
[](https://pypi.org/project/openadapt-ml/)
|
|
63
63
|
[](https://pypi.org/project/openadapt-ml/)
|
|
64
64
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -88,6 +88,38 @@ The design is described in detail in [`docs/design.md`](docs/design.md).
|
|
|
88
88
|
|
|
89
89
|
---
|
|
90
90
|
|
|
91
|
+
## Parallel WAA Benchmark Evaluation (New in v0.3.0)
|
|
92
|
+
|
|
93
|
+
Run Windows Agent Arena benchmarks across multiple Azure VMs in parallel for faster evaluation:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Create a pool of 5 workers
|
|
97
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
|
|
98
|
+
|
|
99
|
+
# Wait for all workers to be ready
|
|
100
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-wait
|
|
101
|
+
|
|
102
|
+
# Run 154 tasks distributed across workers (~5x faster)
|
|
103
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Key features:**
|
|
107
|
+
- **Parallel execution**: Distribute 154 WAA tasks across N workers
|
|
108
|
+
- **Automatic task distribution**: Uses WAA's native `--worker_id`/`--num_workers` for round-robin assignment
|
|
109
|
+
- **VNC access**: View each Windows VM via SSH tunnels (`localhost:8006`, `localhost:8007`, etc.)
|
|
110
|
+
- **Cost tracking**: Monitor Azure VM costs in real-time
|
|
111
|
+
|
|
112
|
+
**Performance:**
|
|
113
|
+
| Workers | Estimated Time (154 tasks) |
|
|
114
|
+
|---------|---------------------------|
|
|
115
|
+
| 1 | ~50-80 hours |
|
|
116
|
+
| 5 | ~10-16 hours |
|
|
117
|
+
| 10 | ~5-8 hours |
|
|
118
|
+
|
|
119
|
+
See [WAA Benchmark Workflow](#waa-benchmark-workflow) for complete setup instructions.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
91
123
|
## 1. Installation
|
|
92
124
|
|
|
93
125
|
### 1.1 From PyPI (recommended)
|
|
@@ -1029,7 +1061,112 @@ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-t
|
|
|
1029
1061
|
|
|
1030
1062
|
---
|
|
1031
1063
|
|
|
1032
|
-
|
|
1064
|
+
<a id="waa-benchmark-workflow"></a>
|
|
1065
|
+
|
|
1066
|
+
## 14. WAA Benchmark Workflow
|
|
1067
|
+
|
|
1068
|
+
Windows Agent Arena (WAA) is a benchmark of 154 tasks across 11 Windows domains. OpenAdapt-ML provides infrastructure to run WAA evaluations on Azure VMs with parallel execution.
|
|
1069
|
+
|
|
1070
|
+
### 14.1 Prerequisites
|
|
1071
|
+
|
|
1072
|
+
1. **Azure CLI**: `brew install azure-cli && az login`
|
|
1073
|
+
2. **OpenAI API Key**: Set in `.env` file (`OPENAI_API_KEY=sk-...`)
|
|
1074
|
+
3. **Azure quota**: Ddsv5 family VMs (8+ vCPUs per worker)
|
|
1075
|
+
|
|
1076
|
+
### 14.2 Single VM Workflow
|
|
1077
|
+
|
|
1078
|
+
For quick testing or small runs:
|
|
1079
|
+
|
|
1080
|
+
```bash
|
|
1081
|
+
# Setup VM with WAA
|
|
1082
|
+
uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
|
|
1083
|
+
|
|
1084
|
+
# Start monitoring dashboard (auto-opens VNC, manages SSH tunnels)
|
|
1085
|
+
uv run python -m openadapt_ml.benchmarks.cli vm monitor
|
|
1086
|
+
|
|
1087
|
+
# Run benchmark
|
|
1088
|
+
uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 10
|
|
1089
|
+
|
|
1090
|
+
# Deallocate when done (stops billing)
|
|
1091
|
+
uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
|
|
1092
|
+
```
|
|
1093
|
+
|
|
1094
|
+
### 14.3 Parallel Pool Workflow (Recommended)
|
|
1095
|
+
|
|
1096
|
+
For full 154-task evaluations, use multiple VMs:
|
|
1097
|
+
|
|
1098
|
+
```bash
|
|
1099
|
+
# 1. Create pool (provisions N Azure VMs with Docker + WAA)
|
|
1100
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
|
|
1101
|
+
|
|
1102
|
+
# 2. Wait for all workers to be ready (Windows boot + WAA server startup)
|
|
1103
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-wait
|
|
1104
|
+
|
|
1105
|
+
# 3. Run benchmark across all workers
|
|
1106
|
+
# Tasks are distributed using WAA's native --worker_id/--num_workers
|
|
1107
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
|
|
1108
|
+
|
|
1109
|
+
# 4. Monitor progress
|
|
1110
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-status
|
|
1111
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-logs
|
|
1112
|
+
|
|
1113
|
+
# 5. Cleanup (delete all VMs - IMPORTANT to stop billing!)
|
|
1114
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
|
|
1115
|
+
```
|
|
1116
|
+
|
|
1117
|
+
### 14.4 VNC Access to Workers
|
|
1118
|
+
|
|
1119
|
+
View what each Windows VM is doing:
|
|
1120
|
+
|
|
1121
|
+
```bash
|
|
1122
|
+
# Get worker IPs
|
|
1123
|
+
uv run python -m openadapt_ml.benchmarks.cli pool-status
|
|
1124
|
+
|
|
1125
|
+
# Set up SSH tunnels (tunnels are created automatically, but you can also do this manually)
|
|
1126
|
+
ssh -f -N -L 8006:localhost:8006 azureuser@<worker-0-ip> # localhost:8006
|
|
1127
|
+
ssh -f -N -L 8007:localhost:8006 azureuser@<worker-1-ip> # localhost:8007
|
|
1128
|
+
# etc.
|
|
1129
|
+
|
|
1130
|
+
# Open in browser
|
|
1131
|
+
open http://localhost:8006 # Worker 0
|
|
1132
|
+
open http://localhost:8007 # Worker 1
|
|
1133
|
+
```
|
|
1134
|
+
|
|
1135
|
+
### 14.5 Architecture
|
|
1136
|
+
|
|
1137
|
+
```
|
|
1138
|
+
Local Machine
|
|
1139
|
+
├── openadapt-ml CLI (pool-create, pool-wait, pool-run)
|
|
1140
|
+
│ └── SSH tunnels to each worker
|
|
1141
|
+
│
|
|
1142
|
+
Azure (N VMs, Standard_D8ds_v5)
|
|
1143
|
+
├── waa-pool-00
|
|
1144
|
+
│ └── Docker
|
|
1145
|
+
│ └── windowsarena/winarena:latest
|
|
1146
|
+
│ └── QEMU (Windows 11)
|
|
1147
|
+
│ ├── WAA Flask server (port 5000)
|
|
1148
|
+
│ └── Navi agent (GPT-4o-mini)
|
|
1149
|
+
├── waa-pool-01
|
|
1150
|
+
│ └── ...
|
|
1151
|
+
└── waa-pool-N
|
|
1152
|
+
└── ...
|
|
1153
|
+
```
|
|
1154
|
+
|
|
1155
|
+
### 14.6 Cost Estimates
|
|
1156
|
+
|
|
1157
|
+
| VM Size | vCPUs | RAM | Cost/hr | 5 VMs for 10hrs |
|
|
1158
|
+
|---------|-------|-----|---------|-----------------|
|
|
1159
|
+
| Standard_D8ds_v5 | 8 | 32GB | ~$0.38 | ~$19 |
|
|
1160
|
+
|
|
1161
|
+
**Tips:**
|
|
1162
|
+
- Always run `pool-cleanup` when done to delete VMs and stop billing
|
|
1163
|
+
- Use `vm deallocate` (not delete) to pause billing but keep disk
|
|
1164
|
+
- Set `--auto-shutdown-hours 2` on `vm monitor` for safety
|
|
1165
|
+
- Prices vary by Azure region
|
|
1166
|
+
|
|
1167
|
+
---
|
|
1168
|
+
|
|
1169
|
+
## 15. Limitations & Notes
|
|
1033
1170
|
|
|
1034
1171
|
- **Apple Silicon / bitsandbytes**:
|
|
1035
1172
|
- Example configs are sized for CPU / Apple Silicon development runs; see
|
|
@@ -1053,7 +1190,7 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
|
|
|
1053
1190
|
|
|
1054
1191
|
---
|
|
1055
1192
|
|
|
1056
|
-
##
|
|
1193
|
+
## 16. Roadmap
|
|
1057
1194
|
|
|
1058
1195
|
For the up-to-date, prioritized roadmap (including concrete implementation
|
|
1059
1196
|
targets and agent-executable acceptance criteria), see
|
|
@@ -10,7 +10,7 @@ openadapt_ml/benchmarks/__init__.py,sha256=FaEGc7pRM-eLUXEEpJXcIckwkIWKhfaDkaxGM
|
|
|
10
10
|
openadapt_ml/benchmarks/agent.py,sha256=8UcS9skCy6l18fGYaYt0JzJmYSGNB_WxDWhApbM7QH0,26940
|
|
11
11
|
openadapt_ml/benchmarks/azure.py,sha256=dCrxi90X5NmFNMTT-2WG4AF3-IOO4zQs7yPpnqR-jLc,28238
|
|
12
12
|
openadapt_ml/benchmarks/azure_ops_tracker.py,sha256=NOW21LPagOWIThSCIotI5cBvve92dtIktRIDLuyJ2CI,19309
|
|
13
|
-
openadapt_ml/benchmarks/cli.py,sha256=
|
|
13
|
+
openadapt_ml/benchmarks/cli.py,sha256=DwBZJEZF2XwajAazWWXxePbuH7J_W8G9N0y7iv3l7FI,288566
|
|
14
14
|
openadapt_ml/benchmarks/trace_export.py,sha256=Zx-pryEuLe734YHY8MgJsNdj3I3TcTY61OQ9iurgGB0,21746
|
|
15
15
|
openadapt_ml/benchmarks/viewer.py,sha256=Jztt_IoDW1u0WjPqlikfR8dunYzj66xCx0bMDDzJHQ8,41586
|
|
16
16
|
openadapt_ml/benchmarks/vm_monitor.py,sha256=EYgPRok2MPqs8Yajg7EJaqyb4EtRpqt8URQMLhE9Ego,35991
|
|
@@ -110,7 +110,7 @@ openadapt_ml/training/trainer.py,sha256=yGK79alY9Z0xGRQ2r9EaiWbzGlmE5WZJQL_2TWgc
|
|
|
110
110
|
openadapt_ml/training/trl_trainer.py,sha256=AL1KFWXMub4vWE2w8eoAoQbSgm2fXO82CIqXULLYwVo,13223
|
|
111
111
|
openadapt_ml/training/viewer.py,sha256=rXpREFbDK_tsu719VUej6iXrgnB8eNP0SEuvB9NUUhA,128104
|
|
112
112
|
openadapt_ml/training/viewer_components.py,sha256=XilaX7r4YXFMT1QkooNnPWqR14SpsiTf7YbrN_g-Lq0,5478
|
|
113
|
-
openadapt_ml-0.3.
|
|
114
|
-
openadapt_ml-0.3.
|
|
115
|
-
openadapt_ml-0.3.
|
|
116
|
-
openadapt_ml-0.3.
|
|
113
|
+
openadapt_ml-0.3.1.dist-info/METADATA,sha256=h5Xf2LEjMlBOsuwDCRiF5_cGlEwlgbRp8Vkqw1HOo4Q,40990
|
|
114
|
+
openadapt_ml-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
115
|
+
openadapt_ml-0.3.1.dist-info/licenses/LICENSE,sha256=2E5UY67RVLedJuNnwGudkAMtfM3LZNUcHgmaL89TAfw,1068
|
|
116
|
+
openadapt_ml-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|