openadapt-ml 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -154,11 +154,14 @@ class VMMonitor:
154
154
  def check_waa_probe(self) -> tuple[bool, str | None]:
155
155
  """Check if WAA /probe endpoint responds.
156
156
 
157
+ The probe must run INSIDE the container to reach 172.30.0.2 (Docker internal network).
158
+
157
159
  Returns:
158
160
  Tuple of (ready, response_text).
159
161
  """
160
162
  try:
161
- cmd = f"curl -s --connect-timeout {self.timeout} http://{self.config.internal_ip}:{self.config.waa_port}/probe"
163
+ # Run curl inside container to access Docker internal network
164
+ cmd = f"docker exec {self.config.docker_container} curl -s --max-time {self.timeout} http://{self.config.internal_ip}:{self.config.waa_port}/probe 2>/dev/null || echo FAIL"
162
165
  result = subprocess.run(
163
166
  [
164
167
  "ssh",
@@ -173,10 +176,10 @@ class VMMonitor:
173
176
  ],
174
177
  capture_output=True,
175
178
  text=True,
176
- timeout=self.timeout + 10,
179
+ timeout=self.timeout + 15,
177
180
  )
178
181
  response = result.stdout.strip()
179
- if response and "error" not in response.lower():
182
+ if response and "FAIL" not in response and "error" not in response.lower():
180
183
  return True, response
181
184
  return False, response or None
182
185
  except (subprocess.TimeoutExpired, Exception) as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openadapt-ml
3
- Version: 0.2.2
3
+ Version: 0.3.1
4
4
  Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
5
5
  Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
6
6
  Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
@@ -58,7 +58,7 @@ Description-Content-Type: text/markdown
58
58
 
59
59
  # OpenAdapt-ML
60
60
 
61
- [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml)
61
+ [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml)
62
62
  [![PyPI version](https://img.shields.io/pypi/v/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
63
63
  [![Downloads](https://img.shields.io/pypi/dm/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
64
64
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -88,6 +88,38 @@ The design is described in detail in [`docs/design.md`](docs/design.md).
88
88
 
89
89
  ---
90
90
 
91
+ ## Parallel WAA Benchmark Evaluation (New in v0.3.0)
92
+
93
+ Run Windows Agent Arena benchmarks across multiple Azure VMs in parallel for faster evaluation:
94
+
95
+ ```bash
96
+ # Create a pool of 5 workers
97
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
98
+
99
+ # Wait for all workers to be ready
100
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
101
+
102
+ # Run 154 tasks distributed across workers (~5x faster)
103
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
104
+ ```
105
+
106
+ **Key features:**
107
+ - **Parallel execution**: Distribute 154 WAA tasks across N workers
108
+ - **Automatic task distribution**: Uses WAA's native `--worker_id`/`--num_workers` for round-robin assignment
109
+ - **VNC access**: View each Windows VM via SSH tunnels (`localhost:8006`, `localhost:8007`, etc.)
110
+ - **Cost tracking**: Monitor Azure VM costs in real-time
111
+
112
+ **Performance:**
113
+ | Workers | Estimated Time (154 tasks) |
114
+ |---------|---------------------------|
115
+ | 1 | ~50-80 hours |
116
+ | 5 | ~10-16 hours |
117
+ | 10 | ~5-8 hours |
118
+
119
+ See [WAA Benchmark Workflow](#waa-benchmark-workflow) for complete setup instructions.
120
+
121
+ ---
122
+
91
123
  ## 1. Installation
92
124
 
93
125
  ### 1.1 From PyPI (recommended)
@@ -1029,7 +1061,112 @@ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-t
1029
1061
 
1030
1062
  ---
1031
1063
 
1032
- ## 14. Limitations & Notes
1064
+ <a id="waa-benchmark-workflow"></a>
1065
+
1066
+ ## 14. WAA Benchmark Workflow
1067
+
1068
+ Windows Agent Arena (WAA) is a benchmark of 154 tasks across 11 Windows domains. OpenAdapt-ML provides infrastructure to run WAA evaluations on Azure VMs with parallel execution.
1069
+
1070
+ ### 14.1 Prerequisites
1071
+
1072
+ 1. **Azure CLI**: `brew install azure-cli && az login`
1073
+ 2. **OpenAI API Key**: Set in `.env` file (`OPENAI_API_KEY=sk-...`)
1074
+ 3. **Azure quota**: Ddsv5 family VMs (8+ vCPUs per worker)
1075
+
1076
+ ### 14.2 Single VM Workflow
1077
+
1078
+ For quick testing or small runs:
1079
+
1080
+ ```bash
1081
+ # Setup VM with WAA
1082
+ uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
1083
+
1084
+ # Start monitoring dashboard (auto-opens VNC, manages SSH tunnels)
1085
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
1086
+
1087
+ # Run benchmark
1088
+ uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 10
1089
+
1090
+ # Deallocate when done (stops billing)
1091
+ uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
1092
+ ```
1093
+
1094
+ ### 14.3 Parallel Pool Workflow (Recommended)
1095
+
1096
+ For full 154-task evaluations, use multiple VMs:
1097
+
1098
+ ```bash
1099
+ # 1. Create pool (provisions N Azure VMs with Docker + WAA)
1100
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
1101
+
1102
+ # 2. Wait for all workers to be ready (Windows boot + WAA server startup)
1103
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
1104
+
1105
+ # 3. Run benchmark across all workers
1106
+ # Tasks are distributed using WAA's native --worker_id/--num_workers
1107
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
1108
+
1109
+ # 4. Monitor progress
1110
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1111
+ uv run python -m openadapt_ml.benchmarks.cli pool-logs
1112
+
1113
+ # 5. Cleanup (delete all VMs - IMPORTANT to stop billing!)
1114
+ uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
1115
+ ```
1116
+
1117
+ ### 14.4 VNC Access to Workers
1118
+
1119
+ View what each Windows VM is doing:
1120
+
1121
+ ```bash
1122
+ # Get worker IPs
1123
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1124
+
1125
+ # Set up SSH tunnels (tunnels are created automatically, but you can also do this manually)
1126
+ ssh -f -N -L 8006:localhost:8006 azureuser@<worker-0-ip> # localhost:8006
1127
+ ssh -f -N -L 8007:localhost:8006 azureuser@<worker-1-ip> # localhost:8007
1128
+ # etc.
1129
+
1130
+ # Open in browser
1131
+ open http://localhost:8006 # Worker 0
1132
+ open http://localhost:8007 # Worker 1
1133
+ ```
1134
+
1135
+ ### 14.5 Architecture
1136
+
1137
+ ```
1138
+ Local Machine
1139
+ ├── openadapt-ml CLI (pool-create, pool-wait, pool-run)
1140
+ │ └── SSH tunnels to each worker
1141
+
1142
+ Azure (N VMs, Standard_D8ds_v5)
1143
+ ├── waa-pool-00
1144
+ │ └── Docker
1145
+ │ └── windowsarena/winarena:latest
1146
+ │ └── QEMU (Windows 11)
1147
+ │ ├── WAA Flask server (port 5000)
1148
+ │ └── Navi agent (GPT-4o-mini)
1149
+ ├── waa-pool-01
1150
+ │ └── ...
1151
+ └── waa-pool-N
1152
+ └── ...
1153
+ ```
1154
+
1155
+ ### 14.6 Cost Estimates
1156
+
1157
+ | VM Size | vCPUs | RAM | Cost/hr | 5 VMs for 10hrs |
1158
+ |---------|-------|-----|---------|-----------------|
1159
+ | Standard_D8ds_v5 | 8 | 32GB | ~$0.38 | ~$19 |
1160
+
1161
+ **Tips:**
1162
+ - Always run `pool-cleanup` when done to delete VMs and stop billing
1163
+ - Use `vm deallocate` (not delete) to pause billing but keep disk
1164
+ - Set `--auto-shutdown-hours 2` on `vm monitor` for safety
1165
+ - Prices vary by Azure region
1166
+
1167
+ ---
1168
+
1169
+ ## 15. Limitations & Notes
1033
1170
 
1034
1171
  - **Apple Silicon / bitsandbytes**:
1035
1172
  - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -1053,7 +1190,7 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
1053
1190
 
1054
1191
  ---
1055
1192
 
1056
- ## 15. Roadmap
1193
+ ## 16. Roadmap
1057
1194
 
1058
1195
  For the up-to-date, prioritized roadmap (including concrete implementation
1059
1196
  targets and agent-executable acceptance criteria), see
@@ -10,10 +10,10 @@ openadapt_ml/benchmarks/__init__.py,sha256=FaEGc7pRM-eLUXEEpJXcIckwkIWKhfaDkaxGM
10
10
  openadapt_ml/benchmarks/agent.py,sha256=8UcS9skCy6l18fGYaYt0JzJmYSGNB_WxDWhApbM7QH0,26940
11
11
  openadapt_ml/benchmarks/azure.py,sha256=dCrxi90X5NmFNMTT-2WG4AF3-IOO4zQs7yPpnqR-jLc,28238
12
12
  openadapt_ml/benchmarks/azure_ops_tracker.py,sha256=NOW21LPagOWIThSCIotI5cBvve92dtIktRIDLuyJ2CI,19309
13
- openadapt_ml/benchmarks/cli.py,sha256=t4cIGN68GdphCX0AGkWJa_M6D4oUO_M0rfJDzD_POGA,62730
13
+ openadapt_ml/benchmarks/cli.py,sha256=DwBZJEZF2XwajAazWWXxePbuH7J_W8G9N0y7iv3l7FI,288566
14
14
  openadapt_ml/benchmarks/trace_export.py,sha256=Zx-pryEuLe734YHY8MgJsNdj3I3TcTY61OQ9iurgGB0,21746
15
15
  openadapt_ml/benchmarks/viewer.py,sha256=Jztt_IoDW1u0WjPqlikfR8dunYzj66xCx0bMDDzJHQ8,41586
16
- openadapt_ml/benchmarks/vm_monitor.py,sha256=FzmRrzqm0sZTcydfqMtRefBLfTr4fjoyWCxdHLovUj0,35733
16
+ openadapt_ml/benchmarks/vm_monitor.py,sha256=EYgPRok2MPqs8Yajg7EJaqyb4EtRpqt8URQMLhE9Ego,35991
17
17
  openadapt_ml/benchmarks/waa_deploy/Dockerfile,sha256=F4GzVUoAUHvGlTFj-gGIPSlncG-JIz1_JyeaHvTnLpA,10853
18
18
  openadapt_ml/benchmarks/waa_deploy/__init__.py,sha256=KV71HrrgETytfY0i4vFSi-yM0KjoQP2hd9Bl03cZ9yc,320
19
19
  openadapt_ml/benchmarks/waa_deploy/api_agent.py,sha256=A5ZFhtBTKz0Q1GarNV51JhkEJwAgJfm9tK4CTJ1UEnE,20040
@@ -110,7 +110,7 @@ openadapt_ml/training/trainer.py,sha256=yGK79alY9Z0xGRQ2r9EaiWbzGlmE5WZJQL_2TWgc
110
110
  openadapt_ml/training/trl_trainer.py,sha256=AL1KFWXMub4vWE2w8eoAoQbSgm2fXO82CIqXULLYwVo,13223
111
111
  openadapt_ml/training/viewer.py,sha256=rXpREFbDK_tsu719VUej6iXrgnB8eNP0SEuvB9NUUhA,128104
112
112
  openadapt_ml/training/viewer_components.py,sha256=XilaX7r4YXFMT1QkooNnPWqR14SpsiTf7YbrN_g-Lq0,5478
113
- openadapt_ml-0.2.2.dist-info/METADATA,sha256=XNDolFy-sWkfPPCZ36qFRUwKzMRLk9WhzLMVufPp8i8,36696
114
- openadapt_ml-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
115
- openadapt_ml-0.2.2.dist-info/licenses/LICENSE,sha256=2E5UY67RVLedJuNnwGudkAMtfM3LZNUcHgmaL89TAfw,1068
116
- openadapt_ml-0.2.2.dist-info/RECORD,,
113
+ openadapt_ml-0.3.1.dist-info/METADATA,sha256=h5Xf2LEjMlBOsuwDCRiF5_cGlEwlgbRp8Vkqw1HOo4Q,40990
114
+ openadapt_ml-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
115
+ openadapt_ml-0.3.1.dist-info/licenses/LICENSE,sha256=2E5UY67RVLedJuNnwGudkAMtfM3LZNUcHgmaL89TAfw,1068
116
+ openadapt_ml-0.3.1.dist-info/RECORD,,