openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -53,8 +53,14 @@ COPY --from=windowsarena/winarena:latest /oem /oem
53
53
  RUN sed -i '/^return 0$/i cp -r /oem/* /tmp/smb/ 2>/dev/null || true' /run/samba.sh && \
54
54
  echo "Inserted OEM copy before return in samba.sh"
55
55
 
56
- # Copy unattend.xml for automated Windows installation
57
- COPY --from=windowsarena/winarena:latest /run/assets/win11x64-enterprise-eval.xml /run/assets/win11x64.xml
56
+ # DO NOT replace dockurr/windows's autounattend.xml - it handles OOBE properly
57
+ # Instead, only PATCH it to add InstallFrom element (prevents "Select OS" dialog)
58
+ # This preserves dockurr/windows's native OEM mechanism
59
+ RUN for xml in /run/assets/win11x64.xml /run/assets/win11x64-enterprise-eval.xml; do \
60
+ if [ -f "$xml" ] && ! grep -q "InstallFrom" "$xml"; then \
61
+ sed -i 's|<InstallTo>|<InstallFrom>\n <MetaData wcm:action="add">\n <Key>/IMAGE/INDEX</Key>\n <Value>1</Value>\n </MetaData>\n </InstallFrom>\n <InstallTo>|' "$xml"; \
62
+ fi; \
63
+ done && echo "Added InstallFrom element for automatic image selection"
58
64
 
59
65
  # -----------------------------------------------------------------------------
60
66
  # Create start_vm.sh that uses our dockurr/windows entrypoint
@@ -77,23 +83,15 @@ RUN find /client -name "*.py" -exec sed -i 's|20.20.20.21|172.30.0.2|g' {} \; &&
77
83
  echo "Patched client Python files"
78
84
 
79
85
  # -----------------------------------------------------------------------------
80
- # Add API-backed agent support (Claude Sonnet 4.5 / GPT-5.1)
81
- # This allows using --agent api-claude or --agent api-openai instead of navi
86
+ # Add API-backed agent support (Claude / OpenAI)
87
+ # NOTE: API agents (api-claude, api-openai) are run EXTERNALLY via openadapt-evals CLI
88
+ # which connects to the WAA server over SSH tunnel. No internal patching needed.
89
+ # The api_agent.py is included for reference/future use.
82
90
  # -----------------------------------------------------------------------------
83
91
 
84
- # Copy api_agent.py to the client mm_agents directory
92
+ # Copy api_agent.py for reference (used externally by openadapt-evals)
85
93
  COPY api_agent.py /client/mm_agents/api_agent.py
86
94
 
87
- # Patch run.py to support api-claude and api-openai agents
88
- # This adds elif blocks after the "navi" agent handling
89
- # Using Python to insert the patch with proper indentation
90
- RUN python3 -c "import re; \
91
- f = open('/client/run.py', 'r'); c = f.read(); f.close(); \
92
- patch = ''' elif cfg_args[\"agent_name\"] in [\"api-claude\", \"api-openai\"]:\n from mm_agents.api_agent import ApiAgent\n provider = \"anthropic\" if cfg_args[\"agent_name\"] == \"api-claude\" else \"openai\"\n agent = ApiAgent(provider=provider, temperature=args.temperature)\n'''; \
93
- c = c.replace('raise ValueError(f\"Unknown agent name: {cfg_args', patch + ' raise ValueError(f\"Unknown agent name: {cfg_args'); \
94
- f = open('/client/run.py', 'w'); f.write(c); f.close(); \
95
- print('Patched run.py for API agents')"
96
-
97
95
  # -----------------------------------------------------------------------------
98
96
  # Fix Windows setup for automation
99
97
  # -----------------------------------------------------------------------------
@@ -157,15 +155,33 @@ RUN if grep -q "</FirstLogonCommands>" /run/assets/win11x64.xml; then \
157
155
  fi
158
156
 
159
157
  # -----------------------------------------------------------------------------
160
- # Install Python and dependencies directly
161
- # dockurr/windows base is Debian trixie which has Python 3.12
158
+ # Copy Python 3.9 and all packages from vanilla image
162
159
  # -----------------------------------------------------------------------------
163
-
164
- # Install Python 3 and system dependencies
160
+ # IMPORTANT: Do NOT install Python from apt or pip install packages ourselves.
161
+ # The vanilla image has Python 3.9.20 with transformers 4.46.2 which is compatible
162
+ # with GroundingDINO. Installing our own Python (3.13) with latest transformers (5.0)
163
+ # breaks the navi agent with: AttributeError: 'BertModel' has no attribute 'get_head_mask'
164
+
165
+ # Copy Python 3.9 installation from vanilla (binaries, libraries, packages)
166
+ COPY --from=windowsarena/winarena:latest /usr/local/bin/python* /usr/local/bin/
167
+ COPY --from=windowsarena/winarena:latest /usr/local/bin/pip* /usr/local/bin/
168
+ COPY --from=windowsarena/winarena:latest /usr/local/lib/python3.9 /usr/local/lib/python3.9
169
+ COPY --from=windowsarena/winarena:latest /usr/local/lib/libpython3.9.so* /usr/local/lib/
170
+ COPY --from=windowsarena/winarena:latest /usr/local/include/python3.9 /usr/local/include/python3.9
171
+
172
+ # Ensure the shared library is found
173
+ RUN ldconfig
174
+
175
+ # Create symlinks for python/pip commands
176
+ RUN ln -sf /usr/local/bin/python3.9 /usr/local/bin/python && \
177
+ ln -sf /usr/local/bin/python3.9 /usr/bin/python && \
178
+ ln -sf /usr/local/bin/python3.9 /usr/bin/python3 && \
179
+ ln -sf /usr/local/bin/pip3.9 /usr/local/bin/pip && \
180
+ ln -sf /usr/local/bin/pip3.9 /usr/bin/pip && \
181
+ ln -sf /usr/local/bin/pip3.9 /usr/bin/pip3
182
+
183
+ # Install only system dependencies that Python packages need (not Python itself)
165
184
  RUN apt-get update && apt-get install -y --no-install-recommends \
166
- python3 \
167
- python3-venv \
168
- python3-pip \
169
185
  tesseract-ocr \
170
186
  libgl1 \
171
187
  libglib2.0-0 \
@@ -173,32 +189,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
173
189
  libxext6 \
174
190
  libxrender-dev \
175
191
  ffmpeg \
176
- && rm -rf /var/lib/apt/lists/* \
177
- && ln -sf /usr/bin/python3 /usr/bin/python
178
-
179
- # Install Python dependencies for WAA client
180
- # Using --break-system-packages since we're in a container
181
- # Full dependency list from: github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/client/requirements.txt
182
- RUN pip3 install --no-cache-dir --break-system-packages \
183
- torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
184
- pip3 install --no-cache-dir --break-system-packages \
185
- gymnasium farama-notifications cloudpickle packaging typer rich tqdm colorama \
186
- openai anthropic google-generativeai groq tiktoken \
187
- pyyaml jsonschema tenacity httpx backoff toml func-timeout wrapt-timeout-decorator \
188
- psutil pyperclip screeninfo mss pyautogui fabric \
189
- easyocr pillow pytesseract opencv-python-headless scikit-image ImageHash \
190
- requests flask beautifulsoup4 lxml cssselect xmltodict playwright requests-toolbelt \
191
- pydrive openpyxl python-docx python-pptx odfpy pypdf PyPDF2 pdfplumber pymupdf borb \
192
- xlrd xlwt xlsxwriter mammoth pdf2image \
193
- google-api-python-client google-auth-httplib2 google-auth-oauthlib gdown \
194
- numpy pandas scipy formulas rapidfuzz anytree addict \
195
- transformers accelerate "timm>=0.9.0,<1.0.0" ultralytics supervision pycocotools einops \
196
- mutagen pyacoustid chardet librosa fastdtw \
197
- py7zr LnkParse3 \
198
- matplotlib wandb yapf
199
-
200
- # Install Playwright browsers
201
- RUN playwright install chromium
192
+ && rm -rf /var/lib/apt/lists/*
193
+
194
+ # Note: Playwright browsers not copied - not needed for navi agent (uses GroundingDINO)
195
+ # If needed later, install via: python -m playwright install chromium
202
196
 
203
197
  # -----------------------------------------------------------------------------
204
198
  # Environment configuration
@@ -215,8 +209,8 @@ ENV ARGUMENTS="-qmp tcp:0.0.0.0:7200,server,nowait"
215
209
  # Expose ports
216
210
  EXPOSE 8006 5000 7200 3389
217
211
 
218
- # Default entrypoint - copy OEM files then run entry.sh
219
- # Use: /entry.sh --start-client true --model gpt-4o
220
- # Or: /entry.sh --start-client false (just start Windows, no benchmark)
221
- ENTRYPOINT ["/bin/bash", "-c"]
222
- CMD ["/copy-oem.sh /entry.sh --start-client false"]
212
+ # Default entrypoint - use dockurr/windows's native entry point
213
+ # The OEM files are copied by samba.sh (patched above) when Samba starts
214
+ # dockurr/windows handles: QEMU VM startup, Samba, VNC, Windows boot
215
+ # Our patched autounattend.xml handles: FirstLogonCommands that run install.bat
216
+ ENTRYPOINT ["/usr/bin/tini", "-s", "/run/entry.sh"]
@@ -43,7 +43,7 @@ import logging
43
43
  import os
44
44
  import re
45
45
  from io import BytesIO
46
- from typing import Any, Dict, List
46
+ from typing import Dict, List
47
47
 
48
48
  from PIL import Image
49
49
 
@@ -210,6 +210,7 @@ class ApiAgent:
210
210
  )
211
211
  try:
212
212
  from anthropic import Anthropic
213
+
213
214
  self._client = Anthropic(api_key=self.api_key)
214
215
  except ImportError:
215
216
  raise RuntimeError(
@@ -225,6 +226,7 @@ class ApiAgent:
225
226
  )
226
227
  try:
227
228
  from openai import OpenAI
229
+
228
230
  self._client = OpenAI(api_key=self.api_key)
229
231
  except ImportError:
230
232
  raise RuntimeError(
@@ -240,9 +242,13 @@ class ApiAgent:
240
242
  self.memory_block_text = "# empty memory block"
241
243
  self.step_counter = 0
242
244
 
243
- logger.info(f"ApiAgent initialized with provider={provider}, model={self.model}")
245
+ logger.info(
246
+ f"ApiAgent initialized with provider={provider}, model={self.model}"
247
+ )
244
248
  if self.demo:
245
- logger.info(f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps")
249
+ logger.info(
250
+ f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps"
251
+ )
246
252
 
247
253
  def predict(self, instruction: str, obs: Dict) -> tuple:
248
254
  """Predict the next action based on observation.
@@ -325,10 +331,9 @@ class ApiAgent:
325
331
  # Add action history if enabled (enhanced: includes reasoning, not just raw actions)
326
332
  if self.use_history and self.history:
327
333
  # Use rich history with reasoning (like PC Agent-E)
328
- history_entries = self.history[-self.history_cutoff:]
334
+ history_entries = self.history[-self.history_cutoff :]
329
335
  history_str = "\n\n".join(
330
- f"[Step {i+1}] {entry}"
331
- for i, entry in enumerate(history_entries)
336
+ f"[Step {i + 1}] {entry}" for i, entry in enumerate(history_entries)
332
337
  )
333
338
  content_parts.append(f"History of previous steps:\n{history_str}")
334
339
  logs["history_entries"] = len(history_entries)
@@ -381,14 +386,18 @@ class ApiAgent:
381
386
  actions = [code_text]
382
387
  self.prev_actions.append(code_text)
383
388
  # Store rich history with reasoning (memory + action)
384
- self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {code_text}")
389
+ self._add_to_history(
390
+ f"Thought: {self.memory_block_text}\nAction: {code_text}"
391
+ )
385
392
  else:
386
393
  # Try to extract action from response text
387
394
  action = self._parse_action_from_text(response_text, w, h)
388
395
  if action:
389
396
  actions = [action]
390
397
  self.prev_actions.append(action)
391
- self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {action}")
398
+ self._add_to_history(
399
+ f"Thought: {self.memory_block_text}\nAction: {action}"
400
+ )
392
401
  else:
393
402
  logger.warning("Could not extract action from response")
394
403
  actions = ["# Could not parse action"]
@@ -483,33 +492,25 @@ class ApiAgent:
483
492
  Python code string or None if parsing failed.
484
493
  """
485
494
  # Try to find click coordinates
486
- click_match = re.search(
487
- r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE
488
- )
495
+ click_match = re.search(r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE)
489
496
  if click_match:
490
497
  x, y = int(click_match.group(1)), int(click_match.group(2))
491
498
  return f"computer.click({x}, {y})"
492
499
 
493
500
  # Try to find type text
494
- type_match = re.search(
495
- r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE
496
- )
501
+ type_match = re.search(r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE)
497
502
  if type_match:
498
503
  text_to_type = type_match.group(1)
499
504
  return f'computer.type("{text_to_type}")'
500
505
 
501
506
  # Try to find key press
502
- key_match = re.search(
503
- r"press[:\s]+(\w+)", text, re.IGNORECASE
504
- )
507
+ key_match = re.search(r"press[:\s]+(\w+)", text, re.IGNORECASE)
505
508
  if key_match:
506
509
  key = key_match.group(1).lower()
507
510
  return f'computer.press("{key}")'
508
511
 
509
512
  # Try to find hotkey
510
- hotkey_match = re.search(
511
- r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE
512
- )
513
+ hotkey_match = re.search(r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE)
513
514
  if hotkey_match:
514
515
  key1, key2 = hotkey_match.group(1).lower(), hotkey_match.group(2).lower()
515
516
  return f'computer.hotkey("{key1}", "{key2}")'
@@ -144,7 +144,7 @@ class AzureInferenceQueue:
144
144
  blob_name = f"checkpoints/epoch_{epoch}/{checkpoint_path.name}"
145
145
  logger.info(f"Uploading checkpoint to {blob_name}...")
146
146
 
147
- checkpoint_blob_client = self.blob_service.get_blob_client(
147
+ self.blob_service.get_blob_client(
148
148
  container=self.checkpoints_container, blob=blob_name
149
149
  )
150
150
 
@@ -378,9 +378,7 @@ def main():
378
378
  submit_parser.add_argument(
379
379
  "--checkpoint", "-c", required=True, help="Path to checkpoint directory"
380
380
  )
381
- submit_parser.add_argument(
382
- "--capture", required=True, help="Path to capture data"
383
- )
381
+ submit_parser.add_argument("--capture", required=True, help="Path to capture data")
384
382
  submit_parser.add_argument(
385
383
  "--epoch", "-e", type=int, default=0, help="Epoch number"
386
384
  )
@@ -415,7 +413,7 @@ def main():
415
413
 
416
414
  if args.command == "inference-submit":
417
415
  # Submit checkpoint for inference
418
- print(f"Submitting checkpoint for inference...")
416
+ print("Submitting checkpoint for inference...")
419
417
  job = queue.submit_checkpoint(
420
418
  checkpoint_path=args.checkpoint,
421
419
  capture_path=args.capture,