openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,540 @@
1
+ """WAA-compatible API Agent that uses Claude Sonnet 4.5 or GPT-5.1 directly.
2
+
3
+ This module provides a drop-in replacement for the Navi agent in Windows Agent Arena
4
+ that uses hosted VLM APIs (Claude or GPT-5.1) instead of the buggy Navi agent.
5
+
6
+ The agent receives observations from WAA and returns actions in WAA's expected format
7
+ (code blocks for the pyautogui action space).
8
+
9
+ Why this exists:
10
+ The default Navi agent in WAA has NoneType errors and other bugs.
11
+ This API agent provides a reliable alternative that uses Claude Sonnet 4.5
12
+ or GPT-5.1 directly, bypassing the problematic Navi implementation.
13
+
14
+ Usage from CLI:
15
+ # Run with Claude Sonnet 4.5 (requires ANTHROPIC_API_KEY)
16
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-claude --num-tasks 5
17
+
18
+ # Run with GPT-5.1 (requires OPENAI_API_KEY)
19
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-openai --num-tasks 5
20
+
21
+ How it works:
22
+ 1. The Dockerfile copies this file to /client/mm_agents/api_agent.py
23
+ 2. The Dockerfile patches run.py to recognize "api-claude" and "api-openai" agents
24
+ 3. When the agent is selected, it:
25
+ - Receives screenshots from WAA's DesktopEnv
26
+ - Sends them to Claude or GPT-5.1 via their respective APIs
27
+ - Parses the response into pyautogui code blocks
28
+ - Returns actions in WAA's expected format
29
+
30
+ Example usage in WAA run.py (auto-patched by Dockerfile):
31
+ if cfg_args["agent_name"] == "api-claude":
32
+ from mm_agents.api_agent import ApiAgent
33
+ agent = ApiAgent(provider="anthropic")
34
+ elif cfg_args["agent_name"] == "api-openai":
35
+ from mm_agents.api_agent import ApiAgent
36
+ agent = ApiAgent(provider="openai")
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import base64
42
+ import logging
43
+ import os
44
+ import re
45
+ from io import BytesIO
46
+ from typing import Dict, List
47
+
48
+ from PIL import Image
49
+
50
+ logger = logging.getLogger("desktopenv.agent.api")
51
+
52
+
53
+ # System prompt for GUI automation - adapted from APIBenchmarkAgent
54
+ SYSTEM_PROMPT = """You are a GUI automation agent controlling a Windows desktop. Given a screenshot and task instruction, determine the next action to take.
55
+
56
+ You must respond with a Python code block that uses the pyautogui API. Available functions:
57
+ - computer.click(x, y) - Click at pixel coordinates
58
+ - computer.double_click(x, y) - Double-click at pixel coordinates
59
+ - computer.right_click(x, y) - Right-click at pixel coordinates
60
+ - computer.type(text) - Type the given text
61
+ - computer.hotkey(key1, key2, ...) - Press key combination (e.g., 'ctrl', 'c')
62
+ - computer.press(key) - Press a single key (e.g., 'enter', 'tab', 'escape')
63
+ - computer.scroll(direction) - Scroll up (-3) or down (3)
64
+ - computer.drag(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
65
+
66
+ Coordinates are pixel values within the screen (1920x1200 by default).
67
+
68
+ Format your response as:
69
+
70
+ ```memory
71
+ # Your notes about the task state (optional)
72
+ ```
73
+
74
+ ```decision
75
+ CONTINUE
76
+ ```
77
+
78
+ ```python
79
+ computer.click(500, 300)
80
+ ```
81
+
82
+ Important:
83
+ - Use DONE in the decision block when the task is complete
84
+ - Use FAIL if the task cannot be completed
85
+ - Always output exactly one action per response
86
+ - Click on UI elements by their visual center coordinates
87
+ - For text input, first click to focus the field, then type
88
+
89
+ Think step by step:
90
+ 1. What is the current state of the UI?
91
+ 2. What is the goal?
92
+ 3. What is the next logical action?
93
+ """
94
+
95
+
96
+ def format_accessibility_tree(tree: dict, indent: int = 0, max_depth: int = 5) -> str:
97
+ """Format accessibility tree for prompt.
98
+
99
+ Args:
100
+ tree: Accessibility tree dict from WAA.
101
+ indent: Current indentation level.
102
+ max_depth: Maximum depth to traverse.
103
+
104
+ Returns:
105
+ Formatted string representation.
106
+ """
107
+ if indent >= max_depth:
108
+ return ""
109
+
110
+ lines = []
111
+ prefix = " " * indent
112
+
113
+ role = tree.get("role", tree.get("control_type", "unknown"))
114
+ name = tree.get("name", "")
115
+ node_id = tree.get("id", tree.get("node_id", ""))
116
+
117
+ # Get bounding box if available
118
+ bbox_str = ""
119
+ if "bounding_rectangle" in tree:
120
+ br = tree["bounding_rectangle"]
121
+ bbox_str = f" [{br.get('left', 0)},{br.get('top', 0)},{br.get('right', 0)},{br.get('bottom', 0)}]"
122
+
123
+ line = f"{prefix}[{node_id}] {role}"
124
+ if name:
125
+ line += f": {name[:50]}" # Truncate long names
126
+ if bbox_str:
127
+ line += bbox_str
128
+ lines.append(line)
129
+
130
+ for child in tree.get("children", []):
131
+ child_text = format_accessibility_tree(child, indent + 1, max_depth)
132
+ if child_text:
133
+ lines.append(child_text)
134
+
135
+ return "\n".join(lines)
136
+
137
+
138
+ def prev_actions_to_string(prev_actions: List[str], n_prev: int = 3) -> str:
139
+ """Format previous actions for the prompt.
140
+
141
+ Args:
142
+ prev_actions: List of previous action strings.
143
+ n_prev: Number of previous actions to include.
144
+
145
+ Returns:
146
+ Formatted string of previous actions.
147
+ """
148
+ result = ""
149
+ n_prev = min(n_prev, len(prev_actions))
150
+ for i in range(1, n_prev + 1):
151
+ action = prev_actions[-i]
152
+ result += f"Action at T-{i}:\n{action}\n\n"
153
+ return result
154
+
155
+
156
+ class ApiAgent:
157
+ """WAA-compatible agent that uses Claude or GPT-5.1 API directly.
158
+
159
+ This agent implements the same interface as NaviAgent but uses hosted
160
+ VLM APIs instead of the local Navi implementation (which has NoneType bugs).
161
+
162
+ Args:
163
+ provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
164
+ api_key: Optional API key. If not provided, uses environment variables.
165
+ model: Optional model name override.
166
+ temperature: Sampling temperature (0.0-1.0).
167
+ max_tokens: Maximum tokens for API response.
168
+ use_accessibility_tree: Whether to include a11y tree in prompts.
169
+ use_history: Whether to include action history in prompts.
170
+ demo: Optional demonstration trajectory to include at every step.
171
+ This is the key fix for 100% first-action / 0% episode success:
172
+ the demo must persist across ALL steps, not just step 1.
173
+ """
174
+
175
+ # Default models for each provider
176
+ DEFAULT_MODELS = {
177
+ "anthropic": "claude-sonnet-4-5-20250929",
178
+ "openai": "gpt-5.1",
179
+ }
180
+
181
+ def __init__(
182
+ self,
183
+ provider: str = "anthropic",
184
+ api_key: str | None = None,
185
+ model: str | None = None,
186
+ temperature: float = 0.5,
187
+ max_tokens: int = 1500,
188
+ use_accessibility_tree: bool = True,
189
+ use_history: bool = True,
190
+ demo: str | None = None,
191
+ ):
192
+ self.provider = provider
193
+ self.model = model or self.DEFAULT_MODELS.get(provider)
194
+ self.temperature = temperature
195
+ self.max_tokens = max_tokens
196
+ self.use_accessibility_tree = use_accessibility_tree
197
+ self.use_history = use_history
198
+ self.demo = demo # Demo persists across ALL steps
199
+
200
+ # WAA compatibility
201
+ self.action_space = "code_block"
202
+
203
+ # Get API key
204
+ if provider == "anthropic":
205
+ self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
206
+ if not self.api_key:
207
+ raise RuntimeError(
208
+ "ANTHROPIC_API_KEY is required for provider='anthropic'. "
209
+ "Set it in environment or pass api_key parameter."
210
+ )
211
+ try:
212
+ from anthropic import Anthropic
213
+
214
+ self._client = Anthropic(api_key=self.api_key)
215
+ except ImportError:
216
+ raise RuntimeError(
217
+ "anthropic package required. Install with: pip install anthropic"
218
+ )
219
+
220
+ elif provider == "openai":
221
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
222
+ if not self.api_key:
223
+ raise RuntimeError(
224
+ "OPENAI_API_KEY is required for provider='openai'. "
225
+ "Set it in environment or pass api_key parameter."
226
+ )
227
+ try:
228
+ from openai import OpenAI
229
+
230
+ self._client = OpenAI(api_key=self.api_key)
231
+ except ImportError:
232
+ raise RuntimeError(
233
+ "openai package required. Install with: pip install openai"
234
+ )
235
+ else:
236
+ raise ValueError(f"Unsupported provider: {provider}")
237
+
238
+ # State tracking
239
+ self.prev_actions: List[str] = [] # Raw action codes for WAA compatibility
240
+ self.history: List[str] = [] # Rich history with reasoning (like PC Agent-E)
241
+ self.history_cutoff = 10 # Max history entries to include
242
+ self.memory_block_text = "# empty memory block"
243
+ self.step_counter = 0
244
+
245
+ logger.info(
246
+ f"ApiAgent initialized with provider={provider}, model={self.model}"
247
+ )
248
+ if self.demo:
249
+ logger.info(
250
+ f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps"
251
+ )
252
+
253
+ def predict(self, instruction: str, obs: Dict) -> tuple:
254
+ """Predict the next action based on observation.
255
+
256
+ This method implements the same interface as NaviAgent.predict().
257
+
258
+ Args:
259
+ instruction: The task instruction.
260
+ obs: Observation dict containing:
261
+ - screenshot: PNG bytes of current screen
262
+ - accessibility_tree: A11y tree dict (optional)
263
+ - window_title: Current window title
264
+ - window_names_str: List of open windows
265
+ - computer_clipboard: Current clipboard content
266
+
267
+ Returns:
268
+ Tuple of (response_text, actions_list, logs_dict, computer_update_args)
269
+ """
270
+ logs = {}
271
+ self.step_counter += 1
272
+
273
+ # Extract screenshot
274
+ screenshot_bytes = obs.get("screenshot")
275
+ if screenshot_bytes is None:
276
+ logger.error("No screenshot in observation")
277
+ return "", ["# No screenshot available"], logs, {}
278
+
279
+ # Convert screenshot to PIL Image
280
+ try:
281
+ image = Image.open(BytesIO(screenshot_bytes))
282
+ w, h = image.size
283
+ except Exception as e:
284
+ logger.error(f"Failed to load screenshot: {e}")
285
+ return "", ["# Failed to load screenshot"], logs, {}
286
+
287
+ logs["image_width"] = w
288
+ logs["image_height"] = h
289
+
290
+ # Build the prompt
291
+ content_parts = [f"TASK: {instruction}"]
292
+
293
+ # CRITICAL FIX: Include demo at EVERY step, not just step 1
294
+ # This is the key fix for 100% first-action / 0% episode success
295
+ if self.demo:
296
+ content_parts.append(
297
+ f"DEMONSTRATION (follow this pattern):\n"
298
+ f"---\n{self.demo}\n---\n"
299
+ f"Use the demonstration above as a guide. You are currently at step {self.step_counter}."
300
+ )
301
+ logs["demo_included"] = True
302
+ logs["demo_length"] = len(self.demo)
303
+
304
+ # Add context
305
+ window_title = obs.get("window_title", "")
306
+ if window_title:
307
+ content_parts.append(f"Current window: {window_title}")
308
+ logs["window_title"] = window_title
309
+
310
+ window_names_str = obs.get("window_names_str", "")
311
+ if window_names_str:
312
+ content_parts.append(f"Open windows: {window_names_str}")
313
+ logs["window_names_str"] = window_names_str
314
+
315
+ clipboard = obs.get("computer_clipboard", "")
316
+ if clipboard:
317
+ content_parts.append(f"Clipboard: {clipboard[:100]}")
318
+ logs["computer_clipboard"] = clipboard
319
+
320
+ # Add accessibility tree if available and enabled
321
+ if self.use_accessibility_tree:
322
+ a11y_tree = obs.get("accessibility_tree")
323
+ if a11y_tree:
324
+ tree_str = format_accessibility_tree(a11y_tree)
325
+ # Truncate if too long
326
+ if len(tree_str) > 4000:
327
+ tree_str = tree_str[:4000] + "\n... (truncated)"
328
+ content_parts.append(f"UI Elements:\n{tree_str}")
329
+ logs["accessibility_tree_len"] = len(tree_str)
330
+
331
+ # Add action history if enabled (enhanced: includes reasoning, not just raw actions)
332
+ if self.use_history and self.history:
333
+ # Use rich history with reasoning (like PC Agent-E)
334
+ history_entries = self.history[-self.history_cutoff :]
335
+ history_str = "\n\n".join(
336
+ f"[Step {i + 1}] {entry}" for i, entry in enumerate(history_entries)
337
+ )
338
+ content_parts.append(f"History of previous steps:\n{history_str}")
339
+ logs["history_entries"] = len(history_entries)
340
+ elif self.use_history and self.prev_actions:
341
+ # Fallback to raw action history
342
+ history_str = prev_actions_to_string(self.prev_actions, n_prev=5)
343
+ content_parts.append(f"Previous actions:\n{history_str}")
344
+
345
+ # Add memory block
346
+ content_parts.append(f"Your memory:\n```memory\n{self.memory_block_text}\n```")
347
+
348
+ content_parts.append(f"\nScreen dimensions: {w}x{h} pixels")
349
+ content_parts.append("\nWhat is the next action?")
350
+
351
+ user_prompt = "\n\n".join(content_parts)
352
+ logs["user_question"] = user_prompt
353
+
354
+ # Call the API
355
+ try:
356
+ response_text = self._call_api(screenshot_bytes, user_prompt)
357
+ except Exception as e:
358
+ logger.error(f"API call failed: {e}")
359
+ return "", ["# API call failed"], logs, {}
360
+
361
+ logs["plan_result"] = response_text
362
+
363
+ # Extract memory block
364
+ memory_match = re.search(r"```memory\n(.*?)```", response_text, re.DOTALL)
365
+ if memory_match:
366
+ self.memory_block_text = memory_match.group(1).strip()
367
+
368
+ # Extract decision block
369
+ decision_match = re.search(r"```decision\n(.*?)```", response_text, re.DOTALL)
370
+ if decision_match:
371
+ decision = decision_match.group(1).strip().upper()
372
+ if "DONE" in decision:
373
+ self.prev_actions.append("DONE")
374
+ return "", ["DONE"], logs, {}
375
+ elif "FAIL" in decision:
376
+ self.prev_actions.append("FAIL")
377
+ return "", ["FAIL"], logs, {}
378
+ elif "WAIT" in decision:
379
+ self.prev_actions.append("WAIT")
380
+ return "", ["WAIT"], logs, {}
381
+
382
+ # Extract Python code block
383
+ code_match = re.search(r"```python\n(.*?)```", response_text, re.DOTALL)
384
+ if code_match:
385
+ code_text = code_match.group(1).strip()
386
+ actions = [code_text]
387
+ self.prev_actions.append(code_text)
388
+ # Store rich history with reasoning (memory + action)
389
+ self._add_to_history(
390
+ f"Thought: {self.memory_block_text}\nAction: {code_text}"
391
+ )
392
+ else:
393
+ # Try to extract action from response text
394
+ action = self._parse_action_from_text(response_text, w, h)
395
+ if action:
396
+ actions = [action]
397
+ self.prev_actions.append(action)
398
+ self._add_to_history(
399
+ f"Thought: {self.memory_block_text}\nAction: {action}"
400
+ )
401
+ else:
402
+ logger.warning("Could not extract action from response")
403
+ actions = ["# Could not parse action"]
404
+
405
+ # Build computer_update_args (for WAA compatibility)
406
+ computer_update_args = {
407
+ "rects": [],
408
+ "window_rect": [0, 0, w, h],
409
+ "screenshot": image,
410
+ "scale": (1.0, 1.0),
411
+ "clipboard_content": clipboard,
412
+ "swap_ctrl_alt": False,
413
+ }
414
+
415
+ return "", actions, logs, computer_update_args
416
+
417
+ def _call_api(self, screenshot_bytes: bytes, user_prompt: str) -> str:
418
+ """Call the VLM API with screenshot and prompt.
419
+
420
+ Args:
421
+ screenshot_bytes: PNG image bytes.
422
+ user_prompt: User prompt text.
423
+
424
+ Returns:
425
+ Response text from the API.
426
+ """
427
+ image_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
428
+
429
+ if self.provider == "anthropic":
430
+ content = [
431
+ {"type": "text", "text": user_prompt},
432
+ {
433
+ "type": "image",
434
+ "source": {
435
+ "type": "base64",
436
+ "media_type": "image/png",
437
+ "data": image_b64,
438
+ },
439
+ },
440
+ ]
441
+
442
+ resp = self._client.messages.create(
443
+ model=self.model,
444
+ max_tokens=self.max_tokens,
445
+ system=SYSTEM_PROMPT,
446
+ messages=[{"role": "user", "content": content}],
447
+ )
448
+
449
+ # Extract text from response
450
+ parts = getattr(resp, "content", [])
451
+ texts = [
452
+ getattr(p, "text", "")
453
+ for p in parts
454
+ if getattr(p, "type", "") == "text"
455
+ ]
456
+ return "\n".join([t for t in texts if t]).strip()
457
+
458
+ elif self.provider == "openai":
459
+ messages = [
460
+ {"role": "system", "content": SYSTEM_PROMPT},
461
+ {
462
+ "role": "user",
463
+ "content": [
464
+ {"type": "text", "text": user_prompt},
465
+ {
466
+ "type": "image_url",
467
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
468
+ },
469
+ ],
470
+ },
471
+ ]
472
+
473
+ resp = self._client.chat.completions.create(
474
+ model=self.model,
475
+ messages=messages,
476
+ max_completion_tokens=self.max_tokens,
477
+ temperature=self.temperature,
478
+ )
479
+ return resp.choices[0].message.content or ""
480
+
481
+ raise ValueError(f"Unsupported provider: {self.provider}")
482
+
483
+ def _parse_action_from_text(self, text: str, width: int, height: int) -> str | None:
484
+ """Try to parse an action from free-form text response.
485
+
486
+ Args:
487
+ text: Response text to parse.
488
+ width: Screen width.
489
+ height: Screen height.
490
+
491
+ Returns:
492
+ Python code string or None if parsing failed.
493
+ """
494
+ # Try to find click coordinates
495
+ click_match = re.search(r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE)
496
+ if click_match:
497
+ x, y = int(click_match.group(1)), int(click_match.group(2))
498
+ return f"computer.click({x}, {y})"
499
+
500
+ # Try to find type text
501
+ type_match = re.search(r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE)
502
+ if type_match:
503
+ text_to_type = type_match.group(1)
504
+ return f'computer.type("{text_to_type}")'
505
+
506
+ # Try to find key press
507
+ key_match = re.search(r"press[:\s]+(\w+)", text, re.IGNORECASE)
508
+ if key_match:
509
+ key = key_match.group(1).lower()
510
+ return f'computer.press("{key}")'
511
+
512
+ # Try to find hotkey
513
+ hotkey_match = re.search(r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE)
514
+ if hotkey_match:
515
+ key1, key2 = hotkey_match.group(1).lower(), hotkey_match.group(2).lower()
516
+ return f'computer.hotkey("{key1}", "{key2}")'
517
+
518
+ return None
519
+
520
+ def _add_to_history(self, entry: str) -> None:
521
+ """Add an entry to the rich history (reasoning + action)."""
522
+ self.history.append(entry)
523
+
524
+ def set_demo(self, demo: str) -> None:
525
+ """Set or update the demo trajectory.
526
+
527
+ This allows setting the demo after initialization,
528
+ useful for dynamic demo retrieval.
529
+ """
530
+ self.demo = demo
531
+ logger.info(f"Demo set ({len(demo)} chars) - will persist across all steps")
532
+
533
+ def reset(self) -> None:
534
+ """Reset agent state between tasks."""
535
+ self.prev_actions = []
536
+ self.history = [] # Clear rich history too
537
+ self.memory_block_text = "# empty memory block"
538
+ self.step_counter = 0
539
+ # Note: demo is NOT reset - it persists across resets if set
540
+ logger.info("ApiAgent reset")
@@ -0,0 +1,53 @@
1
+ @echo off
2
+ REM start_waa_server.bat - Start WAA Flask server on Windows boot
3
+ REM This script ensures the WAA server starts automatically on every boot
4
+
5
+ echo [WAA Startup] Starting WAA server...
6
+
7
+ REM Wait for network to be available
8
+ ping -n 5 127.0.0.1 > nul
9
+
10
+ REM Check if server is already running
11
+ netstat -an | find ":5000" | find "LISTENING" > nul
12
+ if %errorlevel% == 0 (
13
+ echo [WAA Startup] Server already running on port 5000
14
+ exit /b 0
15
+ )
16
+
17
+ REM Try multiple possible server locations
18
+ REM Location 1: OEM server path (official WAA location)
19
+ if exist "C:\oem\server\main.py" (
20
+ cd /d C:\oem\server
21
+ start /b python main.py
22
+ echo [WAA Startup] Started from C:\oem\server
23
+ exit /b 0
24
+ )
25
+
26
+ REM Location 2: Network share (Samba)
27
+ if exist "\\host.lan\Data\server\main.py" (
28
+ cd /d \\host.lan\Data\server
29
+ start /b python main.py
30
+ echo [WAA Startup] Started from network share
31
+ exit /b 0
32
+ )
33
+
34
+ REM Location 3: Legacy path
35
+ if exist "C:\waa\server\main.py" (
36
+ cd /d C:\waa\server
37
+ start /b python main.py
38
+ echo [WAA Startup] Started from C:\waa\server
39
+ exit /b 0
40
+ )
41
+
42
+ REM If none found, try running from network directly
43
+ echo [WAA Startup] Trying network server path...
44
+ cd /d \\host.lan\Data\server 2>nul
45
+ if %errorlevel% == 0 (
46
+ start /b python main.py
47
+ echo [WAA Startup] Started from network path
48
+ exit /b 0
49
+ )
50
+
51
+ echo [WAA Startup] ERROR: WAA server not found in any expected location
52
+ echo Checked: C:\oem\server, \\host.lan\Data\server, C:\waa\server
53
+ exit /b 1
@@ -144,7 +144,7 @@ class AzureInferenceQueue:
144
144
  blob_name = f"checkpoints/epoch_{epoch}/{checkpoint_path.name}"
145
145
  logger.info(f"Uploading checkpoint to {blob_name}...")
146
146
 
147
- checkpoint_blob_client = self.blob_service.get_blob_client(
147
+ self.blob_service.get_blob_client(
148
148
  container=self.checkpoints_container, blob=blob_name
149
149
  )
150
150
 
@@ -378,9 +378,7 @@ def main():
378
378
  submit_parser.add_argument(
379
379
  "--checkpoint", "-c", required=True, help="Path to checkpoint directory"
380
380
  )
381
- submit_parser.add_argument(
382
- "--capture", required=True, help="Path to capture data"
383
- )
381
+ submit_parser.add_argument("--capture", required=True, help="Path to capture data")
384
382
  submit_parser.add_argument(
385
383
  "--epoch", "-e", type=int, default=0, help="Epoch number"
386
384
  )
@@ -415,7 +413,7 @@ def main():
415
413
 
416
414
  if args.command == "inference-submit":
417
415
  # Submit checkpoint for inference
418
- print(f"Submitting checkpoint for inference...")
416
+ print("Submitting checkpoint for inference...")
419
417
  job = queue.submit_checkpoint(
420
418
  checkpoint_path=args.checkpoint,
421
419
  capture_path=args.capture,