openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,539 @@
1
+ """WAA-compatible API Agent that uses Claude Sonnet 4.5 or GPT-5.1 directly.
2
+
3
+ This module provides a drop-in replacement for the Navi agent in Windows Agent Arena
4
+ that uses hosted VLM APIs (Claude or GPT-5.1) instead of the buggy Navi agent.
5
+
6
+ The agent receives observations from WAA and returns actions in WAA's expected format
7
+ (code blocks for the pyautogui action space).
8
+
9
+ Why this exists:
10
+ The default Navi agent in WAA has NoneType errors and other bugs.
11
+ This API agent provides a reliable alternative that uses Claude Sonnet 4.5
12
+ or GPT-5.1 directly, bypassing the problematic Navi implementation.
13
+
14
+ Usage from CLI:
15
+ # Run with Claude Sonnet 4.5 (requires ANTHROPIC_API_KEY)
16
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-claude --num-tasks 5
17
+
18
+ # Run with GPT-5.1 (requires OPENAI_API_KEY)
19
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-openai --num-tasks 5
20
+
21
+ How it works:
22
+ 1. The Dockerfile copies this file to /client/mm_agents/api_agent.py
23
+ 2. The Dockerfile patches run.py to recognize "api-claude" and "api-openai" agents
24
+ 3. When the agent is selected, it:
25
+ - Receives screenshots from WAA's DesktopEnv
26
+ - Sends them to Claude or GPT-5.1 via their respective APIs
27
+ - Parses the response into pyautogui code blocks
28
+ - Returns actions in WAA's expected format
29
+
30
+ Example usage in WAA run.py (auto-patched by Dockerfile):
31
+ if cfg_args["agent_name"] == "api-claude":
32
+ from mm_agents.api_agent import ApiAgent
33
+ agent = ApiAgent(provider="anthropic")
34
+ elif cfg_args["agent_name"] == "api-openai":
35
+ from mm_agents.api_agent import ApiAgent
36
+ agent = ApiAgent(provider="openai")
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import base64
42
+ import logging
43
+ import os
44
+ import re
45
+ from io import BytesIO
46
+ from typing import Any, Dict, List
47
+
48
+ from PIL import Image
49
+
50
+ logger = logging.getLogger("desktopenv.agent.api")
51
+
52
+
53
+ # System prompt for GUI automation - adapted from APIBenchmarkAgent
54
+ SYSTEM_PROMPT = """You are a GUI automation agent controlling a Windows desktop. Given a screenshot and task instruction, determine the next action to take.
55
+
56
+ You must respond with a Python code block that uses the pyautogui API. Available functions:
57
+ - computer.click(x, y) - Click at pixel coordinates
58
+ - computer.double_click(x, y) - Double-click at pixel coordinates
59
+ - computer.right_click(x, y) - Right-click at pixel coordinates
60
+ - computer.type(text) - Type the given text
61
+ - computer.hotkey(key1, key2, ...) - Press key combination (e.g., 'ctrl', 'c')
62
+ - computer.press(key) - Press a single key (e.g., 'enter', 'tab', 'escape')
63
+ - computer.scroll(direction) - Scroll up (-3) or down (3)
64
+ - computer.drag(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
65
+
66
+ Coordinates are pixel values within the screen (1920x1200 by default).
67
+
68
+ Format your response as:
69
+
70
+ ```memory
71
+ # Your notes about the task state (optional)
72
+ ```
73
+
74
+ ```decision
75
+ CONTINUE
76
+ ```
77
+
78
+ ```python
79
+ computer.click(500, 300)
80
+ ```
81
+
82
+ Important:
83
+ - Use DONE in the decision block when the task is complete
84
+ - Use FAIL if the task cannot be completed
85
+ - Always output exactly one action per response
86
+ - Click on UI elements by their visual center coordinates
87
+ - For text input, first click to focus the field, then type
88
+
89
+ Think step by step:
90
+ 1. What is the current state of the UI?
91
+ 2. What is the goal?
92
+ 3. What is the next logical action?
93
+ """
94
+
95
+
96
+ def format_accessibility_tree(tree: dict, indent: int = 0, max_depth: int = 5) -> str:
97
+ """Format accessibility tree for prompt.
98
+
99
+ Args:
100
+ tree: Accessibility tree dict from WAA.
101
+ indent: Current indentation level.
102
+ max_depth: Maximum depth to traverse.
103
+
104
+ Returns:
105
+ Formatted string representation.
106
+ """
107
+ if indent >= max_depth:
108
+ return ""
109
+
110
+ lines = []
111
+ prefix = " " * indent
112
+
113
+ role = tree.get("role", tree.get("control_type", "unknown"))
114
+ name = tree.get("name", "")
115
+ node_id = tree.get("id", tree.get("node_id", ""))
116
+
117
+ # Get bounding box if available
118
+ bbox_str = ""
119
+ if "bounding_rectangle" in tree:
120
+ br = tree["bounding_rectangle"]
121
+ bbox_str = f" [{br.get('left', 0)},{br.get('top', 0)},{br.get('right', 0)},{br.get('bottom', 0)}]"
122
+
123
+ line = f"{prefix}[{node_id}] {role}"
124
+ if name:
125
+ line += f": {name[:50]}" # Truncate long names
126
+ if bbox_str:
127
+ line += bbox_str
128
+ lines.append(line)
129
+
130
+ for child in tree.get("children", []):
131
+ child_text = format_accessibility_tree(child, indent + 1, max_depth)
132
+ if child_text:
133
+ lines.append(child_text)
134
+
135
+ return "\n".join(lines)
136
+
137
+
138
+ def prev_actions_to_string(prev_actions: List[str], n_prev: int = 3) -> str:
139
+ """Format previous actions for the prompt.
140
+
141
+ Args:
142
+ prev_actions: List of previous action strings.
143
+ n_prev: Number of previous actions to include.
144
+
145
+ Returns:
146
+ Formatted string of previous actions.
147
+ """
148
+ result = ""
149
+ n_prev = min(n_prev, len(prev_actions))
150
+ for i in range(1, n_prev + 1):
151
+ action = prev_actions[-i]
152
+ result += f"Action at T-{i}:\n{action}\n\n"
153
+ return result
154
+
155
+
156
+ class ApiAgent:
157
+ """WAA-compatible agent that uses Claude or GPT-5.1 API directly.
158
+
159
+ This agent implements the same interface as NaviAgent but uses hosted
160
+ VLM APIs instead of the local Navi implementation (which has NoneType bugs).
161
+
162
+ Args:
163
+ provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
164
+ api_key: Optional API key. If not provided, uses environment variables.
165
+ model: Optional model name override.
166
+ temperature: Sampling temperature (0.0-1.0).
167
+ max_tokens: Maximum tokens for API response.
168
+ use_accessibility_tree: Whether to include a11y tree in prompts.
169
+ use_history: Whether to include action history in prompts.
170
+ demo: Optional demonstration trajectory to include at every step.
171
+ This is the key fix for 100% first-action / 0% episode success:
172
+ the demo must persist across ALL steps, not just step 1.
173
+ """
174
+
175
+ # Default models for each provider
176
+ DEFAULT_MODELS = {
177
+ "anthropic": "claude-sonnet-4-5-20250929",
178
+ "openai": "gpt-5.1",
179
+ }
180
+
181
+ def __init__(
182
+ self,
183
+ provider: str = "anthropic",
184
+ api_key: str | None = None,
185
+ model: str | None = None,
186
+ temperature: float = 0.5,
187
+ max_tokens: int = 1500,
188
+ use_accessibility_tree: bool = True,
189
+ use_history: bool = True,
190
+ demo: str | None = None,
191
+ ):
192
+ self.provider = provider
193
+ self.model = model or self.DEFAULT_MODELS.get(provider)
194
+ self.temperature = temperature
195
+ self.max_tokens = max_tokens
196
+ self.use_accessibility_tree = use_accessibility_tree
197
+ self.use_history = use_history
198
+ self.demo = demo # Demo persists across ALL steps
199
+
200
+ # WAA compatibility
201
+ self.action_space = "code_block"
202
+
203
+ # Get API key
204
+ if provider == "anthropic":
205
+ self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
206
+ if not self.api_key:
207
+ raise RuntimeError(
208
+ "ANTHROPIC_API_KEY is required for provider='anthropic'. "
209
+ "Set it in environment or pass api_key parameter."
210
+ )
211
+ try:
212
+ from anthropic import Anthropic
213
+ self._client = Anthropic(api_key=self.api_key)
214
+ except ImportError:
215
+ raise RuntimeError(
216
+ "anthropic package required. Install with: pip install anthropic"
217
+ )
218
+
219
+ elif provider == "openai":
220
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
221
+ if not self.api_key:
222
+ raise RuntimeError(
223
+ "OPENAI_API_KEY is required for provider='openai'. "
224
+ "Set it in environment or pass api_key parameter."
225
+ )
226
+ try:
227
+ from openai import OpenAI
228
+ self._client = OpenAI(api_key=self.api_key)
229
+ except ImportError:
230
+ raise RuntimeError(
231
+ "openai package required. Install with: pip install openai"
232
+ )
233
+ else:
234
+ raise ValueError(f"Unsupported provider: {provider}")
235
+
236
+ # State tracking
237
+ self.prev_actions: List[str] = [] # Raw action codes for WAA compatibility
238
+ self.history: List[str] = [] # Rich history with reasoning (like PC Agent-E)
239
+ self.history_cutoff = 10 # Max history entries to include
240
+ self.memory_block_text = "# empty memory block"
241
+ self.step_counter = 0
242
+
243
+ logger.info(f"ApiAgent initialized with provider={provider}, model={self.model}")
244
+ if self.demo:
245
+ logger.info(f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps")
246
+
247
+ def predict(self, instruction: str, obs: Dict) -> tuple:
248
+ """Predict the next action based on observation.
249
+
250
+ This method implements the same interface as NaviAgent.predict().
251
+
252
+ Args:
253
+ instruction: The task instruction.
254
+ obs: Observation dict containing:
255
+ - screenshot: PNG bytes of current screen
256
+ - accessibility_tree: A11y tree dict (optional)
257
+ - window_title: Current window title
258
+ - window_names_str: List of open windows
259
+ - computer_clipboard: Current clipboard content
260
+
261
+ Returns:
262
+ Tuple of (response_text, actions_list, logs_dict, computer_update_args)
263
+ """
264
+ logs = {}
265
+ self.step_counter += 1
266
+
267
+ # Extract screenshot
268
+ screenshot_bytes = obs.get("screenshot")
269
+ if screenshot_bytes is None:
270
+ logger.error("No screenshot in observation")
271
+ return "", ["# No screenshot available"], logs, {}
272
+
273
+ # Convert screenshot to PIL Image
274
+ try:
275
+ image = Image.open(BytesIO(screenshot_bytes))
276
+ w, h = image.size
277
+ except Exception as e:
278
+ logger.error(f"Failed to load screenshot: {e}")
279
+ return "", ["# Failed to load screenshot"], logs, {}
280
+
281
+ logs["image_width"] = w
282
+ logs["image_height"] = h
283
+
284
+ # Build the prompt
285
+ content_parts = [f"TASK: {instruction}"]
286
+
287
+ # CRITICAL FIX: Include demo at EVERY step, not just step 1
288
+ # This is the key fix for 100% first-action / 0% episode success
289
+ if self.demo:
290
+ content_parts.append(
291
+ f"DEMONSTRATION (follow this pattern):\n"
292
+ f"---\n{self.demo}\n---\n"
293
+ f"Use the demonstration above as a guide. You are currently at step {self.step_counter}."
294
+ )
295
+ logs["demo_included"] = True
296
+ logs["demo_length"] = len(self.demo)
297
+
298
+ # Add context
299
+ window_title = obs.get("window_title", "")
300
+ if window_title:
301
+ content_parts.append(f"Current window: {window_title}")
302
+ logs["window_title"] = window_title
303
+
304
+ window_names_str = obs.get("window_names_str", "")
305
+ if window_names_str:
306
+ content_parts.append(f"Open windows: {window_names_str}")
307
+ logs["window_names_str"] = window_names_str
308
+
309
+ clipboard = obs.get("computer_clipboard", "")
310
+ if clipboard:
311
+ content_parts.append(f"Clipboard: {clipboard[:100]}")
312
+ logs["computer_clipboard"] = clipboard
313
+
314
+ # Add accessibility tree if available and enabled
315
+ if self.use_accessibility_tree:
316
+ a11y_tree = obs.get("accessibility_tree")
317
+ if a11y_tree:
318
+ tree_str = format_accessibility_tree(a11y_tree)
319
+ # Truncate if too long
320
+ if len(tree_str) > 4000:
321
+ tree_str = tree_str[:4000] + "\n... (truncated)"
322
+ content_parts.append(f"UI Elements:\n{tree_str}")
323
+ logs["accessibility_tree_len"] = len(tree_str)
324
+
325
+ # Add action history if enabled (enhanced: includes reasoning, not just raw actions)
326
+ if self.use_history and self.history:
327
+ # Use rich history with reasoning (like PC Agent-E)
328
+ history_entries = self.history[-self.history_cutoff:]
329
+ history_str = "\n\n".join(
330
+ f"[Step {i+1}] {entry}"
331
+ for i, entry in enumerate(history_entries)
332
+ )
333
+ content_parts.append(f"History of previous steps:\n{history_str}")
334
+ logs["history_entries"] = len(history_entries)
335
+ elif self.use_history and self.prev_actions:
336
+ # Fallback to raw action history
337
+ history_str = prev_actions_to_string(self.prev_actions, n_prev=5)
338
+ content_parts.append(f"Previous actions:\n{history_str}")
339
+
340
+ # Add memory block
341
+ content_parts.append(f"Your memory:\n```memory\n{self.memory_block_text}\n```")
342
+
343
+ content_parts.append(f"\nScreen dimensions: {w}x{h} pixels")
344
+ content_parts.append("\nWhat is the next action?")
345
+
346
+ user_prompt = "\n\n".join(content_parts)
347
+ logs["user_question"] = user_prompt
348
+
349
+ # Call the API
350
+ try:
351
+ response_text = self._call_api(screenshot_bytes, user_prompt)
352
+ except Exception as e:
353
+ logger.error(f"API call failed: {e}")
354
+ return "", ["# API call failed"], logs, {}
355
+
356
+ logs["plan_result"] = response_text
357
+
358
+ # Extract memory block
359
+ memory_match = re.search(r"```memory\n(.*?)```", response_text, re.DOTALL)
360
+ if memory_match:
361
+ self.memory_block_text = memory_match.group(1).strip()
362
+
363
+ # Extract decision block
364
+ decision_match = re.search(r"```decision\n(.*?)```", response_text, re.DOTALL)
365
+ if decision_match:
366
+ decision = decision_match.group(1).strip().upper()
367
+ if "DONE" in decision:
368
+ self.prev_actions.append("DONE")
369
+ return "", ["DONE"], logs, {}
370
+ elif "FAIL" in decision:
371
+ self.prev_actions.append("FAIL")
372
+ return "", ["FAIL"], logs, {}
373
+ elif "WAIT" in decision:
374
+ self.prev_actions.append("WAIT")
375
+ return "", ["WAIT"], logs, {}
376
+
377
+ # Extract Python code block
378
+ code_match = re.search(r"```python\n(.*?)```", response_text, re.DOTALL)
379
+ if code_match:
380
+ code_text = code_match.group(1).strip()
381
+ actions = [code_text]
382
+ self.prev_actions.append(code_text)
383
+ # Store rich history with reasoning (memory + action)
384
+ self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {code_text}")
385
+ else:
386
+ # Try to extract action from response text
387
+ action = self._parse_action_from_text(response_text, w, h)
388
+ if action:
389
+ actions = [action]
390
+ self.prev_actions.append(action)
391
+ self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {action}")
392
+ else:
393
+ logger.warning("Could not extract action from response")
394
+ actions = ["# Could not parse action"]
395
+
396
+ # Build computer_update_args (for WAA compatibility)
397
+ computer_update_args = {
398
+ "rects": [],
399
+ "window_rect": [0, 0, w, h],
400
+ "screenshot": image,
401
+ "scale": (1.0, 1.0),
402
+ "clipboard_content": clipboard,
403
+ "swap_ctrl_alt": False,
404
+ }
405
+
406
+ return "", actions, logs, computer_update_args
407
+
408
+ def _call_api(self, screenshot_bytes: bytes, user_prompt: str) -> str:
409
+ """Call the VLM API with screenshot and prompt.
410
+
411
+ Args:
412
+ screenshot_bytes: PNG image bytes.
413
+ user_prompt: User prompt text.
414
+
415
+ Returns:
416
+ Response text from the API.
417
+ """
418
+ image_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
419
+
420
+ if self.provider == "anthropic":
421
+ content = [
422
+ {"type": "text", "text": user_prompt},
423
+ {
424
+ "type": "image",
425
+ "source": {
426
+ "type": "base64",
427
+ "media_type": "image/png",
428
+ "data": image_b64,
429
+ },
430
+ },
431
+ ]
432
+
433
+ resp = self._client.messages.create(
434
+ model=self.model,
435
+ max_tokens=self.max_tokens,
436
+ system=SYSTEM_PROMPT,
437
+ messages=[{"role": "user", "content": content}],
438
+ )
439
+
440
+ # Extract text from response
441
+ parts = getattr(resp, "content", [])
442
+ texts = [
443
+ getattr(p, "text", "")
444
+ for p in parts
445
+ if getattr(p, "type", "") == "text"
446
+ ]
447
+ return "\n".join([t for t in texts if t]).strip()
448
+
449
+ elif self.provider == "openai":
450
+ messages = [
451
+ {"role": "system", "content": SYSTEM_PROMPT},
452
+ {
453
+ "role": "user",
454
+ "content": [
455
+ {"type": "text", "text": user_prompt},
456
+ {
457
+ "type": "image_url",
458
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
459
+ },
460
+ ],
461
+ },
462
+ ]
463
+
464
+ resp = self._client.chat.completions.create(
465
+ model=self.model,
466
+ messages=messages,
467
+ max_completion_tokens=self.max_tokens,
468
+ temperature=self.temperature,
469
+ )
470
+ return resp.choices[0].message.content or ""
471
+
472
+ raise ValueError(f"Unsupported provider: {self.provider}")
473
+
474
+ def _parse_action_from_text(self, text: str, width: int, height: int) -> str | None:
475
+ """Try to parse an action from free-form text response.
476
+
477
+ Args:
478
+ text: Response text to parse.
479
+ width: Screen width.
480
+ height: Screen height.
481
+
482
+ Returns:
483
+ Python code string or None if parsing failed.
484
+ """
485
+ # Try to find click coordinates
486
+ click_match = re.search(
487
+ r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE
488
+ )
489
+ if click_match:
490
+ x, y = int(click_match.group(1)), int(click_match.group(2))
491
+ return f"computer.click({x}, {y})"
492
+
493
+ # Try to find type text
494
+ type_match = re.search(
495
+ r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE
496
+ )
497
+ if type_match:
498
+ text_to_type = type_match.group(1)
499
+ return f'computer.type("{text_to_type}")'
500
+
501
+ # Try to find key press
502
+ key_match = re.search(
503
+ r"press[:\s]+(\w+)", text, re.IGNORECASE
504
+ )
505
+ if key_match:
506
+ key = key_match.group(1).lower()
507
+ return f'computer.press("{key}")'
508
+
509
+ # Try to find hotkey
510
+ hotkey_match = re.search(
511
+ r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE
512
+ )
513
+ if hotkey_match:
514
+ key1, key2 = hotkey_match.group(1).lower(), hotkey_match.group(2).lower()
515
+ return f'computer.hotkey("{key1}", "{key2}")'
516
+
517
+ return None
518
+
519
+ def _add_to_history(self, entry: str) -> None:
520
+ """Add an entry to the rich history (reasoning + action)."""
521
+ self.history.append(entry)
522
+
523
+ def set_demo(self, demo: str) -> None:
524
+ """Set or update the demo trajectory.
525
+
526
+ This allows setting the demo after initialization,
527
+ useful for dynamic demo retrieval.
528
+ """
529
+ self.demo = demo
530
+ logger.info(f"Demo set ({len(demo)} chars) - will persist across all steps")
531
+
532
+ def reset(self) -> None:
533
+ """Reset agent state between tasks."""
534
+ self.prev_actions = []
535
+ self.history = [] # Clear rich history too
536
+ self.memory_block_text = "# empty memory block"
537
+ self.step_counter = 0
538
+ # Note: demo is NOT reset - it persists across resets if set
539
+ logger.info("ApiAgent reset")
@@ -0,0 +1,53 @@
1
+ @echo off
2
+ REM start_waa_server.bat - Start WAA Flask server on Windows boot
3
+ REM This script ensures the WAA server starts automatically on every boot
4
+
5
+ echo [WAA Startup] Starting WAA server...
6
+
7
+ REM Wait for network to be available
8
+ ping -n 5 127.0.0.1 > nul
9
+
10
+ REM Check if server is already running
11
+ netstat -an | find ":5000" | find "LISTENING" > nul
12
+ if %errorlevel% == 0 (
13
+ echo [WAA Startup] Server already running on port 5000
14
+ exit /b 0
15
+ )
16
+
17
+ REM Try multiple possible server locations
18
+ REM Location 1: OEM server path (official WAA location)
19
+ if exist "C:\oem\server\main.py" (
20
+ cd /d C:\oem\server
21
+ start /b python main.py
22
+ echo [WAA Startup] Started from C:\oem\server
23
+ exit /b 0
24
+ )
25
+
26
+ REM Location 2: Network share (Samba)
27
+ if exist "\\host.lan\Data\server\main.py" (
28
+ cd /d \\host.lan\Data\server
29
+ start /b python main.py
30
+ echo [WAA Startup] Started from network share
31
+ exit /b 0
32
+ )
33
+
34
+ REM Location 3: Legacy path
35
+ if exist "C:\waa\server\main.py" (
36
+ cd /d C:\waa\server
37
+ start /b python main.py
38
+ echo [WAA Startup] Started from C:\waa\server
39
+ exit /b 0
40
+ )
41
+
42
+ REM If none found, try running from network directly
43
+ echo [WAA Startup] Trying network server path...
44
+ cd /d \\host.lan\Data\server 2>nul
45
+ if %errorlevel% == 0 (
46
+ start /b python main.py
47
+ echo [WAA Startup] Started from network path
48
+ exit /b 0
49
+ )
50
+
51
+ echo [WAA Startup] ERROR: WAA server not found in any expected location
52
+ echo Checked: C:\oem\server, \\host.lan\Data\server, C:\waa\server
53
+ exit /b 1