openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/__init__.py +8 -0
- openadapt_ml/benchmarks/agent.py +90 -11
- openadapt_ml/benchmarks/azure.py +35 -6
- openadapt_ml/benchmarks/cli.py +4449 -201
- openadapt_ml/benchmarks/live_tracker.py +180 -0
- openadapt_ml/benchmarks/runner.py +41 -4
- openadapt_ml/benchmarks/viewer.py +1219 -0
- openadapt_ml/benchmarks/vm_monitor.py +610 -0
- openadapt_ml/benchmarks/waa.py +61 -4
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/benchmarks/waa_live.py +619 -0
- openadapt_ml/cloud/local.py +1555 -1
- openadapt_ml/cloud/ssh_tunnel.py +553 -0
- openadapt_ml/datasets/next_action.py +87 -68
- openadapt_ml/evals/grounding.py +26 -8
- openadapt_ml/evals/trajectory_matching.py +84 -36
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +717 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +265 -0
- openadapt_ml/ingest/__init__.py +3 -4
- openadapt_ml/ingest/capture.py +89 -81
- openadapt_ml/ingest/loader.py +116 -68
- openadapt_ml/ingest/synthetic.py +221 -159
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +817 -0
- openadapt_ml/retrieval/embeddings.py +629 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +160 -0
- openadapt_ml/runtime/policy.py +10 -10
- openadapt_ml/schema/__init__.py +104 -0
- openadapt_ml/schema/converters.py +541 -0
- openadapt_ml/schema/episode.py +457 -0
- openadapt_ml/scripts/compare.py +26 -16
- openadapt_ml/scripts/eval_policy.py +4 -5
- openadapt_ml/scripts/prepare_synthetic.py +14 -17
- openadapt_ml/scripts/train.py +81 -70
- openadapt_ml/training/benchmark_viewer.py +3225 -0
- openadapt_ml/training/trainer.py +120 -363
- openadapt_ml/training/trl_trainer.py +354 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
- openadapt_ml-0.2.0.dist-info/RECORD +86 -0
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
"""WAA-compatible API Agent that uses Claude Sonnet 4.5 or GPT-5.1 directly.
|
|
2
|
+
|
|
3
|
+
This module provides a drop-in replacement for the Navi agent in Windows Agent Arena
|
|
4
|
+
that uses hosted VLM APIs (Claude or GPT-5.1) instead of the buggy Navi agent.
|
|
5
|
+
|
|
6
|
+
The agent receives observations from WAA and returns actions in WAA's expected format
|
|
7
|
+
(code blocks for the pyautogui action space).
|
|
8
|
+
|
|
9
|
+
Why this exists:
|
|
10
|
+
The default Navi agent in WAA has NoneType errors and other bugs.
|
|
11
|
+
This API agent provides a reliable alternative that uses Claude Sonnet 4.5
|
|
12
|
+
or GPT-5.1 directly, bypassing the problematic Navi implementation.
|
|
13
|
+
|
|
14
|
+
Usage from CLI:
|
|
15
|
+
# Run with Claude Sonnet 4.5 (requires ANTHROPIC_API_KEY)
|
|
16
|
+
uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-claude --num-tasks 5
|
|
17
|
+
|
|
18
|
+
# Run with GPT-5.1 (requires OPENAI_API_KEY)
|
|
19
|
+
uv run python -m openadapt_ml.benchmarks.cli vm run-waa --agent api-openai --num-tasks 5
|
|
20
|
+
|
|
21
|
+
How it works:
|
|
22
|
+
1. The Dockerfile copies this file to /client/mm_agents/api_agent.py
|
|
23
|
+
2. The Dockerfile patches run.py to recognize "api-claude" and "api-openai" agents
|
|
24
|
+
3. When the agent is selected, it:
|
|
25
|
+
- Receives screenshots from WAA's DesktopEnv
|
|
26
|
+
- Sends them to Claude or GPT-5.1 via their respective APIs
|
|
27
|
+
- Parses the response into pyautogui code blocks
|
|
28
|
+
- Returns actions in WAA's expected format
|
|
29
|
+
|
|
30
|
+
Example usage in WAA run.py (auto-patched by Dockerfile):
|
|
31
|
+
if cfg_args["agent_name"] == "api-claude":
|
|
32
|
+
from mm_agents.api_agent import ApiAgent
|
|
33
|
+
agent = ApiAgent(provider="anthropic")
|
|
34
|
+
elif cfg_args["agent_name"] == "api-openai":
|
|
35
|
+
from mm_agents.api_agent import ApiAgent
|
|
36
|
+
agent = ApiAgent(provider="openai")
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import base64
|
|
42
|
+
import logging
|
|
43
|
+
import os
|
|
44
|
+
import re
|
|
45
|
+
from io import BytesIO
|
|
46
|
+
from typing import Any, Dict, List
|
|
47
|
+
|
|
48
|
+
from PIL import Image
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger("desktopenv.agent.api")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# System prompt for GUI automation - adapted from APIBenchmarkAgent
|
|
54
|
+
SYSTEM_PROMPT = """You are a GUI automation agent controlling a Windows desktop. Given a screenshot and task instruction, determine the next action to take.
|
|
55
|
+
|
|
56
|
+
You must respond with a Python code block that uses the pyautogui API. Available functions:
|
|
57
|
+
- computer.click(x, y) - Click at pixel coordinates
|
|
58
|
+
- computer.double_click(x, y) - Double-click at pixel coordinates
|
|
59
|
+
- computer.right_click(x, y) - Right-click at pixel coordinates
|
|
60
|
+
- computer.type(text) - Type the given text
|
|
61
|
+
- computer.hotkey(key1, key2, ...) - Press key combination (e.g., 'ctrl', 'c')
|
|
62
|
+
- computer.press(key) - Press a single key (e.g., 'enter', 'tab', 'escape')
|
|
63
|
+
- computer.scroll(direction) - Scroll up (-3) or down (3)
|
|
64
|
+
- computer.drag(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
|
|
65
|
+
|
|
66
|
+
Coordinates are pixel values within the screen (1920x1200 by default).
|
|
67
|
+
|
|
68
|
+
Format your response as:
|
|
69
|
+
|
|
70
|
+
```memory
|
|
71
|
+
# Your notes about the task state (optional)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```decision
|
|
75
|
+
CONTINUE
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
computer.click(500, 300)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Important:
|
|
83
|
+
- Use DONE in the decision block when the task is complete
|
|
84
|
+
- Use FAIL if the task cannot be completed
|
|
85
|
+
- Always output exactly one action per response
|
|
86
|
+
- Click on UI elements by their visual center coordinates
|
|
87
|
+
- For text input, first click to focus the field, then type
|
|
88
|
+
|
|
89
|
+
Think step by step:
|
|
90
|
+
1. What is the current state of the UI?
|
|
91
|
+
2. What is the goal?
|
|
92
|
+
3. What is the next logical action?
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def format_accessibility_tree(tree: dict, indent: int = 0, max_depth: int = 5) -> str:
|
|
97
|
+
"""Format accessibility tree for prompt.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
tree: Accessibility tree dict from WAA.
|
|
101
|
+
indent: Current indentation level.
|
|
102
|
+
max_depth: Maximum depth to traverse.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Formatted string representation.
|
|
106
|
+
"""
|
|
107
|
+
if indent >= max_depth:
|
|
108
|
+
return ""
|
|
109
|
+
|
|
110
|
+
lines = []
|
|
111
|
+
prefix = " " * indent
|
|
112
|
+
|
|
113
|
+
role = tree.get("role", tree.get("control_type", "unknown"))
|
|
114
|
+
name = tree.get("name", "")
|
|
115
|
+
node_id = tree.get("id", tree.get("node_id", ""))
|
|
116
|
+
|
|
117
|
+
# Get bounding box if available
|
|
118
|
+
bbox_str = ""
|
|
119
|
+
if "bounding_rectangle" in tree:
|
|
120
|
+
br = tree["bounding_rectangle"]
|
|
121
|
+
bbox_str = f" [{br.get('left', 0)},{br.get('top', 0)},{br.get('right', 0)},{br.get('bottom', 0)}]"
|
|
122
|
+
|
|
123
|
+
line = f"{prefix}[{node_id}] {role}"
|
|
124
|
+
if name:
|
|
125
|
+
line += f": {name[:50]}" # Truncate long names
|
|
126
|
+
if bbox_str:
|
|
127
|
+
line += bbox_str
|
|
128
|
+
lines.append(line)
|
|
129
|
+
|
|
130
|
+
for child in tree.get("children", []):
|
|
131
|
+
child_text = format_accessibility_tree(child, indent + 1, max_depth)
|
|
132
|
+
if child_text:
|
|
133
|
+
lines.append(child_text)
|
|
134
|
+
|
|
135
|
+
return "\n".join(lines)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def prev_actions_to_string(prev_actions: List[str], n_prev: int = 3) -> str:
|
|
139
|
+
"""Format previous actions for the prompt.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
prev_actions: List of previous action strings.
|
|
143
|
+
n_prev: Number of previous actions to include.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Formatted string of previous actions.
|
|
147
|
+
"""
|
|
148
|
+
result = ""
|
|
149
|
+
n_prev = min(n_prev, len(prev_actions))
|
|
150
|
+
for i in range(1, n_prev + 1):
|
|
151
|
+
action = prev_actions[-i]
|
|
152
|
+
result += f"Action at T-{i}:\n{action}\n\n"
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ApiAgent:
|
|
157
|
+
"""WAA-compatible agent that uses Claude or GPT-5.1 API directly.
|
|
158
|
+
|
|
159
|
+
This agent implements the same interface as NaviAgent but uses hosted
|
|
160
|
+
VLM APIs instead of the local Navi implementation (which has NoneType bugs).
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
|
|
164
|
+
api_key: Optional API key. If not provided, uses environment variables.
|
|
165
|
+
model: Optional model name override.
|
|
166
|
+
temperature: Sampling temperature (0.0-1.0).
|
|
167
|
+
max_tokens: Maximum tokens for API response.
|
|
168
|
+
use_accessibility_tree: Whether to include a11y tree in prompts.
|
|
169
|
+
use_history: Whether to include action history in prompts.
|
|
170
|
+
demo: Optional demonstration trajectory to include at every step.
|
|
171
|
+
This is the key fix for 100% first-action / 0% episode success:
|
|
172
|
+
the demo must persist across ALL steps, not just step 1.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
# Default models for each provider
|
|
176
|
+
DEFAULT_MODELS = {
|
|
177
|
+
"anthropic": "claude-sonnet-4-5-20250929",
|
|
178
|
+
"openai": "gpt-5.1",
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def __init__(
|
|
182
|
+
self,
|
|
183
|
+
provider: str = "anthropic",
|
|
184
|
+
api_key: str | None = None,
|
|
185
|
+
model: str | None = None,
|
|
186
|
+
temperature: float = 0.5,
|
|
187
|
+
max_tokens: int = 1500,
|
|
188
|
+
use_accessibility_tree: bool = True,
|
|
189
|
+
use_history: bool = True,
|
|
190
|
+
demo: str | None = None,
|
|
191
|
+
):
|
|
192
|
+
self.provider = provider
|
|
193
|
+
self.model = model or self.DEFAULT_MODELS.get(provider)
|
|
194
|
+
self.temperature = temperature
|
|
195
|
+
self.max_tokens = max_tokens
|
|
196
|
+
self.use_accessibility_tree = use_accessibility_tree
|
|
197
|
+
self.use_history = use_history
|
|
198
|
+
self.demo = demo # Demo persists across ALL steps
|
|
199
|
+
|
|
200
|
+
# WAA compatibility
|
|
201
|
+
self.action_space = "code_block"
|
|
202
|
+
|
|
203
|
+
# Get API key
|
|
204
|
+
if provider == "anthropic":
|
|
205
|
+
self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
|
|
206
|
+
if not self.api_key:
|
|
207
|
+
raise RuntimeError(
|
|
208
|
+
"ANTHROPIC_API_KEY is required for provider='anthropic'. "
|
|
209
|
+
"Set it in environment or pass api_key parameter."
|
|
210
|
+
)
|
|
211
|
+
try:
|
|
212
|
+
from anthropic import Anthropic
|
|
213
|
+
self._client = Anthropic(api_key=self.api_key)
|
|
214
|
+
except ImportError:
|
|
215
|
+
raise RuntimeError(
|
|
216
|
+
"anthropic package required. Install with: pip install anthropic"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
elif provider == "openai":
|
|
220
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
221
|
+
if not self.api_key:
|
|
222
|
+
raise RuntimeError(
|
|
223
|
+
"OPENAI_API_KEY is required for provider='openai'. "
|
|
224
|
+
"Set it in environment or pass api_key parameter."
|
|
225
|
+
)
|
|
226
|
+
try:
|
|
227
|
+
from openai import OpenAI
|
|
228
|
+
self._client = OpenAI(api_key=self.api_key)
|
|
229
|
+
except ImportError:
|
|
230
|
+
raise RuntimeError(
|
|
231
|
+
"openai package required. Install with: pip install openai"
|
|
232
|
+
)
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
|
235
|
+
|
|
236
|
+
# State tracking
|
|
237
|
+
self.prev_actions: List[str] = [] # Raw action codes for WAA compatibility
|
|
238
|
+
self.history: List[str] = [] # Rich history with reasoning (like PC Agent-E)
|
|
239
|
+
self.history_cutoff = 10 # Max history entries to include
|
|
240
|
+
self.memory_block_text = "# empty memory block"
|
|
241
|
+
self.step_counter = 0
|
|
242
|
+
|
|
243
|
+
logger.info(f"ApiAgent initialized with provider={provider}, model={self.model}")
|
|
244
|
+
if self.demo:
|
|
245
|
+
logger.info(f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps")
|
|
246
|
+
|
|
247
|
+
def predict(self, instruction: str, obs: Dict) -> tuple:
|
|
248
|
+
"""Predict the next action based on observation.
|
|
249
|
+
|
|
250
|
+
This method implements the same interface as NaviAgent.predict().
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
instruction: The task instruction.
|
|
254
|
+
obs: Observation dict containing:
|
|
255
|
+
- screenshot: PNG bytes of current screen
|
|
256
|
+
- accessibility_tree: A11y tree dict (optional)
|
|
257
|
+
- window_title: Current window title
|
|
258
|
+
- window_names_str: List of open windows
|
|
259
|
+
- computer_clipboard: Current clipboard content
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Tuple of (response_text, actions_list, logs_dict, computer_update_args)
|
|
263
|
+
"""
|
|
264
|
+
logs = {}
|
|
265
|
+
self.step_counter += 1
|
|
266
|
+
|
|
267
|
+
# Extract screenshot
|
|
268
|
+
screenshot_bytes = obs.get("screenshot")
|
|
269
|
+
if screenshot_bytes is None:
|
|
270
|
+
logger.error("No screenshot in observation")
|
|
271
|
+
return "", ["# No screenshot available"], logs, {}
|
|
272
|
+
|
|
273
|
+
# Convert screenshot to PIL Image
|
|
274
|
+
try:
|
|
275
|
+
image = Image.open(BytesIO(screenshot_bytes))
|
|
276
|
+
w, h = image.size
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.error(f"Failed to load screenshot: {e}")
|
|
279
|
+
return "", ["# Failed to load screenshot"], logs, {}
|
|
280
|
+
|
|
281
|
+
logs["image_width"] = w
|
|
282
|
+
logs["image_height"] = h
|
|
283
|
+
|
|
284
|
+
# Build the prompt
|
|
285
|
+
content_parts = [f"TASK: {instruction}"]
|
|
286
|
+
|
|
287
|
+
# CRITICAL FIX: Include demo at EVERY step, not just step 1
|
|
288
|
+
# This is the key fix for 100% first-action / 0% episode success
|
|
289
|
+
if self.demo:
|
|
290
|
+
content_parts.append(
|
|
291
|
+
f"DEMONSTRATION (follow this pattern):\n"
|
|
292
|
+
f"---\n{self.demo}\n---\n"
|
|
293
|
+
f"Use the demonstration above as a guide. You are currently at step {self.step_counter}."
|
|
294
|
+
)
|
|
295
|
+
logs["demo_included"] = True
|
|
296
|
+
logs["demo_length"] = len(self.demo)
|
|
297
|
+
|
|
298
|
+
# Add context
|
|
299
|
+
window_title = obs.get("window_title", "")
|
|
300
|
+
if window_title:
|
|
301
|
+
content_parts.append(f"Current window: {window_title}")
|
|
302
|
+
logs["window_title"] = window_title
|
|
303
|
+
|
|
304
|
+
window_names_str = obs.get("window_names_str", "")
|
|
305
|
+
if window_names_str:
|
|
306
|
+
content_parts.append(f"Open windows: {window_names_str}")
|
|
307
|
+
logs["window_names_str"] = window_names_str
|
|
308
|
+
|
|
309
|
+
clipboard = obs.get("computer_clipboard", "")
|
|
310
|
+
if clipboard:
|
|
311
|
+
content_parts.append(f"Clipboard: {clipboard[:100]}")
|
|
312
|
+
logs["computer_clipboard"] = clipboard
|
|
313
|
+
|
|
314
|
+
# Add accessibility tree if available and enabled
|
|
315
|
+
if self.use_accessibility_tree:
|
|
316
|
+
a11y_tree = obs.get("accessibility_tree")
|
|
317
|
+
if a11y_tree:
|
|
318
|
+
tree_str = format_accessibility_tree(a11y_tree)
|
|
319
|
+
# Truncate if too long
|
|
320
|
+
if len(tree_str) > 4000:
|
|
321
|
+
tree_str = tree_str[:4000] + "\n... (truncated)"
|
|
322
|
+
content_parts.append(f"UI Elements:\n{tree_str}")
|
|
323
|
+
logs["accessibility_tree_len"] = len(tree_str)
|
|
324
|
+
|
|
325
|
+
# Add action history if enabled (enhanced: includes reasoning, not just raw actions)
|
|
326
|
+
if self.use_history and self.history:
|
|
327
|
+
# Use rich history with reasoning (like PC Agent-E)
|
|
328
|
+
history_entries = self.history[-self.history_cutoff:]
|
|
329
|
+
history_str = "\n\n".join(
|
|
330
|
+
f"[Step {i+1}] {entry}"
|
|
331
|
+
for i, entry in enumerate(history_entries)
|
|
332
|
+
)
|
|
333
|
+
content_parts.append(f"History of previous steps:\n{history_str}")
|
|
334
|
+
logs["history_entries"] = len(history_entries)
|
|
335
|
+
elif self.use_history and self.prev_actions:
|
|
336
|
+
# Fallback to raw action history
|
|
337
|
+
history_str = prev_actions_to_string(self.prev_actions, n_prev=5)
|
|
338
|
+
content_parts.append(f"Previous actions:\n{history_str}")
|
|
339
|
+
|
|
340
|
+
# Add memory block
|
|
341
|
+
content_parts.append(f"Your memory:\n```memory\n{self.memory_block_text}\n```")
|
|
342
|
+
|
|
343
|
+
content_parts.append(f"\nScreen dimensions: {w}x{h} pixels")
|
|
344
|
+
content_parts.append("\nWhat is the next action?")
|
|
345
|
+
|
|
346
|
+
user_prompt = "\n\n".join(content_parts)
|
|
347
|
+
logs["user_question"] = user_prompt
|
|
348
|
+
|
|
349
|
+
# Call the API
|
|
350
|
+
try:
|
|
351
|
+
response_text = self._call_api(screenshot_bytes, user_prompt)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.error(f"API call failed: {e}")
|
|
354
|
+
return "", ["# API call failed"], logs, {}
|
|
355
|
+
|
|
356
|
+
logs["plan_result"] = response_text
|
|
357
|
+
|
|
358
|
+
# Extract memory block
|
|
359
|
+
memory_match = re.search(r"```memory\n(.*?)```", response_text, re.DOTALL)
|
|
360
|
+
if memory_match:
|
|
361
|
+
self.memory_block_text = memory_match.group(1).strip()
|
|
362
|
+
|
|
363
|
+
# Extract decision block
|
|
364
|
+
decision_match = re.search(r"```decision\n(.*?)```", response_text, re.DOTALL)
|
|
365
|
+
if decision_match:
|
|
366
|
+
decision = decision_match.group(1).strip().upper()
|
|
367
|
+
if "DONE" in decision:
|
|
368
|
+
self.prev_actions.append("DONE")
|
|
369
|
+
return "", ["DONE"], logs, {}
|
|
370
|
+
elif "FAIL" in decision:
|
|
371
|
+
self.prev_actions.append("FAIL")
|
|
372
|
+
return "", ["FAIL"], logs, {}
|
|
373
|
+
elif "WAIT" in decision:
|
|
374
|
+
self.prev_actions.append("WAIT")
|
|
375
|
+
return "", ["WAIT"], logs, {}
|
|
376
|
+
|
|
377
|
+
# Extract Python code block
|
|
378
|
+
code_match = re.search(r"```python\n(.*?)```", response_text, re.DOTALL)
|
|
379
|
+
if code_match:
|
|
380
|
+
code_text = code_match.group(1).strip()
|
|
381
|
+
actions = [code_text]
|
|
382
|
+
self.prev_actions.append(code_text)
|
|
383
|
+
# Store rich history with reasoning (memory + action)
|
|
384
|
+
self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {code_text}")
|
|
385
|
+
else:
|
|
386
|
+
# Try to extract action from response text
|
|
387
|
+
action = self._parse_action_from_text(response_text, w, h)
|
|
388
|
+
if action:
|
|
389
|
+
actions = [action]
|
|
390
|
+
self.prev_actions.append(action)
|
|
391
|
+
self._add_to_history(f"Thought: {self.memory_block_text}\nAction: {action}")
|
|
392
|
+
else:
|
|
393
|
+
logger.warning("Could not extract action from response")
|
|
394
|
+
actions = ["# Could not parse action"]
|
|
395
|
+
|
|
396
|
+
# Build computer_update_args (for WAA compatibility)
|
|
397
|
+
computer_update_args = {
|
|
398
|
+
"rects": [],
|
|
399
|
+
"window_rect": [0, 0, w, h],
|
|
400
|
+
"screenshot": image,
|
|
401
|
+
"scale": (1.0, 1.0),
|
|
402
|
+
"clipboard_content": clipboard,
|
|
403
|
+
"swap_ctrl_alt": False,
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
return "", actions, logs, computer_update_args
|
|
407
|
+
|
|
408
|
+
def _call_api(self, screenshot_bytes: bytes, user_prompt: str) -> str:
|
|
409
|
+
"""Call the VLM API with screenshot and prompt.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
screenshot_bytes: PNG image bytes.
|
|
413
|
+
user_prompt: User prompt text.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
Response text from the API.
|
|
417
|
+
"""
|
|
418
|
+
image_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
|
|
419
|
+
|
|
420
|
+
if self.provider == "anthropic":
|
|
421
|
+
content = [
|
|
422
|
+
{"type": "text", "text": user_prompt},
|
|
423
|
+
{
|
|
424
|
+
"type": "image",
|
|
425
|
+
"source": {
|
|
426
|
+
"type": "base64",
|
|
427
|
+
"media_type": "image/png",
|
|
428
|
+
"data": image_b64,
|
|
429
|
+
},
|
|
430
|
+
},
|
|
431
|
+
]
|
|
432
|
+
|
|
433
|
+
resp = self._client.messages.create(
|
|
434
|
+
model=self.model,
|
|
435
|
+
max_tokens=self.max_tokens,
|
|
436
|
+
system=SYSTEM_PROMPT,
|
|
437
|
+
messages=[{"role": "user", "content": content}],
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# Extract text from response
|
|
441
|
+
parts = getattr(resp, "content", [])
|
|
442
|
+
texts = [
|
|
443
|
+
getattr(p, "text", "")
|
|
444
|
+
for p in parts
|
|
445
|
+
if getattr(p, "type", "") == "text"
|
|
446
|
+
]
|
|
447
|
+
return "\n".join([t for t in texts if t]).strip()
|
|
448
|
+
|
|
449
|
+
elif self.provider == "openai":
|
|
450
|
+
messages = [
|
|
451
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
452
|
+
{
|
|
453
|
+
"role": "user",
|
|
454
|
+
"content": [
|
|
455
|
+
{"type": "text", "text": user_prompt},
|
|
456
|
+
{
|
|
457
|
+
"type": "image_url",
|
|
458
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
459
|
+
},
|
|
460
|
+
],
|
|
461
|
+
},
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
resp = self._client.chat.completions.create(
|
|
465
|
+
model=self.model,
|
|
466
|
+
messages=messages,
|
|
467
|
+
max_completion_tokens=self.max_tokens,
|
|
468
|
+
temperature=self.temperature,
|
|
469
|
+
)
|
|
470
|
+
return resp.choices[0].message.content or ""
|
|
471
|
+
|
|
472
|
+
raise ValueError(f"Unsupported provider: {self.provider}")
|
|
473
|
+
|
|
474
|
+
def _parse_action_from_text(self, text: str, width: int, height: int) -> str | None:
|
|
475
|
+
"""Try to parse an action from free-form text response.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
text: Response text to parse.
|
|
479
|
+
width: Screen width.
|
|
480
|
+
height: Screen height.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Python code string or None if parsing failed.
|
|
484
|
+
"""
|
|
485
|
+
# Try to find click coordinates
|
|
486
|
+
click_match = re.search(
|
|
487
|
+
r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE
|
|
488
|
+
)
|
|
489
|
+
if click_match:
|
|
490
|
+
x, y = int(click_match.group(1)), int(click_match.group(2))
|
|
491
|
+
return f"computer.click({x}, {y})"
|
|
492
|
+
|
|
493
|
+
# Try to find type text
|
|
494
|
+
type_match = re.search(
|
|
495
|
+
r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE
|
|
496
|
+
)
|
|
497
|
+
if type_match:
|
|
498
|
+
text_to_type = type_match.group(1)
|
|
499
|
+
return f'computer.type("{text_to_type}")'
|
|
500
|
+
|
|
501
|
+
# Try to find key press
|
|
502
|
+
key_match = re.search(
|
|
503
|
+
r"press[:\s]+(\w+)", text, re.IGNORECASE
|
|
504
|
+
)
|
|
505
|
+
if key_match:
|
|
506
|
+
key = key_match.group(1).lower()
|
|
507
|
+
return f'computer.press("{key}")'
|
|
508
|
+
|
|
509
|
+
# Try to find hotkey
|
|
510
|
+
hotkey_match = re.search(
|
|
511
|
+
r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE
|
|
512
|
+
)
|
|
513
|
+
if hotkey_match:
|
|
514
|
+
key1, key2 = hotkey_match.group(1).lower(), hotkey_match.group(2).lower()
|
|
515
|
+
return f'computer.hotkey("{key1}", "{key2}")'
|
|
516
|
+
|
|
517
|
+
return None
|
|
518
|
+
|
|
519
|
+
def _add_to_history(self, entry: str) -> None:
|
|
520
|
+
"""Add an entry to the rich history (reasoning + action)."""
|
|
521
|
+
self.history.append(entry)
|
|
522
|
+
|
|
523
|
+
def set_demo(self, demo: str) -> None:
|
|
524
|
+
"""Set or update the demo trajectory.
|
|
525
|
+
|
|
526
|
+
This allows setting the demo after initialization,
|
|
527
|
+
useful for dynamic demo retrieval.
|
|
528
|
+
"""
|
|
529
|
+
self.demo = demo
|
|
530
|
+
logger.info(f"Demo set ({len(demo)} chars) - will persist across all steps")
|
|
531
|
+
|
|
532
|
+
def reset(self) -> None:
|
|
533
|
+
"""Reset agent state between tasks."""
|
|
534
|
+
self.prev_actions = []
|
|
535
|
+
self.history = [] # Clear rich history too
|
|
536
|
+
self.memory_block_text = "# empty memory block"
|
|
537
|
+
self.step_counter = 0
|
|
538
|
+
# Note: demo is NOT reset - it persists across resets if set
|
|
539
|
+
logger.info("ApiAgent reset")
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
@echo off
|
|
2
|
+
REM start_waa_server.bat - Start WAA Flask server on Windows boot
|
|
3
|
+
REM This script ensures the WAA server starts automatically on every boot
|
|
4
|
+
|
|
5
|
+
echo [WAA Startup] Starting WAA server...
|
|
6
|
+
|
|
7
|
+
REM Wait for network to be available
|
|
8
|
+
ping -n 5 127.0.0.1 > nul
|
|
9
|
+
|
|
10
|
+
REM Check if server is already running
|
|
11
|
+
netstat -an | find ":5000" | find "LISTENING" > nul
|
|
12
|
+
if %errorlevel% == 0 (
|
|
13
|
+
echo [WAA Startup] Server already running on port 5000
|
|
14
|
+
exit /b 0
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
REM Try multiple possible server locations
|
|
18
|
+
REM Location 1: OEM server path (official WAA location)
|
|
19
|
+
if exist "C:\oem\server\main.py" (
|
|
20
|
+
cd /d C:\oem\server
|
|
21
|
+
start /b python main.py
|
|
22
|
+
echo [WAA Startup] Started from C:\oem\server
|
|
23
|
+
exit /b 0
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
REM Location 2: Network share (Samba)
|
|
27
|
+
if exist "\\host.lan\Data\server\main.py" (
|
|
28
|
+
cd /d \\host.lan\Data\server
|
|
29
|
+
start /b python main.py
|
|
30
|
+
echo [WAA Startup] Started from network share
|
|
31
|
+
exit /b 0
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
REM Location 3: Legacy path
|
|
35
|
+
if exist "C:\waa\server\main.py" (
|
|
36
|
+
cd /d C:\waa\server
|
|
37
|
+
start /b python main.py
|
|
38
|
+
echo [WAA Startup] Started from C:\waa\server
|
|
39
|
+
exit /b 0
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
REM If none found, try running from network directly
|
|
43
|
+
echo [WAA Startup] Trying network server path...
|
|
44
|
+
cd /d \\host.lan\Data\server 2>nul
|
|
45
|
+
if %errorlevel% == 0 (
|
|
46
|
+
start /b python main.py
|
|
47
|
+
echo [WAA Startup] Started from network path
|
|
48
|
+
exit /b 0
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
echo [WAA Startup] ERROR: WAA server not found in any expected location
|
|
52
|
+
echo Checked: C:\oem\server, \\host.lan\Data\server, C:\waa\server
|
|
53
|
+
exit /b 1
|