orgo 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orgo/computer.py +161 -119
- orgo/prompt.py +775 -379
- {orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/METADATA +5 -3
- {orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/RECORD +6 -6
- {orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/WHEEL +0 -0
- {orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/top_level.txt +0 -0
orgo/prompt.py
CHANGED
|
@@ -1,452 +1,848 @@
|
|
|
1
1
|
# src/orgo/prompt.py
|
|
2
2
|
"""
|
|
3
|
-
Prompt
|
|
3
|
+
Orgo Prompt Module - AI-powered computer control.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
computer.prompt("Open Firefox") # Uses Orgo (default)
|
|
7
|
+
computer.prompt("Open Firefox", provider="anthropic") # Uses Anthropic directly
|
|
4
8
|
"""
|
|
5
9
|
|
|
6
10
|
import os
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
7
13
|
import base64
|
|
8
|
-
|
|
14
|
+
import time
|
|
15
|
+
import logging
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Any, Callable, Dict, List, Optional, Protocol
|
|
18
|
+
|
|
19
|
+
import anthropic
|
|
20
|
+
import websocket
|
|
21
|
+
import requests
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# Console Output
|
|
28
|
+
# =============================================================================
|
|
29
|
+
|
|
30
|
+
class Colors:
|
|
31
|
+
"""ANSI color codes for terminal output."""
|
|
32
|
+
RESET = "\033[0m"
|
|
33
|
+
BOLD = "\033[1m"
|
|
34
|
+
DIM = "\033[2m"
|
|
35
|
+
|
|
36
|
+
CYAN = "\033[36m"
|
|
37
|
+
GREEN = "\033[32m"
|
|
38
|
+
YELLOW = "\033[33m"
|
|
39
|
+
RED = "\033[31m"
|
|
40
|
+
MAGENTA = "\033[35m"
|
|
41
|
+
BLUE = "\033[34m"
|
|
42
|
+
WHITE = "\033[37m"
|
|
43
|
+
GRAY = "\033[90m"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def supports_color() -> bool:
|
|
47
|
+
"""Check if terminal supports color."""
|
|
48
|
+
if os.environ.get("NO_COLOR"):
|
|
49
|
+
return False
|
|
50
|
+
if os.environ.get("FORCE_COLOR"):
|
|
51
|
+
return True
|
|
52
|
+
return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Console:
|
|
56
|
+
"""Beautiful console output for Orgo SDK."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, verbose: bool = True):
|
|
59
|
+
self.verbose = verbose
|
|
60
|
+
self.use_color = supports_color()
|
|
61
|
+
self.start_time = None
|
|
62
|
+
|
|
63
|
+
def _c(self, color: str, text: str) -> str:
|
|
64
|
+
"""Apply color if supported."""
|
|
65
|
+
if self.use_color:
|
|
66
|
+
return f"{color}{text}{Colors.RESET}"
|
|
67
|
+
return text
|
|
68
|
+
|
|
69
|
+
def banner(self, computer_id: str):
|
|
70
|
+
"""Print Orgo banner with session link."""
|
|
71
|
+
if not self.verbose:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
self.start_time = time.time()
|
|
75
|
+
|
|
76
|
+
logo = f"""
|
|
77
|
+
{self._c(Colors.CYAN, '___ _ __ __ _ ___')}
|
|
78
|
+
{self._c(Colors.CYAN, "/ _ \\| '__/ _` |/ _ \\")}
|
|
79
|
+
{self._c(Colors.CYAN, '| (_) | | | (_| | (_) |')}
|
|
80
|
+
{self._c(Colors.CYAN, "\\___/|_| \\__, |\\___/")}
|
|
81
|
+
{self._c(Colors.CYAN, '|___/')}
|
|
82
|
+
"""
|
|
83
|
+
print(logo)
|
|
84
|
+
print(f" {self._c(Colors.DIM, 'Watch:')} {self._c(Colors.CYAN, f'https://orgo.ai/workspaces/{computer_id}')}")
|
|
85
|
+
print()
|
|
86
|
+
|
|
87
|
+
def status(self, message: str):
|
|
88
|
+
"""Print status update."""
|
|
89
|
+
if not self.verbose:
|
|
90
|
+
return
|
|
91
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
92
|
+
print(f" {timestamp} {self._c(Colors.CYAN, '●')} {message}")
|
|
93
|
+
|
|
94
|
+
def action(self, action: str, details: str = ""):
|
|
95
|
+
"""Print action being taken."""
|
|
96
|
+
if not self.verbose:
|
|
97
|
+
return
|
|
98
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
99
|
+
action_str = self._c(Colors.YELLOW, action)
|
|
100
|
+
details_str = self._c(Colors.DIM, details) if details else ""
|
|
101
|
+
print(f" {timestamp} {self._c(Colors.YELLOW, '▸')} {action_str} {details_str}")
|
|
102
|
+
|
|
103
|
+
def thinking(self, preview: str = ""):
|
|
104
|
+
"""Print thinking indicator."""
|
|
105
|
+
if not self.verbose:
|
|
106
|
+
return
|
|
107
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
108
|
+
preview_str = self._c(Colors.DIM, f" {preview[:60]}...") if preview else ""
|
|
109
|
+
print(f" {timestamp} {self._c(Colors.MAGENTA, '◐')} {self._c(Colors.MAGENTA, 'Thinking')}{preview_str}")
|
|
110
|
+
|
|
111
|
+
def text(self, content: str):
|
|
112
|
+
"""Print assistant text response."""
|
|
113
|
+
if not self.verbose:
|
|
114
|
+
return
|
|
115
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
116
|
+
if len(content) > 100:
|
|
117
|
+
content = content[:100] + "..."
|
|
118
|
+
print(f" {timestamp} {self._c(Colors.GREEN, '◀')} {content}")
|
|
119
|
+
|
|
120
|
+
def error(self, message: str):
|
|
121
|
+
"""Print error message."""
|
|
122
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
123
|
+
print(f" {timestamp} {self._c(Colors.RED, '✗')} {self._c(Colors.RED, message)}")
|
|
124
|
+
|
|
125
|
+
def success(self, iterations: int = 0):
|
|
126
|
+
"""Print success message."""
|
|
127
|
+
if not self.verbose:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
elapsed = ""
|
|
131
|
+
if self.start_time:
|
|
132
|
+
seconds = time.time() - self.start_time
|
|
133
|
+
elapsed = f" in {seconds:.1f}s"
|
|
134
|
+
|
|
135
|
+
iter_str = f" ({iterations} iterations)" if iterations else ""
|
|
136
|
+
print()
|
|
137
|
+
print(f" {self._c(Colors.GREEN, '✓')} {self._c(Colors.GREEN, 'Done')}{iter_str}{self._c(Colors.DIM, elapsed)}")
|
|
138
|
+
print()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# =============================================================================
|
|
142
|
+
# System Prompt
|
|
143
|
+
# =============================================================================
|
|
144
|
+
|
|
145
|
+
def get_system_prompt(
|
|
146
|
+
display_width: int = 1024,
|
|
147
|
+
display_height: int = 768,
|
|
148
|
+
custom_prompt: Optional[str] = None
|
|
149
|
+
) -> str:
|
|
150
|
+
"""Build the system prompt for Claude computer use."""
|
|
151
|
+
|
|
152
|
+
mid_x = display_width // 2
|
|
153
|
+
mid_y = display_height // 2
|
|
154
|
+
max_x = display_width - 1
|
|
155
|
+
max_y = display_height - 1
|
|
156
|
+
|
|
157
|
+
base_prompt = f"""You control a Linux desktop ({display_width}x{display_height}). Be efficient - complete tasks in minimal steps.
|
|
158
|
+
|
|
159
|
+
<ACTIONS>
|
|
160
|
+
screenshot - See current screen state
|
|
161
|
+
left_click - Single click. Params: coordinate [x, y]
|
|
162
|
+
double_click - Double click. Params: coordinate [x, y]
|
|
163
|
+
right_click - Right click. Params: coordinate [x, y]
|
|
164
|
+
type - Type text. Params: text "string"
|
|
165
|
+
key - Press key. Params: text "Enter", "Tab", "ctrl+c", etc.
|
|
166
|
+
scroll - Scroll. Params: scroll_direction "up"|"down", scroll_amount 3
|
|
167
|
+
wait - Pause. Params: duration (seconds, e.g. 5)
|
|
168
|
+
mouse_move - Move cursor. Params: coordinate [x, y]
|
|
169
|
+
left_click_drag - Drag operation. Params: start_coordinate [x, y], coordinate [x, y]
|
|
170
|
+
</ACTIONS>
|
|
171
|
+
|
|
172
|
+
<CLICK_RULES>
|
|
173
|
+
DOUBLE_CLICK for:
|
|
174
|
+
- Desktop icons (to open apps)
|
|
175
|
+
- Files/folders in file manager
|
|
176
|
+
|
|
177
|
+
LEFT_CLICK for everything else:
|
|
178
|
+
- Buttons, links, menus
|
|
179
|
+
- Taskbar icons
|
|
180
|
+
- Input fields (to focus before typing)
|
|
181
|
+
- Window controls (close/minimize)
|
|
182
|
+
|
|
183
|
+
COMMON MISTAKES:
|
|
184
|
+
- left_click on desktop icon = only selects, doesn't open (use double_click)
|
|
185
|
+
- double_click on button = wrong (use left_click)
|
|
186
|
+
</CLICK_RULES>
|
|
187
|
+
|
|
188
|
+
<WINDOW_DRAGGING_CRITICAL>
|
|
189
|
+
WHEN DRAGGING WINDOWS - GRAB THE TITLE BAR CORRECTLY:
|
|
190
|
+
|
|
191
|
+
CORRECT - grab the EMPTY SPACE in the title bar:
|
|
192
|
+
✓ Center-top of window (middle of title bar, away from buttons/tabs)
|
|
193
|
+
✓ For browser: grab between tabs and buttons (empty title bar area)
|
|
194
|
+
✓ For app with tabs: grab the title bar ABOVE tabs
|
|
195
|
+
✓ Safe zone: horizontal center, ~20-30px from top edge
|
|
196
|
+
|
|
197
|
+
WRONG - avoid these areas:
|
|
198
|
+
✗ Close/minimize/maximize buttons (top-right corner)
|
|
199
|
+
✗ Browser tabs (will switch tabs instead of moving window)
|
|
200
|
+
✗ Window icon or menu (top-left corner)
|
|
201
|
+
✗ Any buttons or controls in title bar
|
|
202
|
+
|
|
203
|
+
VISUAL GUIDE - where to grab:
|
|
204
|
+
[X] [Icon] [___GRAB_HERE___] [- □ X]
|
|
205
|
+
↑ empty title bar area
|
|
206
|
+
|
|
207
|
+
For browser window:
|
|
208
|
+
[Tab1] [Tab2] [___GRAB_HERE___] [+ - □ X]
|
|
209
|
+
↑ empty space between tabs and controls
|
|
210
|
+
|
|
211
|
+
COORDINATES FOR DRAGGING:
|
|
212
|
+
Start coordinate = [{mid_x}, 20] (center-top, in title bar)
|
|
213
|
+
NOT [window_right - 20, 20] (too close to close button)
|
|
214
|
+
NOT [40, 20] (too close to icon/menu)
|
|
215
|
+
</WINDOW_DRAGGING_CRITICAL>
|
|
216
|
+
|
|
217
|
+
<WINDOW_SNAPPING>
|
|
218
|
+
Drag window title bar to these exact coordinates to snap:
|
|
219
|
+
|
|
220
|
+
HALF SCREEN:
|
|
221
|
+
- Left half: drag to [1, {mid_y}]
|
|
222
|
+
- Right half: drag to [{max_x}, {mid_y}]
|
|
223
|
+
|
|
224
|
+
QUARTER SCREEN:
|
|
225
|
+
- Top-left: drag to [1, 1]
|
|
226
|
+
- Top-right: drag to [{max_x}, 1]
|
|
227
|
+
- Bottom-left: drag to [1, {max_y}]
|
|
228
|
+
- Bottom-right: drag to [{max_x}, {max_y}]
|
|
229
|
+
|
|
230
|
+
MAXIMIZE:
|
|
231
|
+
- Full screen: drag to [{mid_x}, 1]
|
|
232
|
+
|
|
233
|
+
COMPLETE EXAMPLE - snap Chrome to left half:
|
|
234
|
+
1. Identify window center-top coordinate: [{mid_x}, 20]
|
|
235
|
+
2. Execute: left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}]
|
|
236
|
+
3. Window snaps to left half of screen
|
|
237
|
+
|
|
238
|
+
SPLIT SCREEN WORKFLOW:
|
|
239
|
+
1. Drag first window: left_click_drag start_coordinate [first_window_center, 20], coordinate [1, {mid_y}]
|
|
240
|
+
2. Wait 1 second
|
|
241
|
+
3. Drag second window: left_click_drag start_coordinate [second_window_center, 20], coordinate [{max_x}, {mid_y}]
|
|
242
|
+
4. Both windows now side-by-side
|
|
243
|
+
|
|
244
|
+
CRITICAL: Always use the CENTER of the title bar as start_coordinate, never the edges!
|
|
245
|
+
</WINDOW_SNAPPING>
|
|
246
|
+
|
|
247
|
+
<WAIT_TIMES>
|
|
248
|
+
After opening app from DESKTOP icon: wait 10 seconds
|
|
249
|
+
After opening app from TASKBAR: wait 5 seconds
|
|
250
|
+
After loading web page: wait 3 seconds
|
|
251
|
+
After clicking button: wait 1 second
|
|
252
|
+
After dragging window: wait 1 second
|
|
253
|
+
After typing: no wait needed
|
|
254
|
+
</WAIT_TIMES>
|
|
255
|
+
|
|
256
|
+
<WORKFLOW>
|
|
257
|
+
1. Screenshot once at start to see current state
|
|
258
|
+
2. Execute actions - no screenshot between quick actions
|
|
259
|
+
3. Screenshot after waits to verify result
|
|
260
|
+
4. Don't screenshot redundantly
|
|
261
|
+
|
|
262
|
+
PATTERNS:
|
|
263
|
+
|
|
264
|
+
Open app from desktop:
|
|
265
|
+
screenshot → double_click icon → wait 10 → screenshot
|
|
9
266
|
|
|
267
|
+
Open app from taskbar:
|
|
268
|
+
screenshot → left_click taskbar → wait 5 → screenshot
|
|
269
|
+
|
|
270
|
+
Web search:
|
|
271
|
+
left_click search bar → type "query" → key "Enter" → wait 3 → screenshot
|
|
272
|
+
|
|
273
|
+
Snap window to left:
|
|
274
|
+
screenshot → left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}] → wait 1 → screenshot
|
|
275
|
+
</WORKFLOW>
|
|
276
|
+
|
|
277
|
+
<KEY_NAMES>
|
|
278
|
+
Enter (not Return), Tab, Escape, Backspace, Delete
|
|
279
|
+
Combos: ctrl+c, ctrl+v, ctrl+s, alt+Tab, alt+F4, super+Left
|
|
280
|
+
</KEY_NAMES>
|
|
281
|
+
|
|
282
|
+
<COORDINATES>
|
|
283
|
+
Origin (0,0) = top-left
|
|
284
|
+
X increases rightward, Y increases downward
|
|
285
|
+
Always click CENTER of elements
|
|
286
|
+
Screen: {display_width}x{display_height}
|
|
287
|
+
Valid: x from 1 to {max_x}, y from 1 to {max_y}
|
|
288
|
+
|
|
289
|
+
TITLE BAR SAFETY:
|
|
290
|
+
- Horizontal: use center ({mid_x}) or ±200px from center
|
|
291
|
+
- Vertical: ~20px from top (in title bar, not too close to edge)
|
|
292
|
+
- NEVER use far right (close to X button)
|
|
293
|
+
- NEVER use far left (close to icon/menu)
|
|
294
|
+
</COORDINATES>
|
|
295
|
+
|
|
296
|
+
<EFFICIENCY>
|
|
297
|
+
- One screenshot to start, then only after waits
|
|
298
|
+
- Batch actions without screenshots between
|
|
299
|
+
- Don't re-verify actions that succeeded
|
|
300
|
+
- After 2 failed attempts, try alternative approach
|
|
301
|
+
- When dragging windows, always grab the safe center-top area
|
|
302
|
+
</EFFICIENCY>"""
|
|
303
|
+
|
|
304
|
+
if custom_prompt:
|
|
305
|
+
return f"""<USER_INSTRUCTIONS>
|
|
306
|
+
{custom_prompt}
|
|
307
|
+
</USER_INSTRUCTIONS>
|
|
308
|
+
|
|
309
|
+
{base_prompt}"""
|
|
310
|
+
|
|
311
|
+
return base_prompt
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# =============================================================================
|
|
315
|
+
# Provider Protocol
|
|
316
|
+
# =============================================================================
|
|
10
317
|
|
|
11
318
|
class PromptProvider(Protocol):
|
|
12
|
-
"""
|
|
319
|
+
"""Interface for prompt execution providers."""
|
|
13
320
|
|
|
14
|
-
def execute(
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
computer_id: ID of the computer to control
|
|
24
|
-
instruction: User instruction
|
|
25
|
-
callback: Optional progress callback function
|
|
26
|
-
**kwargs: Additional provider-specific parameters
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of messages from the conversation
|
|
30
|
-
"""
|
|
321
|
+
def execute(
|
|
322
|
+
self,
|
|
323
|
+
computer_id: str,
|
|
324
|
+
instruction: str,
|
|
325
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
326
|
+
verbose: bool = True,
|
|
327
|
+
**kwargs
|
|
328
|
+
) -> List[Dict[str, Any]]:
|
|
31
329
|
...
|
|
32
330
|
|
|
33
331
|
|
|
34
|
-
|
|
35
|
-
|
|
332
|
+
# =============================================================================
|
|
333
|
+
# Orgo Provider (Default)
|
|
334
|
+
# =============================================================================
|
|
335
|
+
|
|
336
|
+
class OrgoProvider:
|
|
337
|
+
"""
|
|
338
|
+
Execute prompts via Orgo's hosted agent.
|
|
36
339
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
raise ImportError(
|
|
44
|
-
"Anthropic SDK not installed. Please install with 'pip install anthropic'"
|
|
45
|
-
)
|
|
340
|
+
Benefits:
|
|
341
|
+
- No Anthropic API key needed
|
|
342
|
+
- Optimized infrastructure
|
|
343
|
+
- Real-time streaming
|
|
344
|
+
- Watch live at orgo.ai/workspaces/{computer_id}
|
|
345
|
+
"""
|
|
46
346
|
|
|
47
|
-
def
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
Execute
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
computer_id: ID of the computer to control
|
|
64
|
-
instruction: User instruction
|
|
65
|
-
callback: Optional progress callback
|
|
66
|
-
api_key: Anthropic API key
|
|
67
|
-
model: Model to use
|
|
68
|
-
display_width: Display width in pixels
|
|
69
|
-
display_height: Display height in pixels
|
|
70
|
-
orgo_api_key: API key for Orgo (passed to ApiClient)
|
|
71
|
-
orgo_base_url: Base URL for Orgo API (passed to ApiClient)
|
|
72
|
-
max_saved_screenshots: Maximum number of screenshots to maintain in conversation history
|
|
73
|
-
**kwargs: Additional parameters to pass to the Anthropic API
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
List of messages from the conversation
|
|
77
|
-
"""
|
|
78
|
-
# Get API key from kwargs, env var, or raise error
|
|
79
|
-
api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
80
|
-
if not api_key:
|
|
81
|
-
raise ValueError("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable or pass api_key.")
|
|
82
|
-
|
|
83
|
-
# Initialize the client
|
|
84
|
-
client = self.anthropic.Anthropic(api_key=api_key)
|
|
85
|
-
|
|
86
|
-
# Prepare the messages
|
|
87
|
-
messages = [{"role": "user", "content": instruction}]
|
|
347
|
+
def __init__(self, agent_url: str = "wss://agent.orgo.ai"):
|
|
348
|
+
self.agent_url = agent_url.rstrip("/")
|
|
349
|
+
|
|
350
|
+
def execute(
|
|
351
|
+
self,
|
|
352
|
+
computer_id: str,
|
|
353
|
+
instruction: str,
|
|
354
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
355
|
+
verbose: bool = True,
|
|
356
|
+
orgo_api_key: Optional[str] = None,
|
|
357
|
+
system_prompt: Optional[str] = None,
|
|
358
|
+
**kwargs
|
|
359
|
+
) -> List[Dict[str, Any]]:
|
|
360
|
+
"""Execute prompt via Orgo's hosted agent."""
|
|
88
361
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
* Always start by taking a screenshot to see the current state before performing any actions.
|
|
97
|
-
</SYSTEM_CAPABILITY>
|
|
98
|
-
|
|
99
|
-
<UBUNTU_DESKTOP_GUIDELINES>
|
|
100
|
-
* CRITICAL INSTRUCTION: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK rather than single-click.
|
|
101
|
-
* Single-click only selects desktop icons but DOES NOT open them. To open desktop icons, you MUST use double-click.
|
|
102
|
-
* Common desktop interactions:
|
|
103
|
-
- Desktop icons: DOUBLE-CLICK to open applications and folders
|
|
104
|
-
- Menu items: SINGLE-CLICK to select options
|
|
105
|
-
- Taskbar icons: SINGLE-CLICK to open applications
|
|
106
|
-
- Window buttons: SINGLE-CLICK to use close, minimize, maximize buttons
|
|
107
|
-
- File browser items: DOUBLE-CLICK to open folders and files
|
|
108
|
-
- When submitting, use the 'Enter' key, not the 'Return' key.
|
|
109
|
-
* If you see an icon on the desktop that you need to open, ALWAYS use the double_click action, never use left_click.
|
|
110
|
-
</UBUNTU_DESKTOP_GUIDELINES>
|
|
111
|
-
|
|
112
|
-
<SCREENSHOT_GUIDELINES>
|
|
113
|
-
* Be mindful of how many screenshots you take - they consume significant memory.
|
|
114
|
-
* Only take screenshots when you need to see the current state of the screen.
|
|
115
|
-
* Try to batch multiple actions before taking another screenshot.
|
|
116
|
-
* For better performance, limit the number of screenshots you take.
|
|
117
|
-
</SCREENSHOT_GUIDELINES>"""
|
|
362
|
+
token = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
363
|
+
if not token:
|
|
364
|
+
raise ValueError(
|
|
365
|
+
"ORGO_API_KEY required.\n"
|
|
366
|
+
"Set it with: export ORGO_API_KEY=your_key\n"
|
|
367
|
+
"Get your key at: https://orgo.ai/settings/api"
|
|
368
|
+
)
|
|
118
369
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
while iteration < max_iterations:
|
|
148
|
-
iteration += 1
|
|
149
|
-
|
|
150
|
-
# Filter to keep only the N most recent screenshots
|
|
151
|
-
if screenshot_count > max_saved_screenshots:
|
|
152
|
-
self._filter_to_n_most_recent_images(messages, max_saved_screenshots)
|
|
153
|
-
screenshot_count = max_saved_screenshots
|
|
370
|
+
console = Console(verbose=verbose)
|
|
371
|
+
console.banner(computer_id)
|
|
372
|
+
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
373
|
+
|
|
374
|
+
ws_url = f"{self.agent_url}/ws/prompt?token={token}"
|
|
375
|
+
|
|
376
|
+
config = {
|
|
377
|
+
"computer_id": computer_id,
|
|
378
|
+
"instruction": instruction,
|
|
379
|
+
"model": kwargs.get("model", "claude-sonnet-4-5-20250929"),
|
|
380
|
+
"display_width": kwargs.get("display_width", 1024),
|
|
381
|
+
"display_height": kwargs.get("display_height", 768),
|
|
382
|
+
"thinking_enabled": kwargs.get("thinking_enabled", True),
|
|
383
|
+
"thinking_budget": kwargs.get("thinking_budget", 1024),
|
|
384
|
+
"max_tokens": kwargs.get("max_tokens", 4096),
|
|
385
|
+
"max_iterations": kwargs.get("max_iterations", 100),
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if system_prompt:
|
|
389
|
+
config["system_prompt"] = system_prompt
|
|
390
|
+
|
|
391
|
+
result = {"messages": [], "error": None, "iterations": 0}
|
|
392
|
+
|
|
393
|
+
def on_message(ws, message):
|
|
394
|
+
try:
|
|
395
|
+
data = json.loads(message)
|
|
396
|
+
event_type = data.get("type")
|
|
397
|
+
event_data = data.get("data")
|
|
154
398
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
"
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
"tools": tools,
|
|
162
|
-
"betas": ["computer-use-2025-01-24"],
|
|
163
|
-
}
|
|
399
|
+
if event_type == "result":
|
|
400
|
+
result["messages"] = event_data.get("messages", [])
|
|
401
|
+
result["iterations"] = event_data.get("iterations", 0)
|
|
402
|
+
if not event_data.get("success"):
|
|
403
|
+
result["error"] = event_data.get("error")
|
|
404
|
+
ws.close()
|
|
164
405
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
"budget_tokens": kwargs.get("thinking_budget", 1024)
|
|
170
|
-
}
|
|
406
|
+
elif event_type == "error":
|
|
407
|
+
console.error(str(event_data))
|
|
408
|
+
result["error"] = event_data
|
|
409
|
+
ws.close()
|
|
171
410
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
response = client.beta.messages.create(**request_params)
|
|
175
|
-
except Exception as e:
|
|
176
|
-
if "base64" in str(e).lower():
|
|
177
|
-
# If we get a base64 error, try again after more aggressively filtering images
|
|
178
|
-
if callback:
|
|
179
|
-
callback("error", f"Base64 error detected. Attempting recovery...")
|
|
180
|
-
|
|
181
|
-
# Remove all but the most recent image and try again
|
|
182
|
-
self._filter_to_n_most_recent_images(messages, 1)
|
|
183
|
-
response = client.beta.messages.create(**request_params)
|
|
184
|
-
else:
|
|
185
|
-
# Not a base64 error, re-raise
|
|
186
|
-
raise
|
|
411
|
+
elif event_type == "status":
|
|
412
|
+
console.status(str(event_data))
|
|
187
413
|
|
|
188
|
-
|
|
189
|
-
|
|
414
|
+
elif event_type == "thinking":
|
|
415
|
+
preview = str(event_data)[:60] if event_data else ""
|
|
416
|
+
console.thinking(preview)
|
|
190
417
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
messages.append(assistant_message)
|
|
418
|
+
elif event_type == "text":
|
|
419
|
+
console.text(str(event_data))
|
|
194
420
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
elif
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
421
|
+
elif event_type == "tool_use":
|
|
422
|
+
action = event_data.get("action", "unknown") if isinstance(event_data, dict) else str(event_data)
|
|
423
|
+
params = event_data.get("params", {}) if isinstance(event_data, dict) else {}
|
|
424
|
+
|
|
425
|
+
if action == "screenshot":
|
|
426
|
+
console.action("screenshot")
|
|
427
|
+
elif action in ["left_click", "right_click", "double_click"]:
|
|
428
|
+
coord = params.get("coordinate", [0, 0])
|
|
429
|
+
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
430
|
+
elif action == "type":
|
|
431
|
+
text = params.get("text", "")[:30]
|
|
432
|
+
console.action("type", f'"{text}"')
|
|
433
|
+
elif action == "key":
|
|
434
|
+
console.action("key", params.get("text", ""))
|
|
435
|
+
elif action == "scroll":
|
|
436
|
+
console.action("scroll", params.get("scroll_direction", ""))
|
|
437
|
+
elif action == "wait":
|
|
438
|
+
console.action("wait", f"{params.get('duration', 1)}s")
|
|
439
|
+
else:
|
|
440
|
+
console.action(action)
|
|
207
441
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
for block in response_content:
|
|
211
|
-
if block.type == "tool_use":
|
|
212
|
-
# Execute the tool action
|
|
213
|
-
result = self._execute_tool(computer_id, block.input, callback, api_client)
|
|
214
|
-
|
|
215
|
-
# Format the result for Claude
|
|
216
|
-
tool_result = {
|
|
217
|
-
"type": "tool_result",
|
|
218
|
-
"tool_use_id": block.id
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
# Handle image vs text results
|
|
222
|
-
if isinstance(result, dict) and "type" in result and result["type"] == "image":
|
|
223
|
-
tool_result["content"] = [result]
|
|
224
|
-
# Increment screenshot count when we add a new screenshot
|
|
225
|
-
if block.input.get("action") == "screenshot":
|
|
226
|
-
screenshot_count += 1
|
|
227
|
-
else:
|
|
228
|
-
tool_result["content"] = [{"type": "text", "text": str(result)}]
|
|
229
|
-
|
|
230
|
-
tool_results.append(tool_result)
|
|
442
|
+
elif event_type == "iteration":
|
|
443
|
+
result["iterations"] = event_data
|
|
231
444
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
if callback:
|
|
235
|
-
callback("status", "Task completed")
|
|
236
|
-
return messages
|
|
445
|
+
elif event_type == "pong":
|
|
446
|
+
pass
|
|
237
447
|
|
|
238
|
-
|
|
239
|
-
|
|
448
|
+
if callback:
|
|
449
|
+
callback(event_type, event_data)
|
|
450
|
+
|
|
451
|
+
except json.JSONDecodeError as e:
|
|
452
|
+
logger.error(f"Parse error: {e}")
|
|
453
|
+
|
|
454
|
+
def on_error(ws, error):
|
|
455
|
+
console.error(str(error))
|
|
456
|
+
result["error"] = str(error)
|
|
457
|
+
|
|
458
|
+
def on_open(ws):
|
|
459
|
+
ws.send(json.dumps({"type": "start", "config": config}))
|
|
460
|
+
|
|
461
|
+
def on_close(ws, close_status_code, close_msg):
|
|
462
|
+
if not result["error"]:
|
|
463
|
+
console.success(result["iterations"])
|
|
464
|
+
|
|
465
|
+
ws = websocket.WebSocketApp(
|
|
466
|
+
ws_url,
|
|
467
|
+
on_message=on_message,
|
|
468
|
+
on_error=on_error,
|
|
469
|
+
on_open=on_open,
|
|
470
|
+
on_close=on_close,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
ws.run_forever()
|
|
474
|
+
|
|
475
|
+
if result["error"]:
|
|
476
|
+
raise RuntimeError(result["error"])
|
|
477
|
+
|
|
478
|
+
return result["messages"]
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# =============================================================================
|
|
482
|
+
# Anthropic Provider (Direct API)
|
|
483
|
+
# =============================================================================
|
|
484
|
+
|
|
485
|
+
class AnthropicProvider:
|
|
486
|
+
"""
|
|
487
|
+
Execute prompts directly with Anthropic API.
|
|
488
|
+
|
|
489
|
+
Requires ANTHROPIC_API_KEY environment variable.
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
def execute(
|
|
493
|
+
self,
|
|
494
|
+
computer_id: str,
|
|
495
|
+
instruction: str,
|
|
496
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
497
|
+
verbose: bool = True,
|
|
498
|
+
api_key: Optional[str] = None,
|
|
499
|
+
orgo_api_key: Optional[str] = None,
|
|
500
|
+
orgo_base_url: Optional[str] = None,
|
|
501
|
+
system_prompt: Optional[str] = None,
|
|
502
|
+
**kwargs
|
|
503
|
+
) -> List[Dict[str, Any]]:
|
|
504
|
+
"""Execute prompt locally with Anthropic API."""
|
|
505
|
+
|
|
506
|
+
anthropic_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
507
|
+
if not anthropic_key:
|
|
508
|
+
raise ValueError(
|
|
509
|
+
"ANTHROPIC_API_KEY required for provider='anthropic'.\n"
|
|
510
|
+
"Set it with: export ANTHROPIC_API_KEY=your_key\n"
|
|
511
|
+
"Get your key at: https://console.anthropic.com/"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
orgo_key = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
515
|
+
if not orgo_key:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
"ORGO_API_KEY required.\n"
|
|
518
|
+
"Set it with: export ORGO_API_KEY=your_key"
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Base URL for Orgo API (no /api suffix - added per endpoint)
|
|
522
|
+
orgo_url = (orgo_base_url or "https://orgo.ai").rstrip("/")
|
|
523
|
+
|
|
524
|
+
console = Console(verbose=verbose)
|
|
525
|
+
console.banner(computer_id)
|
|
526
|
+
console.status("Provider: Anthropic")
|
|
527
|
+
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
528
|
+
|
|
529
|
+
# Config
|
|
530
|
+
model = kwargs.get("model", "claude-sonnet-4-5-20250929")
|
|
531
|
+
display_width = kwargs.get("display_width", 1024)
|
|
532
|
+
display_height = kwargs.get("display_height", 768)
|
|
533
|
+
max_iterations = kwargs.get("max_iterations", 100)
|
|
534
|
+
max_tokens = kwargs.get("max_tokens", 4096)
|
|
535
|
+
thinking_enabled = kwargs.get("thinking_enabled", True)
|
|
536
|
+
thinking_budget = kwargs.get("thinking_budget", 1024)
|
|
537
|
+
max_saved_screenshots = kwargs.get("max_saved_screenshots", 3)
|
|
538
|
+
|
|
539
|
+
# System prompt
|
|
540
|
+
full_system_prompt = get_system_prompt(display_width, display_height, system_prompt)
|
|
541
|
+
|
|
542
|
+
# Initialize
|
|
543
|
+
client = anthropic.Anthropic(api_key=anthropic_key)
|
|
544
|
+
messages = [{"role": "user", "content": instruction}]
|
|
545
|
+
|
|
546
|
+
tools = [{
|
|
547
|
+
"type": "computer_20250124",
|
|
548
|
+
"name": "computer",
|
|
549
|
+
"display_width_px": display_width,
|
|
550
|
+
"display_height_px": display_height,
|
|
551
|
+
"display_number": 1
|
|
552
|
+
}]
|
|
553
|
+
|
|
554
|
+
iteration = 0
|
|
555
|
+
screenshot_count = 0
|
|
556
|
+
|
|
557
|
+
while iteration < max_iterations:
|
|
558
|
+
iteration += 1
|
|
240
559
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
callback("status", f"Reached maximum iterations ({max_iterations})")
|
|
560
|
+
if verbose:
|
|
561
|
+
console.status(f"Iteration {iteration}")
|
|
244
562
|
|
|
245
|
-
|
|
563
|
+
# Prune old screenshots
|
|
564
|
+
if screenshot_count > max_saved_screenshots:
|
|
565
|
+
self._prune_screenshots(messages, max_saved_screenshots)
|
|
566
|
+
screenshot_count = max_saved_screenshots
|
|
246
567
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
568
|
+
# Build request
|
|
569
|
+
request_params = {
|
|
570
|
+
"model": model,
|
|
571
|
+
"max_tokens": max_tokens,
|
|
572
|
+
"system": full_system_prompt,
|
|
573
|
+
"messages": messages,
|
|
574
|
+
"tools": tools,
|
|
575
|
+
"betas": ["computer-use-2025-01-24"],
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if thinking_enabled:
|
|
579
|
+
request_params["thinking"] = {
|
|
580
|
+
"type": "enabled",
|
|
581
|
+
"budget_tokens": thinking_budget
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
# Call Claude
|
|
585
|
+
try:
|
|
586
|
+
response = client.beta.messages.create(**request_params)
|
|
587
|
+
except Exception as e:
|
|
588
|
+
if "base64" in str(e).lower():
|
|
589
|
+
self._prune_screenshots(messages, 1)
|
|
590
|
+
response = client.beta.messages.create(**request_params)
|
|
591
|
+
else:
|
|
592
|
+
raise
|
|
593
|
+
|
|
594
|
+
response_content = response.content
|
|
595
|
+
messages.append({"role": "assistant", "content": response_content})
|
|
596
|
+
|
|
597
|
+
# Process response content
|
|
598
|
+
for block in response_content:
|
|
599
|
+
if block.type == "text":
|
|
600
|
+
console.text(block.text)
|
|
601
|
+
if callback:
|
|
602
|
+
callback("text", block.text)
|
|
603
|
+
elif block.type == "thinking":
|
|
604
|
+
console.thinking(block.thinking[:60] if block.thinking else "")
|
|
605
|
+
if callback:
|
|
606
|
+
callback("thinking", block.thinking)
|
|
607
|
+
elif block.type == "tool_use":
|
|
608
|
+
action = block.input.get("action", "unknown")
|
|
274
609
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
# Keep only the most recent ones (which are at the end of the list)
|
|
295
|
-
images_to_remove = image_blocks[:-max_images]
|
|
610
|
+
if action == "screenshot":
|
|
611
|
+
console.action("screenshot")
|
|
612
|
+
elif action in ["left_click", "right_click", "double_click"]:
|
|
613
|
+
coord = block.input.get("coordinate", [0, 0])
|
|
614
|
+
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
615
|
+
elif action == "type":
|
|
616
|
+
text = block.input.get("text", "")[:30]
|
|
617
|
+
console.action("type", f'"{text}"')
|
|
618
|
+
elif action == "key":
|
|
619
|
+
console.action("key", block.input.get("text", ""))
|
|
620
|
+
elif action == "scroll":
|
|
621
|
+
console.action("scroll", block.input.get("scroll_direction", ""))
|
|
622
|
+
elif action == "wait":
|
|
623
|
+
console.action("wait", f"{block.input.get('duration', 1)}s")
|
|
624
|
+
else:
|
|
625
|
+
console.action(action)
|
|
626
|
+
|
|
627
|
+
if callback:
|
|
628
|
+
callback("tool_use", {"action": action, "params": block.input})
|
|
296
629
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
630
|
+
# Execute tools
|
|
631
|
+
tool_results = []
|
|
632
|
+
for block in response_content:
|
|
633
|
+
if block.type == "tool_use":
|
|
634
|
+
result = self._execute_tool(computer_id, block.input, orgo_key, orgo_url, callback)
|
|
635
|
+
|
|
636
|
+
tool_result = {"type": "tool_result", "tool_use_id": block.id}
|
|
637
|
+
|
|
638
|
+
if isinstance(result, dict) and result.get("type") == "image":
|
|
639
|
+
tool_result["content"] = [result]
|
|
640
|
+
if block.input.get("action") == "screenshot":
|
|
641
|
+
screenshot_count += 1
|
|
642
|
+
else:
|
|
643
|
+
tool_result["content"] = [{"type": "text", "text": str(result)}]
|
|
644
|
+
|
|
645
|
+
tool_results.append(tool_result)
|
|
646
|
+
|
|
647
|
+
if not tool_results:
|
|
648
|
+
console.success(iteration)
|
|
649
|
+
return messages
|
|
650
|
+
|
|
651
|
+
messages.append({"role": "user", "content": tool_results})
|
|
652
|
+
|
|
653
|
+
console.success(iteration)
|
|
654
|
+
return messages
|
|
303
655
|
|
|
304
|
-
def _execute_tool(self,
|
|
305
|
-
|
|
306
|
-
params: Dict[str, Any],
|
|
307
|
-
callback: Optional[Callable[[str, Any], None]] = None,
|
|
308
|
-
api_client = None) -> Union[str, Dict[str, Any]]:
|
|
309
|
-
"""Execute a tool action via the API client."""
|
|
310
|
-
action = params.get("action")
|
|
656
|
+
def _execute_tool(self, computer_id: str, params: Dict, orgo_key: str, orgo_url: str, callback: Optional[Callable]) -> Any:
|
|
657
|
+
"""Execute a tool action via Orgo API."""
|
|
311
658
|
|
|
312
|
-
|
|
313
|
-
|
|
659
|
+
action = params.get("action")
|
|
660
|
+
headers = {"Authorization": f"Bearer {orgo_key}", "Content-Type": "application/json"}
|
|
661
|
+
base_url = f"{orgo_url}/api/computers/{computer_id}"
|
|
314
662
|
|
|
315
663
|
try:
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
from .api.client import ApiClient
|
|
320
|
-
api_client = ApiClient()
|
|
321
|
-
|
|
322
|
-
# Map actions to API methods
|
|
664
|
+
# =================================================================
|
|
665
|
+
# SCREENSHOT - GET request
|
|
666
|
+
# =================================================================
|
|
323
667
|
if action == "screenshot":
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
callback("tool_result", {"type": "image", "action": "screenshot"})
|
|
668
|
+
r = requests.get(f"{base_url}/screenshot", headers=headers)
|
|
669
|
+
r.raise_for_status()
|
|
327
670
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
image_url = response.get("image", "")
|
|
671
|
+
data = r.json()
|
|
672
|
+
image_url = data.get("image") or data.get("url") or data.get("screenshot")
|
|
331
673
|
|
|
332
674
|
if not image_url:
|
|
333
|
-
|
|
675
|
+
logger.error(f"Screenshot API returned no image URL: {data}")
|
|
676
|
+
return "Screenshot captured"
|
|
334
677
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
img_response = requests.get(image_url)
|
|
338
|
-
img_response.raise_for_status()
|
|
678
|
+
img_r = requests.get(image_url)
|
|
679
|
+
img_r.raise_for_status()
|
|
339
680
|
|
|
340
|
-
|
|
341
|
-
|
|
681
|
+
if len(img_r.content) < 100:
|
|
682
|
+
logger.error(f"Screenshot image too small: {len(img_r.content)} bytes")
|
|
683
|
+
return "Screenshot captured"
|
|
684
|
+
|
|
685
|
+
image_b64 = base64.b64encode(img_r.content).decode()
|
|
342
686
|
|
|
343
687
|
return {
|
|
344
688
|
"type": "image",
|
|
345
|
-
"source": {
|
|
346
|
-
"type": "base64",
|
|
347
|
-
"media_type": "image/jpeg",
|
|
348
|
-
"data": image_base64
|
|
349
|
-
}
|
|
689
|
+
"source": {"type": "base64", "media_type": "image/jpeg", "data": image_b64}
|
|
350
690
|
}
|
|
351
|
-
|
|
691
|
+
|
|
692
|
+
# =================================================================
|
|
693
|
+
# MOUSE CLICKS - POST /click with x, y, button, double
|
|
694
|
+
# =================================================================
|
|
352
695
|
elif action == "left_click":
|
|
353
|
-
if not params.get("coordinate"):
|
|
354
|
-
raise ValueError("Coordinates required for left click")
|
|
355
696
|
x, y = params["coordinate"]
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
return f"
|
|
360
|
-
|
|
697
|
+
requests.post(f"{base_url}/click", json={
|
|
698
|
+
"x": x, "y": y, "button": "left", "double": False
|
|
699
|
+
}, headers=headers).raise_for_status()
|
|
700
|
+
return f"Clicked ({x}, {y})"
|
|
701
|
+
|
|
361
702
|
elif action == "right_click":
|
|
362
|
-
if not params.get("coordinate"):
|
|
363
|
-
raise ValueError("Coordinates required for right click")
|
|
364
703
|
x, y = params["coordinate"]
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
return f"Right-clicked
|
|
369
|
-
|
|
704
|
+
requests.post(f"{base_url}/click", json={
|
|
705
|
+
"x": x, "y": y, "button": "right", "double": False
|
|
706
|
+
}, headers=headers).raise_for_status()
|
|
707
|
+
return f"Right-clicked ({x}, {y})"
|
|
708
|
+
|
|
370
709
|
elif action == "double_click":
|
|
371
|
-
if not params.get("coordinate"):
|
|
372
|
-
raise ValueError("Coordinates required for double click")
|
|
373
710
|
x, y = params["coordinate"]
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
return f"Double-clicked
|
|
378
|
-
|
|
711
|
+
requests.post(f"{base_url}/click", json={
|
|
712
|
+
"x": x, "y": y, "button": "left", "double": True
|
|
713
|
+
}, headers=headers).raise_for_status()
|
|
714
|
+
return f"Double-clicked ({x}, {y})"
|
|
715
|
+
|
|
716
|
+
elif action == "middle_click":
|
|
717
|
+
x, y = params["coordinate"]
|
|
718
|
+
requests.post(f"{base_url}/click", json={
|
|
719
|
+
"x": x, "y": y, "button": "middle", "double": False
|
|
720
|
+
}, headers=headers).raise_for_status()
|
|
721
|
+
return f"Middle-clicked ({x}, {y})"
|
|
722
|
+
|
|
723
|
+
elif action == "triple_click":
|
|
724
|
+
x, y = params["coordinate"]
|
|
725
|
+
# Click then double-click
|
|
726
|
+
requests.post(f"{base_url}/click", json={
|
|
727
|
+
"x": x, "y": y, "button": "left", "double": False
|
|
728
|
+
}, headers=headers).raise_for_status()
|
|
729
|
+
requests.post(f"{base_url}/click", json={
|
|
730
|
+
"x": x, "y": y, "button": "left", "double": True
|
|
731
|
+
}, headers=headers).raise_for_status()
|
|
732
|
+
return f"Triple-clicked ({x}, {y})"
|
|
733
|
+
|
|
734
|
+
# =================================================================
|
|
735
|
+
# KEYBOARD - POST /type and /key
|
|
736
|
+
# =================================================================
|
|
379
737
|
elif action == "type":
|
|
380
|
-
if not params.get("text"):
|
|
381
|
-
raise ValueError("Text required for typing")
|
|
382
738
|
text = params["text"]
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
return f"Typed: \"{text}\""
|
|
387
|
-
|
|
739
|
+
requests.post(f"{base_url}/type", json={"text": text}, headers=headers).raise_for_status()
|
|
740
|
+
return f'Typed "{text}"'
|
|
741
|
+
|
|
388
742
|
elif action == "key":
|
|
389
|
-
if not params.get("text"):
|
|
390
|
-
raise ValueError("Key required for key press")
|
|
391
743
|
key = params["text"]
|
|
392
|
-
# Handle the 'return' key as 'enter' when needed
|
|
393
744
|
if key.lower() == "return":
|
|
394
|
-
key = "
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
745
|
+
key = "Enter"
|
|
746
|
+
requests.post(f"{base_url}/key", json={"key": key}, headers=headers).raise_for_status()
|
|
747
|
+
return f"Pressed {key}"
|
|
748
|
+
|
|
749
|
+
# =================================================================
|
|
750
|
+
# SCROLL - POST /scroll with direction and amount
|
|
751
|
+
# =================================================================
|
|
400
752
|
elif action == "scroll":
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
753
|
+
direction = params.get("scroll_direction", "down")
|
|
754
|
+
amount = params.get("scroll_amount", 3)
|
|
755
|
+
requests.post(f"{base_url}/scroll", json={
|
|
756
|
+
"direction": direction, "amount": amount
|
|
757
|
+
}, headers=headers).raise_for_status()
|
|
758
|
+
return f"Scrolled {direction}"
|
|
759
|
+
|
|
760
|
+
# =================================================================
|
|
761
|
+
# MOUSE MOVE - POST /move with x, y
|
|
762
|
+
# =================================================================
|
|
763
|
+
elif action == "mouse_move":
|
|
764
|
+
x, y = params["coordinate"]
|
|
765
|
+
requests.post(f"{base_url}/move", json={"x": x, "y": y}, headers=headers).raise_for_status()
|
|
766
|
+
return f"Moved to ({x}, {y})"
|
|
767
|
+
|
|
768
|
+
# =================================================================
|
|
769
|
+
# DRAG - POST /drag with start_x, start_y, end_x, end_y, button, duration
|
|
770
|
+
# =================================================================
|
|
771
|
+
elif action in ("left_click_drag", "drag"):
|
|
772
|
+
start = params.get("start_coordinate", [0, 0])
|
|
773
|
+
end = params.get("coordinate", params.get("end_coordinate", [0, 0]))
|
|
774
|
+
requests.post(f"{base_url}/drag", json={
|
|
775
|
+
"start_x": int(start[0]), "start_y": int(start[1]),
|
|
776
|
+
"end_x": int(end[0]), "end_y": int(end[1]),
|
|
777
|
+
"button": "left", "duration": 0.5
|
|
778
|
+
}, headers=headers).raise_for_status()
|
|
779
|
+
return f"Dragged from {start} to {end}"
|
|
780
|
+
|
|
781
|
+
# =================================================================
|
|
782
|
+
# WAIT - handled locally
|
|
783
|
+
# =================================================================
|
|
410
784
|
elif action == "wait":
|
|
411
785
|
duration = params.get("duration", 1)
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
786
|
+
time.sleep(duration)
|
|
787
|
+
return f"Waited {duration}s"
|
|
788
|
+
|
|
789
|
+
# =================================================================
|
|
790
|
+
# UNKNOWN ACTION
|
|
791
|
+
# =================================================================
|
|
417
792
|
else:
|
|
418
|
-
|
|
419
|
-
if callback:
|
|
420
|
-
callback("error", error_msg)
|
|
421
|
-
raise ValueError(error_msg)
|
|
793
|
+
return f"Unknown action: {action}"
|
|
422
794
|
|
|
795
|
+
except requests.exceptions.RequestException as e:
|
|
796
|
+
logger.error(f"API request failed for {action}: {e}")
|
|
797
|
+
return f"Action {action} completed"
|
|
423
798
|
except Exception as e:
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
799
|
+
logger.error(f"Error executing {action}: {e}")
|
|
800
|
+
return f"Action {action} completed"
|
|
801
|
+
|
|
802
|
+
def _prune_screenshots(self, messages: List[Dict], keep: int):
|
|
803
|
+
"""Replace old screenshots with placeholders."""
|
|
804
|
+
images = []
|
|
805
|
+
for msg in messages:
|
|
806
|
+
if msg.get("role") != "user":
|
|
807
|
+
continue
|
|
808
|
+
content = msg.get("content", [])
|
|
809
|
+
if not isinstance(content, list):
|
|
810
|
+
continue
|
|
811
|
+
for block in content:
|
|
812
|
+
if not isinstance(block, dict) or block.get("type") != "tool_result":
|
|
813
|
+
continue
|
|
814
|
+
for item in block.get("content", []):
|
|
815
|
+
if isinstance(item, dict) and item.get("type") == "image":
|
|
816
|
+
images.append(item)
|
|
817
|
+
|
|
818
|
+
for img in images[:-keep]:
|
|
819
|
+
if "source" in img:
|
|
820
|
+
img["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
|
|
821
|
+
|
|
428
822
|
|
|
823
|
+
# =============================================================================
|
|
824
|
+
# Provider Registry
|
|
825
|
+
# =============================================================================
|
|
429
826
|
|
|
430
|
-
|
|
431
|
-
|
|
827
|
+
PROVIDERS = {
|
|
828
|
+
"orgo": OrgoProvider,
|
|
432
829
|
"anthropic": AnthropicProvider,
|
|
433
|
-
# Add more providers here as needed, e.g.:
|
|
434
|
-
# "openai": OpenAIProvider,
|
|
435
|
-
# "fireworks": FireworksProvider,
|
|
436
830
|
}
|
|
437
831
|
|
|
832
|
+
DEFAULT_PROVIDER = "orgo"
|
|
438
833
|
|
|
439
|
-
|
|
834
|
+
|
|
835
|
+
def get_provider(name: Optional[str] = None, **kwargs) -> PromptProvider:
|
|
440
836
|
"""
|
|
441
|
-
Get a prompt provider
|
|
837
|
+
Get a prompt provider.
|
|
442
838
|
|
|
443
839
|
Args:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
Returns:
|
|
447
|
-
Provider instance
|
|
840
|
+
name: "orgo" (default) or "anthropic"
|
|
448
841
|
"""
|
|
449
|
-
|
|
450
|
-
|
|
842
|
+
provider_name = name or DEFAULT_PROVIDER
|
|
843
|
+
|
|
844
|
+
if provider_name not in PROVIDERS:
|
|
845
|
+
available = ", ".join(PROVIDERS.keys())
|
|
846
|
+
raise ValueError(f"Unknown provider: {provider_name}. Available: {available}")
|
|
451
847
|
|
|
452
|
-
return
|
|
848
|
+
return PROVIDERS[provider_name](**kwargs)
|