orgo 0.0.40__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orgo/__init__.py +12 -11
- orgo/api/__init__.py +5 -5
- orgo/api/client.py +227 -227
- orgo/computer.py +491 -471
- orgo/forge.py +176 -176
- orgo/project.py +86 -86
- orgo/prompt.py +1015 -1015
- orgo/template.py +347 -0
- orgo/utils/__init__.py +5 -5
- orgo/utils/auth.py +16 -16
- {orgo-0.0.40.dist-info → orgo-0.0.41.dist-info}/METADATA +47 -47
- orgo-0.0.41.dist-info/RECORD +14 -0
- orgo-0.0.40.dist-info/RECORD +0 -13
- {orgo-0.0.40.dist-info → orgo-0.0.41.dist-info}/WHEEL +0 -0
- {orgo-0.0.40.dist-info → orgo-0.0.41.dist-info}/top_level.txt +0 -0
orgo/prompt.py
CHANGED
|
@@ -1,1016 +1,1016 @@
|
|
|
1
|
-
# src/orgo/prompt.py
|
|
2
|
-
"""
|
|
3
|
-
Orgo Prompt Module - AI-powered computer control.
|
|
4
|
-
|
|
5
|
-
Usage:
|
|
6
|
-
computer.prompt("Open Firefox") # Uses Orgo (default)
|
|
7
|
-
computer.prompt("Open Firefox", provider="anthropic") # Uses Anthropic directly
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import os
|
|
11
|
-
import sys
|
|
12
|
-
import json
|
|
13
|
-
import base64
|
|
14
|
-
import time
|
|
15
|
-
import logging
|
|
16
|
-
from datetime import datetime
|
|
17
|
-
from typing import Any, Callable, Dict, List, Optional, Protocol
|
|
18
|
-
|
|
19
|
-
import anthropic
|
|
20
|
-
import websocket
|
|
21
|
-
import requests
|
|
22
|
-
|
|
23
|
-
logger = logging.getLogger(__name__)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# =============================================================================
|
|
27
|
-
# Console Output
|
|
28
|
-
# =============================================================================
|
|
29
|
-
|
|
30
|
-
class Colors:
|
|
31
|
-
"""ANSI color codes for terminal output."""
|
|
32
|
-
RESET = "\033[0m"
|
|
33
|
-
BOLD = "\033[1m"
|
|
34
|
-
DIM = "\033[2m"
|
|
35
|
-
|
|
36
|
-
CYAN = "\033[36m"
|
|
37
|
-
GREEN = "\033[32m"
|
|
38
|
-
YELLOW = "\033[33m"
|
|
39
|
-
RED = "\033[31m"
|
|
40
|
-
MAGENTA = "\033[35m"
|
|
41
|
-
BLUE = "\033[34m"
|
|
42
|
-
WHITE = "\033[37m"
|
|
43
|
-
GRAY = "\033[90m"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def supports_color() -> bool:
|
|
47
|
-
"""Check if terminal supports color."""
|
|
48
|
-
if os.environ.get("NO_COLOR"):
|
|
49
|
-
return False
|
|
50
|
-
if os.environ.get("FORCE_COLOR"):
|
|
51
|
-
return True
|
|
52
|
-
return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class Console:
|
|
56
|
-
"""Beautiful console output for Orgo SDK."""
|
|
57
|
-
|
|
58
|
-
def __init__(self, verbose: bool = True):
|
|
59
|
-
self.verbose = verbose
|
|
60
|
-
self.use_color = supports_color()
|
|
61
|
-
self.start_time = None
|
|
62
|
-
|
|
63
|
-
def _c(self, color: str, text: str) -> str:
|
|
64
|
-
"""Apply color if supported."""
|
|
65
|
-
if self.use_color:
|
|
66
|
-
return f"{color}{text}{Colors.RESET}"
|
|
67
|
-
return text
|
|
68
|
-
|
|
69
|
-
def banner(self, computer_id: str):
|
|
70
|
-
"""Print Orgo banner with session link."""
|
|
71
|
-
if not self.verbose:
|
|
72
|
-
return
|
|
73
|
-
|
|
74
|
-
self.start_time = time.time()
|
|
75
|
-
|
|
76
|
-
logo = f"""
|
|
77
|
-
{self._c(Colors.CYAN, '___ _ __ __ _ ___')}
|
|
78
|
-
{self._c(Colors.CYAN, "/ _ \\| '__/ _` |/ _ \\")}
|
|
79
|
-
{self._c(Colors.CYAN, '| (_) | | | (_| | (_) |')}
|
|
80
|
-
{self._c(Colors.CYAN, "\\___/|_| \\__, |\\___/")}
|
|
81
|
-
{self._c(Colors.CYAN, '|___/')}
|
|
82
|
-
"""
|
|
83
|
-
print(logo)
|
|
84
|
-
print(f" {self._c(Colors.DIM, 'Watch:')} {self._c(Colors.CYAN, f'https://orgo.ai/workspaces/{computer_id}')}")
|
|
85
|
-
print()
|
|
86
|
-
|
|
87
|
-
def status(self, message: str):
|
|
88
|
-
"""Print status update."""
|
|
89
|
-
if not self.verbose:
|
|
90
|
-
return
|
|
91
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
92
|
-
print(f" {timestamp} {self._c(Colors.CYAN, '●')} {message}")
|
|
93
|
-
|
|
94
|
-
def action(self, action: str, details: str = ""):
|
|
95
|
-
"""Print action being taken."""
|
|
96
|
-
if not self.verbose:
|
|
97
|
-
return
|
|
98
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
99
|
-
action_str = self._c(Colors.YELLOW, action)
|
|
100
|
-
details_str = self._c(Colors.DIM, details) if details else ""
|
|
101
|
-
print(f" {timestamp} {self._c(Colors.YELLOW, '▸')} {action_str} {details_str}")
|
|
102
|
-
|
|
103
|
-
def thinking(self, preview: str = ""):
|
|
104
|
-
"""Print thinking indicator."""
|
|
105
|
-
if not self.verbose:
|
|
106
|
-
return
|
|
107
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
108
|
-
preview_str = self._c(Colors.DIM, f" {preview[:60]}...") if preview else ""
|
|
109
|
-
print(f" {timestamp} {self._c(Colors.MAGENTA, '◐')} {self._c(Colors.MAGENTA, 'Thinking')}{preview_str}")
|
|
110
|
-
|
|
111
|
-
def text(self, content: str):
|
|
112
|
-
"""Print assistant text response."""
|
|
113
|
-
if not self.verbose:
|
|
114
|
-
return
|
|
115
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
116
|
-
if len(content) > 100:
|
|
117
|
-
content = content[:100] + "..."
|
|
118
|
-
print(f" {timestamp} {self._c(Colors.GREEN, '◀')} {content}")
|
|
119
|
-
|
|
120
|
-
def error(self, message: str):
|
|
121
|
-
"""Print error message."""
|
|
122
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
123
|
-
print(f" {timestamp} {self._c(Colors.RED, '✗')} {self._c(Colors.RED, message)}")
|
|
124
|
-
|
|
125
|
-
def retry(self, attempt: int, max_attempts: int, delay: float):
|
|
126
|
-
"""Print retry message."""
|
|
127
|
-
if not self.verbose:
|
|
128
|
-
return
|
|
129
|
-
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
130
|
-
print(f" {timestamp} {self._c(Colors.YELLOW, '↻')} Retry {attempt}/{max_attempts} in {delay:.1f}s")
|
|
131
|
-
|
|
132
|
-
def success(self, iterations: int = 0):
|
|
133
|
-
"""Print success message."""
|
|
134
|
-
if not self.verbose:
|
|
135
|
-
return
|
|
136
|
-
|
|
137
|
-
elapsed = ""
|
|
138
|
-
if self.start_time:
|
|
139
|
-
seconds = time.time() - self.start_time
|
|
140
|
-
elapsed = f" in {seconds:.1f}s"
|
|
141
|
-
|
|
142
|
-
iter_str = f" ({iterations} iterations)" if iterations else ""
|
|
143
|
-
print()
|
|
144
|
-
print(f" {self._c(Colors.GREEN, '✓')} {self._c(Colors.GREEN, 'Done')}{iter_str}{self._c(Colors.DIM, elapsed)}")
|
|
145
|
-
print()
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# =============================================================================
|
|
149
|
-
# Exceptions
|
|
150
|
-
# =============================================================================
|
|
151
|
-
|
|
152
|
-
class ScreenshotError(Exception):
|
|
153
|
-
"""Raised when screenshot capture fails."""
|
|
154
|
-
pass
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
class TransientVisionError(Exception):
|
|
158
|
-
"""Raised when Claude's vision API temporarily fails."""
|
|
159
|
-
pass
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
# =============================================================================
|
|
163
|
-
# System Prompt
|
|
164
|
-
# =============================================================================
|
|
165
|
-
|
|
166
|
-
def get_system_prompt(
|
|
167
|
-
display_width: int = 1024,
|
|
168
|
-
display_height: int = 768,
|
|
169
|
-
custom_prompt: Optional[str] = None
|
|
170
|
-
) -> str:
|
|
171
|
-
"""Build the system prompt for Claude computer use."""
|
|
172
|
-
|
|
173
|
-
mid_x = display_width // 2
|
|
174
|
-
mid_y = display_height // 2
|
|
175
|
-
max_x = display_width - 1
|
|
176
|
-
max_y = display_height - 1
|
|
177
|
-
|
|
178
|
-
base_prompt = f"""You control a Linux desktop ({display_width}x{display_height}). Be efficient - complete tasks in minimal steps.
|
|
179
|
-
|
|
180
|
-
<ACTIONS>
|
|
181
|
-
screenshot - See current screen state
|
|
182
|
-
left_click - Single click. Params: coordinate [x, y]
|
|
183
|
-
double_click - Double click. Params: coordinate [x, y]
|
|
184
|
-
right_click - Right click. Params: coordinate [x, y]
|
|
185
|
-
type - Type text. Params: text "string"
|
|
186
|
-
key - Press key. Params: text "Enter", "Tab", "ctrl+c", etc.
|
|
187
|
-
scroll - Scroll. Params: scroll_direction "up"|"down", scroll_amount 3
|
|
188
|
-
wait - Pause. Params: duration (seconds, e.g. 5)
|
|
189
|
-
mouse_move - Move cursor. Params: coordinate [x, y]
|
|
190
|
-
left_click_drag - Drag operation. Params: start_coordinate [x, y], coordinate [x, y]
|
|
191
|
-
</ACTIONS>
|
|
192
|
-
|
|
193
|
-
<CLICK_RULES>
|
|
194
|
-
DOUBLE_CLICK for:
|
|
195
|
-
- Desktop icons (to open apps)
|
|
196
|
-
- Files/folders in file manager
|
|
197
|
-
|
|
198
|
-
LEFT_CLICK for everything else:
|
|
199
|
-
- Buttons, links, menus
|
|
200
|
-
- Taskbar icons
|
|
201
|
-
- Input fields (to focus before typing)
|
|
202
|
-
- Window controls (close/minimize)
|
|
203
|
-
|
|
204
|
-
COMMON MISTAKES:
|
|
205
|
-
- left_click on desktop icon = only selects, doesn't open (use double_click)
|
|
206
|
-
- double_click on button = wrong (use left_click)
|
|
207
|
-
</CLICK_RULES>
|
|
208
|
-
|
|
209
|
-
<WINDOW_DRAGGING_CRITICAL>
|
|
210
|
-
WHEN DRAGGING WINDOWS - GRAB THE TITLE BAR CORRECTLY:
|
|
211
|
-
|
|
212
|
-
CORRECT - grab the EMPTY SPACE in the title bar:
|
|
213
|
-
✓ Center-top of window (middle of title bar, away from buttons/tabs)
|
|
214
|
-
✓ For browser: grab between tabs and buttons (empty title bar area)
|
|
215
|
-
✓ For app with tabs: grab the title bar ABOVE tabs
|
|
216
|
-
✓ Safe zone: horizontal center, ~20-30px from top edge
|
|
217
|
-
|
|
218
|
-
WRONG - avoid these areas:
|
|
219
|
-
✗ Close/minimize/maximize buttons (top-right corner)
|
|
220
|
-
✗ Browser tabs (will switch tabs instead of moving window)
|
|
221
|
-
✗ Window icon or menu (top-left corner)
|
|
222
|
-
✗ Any buttons or controls in title bar
|
|
223
|
-
|
|
224
|
-
VISUAL GUIDE - where to grab:
|
|
225
|
-
[X] [Icon] [___GRAB_HERE___] [- □ X]
|
|
226
|
-
↑ empty title bar area
|
|
227
|
-
|
|
228
|
-
For browser window:
|
|
229
|
-
[Tab1] [Tab2] [___GRAB_HERE___] [+ - □ X]
|
|
230
|
-
↑ empty space between tabs and controls
|
|
231
|
-
|
|
232
|
-
COORDINATES FOR DRAGGING:
|
|
233
|
-
Start coordinate = [{mid_x}, 20] (center-top, in title bar)
|
|
234
|
-
NOT [window_right - 20, 20] (too close to close button)
|
|
235
|
-
NOT [40, 20] (too close to icon/menu)
|
|
236
|
-
</WINDOW_DRAGGING_CRITICAL>
|
|
237
|
-
|
|
238
|
-
<WINDOW_SNAPPING>
|
|
239
|
-
Drag window title bar to these exact coordinates to snap:
|
|
240
|
-
|
|
241
|
-
HALF SCREEN:
|
|
242
|
-
- Left half: drag to [1, {mid_y}]
|
|
243
|
-
- Right half: drag to [{max_x}, {mid_y}]
|
|
244
|
-
|
|
245
|
-
QUARTER SCREEN:
|
|
246
|
-
- Top-left: drag to [1, 1]
|
|
247
|
-
- Top-right: drag to [{max_x}, 1]
|
|
248
|
-
- Bottom-left: drag to [1, {max_y}]
|
|
249
|
-
- Bottom-right: drag to [{max_x}, {max_y}]
|
|
250
|
-
|
|
251
|
-
MAXIMIZE:
|
|
252
|
-
- Full screen: drag to [{mid_x}, 1]
|
|
253
|
-
|
|
254
|
-
COMPLETE EXAMPLE - snap Chrome to left half:
|
|
255
|
-
1. Identify window center-top coordinate: [{mid_x}, 20]
|
|
256
|
-
2. Execute: left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}]
|
|
257
|
-
3. Window snaps to left half of screen
|
|
258
|
-
|
|
259
|
-
SPLIT SCREEN WORKFLOW:
|
|
260
|
-
1. Drag first window: left_click_drag start_coordinate [first_window_center, 20], coordinate [1, {mid_y}]
|
|
261
|
-
2. Wait 1 second
|
|
262
|
-
3. Drag second window: left_click_drag start_coordinate [second_window_center, 20], coordinate [{max_x}, {mid_y}]
|
|
263
|
-
4. Both windows now side-by-side
|
|
264
|
-
|
|
265
|
-
CRITICAL: Always use the CENTER of the title bar as start_coordinate, never the edges!
|
|
266
|
-
</WINDOW_SNAPPING>
|
|
267
|
-
|
|
268
|
-
<WAIT_TIMES>
|
|
269
|
-
After opening app from DESKTOP icon: wait 10 seconds
|
|
270
|
-
After opening app from TASKBAR: wait 5 seconds
|
|
271
|
-
After loading web page: wait 3 seconds
|
|
272
|
-
After clicking button: wait 1 second
|
|
273
|
-
After dragging window: wait 1 second
|
|
274
|
-
After typing: no wait needed
|
|
275
|
-
</WAIT_TIMES>
|
|
276
|
-
|
|
277
|
-
<WORKFLOW>
|
|
278
|
-
1. Screenshot once at start to see current state
|
|
279
|
-
2. Execute actions - no screenshot between quick actions
|
|
280
|
-
3. Screenshot after waits to verify result
|
|
281
|
-
4. Don't screenshot redundantly
|
|
282
|
-
|
|
283
|
-
PATTERNS:
|
|
284
|
-
|
|
285
|
-
Open app from desktop:
|
|
286
|
-
screenshot → double_click icon → wait 10 → screenshot
|
|
287
|
-
|
|
288
|
-
Open app from taskbar:
|
|
289
|
-
screenshot → left_click taskbar → wait 5 → screenshot
|
|
290
|
-
|
|
291
|
-
Web search:
|
|
292
|
-
left_click search bar → type "query" → key "Enter" → wait 3 → screenshot
|
|
293
|
-
|
|
294
|
-
Snap window to left:
|
|
295
|
-
screenshot → left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}] → wait 1 → screenshot
|
|
296
|
-
</WORKFLOW>
|
|
297
|
-
|
|
298
|
-
<KEY_NAMES>
|
|
299
|
-
Enter (not Return), Tab, Escape, Backspace, Delete
|
|
300
|
-
Combos: ctrl+c, ctrl+v, ctrl+s, alt+Tab, alt+F4, super+Left
|
|
301
|
-
</KEY_NAMES>
|
|
302
|
-
|
|
303
|
-
<COORDINATES>
|
|
304
|
-
Origin (0,0) = top-left
|
|
305
|
-
X increases rightward, Y increases downward
|
|
306
|
-
Always click CENTER of elements
|
|
307
|
-
Screen: {display_width}x{display_height}
|
|
308
|
-
Valid: x from 1 to {max_x}, y from 1 to {max_y}
|
|
309
|
-
|
|
310
|
-
TITLE BAR SAFETY:
|
|
311
|
-
- Horizontal: use center ({mid_x}) or ±200px from center
|
|
312
|
-
- Vertical: ~20px from top (in title bar, not too close to edge)
|
|
313
|
-
- NEVER use far right (close to X button)
|
|
314
|
-
- NEVER use far left (close to icon/menu)
|
|
315
|
-
</COORDINATES>
|
|
316
|
-
|
|
317
|
-
<EFFICIENCY>
|
|
318
|
-
- One screenshot to start, then only after waits
|
|
319
|
-
- Batch actions without screenshots between
|
|
320
|
-
- Don't re-verify actions that succeeded
|
|
321
|
-
- After 2 failed attempts, try alternative approach
|
|
322
|
-
- When dragging windows, always grab the safe center-top area
|
|
323
|
-
</EFFICIENCY>"""
|
|
324
|
-
|
|
325
|
-
if custom_prompt:
|
|
326
|
-
return f"""<USER_INSTRUCTIONS>
|
|
327
|
-
{custom_prompt}
|
|
328
|
-
</USER_INSTRUCTIONS>
|
|
329
|
-
|
|
330
|
-
{base_prompt}"""
|
|
331
|
-
|
|
332
|
-
return base_prompt
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
# =============================================================================
|
|
336
|
-
# Provider Protocol
|
|
337
|
-
# =============================================================================
|
|
338
|
-
|
|
339
|
-
class PromptProvider(Protocol):
|
|
340
|
-
"""Interface for prompt execution providers."""
|
|
341
|
-
|
|
342
|
-
def execute(
|
|
343
|
-
self,
|
|
344
|
-
computer_id: str,
|
|
345
|
-
instruction: str,
|
|
346
|
-
callback: Optional[Callable[[str, Any], None]] = None,
|
|
347
|
-
verbose: bool = True,
|
|
348
|
-
**kwargs
|
|
349
|
-
) -> List[Dict[str, Any]]:
|
|
350
|
-
...
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
# =============================================================================
|
|
354
|
-
# Orgo Provider (Default)
|
|
355
|
-
# =============================================================================
|
|
356
|
-
|
|
357
|
-
class OrgoProvider:
|
|
358
|
-
"""
|
|
359
|
-
Execute prompts via Orgo's hosted agent.
|
|
360
|
-
|
|
361
|
-
Benefits:
|
|
362
|
-
- No Anthropic API key needed
|
|
363
|
-
- Optimized infrastructure
|
|
364
|
-
- Real-time streaming
|
|
365
|
-
- Watch live at orgo.ai/workspaces/{computer_id}
|
|
366
|
-
"""
|
|
367
|
-
|
|
368
|
-
def __init__(self, agent_url: str = "wss://agent.orgo.ai"):
|
|
369
|
-
self.agent_url = agent_url.rstrip("/")
|
|
370
|
-
|
|
371
|
-
def execute(
|
|
372
|
-
self,
|
|
373
|
-
computer_id: str,
|
|
374
|
-
instruction: str,
|
|
375
|
-
callback: Optional[Callable[[str, Any], None]] = None,
|
|
376
|
-
verbose: bool = True,
|
|
377
|
-
orgo_api_key: Optional[str] = None,
|
|
378
|
-
system_prompt: Optional[str] = None,
|
|
379
|
-
**kwargs
|
|
380
|
-
) -> List[Dict[str, Any]]:
|
|
381
|
-
"""Execute prompt via Orgo's hosted agent."""
|
|
382
|
-
|
|
383
|
-
token = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
384
|
-
if not token:
|
|
385
|
-
raise ValueError(
|
|
386
|
-
"ORGO_API_KEY required.\n"
|
|
387
|
-
"Set it with: export ORGO_API_KEY=your_key\n"
|
|
388
|
-
"Get your key at: https://orgo.ai/settings/api"
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
console = Console(verbose=verbose)
|
|
392
|
-
console.banner(computer_id)
|
|
393
|
-
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
394
|
-
|
|
395
|
-
ws_url = f"{self.agent_url}/ws/prompt?token={token}"
|
|
396
|
-
|
|
397
|
-
config = {
|
|
398
|
-
"computer_id": computer_id,
|
|
399
|
-
"instruction": instruction,
|
|
400
|
-
"model": kwargs.get("model", "claude-sonnet-4-5-20250929"),
|
|
401
|
-
"display_width": kwargs.get("display_width", 1024),
|
|
402
|
-
"display_height": kwargs.get("display_height", 768),
|
|
403
|
-
"thinking_enabled": kwargs.get("thinking_enabled", True),
|
|
404
|
-
"thinking_budget": kwargs.get("thinking_budget", 1024),
|
|
405
|
-
"max_tokens": kwargs.get("max_tokens", 4096),
|
|
406
|
-
"max_iterations": kwargs.get("max_iterations", 100),
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
if system_prompt:
|
|
410
|
-
config["system_prompt"] = system_prompt
|
|
411
|
-
|
|
412
|
-
result = {"messages": [], "error": None, "iterations": 0}
|
|
413
|
-
|
|
414
|
-
def on_message(ws, message):
|
|
415
|
-
try:
|
|
416
|
-
data = json.loads(message)
|
|
417
|
-
event_type = data.get("type")
|
|
418
|
-
event_data = data.get("data")
|
|
419
|
-
|
|
420
|
-
if event_type == "result":
|
|
421
|
-
result["messages"] = event_data.get("messages", [])
|
|
422
|
-
result["iterations"] = event_data.get("iterations", 0)
|
|
423
|
-
if not event_data.get("success"):
|
|
424
|
-
result["error"] = event_data.get("error")
|
|
425
|
-
ws.close()
|
|
426
|
-
|
|
427
|
-
elif event_type == "error":
|
|
428
|
-
console.error(str(event_data))
|
|
429
|
-
result["error"] = event_data
|
|
430
|
-
ws.close()
|
|
431
|
-
|
|
432
|
-
elif event_type == "status":
|
|
433
|
-
console.status(str(event_data))
|
|
434
|
-
|
|
435
|
-
elif event_type == "thinking":
|
|
436
|
-
preview = str(event_data)[:60] if event_data else ""
|
|
437
|
-
console.thinking(preview)
|
|
438
|
-
|
|
439
|
-
elif event_type == "text":
|
|
440
|
-
console.text(str(event_data))
|
|
441
|
-
|
|
442
|
-
elif event_type == "tool_use":
|
|
443
|
-
action = event_data.get("action", "unknown") if isinstance(event_data, dict) else str(event_data)
|
|
444
|
-
params = event_data.get("params", {}) if isinstance(event_data, dict) else {}
|
|
445
|
-
|
|
446
|
-
if action == "screenshot":
|
|
447
|
-
console.action("screenshot")
|
|
448
|
-
elif action in ["left_click", "right_click", "double_click"]:
|
|
449
|
-
coord = params.get("coordinate", [0, 0])
|
|
450
|
-
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
451
|
-
elif action == "type":
|
|
452
|
-
text = params.get("text", "")[:30]
|
|
453
|
-
console.action("type", f'"{text}"')
|
|
454
|
-
elif action == "key":
|
|
455
|
-
console.action("key", params.get("text", ""))
|
|
456
|
-
elif action == "scroll":
|
|
457
|
-
console.action("scroll", params.get("scroll_direction", ""))
|
|
458
|
-
elif action == "wait":
|
|
459
|
-
console.action("wait", f"{params.get('duration', 1)}s")
|
|
460
|
-
else:
|
|
461
|
-
console.action(action)
|
|
462
|
-
|
|
463
|
-
elif event_type == "iteration":
|
|
464
|
-
result["iterations"] = event_data
|
|
465
|
-
|
|
466
|
-
elif event_type == "pong":
|
|
467
|
-
pass
|
|
468
|
-
|
|
469
|
-
if callback:
|
|
470
|
-
callback(event_type, event_data)
|
|
471
|
-
|
|
472
|
-
except json.JSONDecodeError as e:
|
|
473
|
-
logger.error(f"Parse error: {e}")
|
|
474
|
-
|
|
475
|
-
def on_error(ws, error):
|
|
476
|
-
console.error(str(error))
|
|
477
|
-
result["error"] = str(error)
|
|
478
|
-
|
|
479
|
-
def on_open(ws):
|
|
480
|
-
ws.send(json.dumps({"type": "start", "config": config}))
|
|
481
|
-
|
|
482
|
-
def on_close(ws, close_status_code, close_msg):
|
|
483
|
-
if not result["error"]:
|
|
484
|
-
console.success(result["iterations"])
|
|
485
|
-
|
|
486
|
-
ws = websocket.WebSocketApp(
|
|
487
|
-
ws_url,
|
|
488
|
-
on_message=on_message,
|
|
489
|
-
on_error=on_error,
|
|
490
|
-
on_open=on_open,
|
|
491
|
-
on_close=on_close,
|
|
492
|
-
)
|
|
493
|
-
|
|
494
|
-
ws.run_forever()
|
|
495
|
-
|
|
496
|
-
if result["error"]:
|
|
497
|
-
raise RuntimeError(result["error"])
|
|
498
|
-
|
|
499
|
-
return result["messages"]
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
# =============================================================================
|
|
503
|
-
# Anthropic Provider (Direct API)
|
|
504
|
-
# =============================================================================
|
|
505
|
-
|
|
506
|
-
class AnthropicProvider:
|
|
507
|
-
"""
|
|
508
|
-
Execute prompts directly with Anthropic API.
|
|
509
|
-
|
|
510
|
-
Requires ANTHROPIC_API_KEY environment variable.
|
|
511
|
-
"""
|
|
512
|
-
|
|
513
|
-
def execute(
|
|
514
|
-
self,
|
|
515
|
-
computer_id: str,
|
|
516
|
-
instruction: str,
|
|
517
|
-
callback: Optional[Callable[[str, Any], None]] = None,
|
|
518
|
-
verbose: bool = True,
|
|
519
|
-
api_key: Optional[str] = None,
|
|
520
|
-
orgo_api_key: Optional[str] = None,
|
|
521
|
-
orgo_base_url: Optional[str] = None,
|
|
522
|
-
system_prompt: Optional[str] = None,
|
|
523
|
-
**kwargs
|
|
524
|
-
) -> List[Dict[str, Any]]:
|
|
525
|
-
"""Execute prompt locally with Anthropic API."""
|
|
526
|
-
|
|
527
|
-
anthropic_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
528
|
-
if not anthropic_key:
|
|
529
|
-
raise ValueError(
|
|
530
|
-
"ANTHROPIC_API_KEY required for provider='anthropic'.\n"
|
|
531
|
-
"Set it with: export ANTHROPIC_API_KEY=your_key\n"
|
|
532
|
-
"Get your key at: https://console.anthropic.com/"
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
orgo_key = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
536
|
-
if not orgo_key:
|
|
537
|
-
raise ValueError(
|
|
538
|
-
"ORGO_API_KEY required.\n"
|
|
539
|
-
"Set it with: export ORGO_API_KEY=your_key"
|
|
540
|
-
)
|
|
541
|
-
|
|
542
|
-
# Base URL for Orgo API (no /api suffix - added per endpoint)
|
|
543
|
-
orgo_url = (orgo_base_url or "https://orgo.ai").rstrip("/")
|
|
544
|
-
|
|
545
|
-
console = Console(verbose=verbose)
|
|
546
|
-
console.banner(computer_id)
|
|
547
|
-
console.status("Provider: Anthropic")
|
|
548
|
-
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
549
|
-
|
|
550
|
-
# Config
|
|
551
|
-
model = kwargs.get("model", "claude-sonnet-4-5-20250929")
|
|
552
|
-
display_width = kwargs.get("display_width", 1024)
|
|
553
|
-
display_height = kwargs.get("display_height", 768)
|
|
554
|
-
max_iterations = kwargs.get("max_iterations", 100)
|
|
555
|
-
max_tokens = kwargs.get("max_tokens", 4096)
|
|
556
|
-
thinking_enabled = kwargs.get("thinking_enabled", True)
|
|
557
|
-
thinking_budget = kwargs.get("thinking_budget", 1024)
|
|
558
|
-
max_saved_screenshots = kwargs.get("max_saved_screenshots", 3)
|
|
559
|
-
screenshot_retry_attempts = kwargs.get("screenshot_retry_attempts", 3)
|
|
560
|
-
screenshot_retry_delay = kwargs.get("screenshot_retry_delay", 2.0)
|
|
561
|
-
|
|
562
|
-
# System prompt
|
|
563
|
-
full_system_prompt = get_system_prompt(display_width, display_height, system_prompt)
|
|
564
|
-
|
|
565
|
-
# Initialize
|
|
566
|
-
client = anthropic.Anthropic(api_key=anthropic_key)
|
|
567
|
-
messages = [{"role": "user", "content": instruction}]
|
|
568
|
-
|
|
569
|
-
tools = [{
|
|
570
|
-
"type": "computer_20250124",
|
|
571
|
-
"name": "computer",
|
|
572
|
-
"display_width_px": display_width,
|
|
573
|
-
"display_height_px": display_height,
|
|
574
|
-
"display_number": 1
|
|
575
|
-
}]
|
|
576
|
-
|
|
577
|
-
iteration = 0
|
|
578
|
-
screenshot_count = 0
|
|
579
|
-
|
|
580
|
-
while iteration < max_iterations:
|
|
581
|
-
iteration += 1
|
|
582
|
-
|
|
583
|
-
if verbose:
|
|
584
|
-
console.status(f"Iteration {iteration}")
|
|
585
|
-
|
|
586
|
-
# Prune old screenshots
|
|
587
|
-
if screenshot_count > max_saved_screenshots:
|
|
588
|
-
self._prune_screenshots(messages, max_saved_screenshots)
|
|
589
|
-
screenshot_count = max_saved_screenshots
|
|
590
|
-
|
|
591
|
-
# Build request
|
|
592
|
-
request_params = {
|
|
593
|
-
"model": model,
|
|
594
|
-
"max_tokens": max_tokens,
|
|
595
|
-
"system": full_system_prompt,
|
|
596
|
-
"messages": messages,
|
|
597
|
-
"tools": tools,
|
|
598
|
-
"betas": ["computer-use-2025-01-24"],
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
if thinking_enabled:
|
|
602
|
-
request_params["thinking"] = {
|
|
603
|
-
"type": "enabled",
|
|
604
|
-
"budget_tokens": thinking_budget
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
# Call Claude with retry logic
|
|
608
|
-
response = self._call_claude_with_retry(
|
|
609
|
-
client=client,
|
|
610
|
-
request_params=request_params,
|
|
611
|
-
messages=messages,
|
|
612
|
-
console=console,
|
|
613
|
-
max_retries=screenshot_retry_attempts,
|
|
614
|
-
retry_delay=screenshot_retry_delay
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
response_content = response.content
|
|
618
|
-
messages.append({"role": "assistant", "content": response_content})
|
|
619
|
-
|
|
620
|
-
# Process response content
|
|
621
|
-
for block in response_content:
|
|
622
|
-
if block.type == "text":
|
|
623
|
-
console.text(block.text)
|
|
624
|
-
if callback:
|
|
625
|
-
callback("text", block.text)
|
|
626
|
-
elif block.type == "thinking":
|
|
627
|
-
console.thinking(block.thinking[:60] if block.thinking else "")
|
|
628
|
-
if callback:
|
|
629
|
-
callback("thinking", block.thinking)
|
|
630
|
-
elif block.type == "tool_use":
|
|
631
|
-
action = block.input.get("action", "unknown")
|
|
632
|
-
|
|
633
|
-
if action == "screenshot":
|
|
634
|
-
console.action("screenshot")
|
|
635
|
-
elif action in ["left_click", "right_click", "double_click"]:
|
|
636
|
-
coord = block.input.get("coordinate", [0, 0])
|
|
637
|
-
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
638
|
-
elif action == "type":
|
|
639
|
-
text = block.input.get("text", "")[:30]
|
|
640
|
-
console.action("type", f'"{text}"')
|
|
641
|
-
elif action == "key":
|
|
642
|
-
console.action("key", block.input.get("text", ""))
|
|
643
|
-
elif action == "scroll":
|
|
644
|
-
console.action("scroll", block.input.get("scroll_direction", ""))
|
|
645
|
-
elif action == "wait":
|
|
646
|
-
console.action("wait", f"{block.input.get('duration', 1)}s")
|
|
647
|
-
else:
|
|
648
|
-
console.action(action)
|
|
649
|
-
|
|
650
|
-
if callback:
|
|
651
|
-
callback("tool_use", {"action": action, "params": block.input})
|
|
652
|
-
|
|
653
|
-
# Execute tools with retry logic
|
|
654
|
-
tool_results = []
|
|
655
|
-
for block in response_content:
|
|
656
|
-
if block.type == "tool_use":
|
|
657
|
-
result = self._execute_tool_with_retry(
|
|
658
|
-
computer_id=computer_id,
|
|
659
|
-
params=block.input,
|
|
660
|
-
orgo_key=orgo_key,
|
|
661
|
-
orgo_url=orgo_url,
|
|
662
|
-
console=console,
|
|
663
|
-
callback=callback,
|
|
664
|
-
max_retries=screenshot_retry_attempts,
|
|
665
|
-
retry_delay=screenshot_retry_delay
|
|
666
|
-
)
|
|
667
|
-
|
|
668
|
-
tool_result = {"type": "tool_result", "tool_use_id": block.id}
|
|
669
|
-
|
|
670
|
-
if isinstance(result, dict) and result.get("type") == "image":
|
|
671
|
-
tool_result["content"] = [result]
|
|
672
|
-
if block.input.get("action") == "screenshot":
|
|
673
|
-
screenshot_count += 1
|
|
674
|
-
else:
|
|
675
|
-
tool_result["content"] = [{"type": "text", "text": str(result)}]
|
|
676
|
-
|
|
677
|
-
tool_results.append(tool_result)
|
|
678
|
-
|
|
679
|
-
if not tool_results:
|
|
680
|
-
console.success(iteration)
|
|
681
|
-
return messages
|
|
682
|
-
|
|
683
|
-
messages.append({"role": "user", "content": tool_results})
|
|
684
|
-
|
|
685
|
-
console.success(iteration)
|
|
686
|
-
return messages
|
|
687
|
-
|
|
688
|
-
def _call_claude_with_retry(
|
|
689
|
-
self,
|
|
690
|
-
client: anthropic.Anthropic,
|
|
691
|
-
request_params: Dict[str, Any],
|
|
692
|
-
messages: List[Dict[str, Any]],
|
|
693
|
-
console: Console,
|
|
694
|
-
max_retries: int = 3,
|
|
695
|
-
retry_delay: float = 2.0
|
|
696
|
-
) -> Any:
|
|
697
|
-
"""Call Claude API with exponential backoff retry logic."""
|
|
698
|
-
|
|
699
|
-
last_error = None
|
|
700
|
-
|
|
701
|
-
for attempt in range(max_retries):
|
|
702
|
-
try:
|
|
703
|
-
return client.beta.messages.create(**request_params)
|
|
704
|
-
|
|
705
|
-
except anthropic.BadRequestError as e:
|
|
706
|
-
error_msg = str(e).lower()
|
|
707
|
-
|
|
708
|
-
# Check for vision/image processing errors
|
|
709
|
-
if "image" in error_msg or "vision" in error_msg or "could not process" in error_msg:
|
|
710
|
-
last_error = TransientVisionError(f"Vision API error: {e}")
|
|
711
|
-
|
|
712
|
-
if attempt < max_retries - 1:
|
|
713
|
-
delay = retry_delay * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s
|
|
714
|
-
console.retry(attempt + 1, max_retries, delay)
|
|
715
|
-
time.sleep(delay)
|
|
716
|
-
|
|
717
|
-
# Prune screenshots to reduce payload size
|
|
718
|
-
self._prune_screenshots(messages, 1)
|
|
719
|
-
request_params["messages"] = messages
|
|
720
|
-
continue
|
|
721
|
-
else:
|
|
722
|
-
raise last_error
|
|
723
|
-
|
|
724
|
-
# Check for base64 errors (fallback from old code)
|
|
725
|
-
elif "base64" in error_msg:
|
|
726
|
-
if attempt < max_retries - 1:
|
|
727
|
-
delay = retry_delay * (2 ** attempt)
|
|
728
|
-
console.retry(attempt + 1, max_retries, delay)
|
|
729
|
-
time.sleep(delay)
|
|
730
|
-
|
|
731
|
-
self._prune_screenshots(messages, 1)
|
|
732
|
-
request_params["messages"] = messages
|
|
733
|
-
continue
|
|
734
|
-
else:
|
|
735
|
-
raise
|
|
736
|
-
else:
|
|
737
|
-
# Non-retryable error
|
|
738
|
-
raise
|
|
739
|
-
|
|
740
|
-
except (anthropic.APIConnectionError, anthropic.APITimeoutError) as e:
|
|
741
|
-
# Network errors - retry with backoff
|
|
742
|
-
last_error = e
|
|
743
|
-
|
|
744
|
-
if attempt < max_retries - 1:
|
|
745
|
-
delay = retry_delay * (2 ** attempt)
|
|
746
|
-
console.retry(attempt + 1, max_retries, delay)
|
|
747
|
-
time.sleep(delay)
|
|
748
|
-
continue
|
|
749
|
-
else:
|
|
750
|
-
raise
|
|
751
|
-
|
|
752
|
-
except Exception as e:
|
|
753
|
-
# Unexpected errors - don't retry
|
|
754
|
-
raise
|
|
755
|
-
|
|
756
|
-
# Should never reach here, but just in case
|
|
757
|
-
if last_error:
|
|
758
|
-
raise last_error
|
|
759
|
-
raise RuntimeError("Max retries exceeded")
|
|
760
|
-
|
|
761
|
-
def _execute_tool_with_retry(
|
|
762
|
-
self,
|
|
763
|
-
computer_id: str,
|
|
764
|
-
params: Dict,
|
|
765
|
-
orgo_key: str,
|
|
766
|
-
orgo_url: str,
|
|
767
|
-
console: Console,
|
|
768
|
-
callback: Optional[Callable],
|
|
769
|
-
max_retries: int = 3,
|
|
770
|
-
retry_delay: float = 2.0
|
|
771
|
-
) -> Any:
|
|
772
|
-
"""Execute tool with retry logic for screenshots."""
|
|
773
|
-
|
|
774
|
-
action = params.get("action")
|
|
775
|
-
|
|
776
|
-
# Only retry screenshots, execute other actions directly
|
|
777
|
-
if action != "screenshot":
|
|
778
|
-
return self._execute_tool(computer_id, params, orgo_key, orgo_url, callback)
|
|
779
|
-
|
|
780
|
-
last_error = None
|
|
781
|
-
|
|
782
|
-
for attempt in range(max_retries):
|
|
783
|
-
try:
|
|
784
|
-
return self._execute_tool(computer_id, params, orgo_key, orgo_url, callback)
|
|
785
|
-
|
|
786
|
-
except (ScreenshotError, requests.exceptions.RequestException) as e:
|
|
787
|
-
last_error = e
|
|
788
|
-
|
|
789
|
-
if attempt < max_retries - 1:
|
|
790
|
-
delay = retry_delay * (2 ** attempt) # Exponential backoff
|
|
791
|
-
console.retry(attempt + 1, max_retries, delay)
|
|
792
|
-
time.sleep(delay)
|
|
793
|
-
continue
|
|
794
|
-
else:
|
|
795
|
-
# Return placeholder after all retries exhausted
|
|
796
|
-
logger.error(f"Screenshot failed after {max_retries} attempts: {e}")
|
|
797
|
-
return "Screenshot captured (degraded quality)"
|
|
798
|
-
|
|
799
|
-
except Exception as e:
|
|
800
|
-
# Unexpected errors - don't retry
|
|
801
|
-
raise
|
|
802
|
-
|
|
803
|
-
# Fallback if all retries failed
|
|
804
|
-
if last_error:
|
|
805
|
-
logger.error(f"Screenshot failed: {last_error}")
|
|
806
|
-
return "Screenshot captured (degraded quality)"
|
|
807
|
-
|
|
808
|
-
return "Screenshot captured"
|
|
809
|
-
|
|
810
|
-
def _execute_tool(self, computer_id: str, params: Dict, orgo_key: str, orgo_url: str, callback: Optional[Callable]) -> Any:
|
|
811
|
-
"""Execute a tool action via Orgo API."""
|
|
812
|
-
|
|
813
|
-
action = params.get("action")
|
|
814
|
-
headers = {"Authorization": f"Bearer {orgo_key}", "Content-Type": "application/json"}
|
|
815
|
-
base_url = f"{orgo_url}/api/computers/{computer_id}"
|
|
816
|
-
|
|
817
|
-
try:
|
|
818
|
-
# =================================================================
|
|
819
|
-
# SCREENSHOT - GET request with validation
|
|
820
|
-
# =================================================================
|
|
821
|
-
if action == "screenshot":
|
|
822
|
-
r = requests.get(f"{base_url}/screenshot", headers=headers, timeout=30)
|
|
823
|
-
r.raise_for_status()
|
|
824
|
-
|
|
825
|
-
data = r.json()
|
|
826
|
-
image_url = data.get("image") or data.get("url") or data.get("screenshot")
|
|
827
|
-
|
|
828
|
-
if not image_url:
|
|
829
|
-
logger.error(f"Screenshot API returned no image URL: {data}")
|
|
830
|
-
raise ScreenshotError("No image URL in response")
|
|
831
|
-
|
|
832
|
-
# Fetch the actual image
|
|
833
|
-
img_r = requests.get(image_url, timeout=30)
|
|
834
|
-
img_r.raise_for_status()
|
|
835
|
-
|
|
836
|
-
# Validate image size
|
|
837
|
-
if len(img_r.content) < 100:
|
|
838
|
-
logger.error(f"Screenshot image too small: {len(img_r.content)} bytes")
|
|
839
|
-
raise ScreenshotError(f"Invalid image size: {len(img_r.content)} bytes")
|
|
840
|
-
|
|
841
|
-
# Validate it's actually an image
|
|
842
|
-
if not img_r.headers.get('content-type', '').startswith('image/'):
|
|
843
|
-
logger.error(f"Invalid content type: {img_r.headers.get('content-type')}")
|
|
844
|
-
raise ScreenshotError("Response is not an image")
|
|
845
|
-
|
|
846
|
-
image_b64 = base64.b64encode(img_r.content).decode()
|
|
847
|
-
|
|
848
|
-
return {
|
|
849
|
-
"type": "image",
|
|
850
|
-
"source": {"type": "base64", "media_type": "image/jpeg", "data": image_b64}
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
# =================================================================
|
|
854
|
-
# MOUSE CLICKS - POST /click with x, y, button, double
|
|
855
|
-
# =================================================================
|
|
856
|
-
elif action == "left_click":
|
|
857
|
-
x, y = params["coordinate"]
|
|
858
|
-
requests.post(f"{base_url}/click", json={
|
|
859
|
-
"x": x, "y": y, "button": "left", "double": False
|
|
860
|
-
}, headers=headers).raise_for_status()
|
|
861
|
-
return f"Clicked ({x}, {y})"
|
|
862
|
-
|
|
863
|
-
elif action == "right_click":
|
|
864
|
-
x, y = params["coordinate"]
|
|
865
|
-
requests.post(f"{base_url}/click", json={
|
|
866
|
-
"x": x, "y": y, "button": "right", "double": False
|
|
867
|
-
}, headers=headers).raise_for_status()
|
|
868
|
-
return f"Right-clicked ({x}, {y})"
|
|
869
|
-
|
|
870
|
-
elif action == "double_click":
|
|
871
|
-
x, y = params["coordinate"]
|
|
872
|
-
requests.post(f"{base_url}/click", json={
|
|
873
|
-
"x": x, "y": y, "button": "left", "double": True
|
|
874
|
-
}, headers=headers).raise_for_status()
|
|
875
|
-
return f"Double-clicked ({x}, {y})"
|
|
876
|
-
|
|
877
|
-
elif action == "middle_click":
|
|
878
|
-
x, y = params["coordinate"]
|
|
879
|
-
requests.post(f"{base_url}/click", json={
|
|
880
|
-
"x": x, "y": y, "button": "middle", "double": False
|
|
881
|
-
}, headers=headers).raise_for_status()
|
|
882
|
-
return f"Middle-clicked ({x}, {y})"
|
|
883
|
-
|
|
884
|
-
elif action == "triple_click":
|
|
885
|
-
x, y = params["coordinate"]
|
|
886
|
-
# Click then double-click
|
|
887
|
-
requests.post(f"{base_url}/click", json={
|
|
888
|
-
"x": x, "y": y, "button": "left", "double": False
|
|
889
|
-
}, headers=headers).raise_for_status()
|
|
890
|
-
requests.post(f"{base_url}/click", json={
|
|
891
|
-
"x": x, "y": y, "button": "left", "double": True
|
|
892
|
-
}, headers=headers).raise_for_status()
|
|
893
|
-
return f"Triple-clicked ({x}, {y})"
|
|
894
|
-
|
|
895
|
-
# =================================================================
|
|
896
|
-
# KEYBOARD - POST /type and /key
|
|
897
|
-
# =================================================================
|
|
898
|
-
elif action == "type":
|
|
899
|
-
text = params["text"]
|
|
900
|
-
requests.post(f"{base_url}/type", json={"text": text}, headers=headers).raise_for_status()
|
|
901
|
-
return f'Typed "{text}"'
|
|
902
|
-
|
|
903
|
-
elif action == "key":
|
|
904
|
-
key = params["text"]
|
|
905
|
-
if key.lower() == "return":
|
|
906
|
-
key = "Enter"
|
|
907
|
-
requests.post(f"{base_url}/key", json={"key": key}, headers=headers).raise_for_status()
|
|
908
|
-
return f"Pressed {key}"
|
|
909
|
-
|
|
910
|
-
# =================================================================
|
|
911
|
-
# SCROLL - POST /scroll with direction and amount
|
|
912
|
-
# =================================================================
|
|
913
|
-
elif action == "scroll":
|
|
914
|
-
direction = params.get("scroll_direction", "down")
|
|
915
|
-
amount = params.get("scroll_amount", 3)
|
|
916
|
-
requests.post(f"{base_url}/scroll", json={
|
|
917
|
-
"direction": direction, "amount": amount
|
|
918
|
-
}, headers=headers).raise_for_status()
|
|
919
|
-
return f"Scrolled {direction}"
|
|
920
|
-
|
|
921
|
-
# =================================================================
|
|
922
|
-
# MOUSE MOVE - POST /move with x, y
|
|
923
|
-
# =================================================================
|
|
924
|
-
elif action == "mouse_move":
|
|
925
|
-
x, y = params["coordinate"]
|
|
926
|
-
requests.post(f"{base_url}/move", json={"x": x, "y": y}, headers=headers).raise_for_status()
|
|
927
|
-
return f"Moved to ({x}, {y})"
|
|
928
|
-
|
|
929
|
-
# =================================================================
|
|
930
|
-
# DRAG - POST /drag with start_x, start_y, end_x, end_y, button, duration
|
|
931
|
-
# =================================================================
|
|
932
|
-
elif action in ("left_click_drag", "drag"):
|
|
933
|
-
start = params.get("start_coordinate", [0, 0])
|
|
934
|
-
end = params.get("coordinate", params.get("end_coordinate", [0, 0]))
|
|
935
|
-
requests.post(f"{base_url}/drag", json={
|
|
936
|
-
"start_x": int(start[0]), "start_y": int(start[1]),
|
|
937
|
-
"end_x": int(end[0]), "end_y": int(end[1]),
|
|
938
|
-
"button": "left", "duration": 0.5
|
|
939
|
-
}, headers=headers).raise_for_status()
|
|
940
|
-
return f"Dragged from {start} to {end}"
|
|
941
|
-
|
|
942
|
-
# =================================================================
|
|
943
|
-
# WAIT - handled locally
|
|
944
|
-
# =================================================================
|
|
945
|
-
elif action == "wait":
|
|
946
|
-
duration = params.get("duration", 1)
|
|
947
|
-
time.sleep(duration)
|
|
948
|
-
return f"Waited {duration}s"
|
|
949
|
-
|
|
950
|
-
# =================================================================
|
|
951
|
-
# UNKNOWN ACTION
|
|
952
|
-
# =================================================================
|
|
953
|
-
else:
|
|
954
|
-
return f"Unknown action: {action}"
|
|
955
|
-
|
|
956
|
-
except requests.exceptions.RequestException as e:
|
|
957
|
-
if action == "screenshot":
|
|
958
|
-
# Re-raise as ScreenshotError for retry logic
|
|
959
|
-
raise ScreenshotError(f"Screenshot request failed: {e}") from e
|
|
960
|
-
else:
|
|
961
|
-
logger.error(f"API request failed for {action}: {e}")
|
|
962
|
-
return f"Action {action} completed"
|
|
963
|
-
except Exception as e:
|
|
964
|
-
logger.error(f"Error executing {action}: {e}")
|
|
965
|
-
if action == "screenshot":
|
|
966
|
-
raise ScreenshotError(f"Screenshot processing failed: {e}") from e
|
|
967
|
-
return f"Action {action} completed"
|
|
968
|
-
|
|
969
|
-
def _prune_screenshots(self, messages: List[Dict], keep: int):
|
|
970
|
-
"""Replace old screenshots with placeholders."""
|
|
971
|
-
images = []
|
|
972
|
-
for msg in messages:
|
|
973
|
-
if msg.get("role") != "user":
|
|
974
|
-
continue
|
|
975
|
-
content = msg.get("content", [])
|
|
976
|
-
if not isinstance(content, list):
|
|
977
|
-
continue
|
|
978
|
-
for block in content:
|
|
979
|
-
if not isinstance(block, dict) or block.get("type") != "tool_result":
|
|
980
|
-
continue
|
|
981
|
-
for item in block.get("content", []):
|
|
982
|
-
if isinstance(item, dict) and item.get("type") == "image":
|
|
983
|
-
images.append(item)
|
|
984
|
-
|
|
985
|
-
# Replace older screenshots with 1x1 transparent PNG
|
|
986
|
-
for img in images[:-keep]:
|
|
987
|
-
if "source" in img:
|
|
988
|
-
img["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
# =============================================================================
|
|
992
|
-
# Provider Registry
|
|
993
|
-
# =============================================================================
|
|
994
|
-
|
|
995
|
-
PROVIDERS = {
|
|
996
|
-
"orgo": OrgoProvider,
|
|
997
|
-
"anthropic": AnthropicProvider,
|
|
998
|
-
}
|
|
999
|
-
|
|
1000
|
-
DEFAULT_PROVIDER = "orgo"
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
def get_provider(name: Optional[str] = None, **kwargs) -> PromptProvider:
|
|
1004
|
-
"""
|
|
1005
|
-
Get a prompt provider.
|
|
1006
|
-
|
|
1007
|
-
Args:
|
|
1008
|
-
name: "orgo" (default) or "anthropic"
|
|
1009
|
-
"""
|
|
1010
|
-
provider_name = name or DEFAULT_PROVIDER
|
|
1011
|
-
|
|
1012
|
-
if provider_name not in PROVIDERS:
|
|
1013
|
-
available = ", ".join(PROVIDERS.keys())
|
|
1014
|
-
raise ValueError(f"Unknown provider: {provider_name}. Available: {available}")
|
|
1015
|
-
|
|
1
|
+
# src/orgo/prompt.py
|
|
2
|
+
"""
|
|
3
|
+
Orgo Prompt Module - AI-powered computer control.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
computer.prompt("Open Firefox") # Uses Orgo (default)
|
|
7
|
+
computer.prompt("Open Firefox", provider="anthropic") # Uses Anthropic directly
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
13
|
+
import base64
|
|
14
|
+
import time
|
|
15
|
+
import logging
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Any, Callable, Dict, List, Optional, Protocol
|
|
18
|
+
|
|
19
|
+
import anthropic
|
|
20
|
+
import websocket
|
|
21
|
+
import requests
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# Console Output
|
|
28
|
+
# =============================================================================
|
|
29
|
+
|
|
30
|
+
class Colors:
|
|
31
|
+
"""ANSI color codes for terminal output."""
|
|
32
|
+
RESET = "\033[0m"
|
|
33
|
+
BOLD = "\033[1m"
|
|
34
|
+
DIM = "\033[2m"
|
|
35
|
+
|
|
36
|
+
CYAN = "\033[36m"
|
|
37
|
+
GREEN = "\033[32m"
|
|
38
|
+
YELLOW = "\033[33m"
|
|
39
|
+
RED = "\033[31m"
|
|
40
|
+
MAGENTA = "\033[35m"
|
|
41
|
+
BLUE = "\033[34m"
|
|
42
|
+
WHITE = "\033[37m"
|
|
43
|
+
GRAY = "\033[90m"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def supports_color() -> bool:
|
|
47
|
+
"""Check if terminal supports color."""
|
|
48
|
+
if os.environ.get("NO_COLOR"):
|
|
49
|
+
return False
|
|
50
|
+
if os.environ.get("FORCE_COLOR"):
|
|
51
|
+
return True
|
|
52
|
+
return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Console:
|
|
56
|
+
"""Beautiful console output for Orgo SDK."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, verbose: bool = True):
|
|
59
|
+
self.verbose = verbose
|
|
60
|
+
self.use_color = supports_color()
|
|
61
|
+
self.start_time = None
|
|
62
|
+
|
|
63
|
+
def _c(self, color: str, text: str) -> str:
|
|
64
|
+
"""Apply color if supported."""
|
|
65
|
+
if self.use_color:
|
|
66
|
+
return f"{color}{text}{Colors.RESET}"
|
|
67
|
+
return text
|
|
68
|
+
|
|
69
|
+
def banner(self, computer_id: str):
|
|
70
|
+
"""Print Orgo banner with session link."""
|
|
71
|
+
if not self.verbose:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
self.start_time = time.time()
|
|
75
|
+
|
|
76
|
+
logo = f"""
|
|
77
|
+
{self._c(Colors.CYAN, '___ _ __ __ _ ___')}
|
|
78
|
+
{self._c(Colors.CYAN, "/ _ \\| '__/ _` |/ _ \\")}
|
|
79
|
+
{self._c(Colors.CYAN, '| (_) | | | (_| | (_) |')}
|
|
80
|
+
{self._c(Colors.CYAN, "\\___/|_| \\__, |\\___/")}
|
|
81
|
+
{self._c(Colors.CYAN, '|___/')}
|
|
82
|
+
"""
|
|
83
|
+
print(logo)
|
|
84
|
+
print(f" {self._c(Colors.DIM, 'Watch:')} {self._c(Colors.CYAN, f'https://orgo.ai/workspaces/{computer_id}')}")
|
|
85
|
+
print()
|
|
86
|
+
|
|
87
|
+
def status(self, message: str):
|
|
88
|
+
"""Print status update."""
|
|
89
|
+
if not self.verbose:
|
|
90
|
+
return
|
|
91
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
92
|
+
print(f" {timestamp} {self._c(Colors.CYAN, '●')} {message}")
|
|
93
|
+
|
|
94
|
+
def action(self, action: str, details: str = ""):
|
|
95
|
+
"""Print action being taken."""
|
|
96
|
+
if not self.verbose:
|
|
97
|
+
return
|
|
98
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
99
|
+
action_str = self._c(Colors.YELLOW, action)
|
|
100
|
+
details_str = self._c(Colors.DIM, details) if details else ""
|
|
101
|
+
print(f" {timestamp} {self._c(Colors.YELLOW, '▸')} {action_str} {details_str}")
|
|
102
|
+
|
|
103
|
+
def thinking(self, preview: str = ""):
|
|
104
|
+
"""Print thinking indicator."""
|
|
105
|
+
if not self.verbose:
|
|
106
|
+
return
|
|
107
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
108
|
+
preview_str = self._c(Colors.DIM, f" {preview[:60]}...") if preview else ""
|
|
109
|
+
print(f" {timestamp} {self._c(Colors.MAGENTA, '◐')} {self._c(Colors.MAGENTA, 'Thinking')}{preview_str}")
|
|
110
|
+
|
|
111
|
+
def text(self, content: str):
|
|
112
|
+
"""Print assistant text response."""
|
|
113
|
+
if not self.verbose:
|
|
114
|
+
return
|
|
115
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
116
|
+
if len(content) > 100:
|
|
117
|
+
content = content[:100] + "..."
|
|
118
|
+
print(f" {timestamp} {self._c(Colors.GREEN, '◀')} {content}")
|
|
119
|
+
|
|
120
|
+
def error(self, message: str):
|
|
121
|
+
"""Print error message."""
|
|
122
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
123
|
+
print(f" {timestamp} {self._c(Colors.RED, '✗')} {self._c(Colors.RED, message)}")
|
|
124
|
+
|
|
125
|
+
def retry(self, attempt: int, max_attempts: int, delay: float):
|
|
126
|
+
"""Print retry message."""
|
|
127
|
+
if not self.verbose:
|
|
128
|
+
return
|
|
129
|
+
timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
|
|
130
|
+
print(f" {timestamp} {self._c(Colors.YELLOW, '↻')} Retry {attempt}/{max_attempts} in {delay:.1f}s")
|
|
131
|
+
|
|
132
|
+
def success(self, iterations: int = 0):
|
|
133
|
+
"""Print success message."""
|
|
134
|
+
if not self.verbose:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
elapsed = ""
|
|
138
|
+
if self.start_time:
|
|
139
|
+
seconds = time.time() - self.start_time
|
|
140
|
+
elapsed = f" in {seconds:.1f}s"
|
|
141
|
+
|
|
142
|
+
iter_str = f" ({iterations} iterations)" if iterations else ""
|
|
143
|
+
print()
|
|
144
|
+
print(f" {self._c(Colors.GREEN, '✓')} {self._c(Colors.GREEN, 'Done')}{iter_str}{self._c(Colors.DIM, elapsed)}")
|
|
145
|
+
print()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# =============================================================================
|
|
149
|
+
# Exceptions
|
|
150
|
+
# =============================================================================
|
|
151
|
+
|
|
152
|
+
class ScreenshotError(Exception):
|
|
153
|
+
"""Raised when screenshot capture fails."""
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class TransientVisionError(Exception):
|
|
158
|
+
"""Raised when Claude's vision API temporarily fails."""
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# =============================================================================
|
|
163
|
+
# System Prompt
|
|
164
|
+
# =============================================================================
|
|
165
|
+
|
|
166
|
+
def get_system_prompt(
|
|
167
|
+
display_width: int = 1024,
|
|
168
|
+
display_height: int = 768,
|
|
169
|
+
custom_prompt: Optional[str] = None
|
|
170
|
+
) -> str:
|
|
171
|
+
"""Build the system prompt for Claude computer use."""
|
|
172
|
+
|
|
173
|
+
mid_x = display_width // 2
|
|
174
|
+
mid_y = display_height // 2
|
|
175
|
+
max_x = display_width - 1
|
|
176
|
+
max_y = display_height - 1
|
|
177
|
+
|
|
178
|
+
base_prompt = f"""You control a Linux desktop ({display_width}x{display_height}). Be efficient - complete tasks in minimal steps.
|
|
179
|
+
|
|
180
|
+
<ACTIONS>
|
|
181
|
+
screenshot - See current screen state
|
|
182
|
+
left_click - Single click. Params: coordinate [x, y]
|
|
183
|
+
double_click - Double click. Params: coordinate [x, y]
|
|
184
|
+
right_click - Right click. Params: coordinate [x, y]
|
|
185
|
+
type - Type text. Params: text "string"
|
|
186
|
+
key - Press key. Params: text "Enter", "Tab", "ctrl+c", etc.
|
|
187
|
+
scroll - Scroll. Params: scroll_direction "up"|"down", scroll_amount 3
|
|
188
|
+
wait - Pause. Params: duration (seconds, e.g. 5)
|
|
189
|
+
mouse_move - Move cursor. Params: coordinate [x, y]
|
|
190
|
+
left_click_drag - Drag operation. Params: start_coordinate [x, y], coordinate [x, y]
|
|
191
|
+
</ACTIONS>
|
|
192
|
+
|
|
193
|
+
<CLICK_RULES>
|
|
194
|
+
DOUBLE_CLICK for:
|
|
195
|
+
- Desktop icons (to open apps)
|
|
196
|
+
- Files/folders in file manager
|
|
197
|
+
|
|
198
|
+
LEFT_CLICK for everything else:
|
|
199
|
+
- Buttons, links, menus
|
|
200
|
+
- Taskbar icons
|
|
201
|
+
- Input fields (to focus before typing)
|
|
202
|
+
- Window controls (close/minimize)
|
|
203
|
+
|
|
204
|
+
COMMON MISTAKES:
|
|
205
|
+
- left_click on desktop icon = only selects, doesn't open (use double_click)
|
|
206
|
+
- double_click on button = wrong (use left_click)
|
|
207
|
+
</CLICK_RULES>
|
|
208
|
+
|
|
209
|
+
<WINDOW_DRAGGING_CRITICAL>
|
|
210
|
+
WHEN DRAGGING WINDOWS - GRAB THE TITLE BAR CORRECTLY:
|
|
211
|
+
|
|
212
|
+
CORRECT - grab the EMPTY SPACE in the title bar:
|
|
213
|
+
✓ Center-top of window (middle of title bar, away from buttons/tabs)
|
|
214
|
+
✓ For browser: grab between tabs and buttons (empty title bar area)
|
|
215
|
+
✓ For app with tabs: grab the title bar ABOVE tabs
|
|
216
|
+
✓ Safe zone: horizontal center, ~20-30px from top edge
|
|
217
|
+
|
|
218
|
+
WRONG - avoid these areas:
|
|
219
|
+
✗ Close/minimize/maximize buttons (top-right corner)
|
|
220
|
+
✗ Browser tabs (will switch tabs instead of moving window)
|
|
221
|
+
✗ Window icon or menu (top-left corner)
|
|
222
|
+
✗ Any buttons or controls in title bar
|
|
223
|
+
|
|
224
|
+
VISUAL GUIDE - where to grab:
|
|
225
|
+
[X] [Icon] [___GRAB_HERE___] [- □ X]
|
|
226
|
+
↑ empty title bar area
|
|
227
|
+
|
|
228
|
+
For browser window:
|
|
229
|
+
[Tab1] [Tab2] [___GRAB_HERE___] [+ - □ X]
|
|
230
|
+
↑ empty space between tabs and controls
|
|
231
|
+
|
|
232
|
+
COORDINATES FOR DRAGGING:
|
|
233
|
+
Start coordinate = [{mid_x}, 20] (center-top, in title bar)
|
|
234
|
+
NOT [window_right - 20, 20] (too close to close button)
|
|
235
|
+
NOT [40, 20] (too close to icon/menu)
|
|
236
|
+
</WINDOW_DRAGGING_CRITICAL>
|
|
237
|
+
|
|
238
|
+
<WINDOW_SNAPPING>
|
|
239
|
+
Drag window title bar to these exact coordinates to snap:
|
|
240
|
+
|
|
241
|
+
HALF SCREEN:
|
|
242
|
+
- Left half: drag to [1, {mid_y}]
|
|
243
|
+
- Right half: drag to [{max_x}, {mid_y}]
|
|
244
|
+
|
|
245
|
+
QUARTER SCREEN:
|
|
246
|
+
- Top-left: drag to [1, 1]
|
|
247
|
+
- Top-right: drag to [{max_x}, 1]
|
|
248
|
+
- Bottom-left: drag to [1, {max_y}]
|
|
249
|
+
- Bottom-right: drag to [{max_x}, {max_y}]
|
|
250
|
+
|
|
251
|
+
MAXIMIZE:
|
|
252
|
+
- Full screen: drag to [{mid_x}, 1]
|
|
253
|
+
|
|
254
|
+
COMPLETE EXAMPLE - snap Chrome to left half:
|
|
255
|
+
1. Identify window center-top coordinate: [{mid_x}, 20]
|
|
256
|
+
2. Execute: left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}]
|
|
257
|
+
3. Window snaps to left half of screen
|
|
258
|
+
|
|
259
|
+
SPLIT SCREEN WORKFLOW:
|
|
260
|
+
1. Drag first window: left_click_drag start_coordinate [first_window_center, 20], coordinate [1, {mid_y}]
|
|
261
|
+
2. Wait 1 second
|
|
262
|
+
3. Drag second window: left_click_drag start_coordinate [second_window_center, 20], coordinate [{max_x}, {mid_y}]
|
|
263
|
+
4. Both windows now side-by-side
|
|
264
|
+
|
|
265
|
+
CRITICAL: Always use the CENTER of the title bar as start_coordinate, never the edges!
|
|
266
|
+
</WINDOW_SNAPPING>
|
|
267
|
+
|
|
268
|
+
<WAIT_TIMES>
|
|
269
|
+
After opening app from DESKTOP icon: wait 10 seconds
|
|
270
|
+
After opening app from TASKBAR: wait 5 seconds
|
|
271
|
+
After loading web page: wait 3 seconds
|
|
272
|
+
After clicking button: wait 1 second
|
|
273
|
+
After dragging window: wait 1 second
|
|
274
|
+
After typing: no wait needed
|
|
275
|
+
</WAIT_TIMES>
|
|
276
|
+
|
|
277
|
+
<WORKFLOW>
|
|
278
|
+
1. Screenshot once at start to see current state
|
|
279
|
+
2. Execute actions - no screenshot between quick actions
|
|
280
|
+
3. Screenshot after waits to verify result
|
|
281
|
+
4. Don't screenshot redundantly
|
|
282
|
+
|
|
283
|
+
PATTERNS:
|
|
284
|
+
|
|
285
|
+
Open app from desktop:
|
|
286
|
+
screenshot → double_click icon → wait 10 → screenshot
|
|
287
|
+
|
|
288
|
+
Open app from taskbar:
|
|
289
|
+
screenshot → left_click taskbar → wait 5 → screenshot
|
|
290
|
+
|
|
291
|
+
Web search:
|
|
292
|
+
left_click search bar → type "query" → key "Enter" → wait 3 → screenshot
|
|
293
|
+
|
|
294
|
+
Snap window to left:
|
|
295
|
+
screenshot → left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}] → wait 1 → screenshot
|
|
296
|
+
</WORKFLOW>
|
|
297
|
+
|
|
298
|
+
<KEY_NAMES>
|
|
299
|
+
Enter (not Return), Tab, Escape, Backspace, Delete
|
|
300
|
+
Combos: ctrl+c, ctrl+v, ctrl+s, alt+Tab, alt+F4, super+Left
|
|
301
|
+
</KEY_NAMES>
|
|
302
|
+
|
|
303
|
+
<COORDINATES>
|
|
304
|
+
Origin (0,0) = top-left
|
|
305
|
+
X increases rightward, Y increases downward
|
|
306
|
+
Always click CENTER of elements
|
|
307
|
+
Screen: {display_width}x{display_height}
|
|
308
|
+
Valid: x from 1 to {max_x}, y from 1 to {max_y}
|
|
309
|
+
|
|
310
|
+
TITLE BAR SAFETY:
|
|
311
|
+
- Horizontal: use center ({mid_x}) or ±200px from center
|
|
312
|
+
- Vertical: ~20px from top (in title bar, not too close to edge)
|
|
313
|
+
- NEVER use far right (close to X button)
|
|
314
|
+
- NEVER use far left (close to icon/menu)
|
|
315
|
+
</COORDINATES>
|
|
316
|
+
|
|
317
|
+
<EFFICIENCY>
|
|
318
|
+
- One screenshot to start, then only after waits
|
|
319
|
+
- Batch actions without screenshots between
|
|
320
|
+
- Don't re-verify actions that succeeded
|
|
321
|
+
- After 2 failed attempts, try alternative approach
|
|
322
|
+
- When dragging windows, always grab the safe center-top area
|
|
323
|
+
</EFFICIENCY>"""
|
|
324
|
+
|
|
325
|
+
if custom_prompt:
|
|
326
|
+
return f"""<USER_INSTRUCTIONS>
|
|
327
|
+
{custom_prompt}
|
|
328
|
+
</USER_INSTRUCTIONS>
|
|
329
|
+
|
|
330
|
+
{base_prompt}"""
|
|
331
|
+
|
|
332
|
+
return base_prompt
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# =============================================================================
|
|
336
|
+
# Provider Protocol
|
|
337
|
+
# =============================================================================
|
|
338
|
+
|
|
339
|
+
class PromptProvider(Protocol):
|
|
340
|
+
"""Interface for prompt execution providers."""
|
|
341
|
+
|
|
342
|
+
def execute(
|
|
343
|
+
self,
|
|
344
|
+
computer_id: str,
|
|
345
|
+
instruction: str,
|
|
346
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
347
|
+
verbose: bool = True,
|
|
348
|
+
**kwargs
|
|
349
|
+
) -> List[Dict[str, Any]]:
|
|
350
|
+
...
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
# =============================================================================
|
|
354
|
+
# Orgo Provider (Default)
|
|
355
|
+
# =============================================================================
|
|
356
|
+
|
|
357
|
+
class OrgoProvider:
|
|
358
|
+
"""
|
|
359
|
+
Execute prompts via Orgo's hosted agent.
|
|
360
|
+
|
|
361
|
+
Benefits:
|
|
362
|
+
- No Anthropic API key needed
|
|
363
|
+
- Optimized infrastructure
|
|
364
|
+
- Real-time streaming
|
|
365
|
+
- Watch live at orgo.ai/workspaces/{computer_id}
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
def __init__(self, agent_url: str = "wss://agent.orgo.ai"):
|
|
369
|
+
self.agent_url = agent_url.rstrip("/")
|
|
370
|
+
|
|
371
|
+
def execute(
|
|
372
|
+
self,
|
|
373
|
+
computer_id: str,
|
|
374
|
+
instruction: str,
|
|
375
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
376
|
+
verbose: bool = True,
|
|
377
|
+
orgo_api_key: Optional[str] = None,
|
|
378
|
+
system_prompt: Optional[str] = None,
|
|
379
|
+
**kwargs
|
|
380
|
+
) -> List[Dict[str, Any]]:
|
|
381
|
+
"""Execute prompt via Orgo's hosted agent."""
|
|
382
|
+
|
|
383
|
+
token = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
384
|
+
if not token:
|
|
385
|
+
raise ValueError(
|
|
386
|
+
"ORGO_API_KEY required.\n"
|
|
387
|
+
"Set it with: export ORGO_API_KEY=your_key\n"
|
|
388
|
+
"Get your key at: https://orgo.ai/settings/api"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
console = Console(verbose=verbose)
|
|
392
|
+
console.banner(computer_id)
|
|
393
|
+
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
394
|
+
|
|
395
|
+
ws_url = f"{self.agent_url}/ws/prompt?token={token}"
|
|
396
|
+
|
|
397
|
+
config = {
|
|
398
|
+
"computer_id": computer_id,
|
|
399
|
+
"instruction": instruction,
|
|
400
|
+
"model": kwargs.get("model", "claude-sonnet-4-5-20250929"),
|
|
401
|
+
"display_width": kwargs.get("display_width", 1024),
|
|
402
|
+
"display_height": kwargs.get("display_height", 768),
|
|
403
|
+
"thinking_enabled": kwargs.get("thinking_enabled", True),
|
|
404
|
+
"thinking_budget": kwargs.get("thinking_budget", 1024),
|
|
405
|
+
"max_tokens": kwargs.get("max_tokens", 4096),
|
|
406
|
+
"max_iterations": kwargs.get("max_iterations", 100),
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if system_prompt:
|
|
410
|
+
config["system_prompt"] = system_prompt
|
|
411
|
+
|
|
412
|
+
result = {"messages": [], "error": None, "iterations": 0}
|
|
413
|
+
|
|
414
|
+
def on_message(ws, message):
|
|
415
|
+
try:
|
|
416
|
+
data = json.loads(message)
|
|
417
|
+
event_type = data.get("type")
|
|
418
|
+
event_data = data.get("data")
|
|
419
|
+
|
|
420
|
+
if event_type == "result":
|
|
421
|
+
result["messages"] = event_data.get("messages", [])
|
|
422
|
+
result["iterations"] = event_data.get("iterations", 0)
|
|
423
|
+
if not event_data.get("success"):
|
|
424
|
+
result["error"] = event_data.get("error")
|
|
425
|
+
ws.close()
|
|
426
|
+
|
|
427
|
+
elif event_type == "error":
|
|
428
|
+
console.error(str(event_data))
|
|
429
|
+
result["error"] = event_data
|
|
430
|
+
ws.close()
|
|
431
|
+
|
|
432
|
+
elif event_type == "status":
|
|
433
|
+
console.status(str(event_data))
|
|
434
|
+
|
|
435
|
+
elif event_type == "thinking":
|
|
436
|
+
preview = str(event_data)[:60] if event_data else ""
|
|
437
|
+
console.thinking(preview)
|
|
438
|
+
|
|
439
|
+
elif event_type == "text":
|
|
440
|
+
console.text(str(event_data))
|
|
441
|
+
|
|
442
|
+
elif event_type == "tool_use":
|
|
443
|
+
action = event_data.get("action", "unknown") if isinstance(event_data, dict) else str(event_data)
|
|
444
|
+
params = event_data.get("params", {}) if isinstance(event_data, dict) else {}
|
|
445
|
+
|
|
446
|
+
if action == "screenshot":
|
|
447
|
+
console.action("screenshot")
|
|
448
|
+
elif action in ["left_click", "right_click", "double_click"]:
|
|
449
|
+
coord = params.get("coordinate", [0, 0])
|
|
450
|
+
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
451
|
+
elif action == "type":
|
|
452
|
+
text = params.get("text", "")[:30]
|
|
453
|
+
console.action("type", f'"{text}"')
|
|
454
|
+
elif action == "key":
|
|
455
|
+
console.action("key", params.get("text", ""))
|
|
456
|
+
elif action == "scroll":
|
|
457
|
+
console.action("scroll", params.get("scroll_direction", ""))
|
|
458
|
+
elif action == "wait":
|
|
459
|
+
console.action("wait", f"{params.get('duration', 1)}s")
|
|
460
|
+
else:
|
|
461
|
+
console.action(action)
|
|
462
|
+
|
|
463
|
+
elif event_type == "iteration":
|
|
464
|
+
result["iterations"] = event_data
|
|
465
|
+
|
|
466
|
+
elif event_type == "pong":
|
|
467
|
+
pass
|
|
468
|
+
|
|
469
|
+
if callback:
|
|
470
|
+
callback(event_type, event_data)
|
|
471
|
+
|
|
472
|
+
except json.JSONDecodeError as e:
|
|
473
|
+
logger.error(f"Parse error: {e}")
|
|
474
|
+
|
|
475
|
+
def on_error(ws, error):
|
|
476
|
+
console.error(str(error))
|
|
477
|
+
result["error"] = str(error)
|
|
478
|
+
|
|
479
|
+
def on_open(ws):
|
|
480
|
+
ws.send(json.dumps({"type": "start", "config": config}))
|
|
481
|
+
|
|
482
|
+
def on_close(ws, close_status_code, close_msg):
|
|
483
|
+
if not result["error"]:
|
|
484
|
+
console.success(result["iterations"])
|
|
485
|
+
|
|
486
|
+
ws = websocket.WebSocketApp(
|
|
487
|
+
ws_url,
|
|
488
|
+
on_message=on_message,
|
|
489
|
+
on_error=on_error,
|
|
490
|
+
on_open=on_open,
|
|
491
|
+
on_close=on_close,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
ws.run_forever()
|
|
495
|
+
|
|
496
|
+
if result["error"]:
|
|
497
|
+
raise RuntimeError(result["error"])
|
|
498
|
+
|
|
499
|
+
return result["messages"]
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
# =============================================================================
|
|
503
|
+
# Anthropic Provider (Direct API)
|
|
504
|
+
# =============================================================================
|
|
505
|
+
|
|
506
|
+
class AnthropicProvider:
|
|
507
|
+
"""
|
|
508
|
+
Execute prompts directly with Anthropic API.
|
|
509
|
+
|
|
510
|
+
Requires ANTHROPIC_API_KEY environment variable.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
def execute(
|
|
514
|
+
self,
|
|
515
|
+
computer_id: str,
|
|
516
|
+
instruction: str,
|
|
517
|
+
callback: Optional[Callable[[str, Any], None]] = None,
|
|
518
|
+
verbose: bool = True,
|
|
519
|
+
api_key: Optional[str] = None,
|
|
520
|
+
orgo_api_key: Optional[str] = None,
|
|
521
|
+
orgo_base_url: Optional[str] = None,
|
|
522
|
+
system_prompt: Optional[str] = None,
|
|
523
|
+
**kwargs
|
|
524
|
+
) -> List[Dict[str, Any]]:
|
|
525
|
+
"""Execute prompt locally with Anthropic API."""
|
|
526
|
+
|
|
527
|
+
anthropic_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
528
|
+
if not anthropic_key:
|
|
529
|
+
raise ValueError(
|
|
530
|
+
"ANTHROPIC_API_KEY required for provider='anthropic'.\n"
|
|
531
|
+
"Set it with: export ANTHROPIC_API_KEY=your_key\n"
|
|
532
|
+
"Get your key at: https://console.anthropic.com/"
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
orgo_key = orgo_api_key or os.environ.get("ORGO_API_KEY")
|
|
536
|
+
if not orgo_key:
|
|
537
|
+
raise ValueError(
|
|
538
|
+
"ORGO_API_KEY required.\n"
|
|
539
|
+
"Set it with: export ORGO_API_KEY=your_key"
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Base URL for Orgo API (no /api suffix - added per endpoint)
|
|
543
|
+
orgo_url = (orgo_base_url or "https://orgo.ai").rstrip("/")
|
|
544
|
+
|
|
545
|
+
console = Console(verbose=verbose)
|
|
546
|
+
console.banner(computer_id)
|
|
547
|
+
console.status("Provider: Anthropic")
|
|
548
|
+
console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
|
|
549
|
+
|
|
550
|
+
# Config
|
|
551
|
+
model = kwargs.get("model", "claude-sonnet-4-5-20250929")
|
|
552
|
+
display_width = kwargs.get("display_width", 1024)
|
|
553
|
+
display_height = kwargs.get("display_height", 768)
|
|
554
|
+
max_iterations = kwargs.get("max_iterations", 100)
|
|
555
|
+
max_tokens = kwargs.get("max_tokens", 4096)
|
|
556
|
+
thinking_enabled = kwargs.get("thinking_enabled", True)
|
|
557
|
+
thinking_budget = kwargs.get("thinking_budget", 1024)
|
|
558
|
+
max_saved_screenshots = kwargs.get("max_saved_screenshots", 3)
|
|
559
|
+
screenshot_retry_attempts = kwargs.get("screenshot_retry_attempts", 3)
|
|
560
|
+
screenshot_retry_delay = kwargs.get("screenshot_retry_delay", 2.0)
|
|
561
|
+
|
|
562
|
+
# System prompt
|
|
563
|
+
full_system_prompt = get_system_prompt(display_width, display_height, system_prompt)
|
|
564
|
+
|
|
565
|
+
# Initialize
|
|
566
|
+
client = anthropic.Anthropic(api_key=anthropic_key)
|
|
567
|
+
messages = [{"role": "user", "content": instruction}]
|
|
568
|
+
|
|
569
|
+
tools = [{
|
|
570
|
+
"type": "computer_20250124",
|
|
571
|
+
"name": "computer",
|
|
572
|
+
"display_width_px": display_width,
|
|
573
|
+
"display_height_px": display_height,
|
|
574
|
+
"display_number": 1
|
|
575
|
+
}]
|
|
576
|
+
|
|
577
|
+
iteration = 0
|
|
578
|
+
screenshot_count = 0
|
|
579
|
+
|
|
580
|
+
while iteration < max_iterations:
|
|
581
|
+
iteration += 1
|
|
582
|
+
|
|
583
|
+
if verbose:
|
|
584
|
+
console.status(f"Iteration {iteration}")
|
|
585
|
+
|
|
586
|
+
# Prune old screenshots
|
|
587
|
+
if screenshot_count > max_saved_screenshots:
|
|
588
|
+
self._prune_screenshots(messages, max_saved_screenshots)
|
|
589
|
+
screenshot_count = max_saved_screenshots
|
|
590
|
+
|
|
591
|
+
# Build request
|
|
592
|
+
request_params = {
|
|
593
|
+
"model": model,
|
|
594
|
+
"max_tokens": max_tokens,
|
|
595
|
+
"system": full_system_prompt,
|
|
596
|
+
"messages": messages,
|
|
597
|
+
"tools": tools,
|
|
598
|
+
"betas": ["computer-use-2025-01-24"],
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
if thinking_enabled:
|
|
602
|
+
request_params["thinking"] = {
|
|
603
|
+
"type": "enabled",
|
|
604
|
+
"budget_tokens": thinking_budget
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
# Call Claude with retry logic
|
|
608
|
+
response = self._call_claude_with_retry(
|
|
609
|
+
client=client,
|
|
610
|
+
request_params=request_params,
|
|
611
|
+
messages=messages,
|
|
612
|
+
console=console,
|
|
613
|
+
max_retries=screenshot_retry_attempts,
|
|
614
|
+
retry_delay=screenshot_retry_delay
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
response_content = response.content
|
|
618
|
+
messages.append({"role": "assistant", "content": response_content})
|
|
619
|
+
|
|
620
|
+
# Process response content
|
|
621
|
+
for block in response_content:
|
|
622
|
+
if block.type == "text":
|
|
623
|
+
console.text(block.text)
|
|
624
|
+
if callback:
|
|
625
|
+
callback("text", block.text)
|
|
626
|
+
elif block.type == "thinking":
|
|
627
|
+
console.thinking(block.thinking[:60] if block.thinking else "")
|
|
628
|
+
if callback:
|
|
629
|
+
callback("thinking", block.thinking)
|
|
630
|
+
elif block.type == "tool_use":
|
|
631
|
+
action = block.input.get("action", "unknown")
|
|
632
|
+
|
|
633
|
+
if action == "screenshot":
|
|
634
|
+
console.action("screenshot")
|
|
635
|
+
elif action in ["left_click", "right_click", "double_click"]:
|
|
636
|
+
coord = block.input.get("coordinate", [0, 0])
|
|
637
|
+
console.action(action, f"({coord[0]}, {coord[1]})")
|
|
638
|
+
elif action == "type":
|
|
639
|
+
text = block.input.get("text", "")[:30]
|
|
640
|
+
console.action("type", f'"{text}"')
|
|
641
|
+
elif action == "key":
|
|
642
|
+
console.action("key", block.input.get("text", ""))
|
|
643
|
+
elif action == "scroll":
|
|
644
|
+
console.action("scroll", block.input.get("scroll_direction", ""))
|
|
645
|
+
elif action == "wait":
|
|
646
|
+
console.action("wait", f"{block.input.get('duration', 1)}s")
|
|
647
|
+
else:
|
|
648
|
+
console.action(action)
|
|
649
|
+
|
|
650
|
+
if callback:
|
|
651
|
+
callback("tool_use", {"action": action, "params": block.input})
|
|
652
|
+
|
|
653
|
+
# Execute tools with retry logic
|
|
654
|
+
tool_results = []
|
|
655
|
+
for block in response_content:
|
|
656
|
+
if block.type == "tool_use":
|
|
657
|
+
result = self._execute_tool_with_retry(
|
|
658
|
+
computer_id=computer_id,
|
|
659
|
+
params=block.input,
|
|
660
|
+
orgo_key=orgo_key,
|
|
661
|
+
orgo_url=orgo_url,
|
|
662
|
+
console=console,
|
|
663
|
+
callback=callback,
|
|
664
|
+
max_retries=screenshot_retry_attempts,
|
|
665
|
+
retry_delay=screenshot_retry_delay
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
tool_result = {"type": "tool_result", "tool_use_id": block.id}
|
|
669
|
+
|
|
670
|
+
if isinstance(result, dict) and result.get("type") == "image":
|
|
671
|
+
tool_result["content"] = [result]
|
|
672
|
+
if block.input.get("action") == "screenshot":
|
|
673
|
+
screenshot_count += 1
|
|
674
|
+
else:
|
|
675
|
+
tool_result["content"] = [{"type": "text", "text": str(result)}]
|
|
676
|
+
|
|
677
|
+
tool_results.append(tool_result)
|
|
678
|
+
|
|
679
|
+
if not tool_results:
|
|
680
|
+
console.success(iteration)
|
|
681
|
+
return messages
|
|
682
|
+
|
|
683
|
+
messages.append({"role": "user", "content": tool_results})
|
|
684
|
+
|
|
685
|
+
console.success(iteration)
|
|
686
|
+
return messages
|
|
687
|
+
|
|
688
|
+
def _call_claude_with_retry(
|
|
689
|
+
self,
|
|
690
|
+
client: anthropic.Anthropic,
|
|
691
|
+
request_params: Dict[str, Any],
|
|
692
|
+
messages: List[Dict[str, Any]],
|
|
693
|
+
console: Console,
|
|
694
|
+
max_retries: int = 3,
|
|
695
|
+
retry_delay: float = 2.0
|
|
696
|
+
) -> Any:
|
|
697
|
+
"""Call Claude API with exponential backoff retry logic."""
|
|
698
|
+
|
|
699
|
+
last_error = None
|
|
700
|
+
|
|
701
|
+
for attempt in range(max_retries):
|
|
702
|
+
try:
|
|
703
|
+
return client.beta.messages.create(**request_params)
|
|
704
|
+
|
|
705
|
+
except anthropic.BadRequestError as e:
|
|
706
|
+
error_msg = str(e).lower()
|
|
707
|
+
|
|
708
|
+
# Check for vision/image processing errors
|
|
709
|
+
if "image" in error_msg or "vision" in error_msg or "could not process" in error_msg:
|
|
710
|
+
last_error = TransientVisionError(f"Vision API error: {e}")
|
|
711
|
+
|
|
712
|
+
if attempt < max_retries - 1:
|
|
713
|
+
delay = retry_delay * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s
|
|
714
|
+
console.retry(attempt + 1, max_retries, delay)
|
|
715
|
+
time.sleep(delay)
|
|
716
|
+
|
|
717
|
+
# Prune screenshots to reduce payload size
|
|
718
|
+
self._prune_screenshots(messages, 1)
|
|
719
|
+
request_params["messages"] = messages
|
|
720
|
+
continue
|
|
721
|
+
else:
|
|
722
|
+
raise last_error
|
|
723
|
+
|
|
724
|
+
# Check for base64 errors (fallback from old code)
|
|
725
|
+
elif "base64" in error_msg:
|
|
726
|
+
if attempt < max_retries - 1:
|
|
727
|
+
delay = retry_delay * (2 ** attempt)
|
|
728
|
+
console.retry(attempt + 1, max_retries, delay)
|
|
729
|
+
time.sleep(delay)
|
|
730
|
+
|
|
731
|
+
self._prune_screenshots(messages, 1)
|
|
732
|
+
request_params["messages"] = messages
|
|
733
|
+
continue
|
|
734
|
+
else:
|
|
735
|
+
raise
|
|
736
|
+
else:
|
|
737
|
+
# Non-retryable error
|
|
738
|
+
raise
|
|
739
|
+
|
|
740
|
+
except (anthropic.APIConnectionError, anthropic.APITimeoutError) as e:
|
|
741
|
+
# Network errors - retry with backoff
|
|
742
|
+
last_error = e
|
|
743
|
+
|
|
744
|
+
if attempt < max_retries - 1:
|
|
745
|
+
delay = retry_delay * (2 ** attempt)
|
|
746
|
+
console.retry(attempt + 1, max_retries, delay)
|
|
747
|
+
time.sleep(delay)
|
|
748
|
+
continue
|
|
749
|
+
else:
|
|
750
|
+
raise
|
|
751
|
+
|
|
752
|
+
except Exception as e:
|
|
753
|
+
# Unexpected errors - don't retry
|
|
754
|
+
raise
|
|
755
|
+
|
|
756
|
+
# Should never reach here, but just in case
|
|
757
|
+
if last_error:
|
|
758
|
+
raise last_error
|
|
759
|
+
raise RuntimeError("Max retries exceeded")
|
|
760
|
+
|
|
761
|
+
def _execute_tool_with_retry(
|
|
762
|
+
self,
|
|
763
|
+
computer_id: str,
|
|
764
|
+
params: Dict,
|
|
765
|
+
orgo_key: str,
|
|
766
|
+
orgo_url: str,
|
|
767
|
+
console: Console,
|
|
768
|
+
callback: Optional[Callable],
|
|
769
|
+
max_retries: int = 3,
|
|
770
|
+
retry_delay: float = 2.0
|
|
771
|
+
) -> Any:
|
|
772
|
+
"""Execute tool with retry logic for screenshots."""
|
|
773
|
+
|
|
774
|
+
action = params.get("action")
|
|
775
|
+
|
|
776
|
+
# Only retry screenshots, execute other actions directly
|
|
777
|
+
if action != "screenshot":
|
|
778
|
+
return self._execute_tool(computer_id, params, orgo_key, orgo_url, callback)
|
|
779
|
+
|
|
780
|
+
last_error = None
|
|
781
|
+
|
|
782
|
+
for attempt in range(max_retries):
|
|
783
|
+
try:
|
|
784
|
+
return self._execute_tool(computer_id, params, orgo_key, orgo_url, callback)
|
|
785
|
+
|
|
786
|
+
except (ScreenshotError, requests.exceptions.RequestException) as e:
|
|
787
|
+
last_error = e
|
|
788
|
+
|
|
789
|
+
if attempt < max_retries - 1:
|
|
790
|
+
delay = retry_delay * (2 ** attempt) # Exponential backoff
|
|
791
|
+
console.retry(attempt + 1, max_retries, delay)
|
|
792
|
+
time.sleep(delay)
|
|
793
|
+
continue
|
|
794
|
+
else:
|
|
795
|
+
# Return placeholder after all retries exhausted
|
|
796
|
+
logger.error(f"Screenshot failed after {max_retries} attempts: {e}")
|
|
797
|
+
return "Screenshot captured (degraded quality)"
|
|
798
|
+
|
|
799
|
+
except Exception as e:
|
|
800
|
+
# Unexpected errors - don't retry
|
|
801
|
+
raise
|
|
802
|
+
|
|
803
|
+
# Fallback if all retries failed
|
|
804
|
+
if last_error:
|
|
805
|
+
logger.error(f"Screenshot failed: {last_error}")
|
|
806
|
+
return "Screenshot captured (degraded quality)"
|
|
807
|
+
|
|
808
|
+
return "Screenshot captured"
|
|
809
|
+
|
|
810
|
+
def _execute_tool(self, computer_id: str, params: Dict, orgo_key: str, orgo_url: str, callback: Optional[Callable]) -> Any:
|
|
811
|
+
"""Execute a tool action via Orgo API."""
|
|
812
|
+
|
|
813
|
+
action = params.get("action")
|
|
814
|
+
headers = {"Authorization": f"Bearer {orgo_key}", "Content-Type": "application/json"}
|
|
815
|
+
base_url = f"{orgo_url}/api/computers/{computer_id}"
|
|
816
|
+
|
|
817
|
+
try:
|
|
818
|
+
# =================================================================
|
|
819
|
+
# SCREENSHOT - GET request with validation
|
|
820
|
+
# =================================================================
|
|
821
|
+
if action == "screenshot":
|
|
822
|
+
r = requests.get(f"{base_url}/screenshot", headers=headers, timeout=30)
|
|
823
|
+
r.raise_for_status()
|
|
824
|
+
|
|
825
|
+
data = r.json()
|
|
826
|
+
image_url = data.get("image") or data.get("url") or data.get("screenshot")
|
|
827
|
+
|
|
828
|
+
if not image_url:
|
|
829
|
+
logger.error(f"Screenshot API returned no image URL: {data}")
|
|
830
|
+
raise ScreenshotError("No image URL in response")
|
|
831
|
+
|
|
832
|
+
# Fetch the actual image
|
|
833
|
+
img_r = requests.get(image_url, timeout=30)
|
|
834
|
+
img_r.raise_for_status()
|
|
835
|
+
|
|
836
|
+
# Validate image size
|
|
837
|
+
if len(img_r.content) < 100:
|
|
838
|
+
logger.error(f"Screenshot image too small: {len(img_r.content)} bytes")
|
|
839
|
+
raise ScreenshotError(f"Invalid image size: {len(img_r.content)} bytes")
|
|
840
|
+
|
|
841
|
+
# Validate it's actually an image
|
|
842
|
+
if not img_r.headers.get('content-type', '').startswith('image/'):
|
|
843
|
+
logger.error(f"Invalid content type: {img_r.headers.get('content-type')}")
|
|
844
|
+
raise ScreenshotError("Response is not an image")
|
|
845
|
+
|
|
846
|
+
image_b64 = base64.b64encode(img_r.content).decode()
|
|
847
|
+
|
|
848
|
+
return {
|
|
849
|
+
"type": "image",
|
|
850
|
+
"source": {"type": "base64", "media_type": "image/jpeg", "data": image_b64}
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
# =================================================================
|
|
854
|
+
# MOUSE CLICKS - POST /click with x, y, button, double
|
|
855
|
+
# =================================================================
|
|
856
|
+
elif action == "left_click":
|
|
857
|
+
x, y = params["coordinate"]
|
|
858
|
+
requests.post(f"{base_url}/click", json={
|
|
859
|
+
"x": x, "y": y, "button": "left", "double": False
|
|
860
|
+
}, headers=headers).raise_for_status()
|
|
861
|
+
return f"Clicked ({x}, {y})"
|
|
862
|
+
|
|
863
|
+
elif action == "right_click":
|
|
864
|
+
x, y = params["coordinate"]
|
|
865
|
+
requests.post(f"{base_url}/click", json={
|
|
866
|
+
"x": x, "y": y, "button": "right", "double": False
|
|
867
|
+
}, headers=headers).raise_for_status()
|
|
868
|
+
return f"Right-clicked ({x}, {y})"
|
|
869
|
+
|
|
870
|
+
elif action == "double_click":
|
|
871
|
+
x, y = params["coordinate"]
|
|
872
|
+
requests.post(f"{base_url}/click", json={
|
|
873
|
+
"x": x, "y": y, "button": "left", "double": True
|
|
874
|
+
}, headers=headers).raise_for_status()
|
|
875
|
+
return f"Double-clicked ({x}, {y})"
|
|
876
|
+
|
|
877
|
+
elif action == "middle_click":
|
|
878
|
+
x, y = params["coordinate"]
|
|
879
|
+
requests.post(f"{base_url}/click", json={
|
|
880
|
+
"x": x, "y": y, "button": "middle", "double": False
|
|
881
|
+
}, headers=headers).raise_for_status()
|
|
882
|
+
return f"Middle-clicked ({x}, {y})"
|
|
883
|
+
|
|
884
|
+
elif action == "triple_click":
|
|
885
|
+
x, y = params["coordinate"]
|
|
886
|
+
# Click then double-click
|
|
887
|
+
requests.post(f"{base_url}/click", json={
|
|
888
|
+
"x": x, "y": y, "button": "left", "double": False
|
|
889
|
+
}, headers=headers).raise_for_status()
|
|
890
|
+
requests.post(f"{base_url}/click", json={
|
|
891
|
+
"x": x, "y": y, "button": "left", "double": True
|
|
892
|
+
}, headers=headers).raise_for_status()
|
|
893
|
+
return f"Triple-clicked ({x}, {y})"
|
|
894
|
+
|
|
895
|
+
# =================================================================
|
|
896
|
+
# KEYBOARD - POST /type and /key
|
|
897
|
+
# =================================================================
|
|
898
|
+
elif action == "type":
|
|
899
|
+
text = params["text"]
|
|
900
|
+
requests.post(f"{base_url}/type", json={"text": text}, headers=headers).raise_for_status()
|
|
901
|
+
return f'Typed "{text}"'
|
|
902
|
+
|
|
903
|
+
elif action == "key":
|
|
904
|
+
key = params["text"]
|
|
905
|
+
if key.lower() == "return":
|
|
906
|
+
key = "Enter"
|
|
907
|
+
requests.post(f"{base_url}/key", json={"key": key}, headers=headers).raise_for_status()
|
|
908
|
+
return f"Pressed {key}"
|
|
909
|
+
|
|
910
|
+
# =================================================================
|
|
911
|
+
# SCROLL - POST /scroll with direction and amount
|
|
912
|
+
# =================================================================
|
|
913
|
+
elif action == "scroll":
|
|
914
|
+
direction = params.get("scroll_direction", "down")
|
|
915
|
+
amount = params.get("scroll_amount", 3)
|
|
916
|
+
requests.post(f"{base_url}/scroll", json={
|
|
917
|
+
"direction": direction, "amount": amount
|
|
918
|
+
}, headers=headers).raise_for_status()
|
|
919
|
+
return f"Scrolled {direction}"
|
|
920
|
+
|
|
921
|
+
# =================================================================
|
|
922
|
+
# MOUSE MOVE - POST /move with x, y
|
|
923
|
+
# =================================================================
|
|
924
|
+
elif action == "mouse_move":
|
|
925
|
+
x, y = params["coordinate"]
|
|
926
|
+
requests.post(f"{base_url}/move", json={"x": x, "y": y}, headers=headers).raise_for_status()
|
|
927
|
+
return f"Moved to ({x}, {y})"
|
|
928
|
+
|
|
929
|
+
# =================================================================
|
|
930
|
+
# DRAG - POST /drag with start_x, start_y, end_x, end_y, button, duration
|
|
931
|
+
# =================================================================
|
|
932
|
+
elif action in ("left_click_drag", "drag"):
|
|
933
|
+
start = params.get("start_coordinate", [0, 0])
|
|
934
|
+
end = params.get("coordinate", params.get("end_coordinate", [0, 0]))
|
|
935
|
+
requests.post(f"{base_url}/drag", json={
|
|
936
|
+
"start_x": int(start[0]), "start_y": int(start[1]),
|
|
937
|
+
"end_x": int(end[0]), "end_y": int(end[1]),
|
|
938
|
+
"button": "left", "duration": 0.5
|
|
939
|
+
}, headers=headers).raise_for_status()
|
|
940
|
+
return f"Dragged from {start} to {end}"
|
|
941
|
+
|
|
942
|
+
# =================================================================
|
|
943
|
+
# WAIT - handled locally
|
|
944
|
+
# =================================================================
|
|
945
|
+
elif action == "wait":
|
|
946
|
+
duration = params.get("duration", 1)
|
|
947
|
+
time.sleep(duration)
|
|
948
|
+
return f"Waited {duration}s"
|
|
949
|
+
|
|
950
|
+
# =================================================================
|
|
951
|
+
# UNKNOWN ACTION
|
|
952
|
+
# =================================================================
|
|
953
|
+
else:
|
|
954
|
+
return f"Unknown action: {action}"
|
|
955
|
+
|
|
956
|
+
except requests.exceptions.RequestException as e:
|
|
957
|
+
if action == "screenshot":
|
|
958
|
+
# Re-raise as ScreenshotError for retry logic
|
|
959
|
+
raise ScreenshotError(f"Screenshot request failed: {e}") from e
|
|
960
|
+
else:
|
|
961
|
+
logger.error(f"API request failed for {action}: {e}")
|
|
962
|
+
return f"Action {action} completed"
|
|
963
|
+
except Exception as e:
|
|
964
|
+
logger.error(f"Error executing {action}: {e}")
|
|
965
|
+
if action == "screenshot":
|
|
966
|
+
raise ScreenshotError(f"Screenshot processing failed: {e}") from e
|
|
967
|
+
return f"Action {action} completed"
|
|
968
|
+
|
|
969
|
+
def _prune_screenshots(self, messages: List[Dict], keep: int):
|
|
970
|
+
"""Replace old screenshots with placeholders."""
|
|
971
|
+
images = []
|
|
972
|
+
for msg in messages:
|
|
973
|
+
if msg.get("role") != "user":
|
|
974
|
+
continue
|
|
975
|
+
content = msg.get("content", [])
|
|
976
|
+
if not isinstance(content, list):
|
|
977
|
+
continue
|
|
978
|
+
for block in content:
|
|
979
|
+
if not isinstance(block, dict) or block.get("type") != "tool_result":
|
|
980
|
+
continue
|
|
981
|
+
for item in block.get("content", []):
|
|
982
|
+
if isinstance(item, dict) and item.get("type") == "image":
|
|
983
|
+
images.append(item)
|
|
984
|
+
|
|
985
|
+
# Replace older screenshots with 1x1 transparent PNG
|
|
986
|
+
for img in images[:-keep]:
|
|
987
|
+
if "source" in img:
|
|
988
|
+
img["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
# =============================================================================
|
|
992
|
+
# Provider Registry
|
|
993
|
+
# =============================================================================
|
|
994
|
+
|
|
995
|
+
PROVIDERS = {
|
|
996
|
+
"orgo": OrgoProvider,
|
|
997
|
+
"anthropic": AnthropicProvider,
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
DEFAULT_PROVIDER = "orgo"
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
def get_provider(name: Optional[str] = None, **kwargs) -> PromptProvider:
|
|
1004
|
+
"""
|
|
1005
|
+
Get a prompt provider.
|
|
1006
|
+
|
|
1007
|
+
Args:
|
|
1008
|
+
name: "orgo" (default) or "anthropic"
|
|
1009
|
+
"""
|
|
1010
|
+
provider_name = name or DEFAULT_PROVIDER
|
|
1011
|
+
|
|
1012
|
+
if provider_name not in PROVIDERS:
|
|
1013
|
+
available = ", ".join(PROVIDERS.keys())
|
|
1014
|
+
raise ValueError(f"Unknown provider: {provider_name}. Available: {available}")
|
|
1015
|
+
|
|
1016
1016
|
return PROVIDERS[provider_name](**kwargs)
|