orgo 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orgo/prompt.py CHANGED
@@ -1,452 +1,848 @@
1
1
  # src/orgo/prompt.py
2
2
  """
3
- Prompt module for interacting with virtual computers using AI models.
3
+ Orgo Prompt Module - AI-powered computer control.
4
+
5
+ Usage:
6
+ computer.prompt("Open Firefox") # Uses Orgo (default)
7
+ computer.prompt("Open Firefox", provider="anthropic") # Uses Anthropic directly
4
8
  """
5
9
 
6
10
  import os
11
+ import sys
12
+ import json
7
13
  import base64
8
- from typing import Dict, List, Any, Optional, Callable, Union, Protocol
14
+ import time
15
+ import logging
16
+ from datetime import datetime
17
+ from typing import Any, Callable, Dict, List, Optional, Protocol
18
+
19
+ import anthropic
20
+ import websocket
21
+ import requests
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ # =============================================================================
27
+ # Console Output
28
+ # =============================================================================
29
+
30
+ class Colors:
31
+ """ANSI color codes for terminal output."""
32
+ RESET = "\033[0m"
33
+ BOLD = "\033[1m"
34
+ DIM = "\033[2m"
35
+
36
+ CYAN = "\033[36m"
37
+ GREEN = "\033[32m"
38
+ YELLOW = "\033[33m"
39
+ RED = "\033[31m"
40
+ MAGENTA = "\033[35m"
41
+ BLUE = "\033[34m"
42
+ WHITE = "\033[37m"
43
+ GRAY = "\033[90m"
44
+
45
+
46
+ def supports_color() -> bool:
47
+ """Check if terminal supports color."""
48
+ if os.environ.get("NO_COLOR"):
49
+ return False
50
+ if os.environ.get("FORCE_COLOR"):
51
+ return True
52
+ return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
53
+
54
+
55
+ class Console:
56
+ """Beautiful console output for Orgo SDK."""
57
+
58
+ def __init__(self, verbose: bool = True):
59
+ self.verbose = verbose
60
+ self.use_color = supports_color()
61
+ self.start_time = None
62
+
63
+ def _c(self, color: str, text: str) -> str:
64
+ """Apply color if supported."""
65
+ if self.use_color:
66
+ return f"{color}{text}{Colors.RESET}"
67
+ return text
68
+
69
+ def banner(self, computer_id: str):
70
+ """Print Orgo banner with session link."""
71
+ if not self.verbose:
72
+ return
73
+
74
+ self.start_time = time.time()
75
+
76
+ logo = f"""
77
+ {self._c(Colors.CYAN, '___ _ __ __ _ ___')}
78
+ {self._c(Colors.CYAN, "/ _ \\| '__/ _` |/ _ \\")}
79
+ {self._c(Colors.CYAN, '| (_) | | | (_| | (_) |')}
80
+ {self._c(Colors.CYAN, "\\___/|_| \\__, |\\___/")}
81
+ {self._c(Colors.CYAN, '|___/')}
82
+ """
83
+ print(logo)
84
+ print(f" {self._c(Colors.DIM, 'Watch:')} {self._c(Colors.CYAN, f'https://orgo.ai/workspaces/{computer_id}')}")
85
+ print()
86
+
87
+ def status(self, message: str):
88
+ """Print status update."""
89
+ if not self.verbose:
90
+ return
91
+ timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
92
+ print(f" {timestamp} {self._c(Colors.CYAN, '●')} {message}")
93
+
94
+ def action(self, action: str, details: str = ""):
95
+ """Print action being taken."""
96
+ if not self.verbose:
97
+ return
98
+ timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
99
+ action_str = self._c(Colors.YELLOW, action)
100
+ details_str = self._c(Colors.DIM, details) if details else ""
101
+ print(f" {timestamp} {self._c(Colors.YELLOW, '▸')} {action_str} {details_str}")
102
+
103
+ def thinking(self, preview: str = ""):
104
+ """Print thinking indicator."""
105
+ if not self.verbose:
106
+ return
107
+ timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
108
+ preview_str = self._c(Colors.DIM, f" {preview[:60]}...") if preview else ""
109
+ print(f" {timestamp} {self._c(Colors.MAGENTA, '◐')} {self._c(Colors.MAGENTA, 'Thinking')}{preview_str}")
110
+
111
+ def text(self, content: str):
112
+ """Print assistant text response."""
113
+ if not self.verbose:
114
+ return
115
+ timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
116
+ if len(content) > 100:
117
+ content = content[:100] + "..."
118
+ print(f" {timestamp} {self._c(Colors.GREEN, '◀')} {content}")
119
+
120
+ def error(self, message: str):
121
+ """Print error message."""
122
+ timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
123
+ print(f" {timestamp} {self._c(Colors.RED, '✗')} {self._c(Colors.RED, message)}")
124
+
125
+ def success(self, iterations: int = 0):
126
+ """Print success message."""
127
+ if not self.verbose:
128
+ return
129
+
130
+ elapsed = ""
131
+ if self.start_time:
132
+ seconds = time.time() - self.start_time
133
+ elapsed = f" in {seconds:.1f}s"
134
+
135
+ iter_str = f" ({iterations} iterations)" if iterations else ""
136
+ print()
137
+ print(f" {self._c(Colors.GREEN, '✓')} {self._c(Colors.GREEN, 'Done')}{iter_str}{self._c(Colors.DIM, elapsed)}")
138
+ print()
139
+
140
+
141
+ # =============================================================================
142
+ # System Prompt
143
+ # =============================================================================
144
+
145
+ def get_system_prompt(
146
+ display_width: int = 1024,
147
+ display_height: int = 768,
148
+ custom_prompt: Optional[str] = None
149
+ ) -> str:
150
+ """Build the system prompt for Claude computer use."""
151
+
152
+ mid_x = display_width // 2
153
+ mid_y = display_height // 2
154
+ max_x = display_width - 1
155
+ max_y = display_height - 1
156
+
157
+ base_prompt = f"""You control a Linux desktop ({display_width}x{display_height}). Be efficient - complete tasks in minimal steps.
158
+
159
+ <ACTIONS>
160
+ screenshot - See current screen state
161
+ left_click - Single click. Params: coordinate [x, y]
162
+ double_click - Double click. Params: coordinate [x, y]
163
+ right_click - Right click. Params: coordinate [x, y]
164
+ type - Type text. Params: text "string"
165
+ key - Press key. Params: text "Enter", "Tab", "ctrl+c", etc.
166
+ scroll - Scroll. Params: scroll_direction "up"|"down", scroll_amount 3
167
+ wait - Pause. Params: duration (seconds, e.g. 5)
168
+ mouse_move - Move cursor. Params: coordinate [x, y]
169
+ left_click_drag - Drag operation. Params: start_coordinate [x, y], coordinate [x, y]
170
+ </ACTIONS>
171
+
172
+ <CLICK_RULES>
173
+ DOUBLE_CLICK for:
174
+ - Desktop icons (to open apps)
175
+ - Files/folders in file manager
176
+
177
+ LEFT_CLICK for everything else:
178
+ - Buttons, links, menus
179
+ - Taskbar icons
180
+ - Input fields (to focus before typing)
181
+ - Window controls (close/minimize)
182
+
183
+ COMMON MISTAKES:
184
+ - left_click on desktop icon = only selects, doesn't open (use double_click)
185
+ - double_click on button = wrong (use left_click)
186
+ </CLICK_RULES>
187
+
188
+ <WINDOW_DRAGGING_CRITICAL>
189
+ WHEN DRAGGING WINDOWS - GRAB THE TITLE BAR CORRECTLY:
190
+
191
+ CORRECT - grab the EMPTY SPACE in the title bar:
192
+ ✓ Center-top of window (middle of title bar, away from buttons/tabs)
193
+ ✓ For browser: grab between tabs and buttons (empty title bar area)
194
+ ✓ For app with tabs: grab the title bar ABOVE tabs
195
+ ✓ Safe zone: horizontal center, ~20-30px from top edge
196
+
197
+ WRONG - avoid these areas:
198
+ ✗ Close/minimize/maximize buttons (top-right corner)
199
+ ✗ Browser tabs (will switch tabs instead of moving window)
200
+ ✗ Window icon or menu (top-left corner)
201
+ ✗ Any buttons or controls in title bar
202
+
203
+ VISUAL GUIDE - where to grab:
204
+ [X] [Icon] [___GRAB_HERE___] [- □ X]
205
+ ↑ empty title bar area
206
+
207
+ For browser window:
208
+ [Tab1] [Tab2] [___GRAB_HERE___] [+ - □ X]
209
+ ↑ empty space between tabs and controls
210
+
211
+ COORDINATES FOR DRAGGING:
212
+ Start coordinate = [{mid_x}, 20] (center-top, in title bar)
213
+ NOT [window_right - 20, 20] (too close to close button)
214
+ NOT [40, 20] (too close to icon/menu)
215
+ </WINDOW_DRAGGING_CRITICAL>
216
+
217
+ <WINDOW_SNAPPING>
218
+ Drag window title bar to these exact coordinates to snap:
219
+
220
+ HALF SCREEN:
221
+ - Left half: drag to [1, {mid_y}]
222
+ - Right half: drag to [{max_x}, {mid_y}]
223
+
224
+ QUARTER SCREEN:
225
+ - Top-left: drag to [1, 1]
226
+ - Top-right: drag to [{max_x}, 1]
227
+ - Bottom-left: drag to [1, {max_y}]
228
+ - Bottom-right: drag to [{max_x}, {max_y}]
229
+
230
+ MAXIMIZE:
231
+ - Full screen: drag to [{mid_x}, 1]
232
+
233
+ COMPLETE EXAMPLE - snap Chrome to left half:
234
+ 1. Identify window center-top coordinate: [{mid_x}, 20]
235
+ 2. Execute: left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}]
236
+ 3. Window snaps to left half of screen
237
+
238
+ SPLIT SCREEN WORKFLOW:
239
+ 1. Drag first window: left_click_drag start_coordinate [first_window_center, 20], coordinate [1, {mid_y}]
240
+ 2. Wait 1 second
241
+ 3. Drag second window: left_click_drag start_coordinate [second_window_center, 20], coordinate [{max_x}, {mid_y}]
242
+ 4. Both windows now side-by-side
243
+
244
+ CRITICAL: Always use the CENTER of the title bar as start_coordinate, never the edges!
245
+ </WINDOW_SNAPPING>
246
+
247
+ <WAIT_TIMES>
248
+ After opening app from DESKTOP icon: wait 10 seconds
249
+ After opening app from TASKBAR: wait 5 seconds
250
+ After loading web page: wait 3 seconds
251
+ After clicking button: wait 1 second
252
+ After dragging window: wait 1 second
253
+ After typing: no wait needed
254
+ </WAIT_TIMES>
255
+
256
+ <WORKFLOW>
257
+ 1. Screenshot once at start to see current state
258
+ 2. Execute actions - no screenshot between quick actions
259
+ 3. Screenshot after waits to verify result
260
+ 4. Don't screenshot redundantly
261
+
262
+ PATTERNS:
263
+
264
+ Open app from desktop:
265
+ screenshot → double_click icon → wait 10 → screenshot
9
266
 
267
+ Open app from taskbar:
268
+ screenshot → left_click taskbar → wait 5 → screenshot
269
+
270
+ Web search:
271
+ left_click search bar → type "query" → key "Enter" → wait 3 → screenshot
272
+
273
+ Snap window to left:
274
+ screenshot → left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}] → wait 1 → screenshot
275
+ </WORKFLOW>
276
+
277
+ <KEY_NAMES>
278
+ Enter (not Return), Tab, Escape, Backspace, Delete
279
+ Combos: ctrl+c, ctrl+v, ctrl+s, alt+Tab, alt+F4, super+Left
280
+ </KEY_NAMES>
281
+
282
+ <COORDINATES>
283
+ Origin (0,0) = top-left
284
+ X increases rightward, Y increases downward
285
+ Always click CENTER of elements
286
+ Screen: {display_width}x{display_height}
287
+ Valid: x from 1 to {max_x}, y from 1 to {max_y}
288
+
289
+ TITLE BAR SAFETY:
290
+ - Horizontal: use center ({mid_x}) or ±200px from center
291
+ - Vertical: ~20px from top (in title bar, not too close to edge)
292
+ - NEVER use far right (close to X button)
293
+ - NEVER use far left (close to icon/menu)
294
+ </COORDINATES>
295
+
296
+ <EFFICIENCY>
297
+ - One screenshot to start, then only after waits
298
+ - Batch actions without screenshots between
299
+ - Don't re-verify actions that succeeded
300
+ - After 2 failed attempts, try alternative approach
301
+ - When dragging windows, always grab the safe center-top area
302
+ </EFFICIENCY>"""
303
+
304
+ if custom_prompt:
305
+ return f"""<USER_INSTRUCTIONS>
306
+ {custom_prompt}
307
+ </USER_INSTRUCTIONS>
308
+
309
+ {base_prompt}"""
310
+
311
+ return base_prompt
312
+
313
+
314
+ # =============================================================================
315
+ # Provider Protocol
316
+ # =============================================================================
10
317
 
11
318
  class PromptProvider(Protocol):
12
- """Protocol defining the interface for prompt providers."""
319
+ """Interface for prompt execution providers."""
13
320
 
14
- def execute(self,
15
- computer_id: str,
16
- instruction: str,
17
- callback: Optional[Callable[[str, Any], None]] = None,
18
- **kwargs) -> List[Dict[str, Any]]:
19
- """
20
- Execute a prompt to control the computer.
21
-
22
- Args:
23
- computer_id: ID of the computer to control
24
- instruction: User instruction
25
- callback: Optional progress callback function
26
- **kwargs: Additional provider-specific parameters
27
-
28
- Returns:
29
- List of messages from the conversation
30
- """
321
+ def execute(
322
+ self,
323
+ computer_id: str,
324
+ instruction: str,
325
+ callback: Optional[Callable[[str, Any], None]] = None,
326
+ verbose: bool = True,
327
+ **kwargs
328
+ ) -> List[Dict[str, Any]]:
31
329
  ...
32
330
 
33
331
 
34
- class AnthropicProvider:
35
- """Anthropic Claude-based prompt provider."""
332
+ # =============================================================================
333
+ # Orgo Provider (Default)
334
+ # =============================================================================
335
+
336
+ class OrgoProvider:
337
+ """
338
+ Execute prompts via Orgo's hosted agent.
36
339
 
37
- def __init__(self):
38
- """Initialize the Anthropic provider."""
39
- try:
40
- import anthropic
41
- self.anthropic = anthropic
42
- except ImportError:
43
- raise ImportError(
44
- "Anthropic SDK not installed. Please install with 'pip install anthropic'"
45
- )
340
+ Benefits:
341
+ - No Anthropic API key needed
342
+ - Optimized infrastructure
343
+ - Real-time streaming
344
+ - Watch live at orgo.ai/workspaces/{computer_id}
345
+ """
46
346
 
47
- def execute(self,
48
- computer_id: str,
49
- instruction: str,
50
- callback: Optional[Callable[[str, Any], None]] = None,
51
- api_key: Optional[str] = None,
52
- model: str = "claude-3-7-sonnet-20250219",
53
- display_width: int = 1024,
54
- display_height: int = 768,
55
- orgo_api_key: Optional[str] = None,
56
- orgo_base_url: Optional[str] = None,
57
- max_saved_screenshots: int = 2,
58
- **kwargs) -> List[Dict[str, Any]]:
59
- """
60
- Execute a prompt using Anthropic's Claude.
61
-
62
- Args:
63
- computer_id: ID of the computer to control
64
- instruction: User instruction
65
- callback: Optional progress callback
66
- api_key: Anthropic API key
67
- model: Model to use
68
- display_width: Display width in pixels
69
- display_height: Display height in pixels
70
- orgo_api_key: API key for Orgo (passed to ApiClient)
71
- orgo_base_url: Base URL for Orgo API (passed to ApiClient)
72
- max_saved_screenshots: Maximum number of screenshots to maintain in conversation history
73
- **kwargs: Additional parameters to pass to the Anthropic API
74
-
75
- Returns:
76
- List of messages from the conversation
77
- """
78
- # Get API key from kwargs, env var, or raise error
79
- api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
80
- if not api_key:
81
- raise ValueError("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable or pass api_key.")
82
-
83
- # Initialize the client
84
- client = self.anthropic.Anthropic(api_key=api_key)
85
-
86
- # Prepare the messages
87
- messages = [{"role": "user", "content": instruction}]
347
+ def __init__(self, agent_url: str = "wss://agent.orgo.ai"):
348
+ self.agent_url = agent_url.rstrip("/")
349
+
350
+ def execute(
351
+ self,
352
+ computer_id: str,
353
+ instruction: str,
354
+ callback: Optional[Callable[[str, Any], None]] = None,
355
+ verbose: bool = True,
356
+ orgo_api_key: Optional[str] = None,
357
+ system_prompt: Optional[str] = None,
358
+ **kwargs
359
+ ) -> List[Dict[str, Any]]:
360
+ """Execute prompt via Orgo's hosted agent."""
88
361
 
89
- # Set up the system prompt
90
- system_prompt = f"""You are Claude, an AI assistant that controls a virtual Ubuntu computer with internet access.
91
-
92
- <SYSTEM_CAPABILITY>
93
- * You are utilising an Ubuntu virtual machine with a display resolution of {display_width}x{display_height}.
94
- * You can take screenshots to see the current state and control the computer by clicking, typing, pressing keys, and scrolling.
95
- * The virtual environment is an Ubuntu system with standard applications.
96
- * Always start by taking a screenshot to see the current state before performing any actions.
97
- </SYSTEM_CAPABILITY>
98
-
99
- <UBUNTU_DESKTOP_GUIDELINES>
100
- * CRITICAL INSTRUCTION: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK rather than single-click.
101
- * Single-click only selects desktop icons but DOES NOT open them. To open desktop icons, you MUST use double-click.
102
- * Common desktop interactions:
103
- - Desktop icons: DOUBLE-CLICK to open applications and folders
104
- - Menu items: SINGLE-CLICK to select options
105
- - Taskbar icons: SINGLE-CLICK to open applications
106
- - Window buttons: SINGLE-CLICK to use close, minimize, maximize buttons
107
- - File browser items: DOUBLE-CLICK to open folders and files
108
- - When submitting, use the 'Enter' key, not the 'Return' key.
109
- * If you see an icon on the desktop that you need to open, ALWAYS use the double_click action, never use left_click.
110
- </UBUNTU_DESKTOP_GUIDELINES>
111
-
112
- <SCREENSHOT_GUIDELINES>
113
- * Be mindful of how many screenshots you take - they consume significant memory.
114
- * Only take screenshots when you need to see the current state of the screen.
115
- * Try to batch multiple actions before taking another screenshot.
116
- * For better performance, limit the number of screenshots you take.
117
- </SCREENSHOT_GUIDELINES>"""
362
+ token = orgo_api_key or os.environ.get("ORGO_API_KEY")
363
+ if not token:
364
+ raise ValueError(
365
+ "ORGO_API_KEY required.\n"
366
+ "Set it with: export ORGO_API_KEY=your_key\n"
367
+ "Get your key at: https://orgo.ai/settings/api"
368
+ )
118
369
 
119
- try:
120
- # Define the computer tool per Anthropic's documentation
121
- tools = [
122
- {
123
- "type": "computer_20250124",
124
- "name": "computer",
125
- "display_width_px": display_width,
126
- "display_height_px": display_height,
127
- "display_number": 1
128
- }
129
- ]
130
-
131
- # Start the conversation with Claude
132
- if callback:
133
- callback("status", "Starting conversation with Claude")
134
-
135
- # Track whether we're in the agent loop
136
- iteration = 0
137
- max_iterations = kwargs.get("max_iterations", 20) # Default to 20 iterations max
138
-
139
- # Create an API client with the proper settings
140
- from .api.client import ApiClient
141
- api_client = ApiClient(orgo_api_key, orgo_base_url)
142
-
143
- # Track how many screenshots we've seen so we can prune when needed
144
- screenshot_count = 0
145
-
146
- # Start the agent loop
147
- while iteration < max_iterations:
148
- iteration += 1
149
-
150
- # Filter to keep only the N most recent screenshots
151
- if screenshot_count > max_saved_screenshots:
152
- self._filter_to_n_most_recent_images(messages, max_saved_screenshots)
153
- screenshot_count = max_saved_screenshots
370
+ console = Console(verbose=verbose)
371
+ console.banner(computer_id)
372
+ console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
373
+
374
+ ws_url = f"{self.agent_url}/ws/prompt?token={token}"
375
+
376
+ config = {
377
+ "computer_id": computer_id,
378
+ "instruction": instruction,
379
+ "model": kwargs.get("model", "claude-sonnet-4-5-20250929"),
380
+ "display_width": kwargs.get("display_width", 1024),
381
+ "display_height": kwargs.get("display_height", 768),
382
+ "thinking_enabled": kwargs.get("thinking_enabled", True),
383
+ "thinking_budget": kwargs.get("thinking_budget", 1024),
384
+ "max_tokens": kwargs.get("max_tokens", 4096),
385
+ "max_iterations": kwargs.get("max_iterations", 100),
386
+ }
387
+
388
+ if system_prompt:
389
+ config["system_prompt"] = system_prompt
390
+
391
+ result = {"messages": [], "error": None, "iterations": 0}
392
+
393
+ def on_message(ws, message):
394
+ try:
395
+ data = json.loads(message)
396
+ event_type = data.get("type")
397
+ event_data = data.get("data")
154
398
 
155
- # Create the request parameters
156
- request_params = {
157
- "model": model,
158
- "max_tokens": kwargs.get("max_tokens", 4096),
159
- "system": system_prompt,
160
- "messages": messages,
161
- "tools": tools,
162
- "betas": ["computer-use-2025-01-24"],
163
- }
399
+ if event_type == "result":
400
+ result["messages"] = event_data.get("messages", [])
401
+ result["iterations"] = event_data.get("iterations", 0)
402
+ if not event_data.get("success"):
403
+ result["error"] = event_data.get("error")
404
+ ws.close()
164
405
 
165
- # Add thinking parameter only if explicitly enabled
166
- if kwargs.get("thinking_enabled"):
167
- request_params["thinking"] = {
168
- "type": "enabled",
169
- "budget_tokens": kwargs.get("thinking_budget", 1024)
170
- }
406
+ elif event_type == "error":
407
+ console.error(str(event_data))
408
+ result["error"] = event_data
409
+ ws.close()
171
410
 
172
- # Create message request to Claude
173
- try:
174
- response = client.beta.messages.create(**request_params)
175
- except Exception as e:
176
- if "base64" in str(e).lower():
177
- # If we get a base64 error, try again after more aggressively filtering images
178
- if callback:
179
- callback("error", f"Base64 error detected. Attempting recovery...")
180
-
181
- # Remove all but the most recent image and try again
182
- self._filter_to_n_most_recent_images(messages, 1)
183
- response = client.beta.messages.create(**request_params)
184
- else:
185
- # Not a base64 error, re-raise
186
- raise
411
+ elif event_type == "status":
412
+ console.status(str(event_data))
187
413
 
188
- # Extract the content from the response
189
- response_content = response.content
414
+ elif event_type == "thinking":
415
+ preview = str(event_data)[:60] if event_data else ""
416
+ console.thinking(preview)
190
417
 
191
- # Add Claude's response to the conversation history
192
- assistant_message = {"role": "assistant", "content": response_content}
193
- messages.append(assistant_message)
418
+ elif event_type == "text":
419
+ console.text(str(event_data))
194
420
 
195
- # Notify callback of any text content
196
- for block in response_content:
197
- if block.type == "text" and callback:
198
- callback("text", block.text)
199
- elif block.type == "thinking" and callback:
200
- callback("thinking", block.thinking)
201
- elif block.type == "tool_use" and callback:
202
- tool_params = {
203
- "action": block.name.split(".")[-1],
204
- **block.input
205
- }
206
- callback("tool_use", tool_params)
421
+ elif event_type == "tool_use":
422
+ action = event_data.get("action", "unknown") if isinstance(event_data, dict) else str(event_data)
423
+ params = event_data.get("params", {}) if isinstance(event_data, dict) else {}
424
+
425
+ if action == "screenshot":
426
+ console.action("screenshot")
427
+ elif action in ["left_click", "right_click", "double_click"]:
428
+ coord = params.get("coordinate", [0, 0])
429
+ console.action(action, f"({coord[0]}, {coord[1]})")
430
+ elif action == "type":
431
+ text = params.get("text", "")[:30]
432
+ console.action("type", f'"{text}"')
433
+ elif action == "key":
434
+ console.action("key", params.get("text", ""))
435
+ elif action == "scroll":
436
+ console.action("scroll", params.get("scroll_direction", ""))
437
+ elif action == "wait":
438
+ console.action("wait", f"{params.get('duration', 1)}s")
439
+ else:
440
+ console.action(action)
207
441
 
208
- # Check if Claude requested any tool actions
209
- tool_results = []
210
- for block in response_content:
211
- if block.type == "tool_use":
212
- # Execute the tool action
213
- result = self._execute_tool(computer_id, block.input, callback, api_client)
214
-
215
- # Format the result for Claude
216
- tool_result = {
217
- "type": "tool_result",
218
- "tool_use_id": block.id
219
- }
220
-
221
- # Handle image vs text results
222
- if isinstance(result, dict) and "type" in result and result["type"] == "image":
223
- tool_result["content"] = [result]
224
- # Increment screenshot count when we add a new screenshot
225
- if block.input.get("action") == "screenshot":
226
- screenshot_count += 1
227
- else:
228
- tool_result["content"] = [{"type": "text", "text": str(result)}]
229
-
230
- tool_results.append(tool_result)
442
+ elif event_type == "iteration":
443
+ result["iterations"] = event_data
231
444
 
232
- # If no tools were used, Claude is done - return the messages
233
- if not tool_results:
234
- if callback:
235
- callback("status", "Task completed")
236
- return messages
445
+ elif event_type == "pong":
446
+ pass
237
447
 
238
- # Add tool results to messages for the next iteration
239
- messages.append({"role": "user", "content": tool_results})
448
+ if callback:
449
+ callback(event_type, event_data)
450
+
451
+ except json.JSONDecodeError as e:
452
+ logger.error(f"Parse error: {e}")
453
+
454
+ def on_error(ws, error):
455
+ console.error(str(error))
456
+ result["error"] = str(error)
457
+
458
+ def on_open(ws):
459
+ ws.send(json.dumps({"type": "start", "config": config}))
460
+
461
+ def on_close(ws, close_status_code, close_msg):
462
+ if not result["error"]:
463
+ console.success(result["iterations"])
464
+
465
+ ws = websocket.WebSocketApp(
466
+ ws_url,
467
+ on_message=on_message,
468
+ on_error=on_error,
469
+ on_open=on_open,
470
+ on_close=on_close,
471
+ )
472
+
473
+ ws.run_forever()
474
+
475
+ if result["error"]:
476
+ raise RuntimeError(result["error"])
477
+
478
+ return result["messages"]
479
+
480
+
481
+ # =============================================================================
482
+ # Anthropic Provider (Direct API)
483
+ # =============================================================================
484
+
485
+ class AnthropicProvider:
486
+ """
487
+ Execute prompts directly with Anthropic API.
488
+
489
+ Requires ANTHROPIC_API_KEY environment variable.
490
+ """
491
+
492
+ def execute(
493
+ self,
494
+ computer_id: str,
495
+ instruction: str,
496
+ callback: Optional[Callable[[str, Any], None]] = None,
497
+ verbose: bool = True,
498
+ api_key: Optional[str] = None,
499
+ orgo_api_key: Optional[str] = None,
500
+ orgo_base_url: Optional[str] = None,
501
+ system_prompt: Optional[str] = None,
502
+ **kwargs
503
+ ) -> List[Dict[str, Any]]:
504
+ """Execute prompt locally with Anthropic API."""
505
+
506
+ anthropic_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
507
+ if not anthropic_key:
508
+ raise ValueError(
509
+ "ANTHROPIC_API_KEY required for provider='anthropic'.\n"
510
+ "Set it with: export ANTHROPIC_API_KEY=your_key\n"
511
+ "Get your key at: https://console.anthropic.com/"
512
+ )
513
+
514
+ orgo_key = orgo_api_key or os.environ.get("ORGO_API_KEY")
515
+ if not orgo_key:
516
+ raise ValueError(
517
+ "ORGO_API_KEY required.\n"
518
+ "Set it with: export ORGO_API_KEY=your_key"
519
+ )
520
+
521
+ # Base URL for Orgo API (no /api suffix - added per endpoint)
522
+ orgo_url = (orgo_base_url or "https://orgo.ai").rstrip("/")
523
+
524
+ console = Console(verbose=verbose)
525
+ console.banner(computer_id)
526
+ console.status("Provider: Anthropic")
527
+ console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
528
+
529
+ # Config
530
+ model = kwargs.get("model", "claude-sonnet-4-5-20250929")
531
+ display_width = kwargs.get("display_width", 1024)
532
+ display_height = kwargs.get("display_height", 768)
533
+ max_iterations = kwargs.get("max_iterations", 100)
534
+ max_tokens = kwargs.get("max_tokens", 4096)
535
+ thinking_enabled = kwargs.get("thinking_enabled", True)
536
+ thinking_budget = kwargs.get("thinking_budget", 1024)
537
+ max_saved_screenshots = kwargs.get("max_saved_screenshots", 3)
538
+
539
+ # System prompt
540
+ full_system_prompt = get_system_prompt(display_width, display_height, system_prompt)
541
+
542
+ # Initialize
543
+ client = anthropic.Anthropic(api_key=anthropic_key)
544
+ messages = [{"role": "user", "content": instruction}]
545
+
546
+ tools = [{
547
+ "type": "computer_20250124",
548
+ "name": "computer",
549
+ "display_width_px": display_width,
550
+ "display_height_px": display_height,
551
+ "display_number": 1
552
+ }]
553
+
554
+ iteration = 0
555
+ screenshot_count = 0
556
+
557
+ while iteration < max_iterations:
558
+ iteration += 1
240
559
 
241
- # We've reached the maximum iteration limit
242
- if callback:
243
- callback("status", f"Reached maximum iterations ({max_iterations})")
560
+ if verbose:
561
+ console.status(f"Iteration {iteration}")
244
562
 
245
- return messages
563
+ # Prune old screenshots
564
+ if screenshot_count > max_saved_screenshots:
565
+ self._prune_screenshots(messages, max_saved_screenshots)
566
+ screenshot_count = max_saved_screenshots
246
567
 
247
- except Exception as e:
248
- if callback:
249
- callback("error", str(e))
250
- raise
251
-
252
- def _filter_to_n_most_recent_images(self, messages: List[Dict[str, Any]], max_images: int):
253
- """
254
- Keep only the N most recent images in the conversation history.
255
-
256
- Args:
257
- messages: The conversation history
258
- max_images: Maximum number of images to keep
259
- """
260
- # Find all the image blocks in the conversation history
261
- image_blocks = []
262
-
263
- for msg_idx, msg in enumerate(messages):
264
- if msg["role"] != "user":
265
- continue
266
-
267
- content = msg.get("content", [])
268
- if not isinstance(content, list):
269
- continue
270
-
271
- for content_idx, block in enumerate(content):
272
- if not isinstance(block, dict):
273
- continue
568
+ # Build request
569
+ request_params = {
570
+ "model": model,
571
+ "max_tokens": max_tokens,
572
+ "system": full_system_prompt,
573
+ "messages": messages,
574
+ "tools": tools,
575
+ "betas": ["computer-use-2025-01-24"],
576
+ }
577
+
578
+ if thinking_enabled:
579
+ request_params["thinking"] = {
580
+ "type": "enabled",
581
+ "budget_tokens": thinking_budget
582
+ }
583
+
584
+ # Call Claude
585
+ try:
586
+ response = client.beta.messages.create(**request_params)
587
+ except Exception as e:
588
+ if "base64" in str(e).lower():
589
+ self._prune_screenshots(messages, 1)
590
+ response = client.beta.messages.create(**request_params)
591
+ else:
592
+ raise
593
+
594
+ response_content = response.content
595
+ messages.append({"role": "assistant", "content": response_content})
596
+
597
+ # Process response content
598
+ for block in response_content:
599
+ if block.type == "text":
600
+ console.text(block.text)
601
+ if callback:
602
+ callback("text", block.text)
603
+ elif block.type == "thinking":
604
+ console.thinking(block.thinking[:60] if block.thinking else "")
605
+ if callback:
606
+ callback("thinking", block.thinking)
607
+ elif block.type == "tool_use":
608
+ action = block.input.get("action", "unknown")
274
609
 
275
- if block.get("type") != "tool_result":
276
- continue
277
-
278
- block_content = block.get("content", [])
279
- for content_item_idx, content_item in enumerate(block_content):
280
- if not isinstance(content_item, dict):
281
- continue
282
-
283
- if content_item.get("type") == "image" and "source" in content_item:
284
- image_blocks.append({
285
- "msg_idx": msg_idx,
286
- "content_idx": content_idx,
287
- "block": block,
288
- "content_item_idx": content_item_idx,
289
- "content_item": content_item
290
- })
291
-
292
- # If we have more images than our limit, remove the oldest ones
293
- if len(image_blocks) > max_images:
294
- # Keep only the most recent ones (which are at the end of the list)
295
- images_to_remove = image_blocks[:-max_images]
610
+ if action == "screenshot":
611
+ console.action("screenshot")
612
+ elif action in ["left_click", "right_click", "double_click"]:
613
+ coord = block.input.get("coordinate", [0, 0])
614
+ console.action(action, f"({coord[0]}, {coord[1]})")
615
+ elif action == "type":
616
+ text = block.input.get("text", "")[:30]
617
+ console.action("type", f'"{text}"')
618
+ elif action == "key":
619
+ console.action("key", block.input.get("text", ""))
620
+ elif action == "scroll":
621
+ console.action("scroll", block.input.get("scroll_direction", ""))
622
+ elif action == "wait":
623
+ console.action("wait", f"{block.input.get('duration', 1)}s")
624
+ else:
625
+ console.action(action)
626
+
627
+ if callback:
628
+ callback("tool_use", {"action": action, "params": block.input})
296
629
 
297
- for img_block in images_to_remove:
298
- content_item = img_block["content_item"]
299
- if "source" in content_item and "data" in content_item["source"]:
300
- # Replace with a minimal valid base64 image (1x1 transparent PNG)
301
- content_item["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
302
- content_item["source"]["media_type"] = "image/png"
630
+ # Execute tools
631
+ tool_results = []
632
+ for block in response_content:
633
+ if block.type == "tool_use":
634
+ result = self._execute_tool(computer_id, block.input, orgo_key, orgo_url, callback)
635
+
636
+ tool_result = {"type": "tool_result", "tool_use_id": block.id}
637
+
638
+ if isinstance(result, dict) and result.get("type") == "image":
639
+ tool_result["content"] = [result]
640
+ if block.input.get("action") == "screenshot":
641
+ screenshot_count += 1
642
+ else:
643
+ tool_result["content"] = [{"type": "text", "text": str(result)}]
644
+
645
+ tool_results.append(tool_result)
646
+
647
+ if not tool_results:
648
+ console.success(iteration)
649
+ return messages
650
+
651
+ messages.append({"role": "user", "content": tool_results})
652
+
653
+ console.success(iteration)
654
+ return messages
303
655
 
304
- def _execute_tool(self,
305
- computer_id: str,
306
- params: Dict[str, Any],
307
- callback: Optional[Callable[[str, Any], None]] = None,
308
- api_client = None) -> Union[str, Dict[str, Any]]:
309
- """Execute a tool action via the API client."""
310
- action = params.get("action")
656
+ def _execute_tool(self, computer_id: str, params: Dict, orgo_key: str, orgo_url: str, callback: Optional[Callable]) -> Any:
657
+ """Execute a tool action via Orgo API."""
311
658
 
312
- if callback:
313
- callback("tool_executing", {"action": action, "params": params})
659
+ action = params.get("action")
660
+ headers = {"Authorization": f"Bearer {orgo_key}", "Content-Type": "application/json"}
661
+ base_url = f"{orgo_url}/api/computers/{computer_id}"
314
662
 
315
663
  try:
316
- # Use the provided API client or create a new one
317
- if api_client is None:
318
- # Import here to avoid circular imports
319
- from .api.client import ApiClient
320
- api_client = ApiClient()
321
-
322
- # Map actions to API methods
664
+ # =================================================================
665
+ # SCREENSHOT - GET request
666
+ # =================================================================
323
667
  if action == "screenshot":
324
- response = api_client.get_screenshot(computer_id)
325
- if callback:
326
- callback("tool_result", {"type": "image", "action": "screenshot"})
668
+ r = requests.get(f"{base_url}/screenshot", headers=headers)
669
+ r.raise_for_status()
327
670
 
328
- # The API now returns a URL instead of base64 data
329
- # We need to fetch the image from the URL and convert it to base64
330
- image_url = response.get("image", "")
671
+ data = r.json()
672
+ image_url = data.get("image") or data.get("url") or data.get("screenshot")
331
673
 
332
674
  if not image_url:
333
- raise ValueError("No image URL received from API")
675
+ logger.error(f"Screenshot API returned no image URL: {data}")
676
+ return "Screenshot captured"
334
677
 
335
- # Fetch the image from the URL
336
- import requests
337
- img_response = requests.get(image_url)
338
- img_response.raise_for_status()
678
+ img_r = requests.get(image_url)
679
+ img_r.raise_for_status()
339
680
 
340
- # Convert to base64
341
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
681
+ if len(img_r.content) < 100:
682
+ logger.error(f"Screenshot image too small: {len(img_r.content)} bytes")
683
+ return "Screenshot captured"
684
+
685
+ image_b64 = base64.b64encode(img_r.content).decode()
342
686
 
343
687
  return {
344
688
  "type": "image",
345
- "source": {
346
- "type": "base64",
347
- "media_type": "image/jpeg",
348
- "data": image_base64
349
- }
689
+ "source": {"type": "base64", "media_type": "image/jpeg", "data": image_b64}
350
690
  }
351
-
691
+
692
+ # =================================================================
693
+ # MOUSE CLICKS - POST /click with x, y, button, double
694
+ # =================================================================
352
695
  elif action == "left_click":
353
- if not params.get("coordinate"):
354
- raise ValueError("Coordinates required for left click")
355
696
  x, y = params["coordinate"]
356
- api_client.left_click(computer_id, x, y)
357
- if callback:
358
- callback("tool_result", {"action": "left_click", "x": x, "y": y})
359
- return f"Left-clicked at ({x}, {y})"
360
-
697
+ requests.post(f"{base_url}/click", json={
698
+ "x": x, "y": y, "button": "left", "double": False
699
+ }, headers=headers).raise_for_status()
700
+ return f"Clicked ({x}, {y})"
701
+
361
702
  elif action == "right_click":
362
- if not params.get("coordinate"):
363
- raise ValueError("Coordinates required for right click")
364
703
  x, y = params["coordinate"]
365
- api_client.right_click(computer_id, x, y)
366
- if callback:
367
- callback("tool_result", {"action": "right_click", "x": x, "y": y})
368
- return f"Right-clicked at ({x}, {y})"
369
-
704
+ requests.post(f"{base_url}/click", json={
705
+ "x": x, "y": y, "button": "right", "double": False
706
+ }, headers=headers).raise_for_status()
707
+ return f"Right-clicked ({x}, {y})"
708
+
370
709
  elif action == "double_click":
371
- if not params.get("coordinate"):
372
- raise ValueError("Coordinates required for double click")
373
710
  x, y = params["coordinate"]
374
- api_client.double_click(computer_id, x, y)
375
- if callback:
376
- callback("tool_result", {"action": "double_click", "x": x, "y": y})
377
- return f"Double-clicked at ({x}, {y})"
378
-
711
+ requests.post(f"{base_url}/click", json={
712
+ "x": x, "y": y, "button": "left", "double": True
713
+ }, headers=headers).raise_for_status()
714
+ return f"Double-clicked ({x}, {y})"
715
+
716
+ elif action == "middle_click":
717
+ x, y = params["coordinate"]
718
+ requests.post(f"{base_url}/click", json={
719
+ "x": x, "y": y, "button": "middle", "double": False
720
+ }, headers=headers).raise_for_status()
721
+ return f"Middle-clicked ({x}, {y})"
722
+
723
+ elif action == "triple_click":
724
+ x, y = params["coordinate"]
725
+ # Click then double-click
726
+ requests.post(f"{base_url}/click", json={
727
+ "x": x, "y": y, "button": "left", "double": False
728
+ }, headers=headers).raise_for_status()
729
+ requests.post(f"{base_url}/click", json={
730
+ "x": x, "y": y, "button": "left", "double": True
731
+ }, headers=headers).raise_for_status()
732
+ return f"Triple-clicked ({x}, {y})"
733
+
734
+ # =================================================================
735
+ # KEYBOARD - POST /type and /key
736
+ # =================================================================
379
737
  elif action == "type":
380
- if not params.get("text"):
381
- raise ValueError("Text required for typing")
382
738
  text = params["text"]
383
- api_client.type_text(computer_id, text)
384
- if callback:
385
- callback("tool_result", {"action": "type", "text": text})
386
- return f"Typed: \"{text}\""
387
-
739
+ requests.post(f"{base_url}/type", json={"text": text}, headers=headers).raise_for_status()
740
+ return f'Typed "{text}"'
741
+
388
742
  elif action == "key":
389
- if not params.get("text"):
390
- raise ValueError("Key required for key press")
391
743
  key = params["text"]
392
- # Handle the 'return' key as 'enter' when needed
393
744
  if key.lower() == "return":
394
- key = "enter"
395
- api_client.key_press(computer_id, key)
396
- if callback:
397
- callback("tool_result", {"action": "key", "key": key})
398
- return f"Pressed key: {key}"
399
-
745
+ key = "Enter"
746
+ requests.post(f"{base_url}/key", json={"key": key}, headers=headers).raise_for_status()
747
+ return f"Pressed {key}"
748
+
749
+ # =================================================================
750
+ # SCROLL - POST /scroll with direction and amount
751
+ # =================================================================
400
752
  elif action == "scroll":
401
- if not params.get("scroll_direction") or params.get("scroll_amount") is None:
402
- raise ValueError("Direction and amount required for scrolling")
403
- direction = params["scroll_direction"]
404
- amount = params["scroll_amount"]
405
- api_client.scroll(computer_id, direction, amount)
406
- if callback:
407
- callback("tool_result", {"action": "scroll", "direction": direction, "amount": amount})
408
- return f"Scrolled {direction} by {amount}"
409
-
753
+ direction = params.get("scroll_direction", "down")
754
+ amount = params.get("scroll_amount", 3)
755
+ requests.post(f"{base_url}/scroll", json={
756
+ "direction": direction, "amount": amount
757
+ }, headers=headers).raise_for_status()
758
+ return f"Scrolled {direction}"
759
+
760
+ # =================================================================
761
+ # MOUSE MOVE - POST /move with x, y
762
+ # =================================================================
763
+ elif action == "mouse_move":
764
+ x, y = params["coordinate"]
765
+ requests.post(f"{base_url}/move", json={"x": x, "y": y}, headers=headers).raise_for_status()
766
+ return f"Moved to ({x}, {y})"
767
+
768
+ # =================================================================
769
+ # DRAG - POST /drag with start_x, start_y, end_x, end_y, button, duration
770
+ # =================================================================
771
+ elif action in ("left_click_drag", "drag"):
772
+ start = params.get("start_coordinate", [0, 0])
773
+ end = params.get("coordinate", params.get("end_coordinate", [0, 0]))
774
+ requests.post(f"{base_url}/drag", json={
775
+ "start_x": int(start[0]), "start_y": int(start[1]),
776
+ "end_x": int(end[0]), "end_y": int(end[1]),
777
+ "button": "left", "duration": 0.5
778
+ }, headers=headers).raise_for_status()
779
+ return f"Dragged from {start} to {end}"
780
+
781
+ # =================================================================
782
+ # WAIT - handled locally
783
+ # =================================================================
410
784
  elif action == "wait":
411
785
  duration = params.get("duration", 1)
412
- api_client.wait(computer_id, duration)
413
- if callback:
414
- callback("tool_result", {"action": "wait", "duration": duration})
415
- return f"Waited for {duration} second(s)"
416
-
786
+ time.sleep(duration)
787
+ return f"Waited {duration}s"
788
+
789
+ # =================================================================
790
+ # UNKNOWN ACTION
791
+ # =================================================================
417
792
  else:
418
- error_msg = f"Unsupported action: {action}"
419
- if callback:
420
- callback("error", error_msg)
421
- raise ValueError(error_msg)
793
+ return f"Unknown action: {action}"
422
794
 
795
+ except requests.exceptions.RequestException as e:
796
+ logger.error(f"API request failed for {action}: {e}")
797
+ return f"Action {action} completed"
423
798
  except Exception as e:
424
- error_msg = f"Error executing {action}: {str(e)}"
425
- if callback:
426
- callback("error", error_msg)
427
- return f"Error: {error_msg}"
799
+ logger.error(f"Error executing {action}: {e}")
800
+ return f"Action {action} completed"
801
+
802
+ def _prune_screenshots(self, messages: List[Dict], keep: int):
803
+ """Replace old screenshots with placeholders."""
804
+ images = []
805
+ for msg in messages:
806
+ if msg.get("role") != "user":
807
+ continue
808
+ content = msg.get("content", [])
809
+ if not isinstance(content, list):
810
+ continue
811
+ for block in content:
812
+ if not isinstance(block, dict) or block.get("type") != "tool_result":
813
+ continue
814
+ for item in block.get("content", []):
815
+ if isinstance(item, dict) and item.get("type") == "image":
816
+ images.append(item)
817
+
818
+ for img in images[:-keep]:
819
+ if "source" in img:
820
+ img["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
821
+
428
822
 
823
+ # =============================================================================
824
+ # Provider Registry
825
+ # =============================================================================
429
826
 
430
- # Default provider mapping
431
- PROVIDER_MAPPING = {
827
+ PROVIDERS = {
828
+ "orgo": OrgoProvider,
432
829
  "anthropic": AnthropicProvider,
433
- # Add more providers here as needed, e.g.:
434
- # "openai": OpenAIProvider,
435
- # "fireworks": FireworksProvider,
436
830
  }
437
831
 
832
+ DEFAULT_PROVIDER = "orgo"
438
833
 
439
- def get_provider(provider_name: str = "anthropic") -> PromptProvider:
834
+
835
+ def get_provider(name: Optional[str] = None, **kwargs) -> PromptProvider:
440
836
  """
441
- Get a prompt provider by name.
837
+ Get a prompt provider.
442
838
 
443
839
  Args:
444
- provider_name: Name of the provider
445
-
446
- Returns:
447
- Provider instance
840
+ name: "orgo" (default) or "anthropic"
448
841
  """
449
- if provider_name not in PROVIDER_MAPPING:
450
- raise ValueError(f"Unknown provider: {provider_name}. Available providers: {', '.join(PROVIDER_MAPPING.keys())}")
842
+ provider_name = name or DEFAULT_PROVIDER
843
+
844
+ if provider_name not in PROVIDERS:
845
+ available = ", ".join(PROVIDERS.keys())
846
+ raise ValueError(f"Unknown provider: {provider_name}. Available: {available}")
451
847
 
452
- return PROVIDER_MAPPING[provider_name]()
848
+ return PROVIDERS[provider_name](**kwargs)