cua-agent 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,70 @@
1
+ """
2
+ Base computer interface protocol for agent interactions.
3
+ """
4
+
5
+ from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
6
+
7
+
8
+ @runtime_checkable
9
+ class AsyncComputerHandler(Protocol):
10
+ """Protocol defining the interface for computer interactions."""
11
+
12
+ # ==== Computer-Use-Preview Action Space ====
13
+
14
+ async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
15
+ """Get the current environment type."""
16
+ ...
17
+
18
+ async def get_dimensions(self) -> tuple[int, int]:
19
+ """Get screen dimensions as (width, height)."""
20
+ ...
21
+
22
+ async def screenshot(self) -> str:
23
+ """Take a screenshot and return as base64 string."""
24
+ ...
25
+
26
+ async def click(self, x: int, y: int, button: str = "left") -> None:
27
+ """Click at coordinates with specified button."""
28
+ ...
29
+
30
+ async def double_click(self, x: int, y: int) -> None:
31
+ """Double click at coordinates."""
32
+ ...
33
+
34
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
35
+ """Scroll at coordinates with specified scroll amounts."""
36
+ ...
37
+
38
+ async def type(self, text: str) -> None:
39
+ """Type text."""
40
+ ...
41
+
42
+ async def wait(self, ms: int = 1000) -> None:
43
+ """Wait for specified milliseconds."""
44
+ ...
45
+
46
+ async def move(self, x: int, y: int) -> None:
47
+ """Move cursor to coordinates."""
48
+ ...
49
+
50
+ async def keypress(self, keys: Union[List[str], str]) -> None:
51
+ """Press key combination."""
52
+ ...
53
+
54
+ async def drag(self, path: List[Dict[str, int]]) -> None:
55
+ """Drag along specified path."""
56
+ ...
57
+
58
+ async def get_current_url(self) -> str:
59
+ """Get current URL (for browser environments)."""
60
+ ...
61
+
62
+ # ==== Anthropic Action Space ====
63
+
64
+ async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
65
+ """Left mouse down at coordinates."""
66
+ ...
67
+
68
+ async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
69
+ """Left mouse up at coordinates."""
70
+ ...
@@ -4,35 +4,44 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
4
4
 
5
5
  import base64
6
6
  from typing import Dict, List, Any, Literal, Union, Optional
7
- from .types import Computer
7
+ from .base import AsyncComputerHandler
8
+ from computer import Computer
8
9
 
9
-
10
- class OpenAIComputerHandler:
10
+ class cuaComputerHandler(AsyncComputerHandler):
11
11
  """Computer handler that implements the Computer protocol using the computer interface."""
12
12
 
13
- def __init__(self, computer_interface):
13
+ def __init__(self, cua_computer: Computer):
14
14
  """Initialize with a computer interface (from tool schema)."""
15
- self.interface = computer_interface
15
+ self.cua_computer = cua_computer
16
+ self.interface = None
17
+
18
+ async def _initialize(self):
19
+ if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized:
20
+ await self.cua_computer.run()
21
+ self.interface = self.cua_computer.interface
16
22
 
17
23
  # ==== Computer-Use-Preview Action Space ====
18
24
 
19
25
  async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
20
26
  """Get the current environment type."""
21
- # For now, return a default - this could be enhanced to detect actual environment
22
- return "windows"
27
+ # TODO: detect actual environment
28
+ return "linux"
23
29
 
24
30
  async def get_dimensions(self) -> tuple[int, int]:
25
31
  """Get screen dimensions as (width, height)."""
32
+ assert self.interface is not None
26
33
  screen_size = await self.interface.get_screen_size()
27
34
  return screen_size["width"], screen_size["height"]
28
35
 
29
36
  async def screenshot(self) -> str:
30
37
  """Take a screenshot and return as base64 string."""
38
+ assert self.interface is not None
31
39
  screenshot_bytes = await self.interface.screenshot()
32
40
  return base64.b64encode(screenshot_bytes).decode('utf-8')
33
41
 
34
42
  async def click(self, x: int, y: int, button: str = "left") -> None:
35
43
  """Click at coordinates with specified button."""
44
+ assert self.interface is not None
36
45
  if button == "left":
37
46
  await self.interface.left_click(x, y)
38
47
  elif button == "right":
@@ -43,28 +52,34 @@ class OpenAIComputerHandler:
43
52
 
44
53
  async def double_click(self, x: int, y: int) -> None:
45
54
  """Double click at coordinates."""
55
+ assert self.interface is not None
46
56
  await self.interface.double_click(x, y)
47
57
 
48
58
  async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
49
59
  """Scroll at coordinates with specified scroll amounts."""
60
+ assert self.interface is not None
50
61
  await self.interface.move_cursor(x, y)
51
62
  await self.interface.scroll(scroll_x, scroll_y)
52
63
 
53
64
  async def type(self, text: str) -> None:
54
65
  """Type text."""
66
+ assert self.interface is not None
55
67
  await self.interface.type_text(text)
56
68
 
57
69
  async def wait(self, ms: int = 1000) -> None:
58
70
  """Wait for specified milliseconds."""
71
+ assert self.interface is not None
59
72
  import asyncio
60
73
  await asyncio.sleep(ms / 1000.0)
61
74
 
62
75
  async def move(self, x: int, y: int) -> None:
63
76
  """Move cursor to coordinates."""
77
+ assert self.interface is not None
64
78
  await self.interface.move_cursor(x, y)
65
79
 
66
80
  async def keypress(self, keys: Union[List[str], str]) -> None:
67
81
  """Press key combination."""
82
+ assert self.interface is not None
68
83
  if isinstance(keys, str):
69
84
  keys = keys.replace("-", "+").split("+")
70
85
  if len(keys) == 1:
@@ -75,6 +90,7 @@ class OpenAIComputerHandler:
75
90
 
76
91
  async def drag(self, path: List[Dict[str, int]]) -> None:
77
92
  """Drag along specified path."""
93
+ assert self.interface is not None
78
94
  if not path:
79
95
  return
80
96
 
@@ -99,23 +115,10 @@ class OpenAIComputerHandler:
99
115
  # ==== Anthropic Computer Action Space ====
100
116
  async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
101
117
  """Left mouse down at coordinates."""
118
+ assert self.interface is not None
102
119
  await self.interface.mouse_down(x, y, button="left")
103
120
 
104
121
  async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
105
122
  """Left mouse up at coordinates."""
106
- await self.interface.mouse_up(x, y, button="left")
107
-
108
- def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
109
- """Safety check callback for user acknowledgment."""
110
- if allow_always:
111
- return True
112
- response = input(
113
- f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
114
- ).lower()
115
- return response.strip() == "y"
116
-
117
-
118
- def check_blocklisted_url(url: str) -> None:
119
- """Check if URL is blocklisted (placeholder implementation)."""
120
- # This would contain actual URL checking logic
121
- pass
123
+ assert self.interface is not None
124
+ await self.interface.mouse_up(x, y, button="left")
@@ -0,0 +1,209 @@
1
+ """
2
+ Custom computer handler implementation that accepts a dictionary of functions.
3
+ """
4
+
5
+ import base64
6
+ from typing import Dict, List, Any, Literal, Union, Optional, Callable
7
+ from PIL import Image
8
+ import io
9
+ from .base import AsyncComputerHandler
10
+
11
+
12
+ class CustomComputerHandler(AsyncComputerHandler):
13
+ """Computer handler that implements the Computer protocol using a dictionary of custom functions."""
14
+
15
+ def __init__(self, functions: Dict[str, Callable]):
16
+ """
17
+ Initialize with a dictionary of functions.
18
+
19
+ Args:
20
+ functions: Dictionary where keys are method names and values are callable functions.
21
+ Only 'screenshot' is required, all others are optional.
22
+
23
+ Raises:
24
+ ValueError: If required 'screenshot' function is not provided.
25
+ """
26
+ if 'screenshot' not in functions:
27
+ raise ValueError("'screenshot' function is required in functions dictionary")
28
+
29
+ self.functions = functions
30
+ self._last_screenshot_size: Optional[tuple[int, int]] = None
31
+
32
+ async def _call_function(self, func, *args, **kwargs):
33
+ """
34
+ Call a function, handling both async and sync functions.
35
+
36
+ Args:
37
+ func: The function to call
38
+ *args: Positional arguments to pass to the function
39
+ **kwargs: Keyword arguments to pass to the function
40
+
41
+ Returns:
42
+ The result of the function call
43
+ """
44
+ import asyncio
45
+ import inspect
46
+
47
+ if callable(func):
48
+ if inspect.iscoroutinefunction(func):
49
+ return await func(*args, **kwargs)
50
+ else:
51
+ return func(*args, **kwargs)
52
+ else:
53
+ return func
54
+
55
+ async def _get_value(self, attribute: str):
56
+ """
57
+ Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
58
+
59
+ Args:
60
+ attribute: The attribute name to look for
61
+
62
+ Returns:
63
+ The value from the functions dict, called if callable, returned directly if not
64
+ """
65
+ # Check for 'get_{attribute}' first
66
+ get_key = f"get_{attribute}"
67
+ if get_key in self.functions:
68
+ return await self._call_function(self.functions[get_key])
69
+
70
+ # Check for '{attribute}'
71
+ if attribute in self.functions:
72
+ return await self._call_function(self.functions[attribute])
73
+
74
+ return None
75
+
76
+ def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
77
+ """
78
+ Convert image to base64 string.
79
+
80
+ Args:
81
+ img: Image as bytes, PIL Image, or base64 string
82
+
83
+ Returns:
84
+ str: Base64 encoded image string
85
+ """
86
+ if isinstance(img, str):
87
+ # Already a base64 string
88
+ return img
89
+ elif isinstance(img, bytes):
90
+ # Raw bytes
91
+ return base64.b64encode(img).decode('utf-8')
92
+ elif isinstance(img, Image.Image):
93
+ # PIL Image
94
+ buffer = io.BytesIO()
95
+ img.save(buffer, format='PNG')
96
+ return base64.b64encode(buffer.getvalue()).decode('utf-8')
97
+ else:
98
+ raise ValueError(f"Unsupported image type: {type(img)}")
99
+
100
+ # ==== Computer-Use-Preview Action Space ====
101
+
102
+ async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
103
+ """Get the current environment type."""
104
+ result = await self._get_value('environment')
105
+ if result is None:
106
+ return "linux"
107
+ assert result in ["windows", "mac", "linux", "browser"]
108
+ return result # type: ignore
109
+
110
+ async def get_dimensions(self) -> tuple[int, int]:
111
+ """Get screen dimensions as (width, height)."""
112
+ result = await self._get_value('dimensions')
113
+ if result is not None:
114
+ return result # type: ignore
115
+
116
+ # Fallback: use last screenshot size if available
117
+ if not self._last_screenshot_size:
118
+ await self.screenshot()
119
+ assert self._last_screenshot_size is not None, "Failed to get screenshot size"
120
+
121
+ return self._last_screenshot_size
122
+
123
+ async def screenshot(self) -> str:
124
+ """Take a screenshot and return as base64 string."""
125
+ result = await self._call_function(self.functions['screenshot'])
126
+ b64_str = self._to_b64_str(result) # type: ignore
127
+
128
+ # Try to extract dimensions for fallback use
129
+ try:
130
+ if isinstance(result, Image.Image):
131
+ self._last_screenshot_size = result.size
132
+ elif isinstance(result, bytes):
133
+ # Try to decode bytes to get dimensions
134
+ img = Image.open(io.BytesIO(result))
135
+ self._last_screenshot_size = img.size
136
+ except Exception:
137
+ # If we can't get dimensions, that's okay
138
+ pass
139
+
140
+ return b64_str
141
+
142
+ async def click(self, x: int, y: int, button: str = "left") -> None:
143
+ """Click at coordinates with specified button."""
144
+ if 'click' in self.functions:
145
+ await self._call_function(self.functions['click'], x, y, button)
146
+ # No-op if not implemented
147
+
148
+ async def double_click(self, x: int, y: int) -> None:
149
+ """Double click at coordinates."""
150
+ if 'double_click' in self.functions:
151
+ await self._call_function(self.functions['double_click'], x, y)
152
+ # No-op if not implemented
153
+
154
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
155
+ """Scroll at coordinates with specified scroll amounts."""
156
+ if 'scroll' in self.functions:
157
+ await self._call_function(self.functions['scroll'], x, y, scroll_x, scroll_y)
158
+ # No-op if not implemented
159
+
160
+ async def type(self, text: str) -> None:
161
+ """Type text."""
162
+ if 'type' in self.functions:
163
+ await self._call_function(self.functions['type'], text)
164
+ # No-op if not implemented
165
+
166
+ async def wait(self, ms: int = 1000) -> None:
167
+ """Wait for specified milliseconds."""
168
+ if 'wait' in self.functions:
169
+ await self._call_function(self.functions['wait'], ms)
170
+ else:
171
+ # Default implementation
172
+ import asyncio
173
+ await asyncio.sleep(ms / 1000.0)
174
+
175
+ async def move(self, x: int, y: int) -> None:
176
+ """Move cursor to coordinates."""
177
+ if 'move' in self.functions:
178
+ await self._call_function(self.functions['move'], x, y)
179
+ # No-op if not implemented
180
+
181
+ async def keypress(self, keys: Union[List[str], str]) -> None:
182
+ """Press key combination."""
183
+ if 'keypress' in self.functions:
184
+ await self._call_function(self.functions['keypress'], keys)
185
+ # No-op if not implemented
186
+
187
+ async def drag(self, path: List[Dict[str, int]]) -> None:
188
+ """Drag along specified path."""
189
+ if 'drag' in self.functions:
190
+ await self._call_function(self.functions['drag'], path)
191
+ # No-op if not implemented
192
+
193
+ async def get_current_url(self) -> str:
194
+ """Get current URL (for browser environments)."""
195
+ if 'get_current_url' in self.functions:
196
+ return await self._get_value('current_url') # type: ignore
197
+ return "" # Default fallback
198
+
199
+ async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
200
+ """Left mouse down at coordinates."""
201
+ if 'left_mouse_down' in self.functions:
202
+ await self._call_function(self.functions['left_mouse_down'], x, y)
203
+ # No-op if not implemented
204
+
205
+ async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
206
+ """Left mouse up at coordinates."""
207
+ if 'left_mouse_up' in self.functions:
208
+ await self._call_function(self.functions['left_mouse_up'], x, y)
209
+ # No-op if not implemented
@@ -0,0 +1,29 @@
1
+ """
2
+ Human-in-the-Loop Completion Tool
3
+
4
+ This package provides a human-in-the-loop completion system that allows
5
+ AI agents to request human assistance for complex decisions or responses.
6
+
7
+ Components:
8
+ - server.py: FastAPI server with completion queue management
9
+ - ui.py: Gradio UI for human interaction
10
+ - __main__.py: Combined server and UI application
11
+
12
+ Usage:
13
+ # Run the server and UI
14
+ python -m agent.human_tool
15
+
16
+ # Or run components separately
17
+ python -m agent.human_tool.server # API server only
18
+ python -m agent.human_tool.ui # UI only
19
+ """
20
+
21
+ from .server import CompletionQueue, completion_queue
22
+ from .ui import HumanCompletionUI, create_ui
23
+
24
+ __all__ = [
25
+ "CompletionQueue",
26
+ "completion_queue",
27
+ "HumanCompletionUI",
28
+ "create_ui"
29
+ ]
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Human-in-the-Loop Completion Server and UI
4
+
5
+ This module combines the FastAPI server for handling completion requests
6
+ with a Gradio UI for human interaction.
7
+ """
8
+
9
+ import gradio as gr
10
+ from fastapi import FastAPI
11
+ from .server import app as fastapi_app
12
+ from .ui import create_ui
13
+
14
+ # Create the Gradio demo
15
+ gradio_demo = create_ui()
16
+
17
+ # Mount Gradio on FastAPI
18
+ CUSTOM_PATH = "/gradio"
19
+ app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
20
+
21
+ # Add a redirect from root to Gradio UI
22
+ @fastapi_app.get("/")
23
+ async def redirect_to_ui():
24
+ """Redirect root to Gradio UI."""
25
+ return {
26
+ "message": "Human Completion Server is running",
27
+ "ui_url": "/gradio",
28
+ "api_docs": "/docs"
29
+ }
30
+
31
+ if __name__ == "__main__":
32
+ import uvicorn
33
+ print("🚀 Starting Human-in-the-Loop Completion Server...")
34
+ print("📊 API Server: http://localhost:8002")
35
+ print("🎨 Gradio UI: http://localhost:8002/gradio")
36
+ print("📚 API Docs: http://localhost:8002/docs")
37
+
38
+ uvicorn.run(app, host="0.0.0.0", port=8002)