cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/computers/base.py
CHANGED
|
@@ -2,69 +2,82 @@
|
|
|
2
2
|
Base computer interface protocol for agent interactions.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
9
|
+
Literal,
|
|
10
|
+
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Union,
|
|
13
|
+
runtime_checkable,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
@runtime_checkable
|
|
9
18
|
class AsyncComputerHandler(Protocol):
|
|
10
19
|
"""Protocol defining the interface for computer interactions."""
|
|
11
|
-
|
|
12
|
-
# ==== Computer-Use-Preview Action Space ====
|
|
20
|
+
|
|
21
|
+
# ==== Computer-Use-Preview Action Space ====
|
|
13
22
|
|
|
14
23
|
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
|
15
24
|
"""Get the current environment type."""
|
|
16
25
|
...
|
|
17
|
-
|
|
26
|
+
|
|
18
27
|
async def get_dimensions(self) -> tuple[int, int]:
|
|
19
28
|
"""Get screen dimensions as (width, height)."""
|
|
20
29
|
...
|
|
21
|
-
|
|
22
|
-
async def screenshot(self) -> str:
|
|
23
|
-
"""Take a screenshot and return as base64 string.
|
|
30
|
+
|
|
31
|
+
async def screenshot(self, text: Optional[str] = None) -> str:
|
|
32
|
+
"""Take a screenshot and return as base64 string.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
|
|
36
|
+
"""
|
|
24
37
|
...
|
|
25
|
-
|
|
38
|
+
|
|
26
39
|
async def click(self, x: int, y: int, button: str = "left") -> None:
|
|
27
40
|
"""Click at coordinates with specified button."""
|
|
28
41
|
...
|
|
29
|
-
|
|
42
|
+
|
|
30
43
|
async def double_click(self, x: int, y: int) -> None:
|
|
31
44
|
"""Double click at coordinates."""
|
|
32
45
|
...
|
|
33
|
-
|
|
46
|
+
|
|
34
47
|
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
|
35
48
|
"""Scroll at coordinates with specified scroll amounts."""
|
|
36
49
|
...
|
|
37
|
-
|
|
50
|
+
|
|
38
51
|
async def type(self, text: str) -> None:
|
|
39
52
|
"""Type text."""
|
|
40
53
|
...
|
|
41
|
-
|
|
54
|
+
|
|
42
55
|
async def wait(self, ms: int = 1000) -> None:
|
|
43
56
|
"""Wait for specified milliseconds."""
|
|
44
57
|
...
|
|
45
|
-
|
|
58
|
+
|
|
46
59
|
async def move(self, x: int, y: int) -> None:
|
|
47
60
|
"""Move cursor to coordinates."""
|
|
48
61
|
...
|
|
49
|
-
|
|
62
|
+
|
|
50
63
|
async def keypress(self, keys: Union[List[str], str]) -> None:
|
|
51
64
|
"""Press key combination."""
|
|
52
65
|
...
|
|
53
|
-
|
|
66
|
+
|
|
54
67
|
async def drag(self, path: List[Dict[str, int]]) -> None:
|
|
55
68
|
"""Drag along specified path."""
|
|
56
69
|
...
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
async def get_current_url(self) -> str:
|
|
59
72
|
"""Get current URL (for browser environments)."""
|
|
60
73
|
...
|
|
61
|
-
|
|
62
|
-
# ==== Anthropic Action Space ====
|
|
74
|
+
|
|
75
|
+
# ==== Anthropic Action Space ====
|
|
63
76
|
|
|
64
77
|
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
65
78
|
"""Left mouse down at coordinates."""
|
|
66
79
|
...
|
|
67
|
-
|
|
80
|
+
|
|
68
81
|
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
69
82
|
"""Left mouse up at coordinates."""
|
|
70
83
|
...
|
agent/computers/cua.py
CHANGED
|
@@ -3,24 +3,27 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import base64
|
|
6
|
-
from typing import Dict, List,
|
|
7
|
-
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
7
|
+
|
|
8
8
|
from computer import Computer
|
|
9
9
|
|
|
10
|
+
from .base import AsyncComputerHandler
|
|
11
|
+
|
|
12
|
+
|
|
10
13
|
class cuaComputerHandler(AsyncComputerHandler):
|
|
11
14
|
"""Computer handler that implements the Computer protocol using the computer interface."""
|
|
12
|
-
|
|
15
|
+
|
|
13
16
|
def __init__(self, cua_computer: Computer):
|
|
14
17
|
"""Initialize with a computer interface (from tool schema)."""
|
|
15
18
|
self.cua_computer = cua_computer
|
|
16
19
|
self.interface = None
|
|
17
20
|
|
|
18
21
|
async def _initialize(self):
|
|
19
|
-
if hasattr(self.cua_computer,
|
|
22
|
+
if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
|
|
20
23
|
await self.cua_computer.run()
|
|
21
24
|
self.interface = self.cua_computer.interface
|
|
22
|
-
|
|
23
|
-
# ==== Computer-Use-Preview Action Space ====
|
|
25
|
+
|
|
26
|
+
# ==== Computer-Use-Preview Action Space ====
|
|
24
27
|
|
|
25
28
|
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
|
26
29
|
"""Get the current environment type."""
|
|
@@ -32,13 +35,17 @@ class cuaComputerHandler(AsyncComputerHandler):
|
|
|
32
35
|
assert self.interface is not None
|
|
33
36
|
screen_size = await self.interface.get_screen_size()
|
|
34
37
|
return screen_size["width"], screen_size["height"]
|
|
35
|
-
|
|
36
|
-
async def screenshot(self) -> str:
|
|
37
|
-
"""Take a screenshot and return as base64 string.
|
|
38
|
+
|
|
39
|
+
async def screenshot(self, text: Optional[str] = None) -> str:
|
|
40
|
+
"""Take a screenshot and return as base64 string.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
|
|
44
|
+
"""
|
|
38
45
|
assert self.interface is not None
|
|
39
46
|
screenshot_bytes = await self.interface.screenshot()
|
|
40
|
-
return base64.b64encode(screenshot_bytes).decode(
|
|
41
|
-
|
|
47
|
+
return base64.b64encode(screenshot_bytes).decode("utf-8")
|
|
48
|
+
|
|
42
49
|
async def click(self, x: int, y: int, button: str = "left") -> None:
|
|
43
50
|
"""Click at coordinates with specified button."""
|
|
44
51
|
assert self.interface is not None
|
|
@@ -49,34 +56,35 @@ class cuaComputerHandler(AsyncComputerHandler):
|
|
|
49
56
|
else:
|
|
50
57
|
# Default to left click for unknown buttons
|
|
51
58
|
await self.interface.left_click(x, y)
|
|
52
|
-
|
|
59
|
+
|
|
53
60
|
async def double_click(self, x: int, y: int) -> None:
|
|
54
61
|
"""Double click at coordinates."""
|
|
55
62
|
assert self.interface is not None
|
|
56
63
|
await self.interface.double_click(x, y)
|
|
57
|
-
|
|
64
|
+
|
|
58
65
|
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
|
59
66
|
"""Scroll at coordinates with specified scroll amounts."""
|
|
60
67
|
assert self.interface is not None
|
|
61
68
|
await self.interface.move_cursor(x, y)
|
|
62
69
|
await self.interface.scroll(scroll_x, scroll_y)
|
|
63
|
-
|
|
70
|
+
|
|
64
71
|
async def type(self, text: str) -> None:
|
|
65
72
|
"""Type text."""
|
|
66
73
|
assert self.interface is not None
|
|
67
74
|
await self.interface.type_text(text)
|
|
68
|
-
|
|
75
|
+
|
|
69
76
|
async def wait(self, ms: int = 1000) -> None:
|
|
70
77
|
"""Wait for specified milliseconds."""
|
|
71
78
|
assert self.interface is not None
|
|
72
79
|
import asyncio
|
|
80
|
+
|
|
73
81
|
await asyncio.sleep(ms / 1000.0)
|
|
74
|
-
|
|
82
|
+
|
|
75
83
|
async def move(self, x: int, y: int) -> None:
|
|
76
84
|
"""Move cursor to coordinates."""
|
|
77
85
|
assert self.interface is not None
|
|
78
86
|
await self.interface.move_cursor(x, y)
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
async def keypress(self, keys: Union[List[str], str]) -> None:
|
|
81
89
|
"""Press key combination."""
|
|
82
90
|
assert self.interface is not None
|
|
@@ -87,38 +95,57 @@ class cuaComputerHandler(AsyncComputerHandler):
|
|
|
87
95
|
else:
|
|
88
96
|
# Handle key combinations
|
|
89
97
|
await self.interface.hotkey(*keys)
|
|
90
|
-
|
|
98
|
+
|
|
91
99
|
async def drag(self, path: List[Dict[str, int]]) -> None:
|
|
92
100
|
"""Drag along specified path."""
|
|
93
101
|
assert self.interface is not None
|
|
94
102
|
if not path:
|
|
95
103
|
return
|
|
96
|
-
|
|
104
|
+
|
|
97
105
|
# Start drag from first point
|
|
98
106
|
start = path[0]
|
|
99
107
|
await self.interface.mouse_down(start["x"], start["y"])
|
|
100
|
-
|
|
108
|
+
|
|
101
109
|
# Move through path
|
|
102
110
|
for point in path[1:]:
|
|
103
111
|
await self.interface.move_cursor(point["x"], point["y"])
|
|
104
|
-
|
|
112
|
+
|
|
105
113
|
# End drag at last point
|
|
106
114
|
end = path[-1]
|
|
107
115
|
await self.interface.mouse_up(end["x"], end["y"])
|
|
108
|
-
|
|
116
|
+
|
|
109
117
|
async def get_current_url(self) -> str:
|
|
110
118
|
"""Get current URL (for browser environments)."""
|
|
111
119
|
# This would need to be implemented based on the specific browser interface
|
|
112
120
|
# For now, return empty string
|
|
113
121
|
return ""
|
|
114
122
|
|
|
115
|
-
# ==== Anthropic Computer Action Space ====
|
|
123
|
+
# ==== Anthropic Computer Action Space ====
|
|
116
124
|
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
117
125
|
"""Left mouse down at coordinates."""
|
|
118
126
|
assert self.interface is not None
|
|
119
127
|
await self.interface.mouse_down(x, y, button="left")
|
|
120
|
-
|
|
128
|
+
|
|
121
129
|
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
122
130
|
"""Left mouse up at coordinates."""
|
|
123
131
|
assert self.interface is not None
|
|
124
|
-
await self.interface.mouse_up(x, y, button="left")
|
|
132
|
+
await self.interface.mouse_up(x, y, button="left")
|
|
133
|
+
|
|
134
|
+
# ==== Browser Control Methods (via Playwright) ====
|
|
135
|
+
async def playwright_exec(
|
|
136
|
+
self, command: str, params: Optional[Dict[str, Any]] = None
|
|
137
|
+
) -> Dict[str, Any]:
|
|
138
|
+
"""Execute a Playwright browser command.
|
|
139
|
+
|
|
140
|
+
Supports: visit_url, click, type, scroll, web_search, screenshot,
|
|
141
|
+
get_current_url, go_back, go_forward
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
command: The browser command to execute
|
|
145
|
+
params: Command parameters
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Dict containing the command result
|
|
149
|
+
"""
|
|
150
|
+
assert self.interface is not None
|
|
151
|
+
return await self.interface.playwright_exec(command, params or {})
|
agent/computers/custom.py
CHANGED
|
@@ -3,47 +3,49 @@ Custom computer handler implementation that accepts a dictionary of functions.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import base64
|
|
6
|
-
from typing import Dict, List, Any, Literal, Union, Optional, Callable
|
|
7
|
-
from PIL import Image
|
|
8
6
|
import io
|
|
7
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
8
|
+
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
9
11
|
from .base import AsyncComputerHandler
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
class CustomComputerHandler(AsyncComputerHandler):
|
|
13
15
|
"""Computer handler that implements the Computer protocol using a dictionary of custom functions."""
|
|
14
|
-
|
|
16
|
+
|
|
15
17
|
def __init__(self, functions: Dict[str, Callable]):
|
|
16
18
|
"""
|
|
17
19
|
Initialize with a dictionary of functions.
|
|
18
|
-
|
|
20
|
+
|
|
19
21
|
Args:
|
|
20
22
|
functions: Dictionary where keys are method names and values are callable functions.
|
|
21
23
|
Only 'screenshot' is required, all others are optional.
|
|
22
|
-
|
|
24
|
+
|
|
23
25
|
Raises:
|
|
24
26
|
ValueError: If required 'screenshot' function is not provided.
|
|
25
27
|
"""
|
|
26
|
-
if
|
|
28
|
+
if "screenshot" not in functions:
|
|
27
29
|
raise ValueError("'screenshot' function is required in functions dictionary")
|
|
28
|
-
|
|
30
|
+
|
|
29
31
|
self.functions = functions
|
|
30
32
|
self._last_screenshot_size: Optional[tuple[int, int]] = None
|
|
31
|
-
|
|
33
|
+
|
|
32
34
|
async def _call_function(self, func, *args, **kwargs):
|
|
33
35
|
"""
|
|
34
36
|
Call a function, handling both async and sync functions.
|
|
35
|
-
|
|
37
|
+
|
|
36
38
|
Args:
|
|
37
39
|
func: The function to call
|
|
38
40
|
*args: Positional arguments to pass to the function
|
|
39
41
|
**kwargs: Keyword arguments to pass to the function
|
|
40
|
-
|
|
42
|
+
|
|
41
43
|
Returns:
|
|
42
44
|
The result of the function call
|
|
43
45
|
"""
|
|
44
46
|
import asyncio
|
|
45
47
|
import inspect
|
|
46
|
-
|
|
48
|
+
|
|
47
49
|
if callable(func):
|
|
48
50
|
if inspect.iscoroutinefunction(func):
|
|
49
51
|
return await func(*args, **kwargs)
|
|
@@ -51,14 +53,14 @@ class CustomComputerHandler(AsyncComputerHandler):
|
|
|
51
53
|
return func(*args, **kwargs)
|
|
52
54
|
else:
|
|
53
55
|
return func
|
|
54
|
-
|
|
56
|
+
|
|
55
57
|
async def _get_value(self, attribute: str):
|
|
56
58
|
"""
|
|
57
59
|
Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
|
|
58
|
-
|
|
60
|
+
|
|
59
61
|
Args:
|
|
60
62
|
attribute: The attribute name to look for
|
|
61
|
-
|
|
63
|
+
|
|
62
64
|
Returns:
|
|
63
65
|
The value from the functions dict, called if callable, returned directly if not
|
|
64
66
|
"""
|
|
@@ -66,20 +68,20 @@ class CustomComputerHandler(AsyncComputerHandler):
|
|
|
66
68
|
get_key = f"get_{attribute}"
|
|
67
69
|
if get_key in self.functions:
|
|
68
70
|
return await self._call_function(self.functions[get_key])
|
|
69
|
-
|
|
70
|
-
# Check for '{attribute}'
|
|
71
|
+
|
|
72
|
+
# Check for '{attribute}'
|
|
71
73
|
if attribute in self.functions:
|
|
72
74
|
return await self._call_function(self.functions[attribute])
|
|
73
|
-
|
|
75
|
+
|
|
74
76
|
return None
|
|
75
|
-
|
|
77
|
+
|
|
76
78
|
def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
|
|
77
79
|
"""
|
|
78
80
|
Convert image to base64 string.
|
|
79
|
-
|
|
81
|
+
|
|
80
82
|
Args:
|
|
81
83
|
img: Image as bytes, PIL Image, or base64 string
|
|
82
|
-
|
|
84
|
+
|
|
83
85
|
Returns:
|
|
84
86
|
str: Base64 encoded image string
|
|
85
87
|
"""
|
|
@@ -88,43 +90,47 @@ class CustomComputerHandler(AsyncComputerHandler):
|
|
|
88
90
|
return img
|
|
89
91
|
elif isinstance(img, bytes):
|
|
90
92
|
# Raw bytes
|
|
91
|
-
return base64.b64encode(img).decode(
|
|
93
|
+
return base64.b64encode(img).decode("utf-8")
|
|
92
94
|
elif isinstance(img, Image.Image):
|
|
93
95
|
# PIL Image
|
|
94
96
|
buffer = io.BytesIO()
|
|
95
|
-
img.save(buffer, format=
|
|
96
|
-
return base64.b64encode(buffer.getvalue()).decode(
|
|
97
|
+
img.save(buffer, format="PNG")
|
|
98
|
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
97
99
|
else:
|
|
98
100
|
raise ValueError(f"Unsupported image type: {type(img)}")
|
|
99
|
-
|
|
100
|
-
# ==== Computer-Use-Preview Action Space ====
|
|
101
|
+
|
|
102
|
+
# ==== Computer-Use-Preview Action Space ====
|
|
101
103
|
|
|
102
104
|
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
|
103
105
|
"""Get the current environment type."""
|
|
104
|
-
result = await self._get_value(
|
|
106
|
+
result = await self._get_value("environment")
|
|
105
107
|
if result is None:
|
|
106
108
|
return "linux"
|
|
107
109
|
assert result in ["windows", "mac", "linux", "browser"]
|
|
108
|
-
return result
|
|
110
|
+
return result # type: ignore
|
|
109
111
|
|
|
110
112
|
async def get_dimensions(self) -> tuple[int, int]:
|
|
111
113
|
"""Get screen dimensions as (width, height)."""
|
|
112
|
-
result = await self._get_value(
|
|
114
|
+
result = await self._get_value("dimensions")
|
|
113
115
|
if result is not None:
|
|
114
|
-
return result
|
|
115
|
-
|
|
116
|
+
return result # type: ignore
|
|
117
|
+
|
|
116
118
|
# Fallback: use last screenshot size if available
|
|
117
119
|
if not self._last_screenshot_size:
|
|
118
120
|
await self.screenshot()
|
|
119
121
|
assert self._last_screenshot_size is not None, "Failed to get screenshot size"
|
|
120
|
-
|
|
122
|
+
|
|
121
123
|
return self._last_screenshot_size
|
|
122
|
-
|
|
123
|
-
async def screenshot(self) -> str:
|
|
124
|
-
"""Take a screenshot and return as base64 string.
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
124
|
+
|
|
125
|
+
async def screenshot(self, text: Optional[str] = None) -> str:
|
|
126
|
+
"""Take a screenshot and return as base64 string.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
|
|
130
|
+
"""
|
|
131
|
+
result = await self._call_function(self.functions["screenshot"])
|
|
132
|
+
b64_str = self._to_b64_str(result) # type: ignore
|
|
133
|
+
|
|
128
134
|
# Try to extract dimensions for fallback use
|
|
129
135
|
try:
|
|
130
136
|
if isinstance(result, Image.Image):
|
|
@@ -136,74 +142,75 @@ class CustomComputerHandler(AsyncComputerHandler):
|
|
|
136
142
|
except Exception:
|
|
137
143
|
# If we can't get dimensions, that's okay
|
|
138
144
|
pass
|
|
139
|
-
|
|
145
|
+
|
|
140
146
|
return b64_str
|
|
141
|
-
|
|
147
|
+
|
|
142
148
|
async def click(self, x: int, y: int, button: str = "left") -> None:
|
|
143
149
|
"""Click at coordinates with specified button."""
|
|
144
|
-
if
|
|
145
|
-
await self._call_function(self.functions[
|
|
150
|
+
if "click" in self.functions:
|
|
151
|
+
await self._call_function(self.functions["click"], x, y, button)
|
|
146
152
|
# No-op if not implemented
|
|
147
|
-
|
|
153
|
+
|
|
148
154
|
async def double_click(self, x: int, y: int) -> None:
|
|
149
155
|
"""Double click at coordinates."""
|
|
150
|
-
if
|
|
151
|
-
await self._call_function(self.functions[
|
|
156
|
+
if "double_click" in self.functions:
|
|
157
|
+
await self._call_function(self.functions["double_click"], x, y)
|
|
152
158
|
# No-op if not implemented
|
|
153
|
-
|
|
159
|
+
|
|
154
160
|
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
|
155
161
|
"""Scroll at coordinates with specified scroll amounts."""
|
|
156
|
-
if
|
|
157
|
-
await self._call_function(self.functions[
|
|
162
|
+
if "scroll" in self.functions:
|
|
163
|
+
await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
|
|
158
164
|
# No-op if not implemented
|
|
159
|
-
|
|
165
|
+
|
|
160
166
|
async def type(self, text: str) -> None:
|
|
161
167
|
"""Type text."""
|
|
162
|
-
if
|
|
163
|
-
await self._call_function(self.functions[
|
|
168
|
+
if "type" in self.functions:
|
|
169
|
+
await self._call_function(self.functions["type"], text)
|
|
164
170
|
# No-op if not implemented
|
|
165
|
-
|
|
171
|
+
|
|
166
172
|
async def wait(self, ms: int = 1000) -> None:
|
|
167
173
|
"""Wait for specified milliseconds."""
|
|
168
|
-
if
|
|
169
|
-
await self._call_function(self.functions[
|
|
174
|
+
if "wait" in self.functions:
|
|
175
|
+
await self._call_function(self.functions["wait"], ms)
|
|
170
176
|
else:
|
|
171
177
|
# Default implementation
|
|
172
178
|
import asyncio
|
|
179
|
+
|
|
173
180
|
await asyncio.sleep(ms / 1000.0)
|
|
174
|
-
|
|
181
|
+
|
|
175
182
|
async def move(self, x: int, y: int) -> None:
|
|
176
183
|
"""Move cursor to coordinates."""
|
|
177
|
-
if
|
|
178
|
-
await self._call_function(self.functions[
|
|
184
|
+
if "move" in self.functions:
|
|
185
|
+
await self._call_function(self.functions["move"], x, y)
|
|
179
186
|
# No-op if not implemented
|
|
180
|
-
|
|
187
|
+
|
|
181
188
|
async def keypress(self, keys: Union[List[str], str]) -> None:
|
|
182
189
|
"""Press key combination."""
|
|
183
|
-
if
|
|
184
|
-
await self._call_function(self.functions[
|
|
190
|
+
if "keypress" in self.functions:
|
|
191
|
+
await self._call_function(self.functions["keypress"], keys)
|
|
185
192
|
# No-op if not implemented
|
|
186
|
-
|
|
193
|
+
|
|
187
194
|
async def drag(self, path: List[Dict[str, int]]) -> None:
|
|
188
195
|
"""Drag along specified path."""
|
|
189
|
-
if
|
|
190
|
-
await self._call_function(self.functions[
|
|
196
|
+
if "drag" in self.functions:
|
|
197
|
+
await self._call_function(self.functions["drag"], path)
|
|
191
198
|
# No-op if not implemented
|
|
192
|
-
|
|
199
|
+
|
|
193
200
|
async def get_current_url(self) -> str:
|
|
194
201
|
"""Get current URL (for browser environments)."""
|
|
195
|
-
if
|
|
196
|
-
return await self._get_value(
|
|
202
|
+
if "get_current_url" in self.functions:
|
|
203
|
+
return await self._get_value("current_url") # type: ignore
|
|
197
204
|
return "" # Default fallback
|
|
198
|
-
|
|
205
|
+
|
|
199
206
|
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
200
207
|
"""Left mouse down at coordinates."""
|
|
201
|
-
if
|
|
202
|
-
await self._call_function(self.functions[
|
|
208
|
+
if "left_mouse_down" in self.functions:
|
|
209
|
+
await self._call_function(self.functions["left_mouse_down"], x, y)
|
|
203
210
|
# No-op if not implemented
|
|
204
|
-
|
|
211
|
+
|
|
205
212
|
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
206
213
|
"""Left mouse up at coordinates."""
|
|
207
|
-
if
|
|
208
|
-
await self._call_function(self.functions[
|
|
214
|
+
if "left_mouse_up" in self.functions:
|
|
215
|
+
await self._call_function(self.functions["left_mouse_up"], x, y)
|
|
209
216
|
# No-op if not implemented
|
agent/decorators.py
CHANGED
|
@@ -3,47 +3,56 @@ Decorators for agent - agent_loop decorator
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from typing import List, Optional
|
|
6
|
+
|
|
6
7
|
from .types import AgentConfigInfo
|
|
7
8
|
|
|
8
9
|
# Global registry
|
|
9
10
|
_agent_configs: List[AgentConfigInfo] = []
|
|
10
11
|
|
|
12
|
+
|
|
11
13
|
def register_agent(models: str, priority: int = 0):
|
|
12
14
|
"""
|
|
13
15
|
Decorator to register an AsyncAgentConfig class.
|
|
14
|
-
|
|
16
|
+
|
|
15
17
|
Args:
|
|
16
18
|
models: Regex pattern to match supported models
|
|
17
19
|
priority: Priority for agent selection (higher = more priority)
|
|
18
20
|
"""
|
|
21
|
+
|
|
19
22
|
def decorator(agent_class: type):
|
|
20
23
|
# Validate that the class implements AsyncAgentConfig protocol
|
|
21
|
-
if not hasattr(agent_class,
|
|
22
|
-
raise ValueError(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if not hasattr(agent_class,
|
|
26
|
-
raise ValueError(
|
|
27
|
-
|
|
24
|
+
if not hasattr(agent_class, "predict_step"):
|
|
25
|
+
raise ValueError(
|
|
26
|
+
f"Agent class {agent_class.__name__} must implement predict_step method"
|
|
27
|
+
)
|
|
28
|
+
if not hasattr(agent_class, "predict_click"):
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"Agent class {agent_class.__name__} must implement predict_click method"
|
|
31
|
+
)
|
|
32
|
+
if not hasattr(agent_class, "get_capabilities"):
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Agent class {agent_class.__name__} must implement get_capabilities method"
|
|
35
|
+
)
|
|
36
|
+
|
|
28
37
|
# Register the agent config
|
|
29
38
|
config_info = AgentConfigInfo(
|
|
30
|
-
agent_class=agent_class,
|
|
31
|
-
models_regex=models,
|
|
32
|
-
priority=priority
|
|
39
|
+
agent_class=agent_class, models_regex=models, priority=priority
|
|
33
40
|
)
|
|
34
41
|
_agent_configs.append(config_info)
|
|
35
|
-
|
|
42
|
+
|
|
36
43
|
# Sort by priority (highest first)
|
|
37
44
|
_agent_configs.sort(key=lambda x: x.priority, reverse=True)
|
|
38
|
-
|
|
45
|
+
|
|
39
46
|
return agent_class
|
|
40
|
-
|
|
47
|
+
|
|
41
48
|
return decorator
|
|
42
49
|
|
|
50
|
+
|
|
43
51
|
def get_agent_configs() -> List[AgentConfigInfo]:
|
|
44
52
|
"""Get all registered agent configs"""
|
|
45
53
|
return _agent_configs.copy()
|
|
46
54
|
|
|
55
|
+
|
|
47
56
|
def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
|
|
48
57
|
"""Find the best matching agent config for a model"""
|
|
49
58
|
for config_info in _agent_configs:
|
agent/human_tool/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@ Components:
|
|
|
12
12
|
Usage:
|
|
13
13
|
# Run the server and UI
|
|
14
14
|
python -m agent.human_tool
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
# Or run components separately
|
|
17
17
|
python -m agent.human_tool.server # API server only
|
|
18
18
|
python -m agent.human_tool.ui # UI only
|
|
@@ -21,9 +21,4 @@ Usage:
|
|
|
21
21
|
from .server import CompletionQueue, completion_queue
|
|
22
22
|
from .ui import HumanCompletionUI, create_ui
|
|
23
23
|
|
|
24
|
-
__all__ = [
|
|
25
|
-
"CompletionQueue",
|
|
26
|
-
"completion_queue",
|
|
27
|
-
"HumanCompletionUI",
|
|
28
|
-
"create_ui"
|
|
29
|
-
]
|
|
24
|
+
__all__ = ["CompletionQueue", "completion_queue", "HumanCompletionUI", "create_ui"]
|
agent/human_tool/__main__.py
CHANGED
|
@@ -8,6 +8,7 @@ with a Gradio UI for human interaction.
|
|
|
8
8
|
|
|
9
9
|
import gradio as gr
|
|
10
10
|
from fastapi import FastAPI
|
|
11
|
+
|
|
11
12
|
from .server import app as fastapi_app
|
|
12
13
|
from .ui import create_ui
|
|
13
14
|
|
|
@@ -18,6 +19,7 @@ gradio_demo = create_ui()
|
|
|
18
19
|
CUSTOM_PATH = "/gradio"
|
|
19
20
|
app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
|
|
20
21
|
|
|
22
|
+
|
|
21
23
|
# Add a redirect from root to Gradio UI
|
|
22
24
|
@fastapi_app.get("/")
|
|
23
25
|
async def redirect_to_ui():
|
|
@@ -25,14 +27,16 @@ async def redirect_to_ui():
|
|
|
25
27
|
return {
|
|
26
28
|
"message": "Human Completion Server is running",
|
|
27
29
|
"ui_url": "/gradio",
|
|
28
|
-
"api_docs": "/docs"
|
|
30
|
+
"api_docs": "/docs",
|
|
29
31
|
}
|
|
30
32
|
|
|
33
|
+
|
|
31
34
|
if __name__ == "__main__":
|
|
32
35
|
import uvicorn
|
|
36
|
+
|
|
33
37
|
print("🚀 Starting Human-in-the-Loop Completion Server...")
|
|
34
38
|
print("📊 API Server: http://localhost:8002")
|
|
35
39
|
print("🎨 Gradio UI: http://localhost:8002/gradio")
|
|
36
40
|
print("📚 API Docs: http://localhost:8002/docs")
|
|
37
|
-
|
|
41
|
+
|
|
38
42
|
uvicorn.run(app, host="0.0.0.0", port=8002)
|