cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
  30. agent/core/__init__.py +0 -27
  31. agent/core/agent.py +0 -210
  32. agent/core/base.py +0 -217
  33. agent/core/callbacks.py +0 -200
  34. agent/core/experiment.py +0 -249
  35. agent/core/factory.py +0 -122
  36. agent/core/messages.py +0 -332
  37. agent/core/provider_config.py +0 -21
  38. agent/core/telemetry.py +0 -142
  39. agent/core/tools/__init__.py +0 -21
  40. agent/core/tools/base.py +0 -74
  41. agent/core/tools/bash.py +0 -52
  42. agent/core/tools/collection.py +0 -46
  43. agent/core/tools/computer.py +0 -113
  44. agent/core/tools/edit.py +0 -67
  45. agent/core/tools/manager.py +0 -56
  46. agent/core/tools.py +0 -32
  47. agent/core/types.py +0 -88
  48. agent/core/visualization.py +0 -197
  49. agent/providers/__init__.py +0 -4
  50. agent/providers/anthropic/__init__.py +0 -6
  51. agent/providers/anthropic/api/client.py +0 -360
  52. agent/providers/anthropic/api/logging.py +0 -150
  53. agent/providers/anthropic/api_handler.py +0 -140
  54. agent/providers/anthropic/callbacks/__init__.py +0 -5
  55. agent/providers/anthropic/callbacks/manager.py +0 -65
  56. agent/providers/anthropic/loop.py +0 -568
  57. agent/providers/anthropic/prompts.py +0 -23
  58. agent/providers/anthropic/response_handler.py +0 -226
  59. agent/providers/anthropic/tools/__init__.py +0 -33
  60. agent/providers/anthropic/tools/base.py +0 -88
  61. agent/providers/anthropic/tools/bash.py +0 -66
  62. agent/providers/anthropic/tools/collection.py +0 -34
  63. agent/providers/anthropic/tools/computer.py +0 -396
  64. agent/providers/anthropic/tools/edit.py +0 -326
  65. agent/providers/anthropic/tools/manager.py +0 -54
  66. agent/providers/anthropic/tools/run.py +0 -42
  67. agent/providers/anthropic/types.py +0 -16
  68. agent/providers/anthropic/utils.py +0 -367
  69. agent/providers/omni/__init__.py +0 -8
  70. agent/providers/omni/api_handler.py +0 -42
  71. agent/providers/omni/clients/anthropic.py +0 -103
  72. agent/providers/omni/clients/base.py +0 -35
  73. agent/providers/omni/clients/oaicompat.py +0 -195
  74. agent/providers/omni/clients/ollama.py +0 -122
  75. agent/providers/omni/clients/openai.py +0 -155
  76. agent/providers/omni/clients/utils.py +0 -25
  77. agent/providers/omni/image_utils.py +0 -34
  78. agent/providers/omni/loop.py +0 -990
  79. agent/providers/omni/parser.py +0 -307
  80. agent/providers/omni/prompts.py +0 -64
  81. agent/providers/omni/tools/__init__.py +0 -30
  82. agent/providers/omni/tools/base.py +0 -29
  83. agent/providers/omni/tools/bash.py +0 -74
  84. agent/providers/omni/tools/computer.py +0 -179
  85. agent/providers/omni/tools/manager.py +0 -61
  86. agent/providers/omni/utils.py +0 -236
  87. agent/providers/openai/__init__.py +0 -6
  88. agent/providers/openai/api_handler.py +0 -456
  89. agent/providers/openai/loop.py +0 -472
  90. agent/providers/openai/response_handler.py +0 -205
  91. agent/providers/openai/tools/__init__.py +0 -15
  92. agent/providers/openai/tools/base.py +0 -79
  93. agent/providers/openai/tools/computer.py +0 -326
  94. agent/providers/openai/tools/manager.py +0 -106
  95. agent/providers/openai/types.py +0 -36
  96. agent/providers/openai/utils.py +0 -98
  97. agent/providers/uitars/__init__.py +0 -1
  98. agent/providers/uitars/clients/base.py +0 -35
  99. agent/providers/uitars/clients/mlxvlm.py +0 -263
  100. agent/providers/uitars/clients/oaicompat.py +0 -214
  101. agent/providers/uitars/loop.py +0 -660
  102. agent/providers/uitars/prompts.py +0 -63
  103. agent/providers/uitars/tools/__init__.py +0 -1
  104. agent/providers/uitars/tools/computer.py +0 -283
  105. agent/providers/uitars/tools/manager.py +0 -60
  106. agent/providers/uitars/utils.py +0 -264
  107. agent/telemetry.py +0 -21
  108. agent/ui/__main__.py +0 -15
  109. cua_agent-0.3.1.dist-info/METADATA +0 -295
  110. cua_agent-0.3.1.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
agent/responses.py ADDED
@@ -0,0 +1,207 @@
1
+ """
2
+ Functions for making various Responses API items from different types of responses.
3
+ Based on the OpenAI spec for Responses API items.
4
+ """
5
+
6
+ import base64
7
+ import json
8
+ import uuid
9
+ from typing import List, Dict, Any, Literal, Union, Optional
10
+
11
+ from openai.types.responses.response_computer_tool_call_param import (
12
+ ResponseComputerToolCallParam,
13
+ ActionClick,
14
+ ActionDoubleClick,
15
+ ActionDrag,
16
+ ActionDragPath,
17
+ ActionKeypress,
18
+ ActionMove,
19
+ ActionScreenshot,
20
+ ActionScroll,
21
+ ActionType as ActionTypeAction,
22
+ ActionWait,
23
+ PendingSafetyCheck
24
+ )
25
+
26
+ from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
27
+ from openai.types.responses.response_output_text_param import ResponseOutputTextParam
28
+ from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
29
+ from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
30
+ from openai.types.responses.easy_input_message_param import EasyInputMessageParam
31
+ from openai.types.responses.response_input_image_param import ResponseInputImageParam
32
+
33
+ def random_id():
34
+ return str(uuid.uuid4())
35
+
36
+ # User message items
37
+ def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
38
+ return EasyInputMessageParam(
39
+ content=[
40
+ ResponseInputImageParam(
41
+ type="input_image",
42
+ image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
43
+ )
44
+ ],
45
+ role="user",
46
+ type="message"
47
+ )
48
+
49
+ # Text items
50
+ def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
51
+ return ResponseReasoningItemParam(
52
+ id=random_id(),
53
+ summary=[
54
+ Summary(text=reasoning, type="summary_text")
55
+ ],
56
+ type="reasoning"
57
+ )
58
+
59
+ def make_output_text_item(content: str) -> ResponseOutputMessageParam:
60
+ return ResponseOutputMessageParam(
61
+ id=random_id(),
62
+ content=[
63
+ ResponseOutputTextParam(
64
+ text=content,
65
+ type="output_text",
66
+ annotations=[]
67
+ )
68
+ ],
69
+ role="assistant",
70
+ status="completed",
71
+ type="message"
72
+ )
73
+
74
+ # Function call items
75
+ def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
76
+ return ResponseFunctionToolCallParam(
77
+ id=random_id(),
78
+ call_id=call_id if call_id else random_id(),
79
+ name=function_name,
80
+ arguments=json.dumps(arguments),
81
+ status="completed",
82
+ type="function_call"
83
+ )
84
+
85
+ # Computer tool call items
86
+ def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
87
+ return ResponseComputerToolCallParam(
88
+ id=random_id(),
89
+ call_id=call_id if call_id else random_id(),
90
+ action=ActionClick(
91
+ button=button,
92
+ type="click",
93
+ x=x,
94
+ y=y
95
+ ),
96
+ pending_safety_checks=[],
97
+ status="completed",
98
+ type="computer_call"
99
+ )
100
+
101
+ def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
102
+ return ResponseComputerToolCallParam(
103
+ id=random_id(),
104
+ call_id=call_id if call_id else random_id(),
105
+ action=ActionDoubleClick(
106
+ type="double_click",
107
+ x=x,
108
+ y=y
109
+ ),
110
+ pending_safety_checks=[],
111
+ status="completed",
112
+ type="computer_call"
113
+ )
114
+
115
+ def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
116
+ drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
117
+ return ResponseComputerToolCallParam(
118
+ id=random_id(),
119
+ call_id=call_id if call_id else random_id(),
120
+ action=ActionDrag(
121
+ path=drag_path,
122
+ type="drag"
123
+ ),
124
+ pending_safety_checks=[],
125
+ status="completed",
126
+ type="computer_call"
127
+ )
128
+
129
+ def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
130
+ return ResponseComputerToolCallParam(
131
+ id=random_id(),
132
+ call_id=call_id if call_id else random_id(),
133
+ action=ActionKeypress(
134
+ keys=keys,
135
+ type="keypress"
136
+ ),
137
+ pending_safety_checks=[],
138
+ status="completed",
139
+ type="computer_call"
140
+ )
141
+
142
+ def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
143
+ return ResponseComputerToolCallParam(
144
+ id=random_id(),
145
+ call_id=call_id if call_id else random_id(),
146
+ action=ActionMove(
147
+ type="move",
148
+ x=x,
149
+ y=y
150
+ ),
151
+ pending_safety_checks=[],
152
+ status="completed",
153
+ type="computer_call"
154
+ )
155
+
156
+ def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
157
+ return ResponseComputerToolCallParam(
158
+ id=random_id(),
159
+ call_id=call_id if call_id else random_id(),
160
+ action=ActionScreenshot(
161
+ type="screenshot"
162
+ ),
163
+ pending_safety_checks=[],
164
+ status="completed",
165
+ type="computer_call"
166
+ )
167
+
168
+ def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
169
+ return ResponseComputerToolCallParam(
170
+ id=random_id(),
171
+ call_id=call_id if call_id else random_id(),
172
+ action=ActionScroll(
173
+ scroll_x=scroll_x,
174
+ scroll_y=scroll_y,
175
+ type="scroll",
176
+ x=x,
177
+ y=y
178
+ ),
179
+ pending_safety_checks=[],
180
+ status="completed",
181
+ type="computer_call"
182
+ )
183
+
184
+ def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
185
+ return ResponseComputerToolCallParam(
186
+ id=random_id(),
187
+ call_id=call_id if call_id else random_id(),
188
+ action=ActionTypeAction(
189
+ text=text,
190
+ type="type"
191
+ ),
192
+ pending_safety_checks=[],
193
+ status="completed",
194
+ type="computer_call"
195
+ )
196
+
197
+ def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
198
+ return ResponseComputerToolCallParam(
199
+ id=random_id(),
200
+ call_id=call_id if call_id else random_id(),
201
+ action=ActionWait(
202
+ type="wait"
203
+ ),
204
+ pending_safety_checks=[],
205
+ status="completed",
206
+ type="computer_call"
207
+ )
agent/types.py ADDED
@@ -0,0 +1,79 @@
1
+ """
2
+ Type definitions for agent
3
+ """
4
+
5
+ from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
6
+ from pydantic import BaseModel
7
+ import re
8
+ from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
9
+ from collections.abc import Iterable
10
+
11
+ # Agent input types
12
+ Messages = str | ResponseInputParam
13
+ Tools = Optional[Iterable[ToolParam]]
14
+
15
+ # Agent output types
16
+ AgentResponse = ResponsesAPIResponse
17
+
18
+ # Agent loop registration
19
+ class AgentLoopInfo(BaseModel):
20
+ """Information about a registered agent loop"""
21
+ func: Callable
22
+ models_regex: str
23
+ priority: int = 0
24
+
25
+ def matches_model(self, model: str) -> bool:
26
+ """Check if this loop matches the given model"""
27
+ return bool(re.match(self.models_regex, model))
28
+
29
+ # Computer tool interface
30
+ class Computer(Protocol):
31
+ """Protocol defining the interface for computer interactions."""
32
+
33
+ async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
34
+ """Get the current environment type."""
35
+ ...
36
+
37
+ async def get_dimensions(self) -> tuple[int, int]:
38
+ """Get screen dimensions as (width, height)."""
39
+ ...
40
+
41
+ async def screenshot(self) -> str:
42
+ """Take a screenshot and return as base64 string."""
43
+ ...
44
+
45
+ async def click(self, x: int, y: int, button: str = "left") -> None:
46
+ """Click at coordinates with specified button."""
47
+ ...
48
+
49
+ async def double_click(self, x: int, y: int) -> None:
50
+ """Double click at coordinates."""
51
+ ...
52
+
53
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
54
+ """Scroll at coordinates with specified scroll amounts."""
55
+ ...
56
+
57
+ async def type(self, text: str) -> None:
58
+ """Type text."""
59
+ ...
60
+
61
+ async def wait(self, ms: int = 1000) -> None:
62
+ """Wait for specified milliseconds."""
63
+ ...
64
+
65
+ async def move(self, x: int, y: int) -> None:
66
+ """Move cursor to coordinates."""
67
+ ...
68
+
69
+ async def keypress(self, keys: List[str]) -> None:
70
+ """Press key combination."""
71
+ ...
72
+
73
+ async def drag(self, path: List[Dict[str, int]]) -> None:
74
+ """Drag along specified path."""
75
+ ...
76
+
77
+ async def get_current_url(self) -> str:
78
+ """Get current URL (for browser environments)."""
79
+ ...
agent/ui/__init__.py CHANGED
@@ -1 +1,7 @@
1
- """UI modules for the Computer-Use Agent."""
1
+ """
2
+ UI components for agent
3
+ """
4
+
5
+ from .gradio import test_cua, create_gradio_ui
6
+
7
+ __all__ = ["test_cua", "create_gradio_ui"]
@@ -1,21 +1,8 @@
1
- """Gradio UI for Computer-Use Agent."""
1
+ """
2
+ Gradio UI for agent
3
+ """
2
4
 
3
- import gradio as gr
4
- from typing import Optional
5
+ from .app import test_cua
6
+ from .ui_components import create_gradio_ui
5
7
 
6
- from .app import create_gradio_ui
7
-
8
-
9
- def registry(name: str = "cua:gpt-4o") -> gr.Blocks:
10
- """Create and register a Gradio UI for the Computer-Use Agent.
11
-
12
- Args:
13
- name: The name to use for the Gradio app, in format 'provider:model'
14
-
15
- Returns:
16
- A Gradio Blocks application
17
- """
18
- provider, model = name.split(":", 1) if ":" in name else ("openai", name)
19
-
20
- # Create and return the Gradio UI
21
- return create_gradio_ui(provider_name=provider, model_name=model)
8
+ __all__ = ["test_cua", "create_gradio_ui"]