cua-agent 0.1.6__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- cua_agent-0.1.18/PKG-INFO +165 -0
- cua_agent-0.1.18/README.md +116 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/__init__.py +3 -2
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/__init__.py +1 -6
- cua_agent-0.1.6/agent/core/computer_agent.py → cua_agent-0.1.18/agent/core/agent.py +31 -76
- cua_agent-0.1.6/agent/core/loop.py → cua_agent-0.1.18/agent/core/base.py +68 -127
- cua_agent-0.1.18/agent/core/factory.py +104 -0
- cua_agent-0.1.18/agent/core/messages.py +399 -0
- cua_agent-0.1.18/agent/core/provider_config.py +15 -0
- cua_agent-0.1.18/agent/core/types.py +45 -0
- cua_agent-0.1.18/agent/core/visualization.py +197 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/api/client.py +142 -1
- cua_agent-0.1.18/agent/providers/anthropic/api_handler.py +140 -0
- cua_agent-0.1.18/agent/providers/anthropic/callbacks/__init__.py +5 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/loop.py +207 -221
- cua_agent-0.1.18/agent/providers/anthropic/response_handler.py +226 -0
- cua_agent-0.1.18/agent/providers/anthropic/tools/bash.py +66 -0
- cua_agent-0.1.18/agent/providers/anthropic/utils.py +368 -0
- cua_agent-0.1.18/agent/providers/omni/__init__.py +8 -0
- cua_agent-0.1.18/agent/providers/omni/api_handler.py +42 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/clients/anthropic.py +4 -0
- cua_agent-0.1.18/agent/providers/omni/image_utils.py +34 -0
- cua_agent-0.1.18/agent/providers/omni/loop.py +855 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/parser.py +58 -4
- cua_agent-0.1.18/agent/providers/omni/tools/__init__.py +30 -0
- cua_agent-0.1.18/agent/providers/omni/tools/base.py +29 -0
- cua_agent-0.1.18/agent/providers/omni/tools/bash.py +74 -0
- cua_agent-0.1.18/agent/providers/omni/tools/computer.py +179 -0
- cua_agent-0.1.18/agent/providers/omni/tools/manager.py +61 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/types.py +1 -3
- cua_agent-0.1.18/agent/providers/omni/utils.py +236 -0
- cua_agent-0.1.18/agent/providers/openai/__init__.py +6 -0
- cua_agent-0.1.18/agent/providers/openai/api_handler.py +453 -0
- cua_agent-0.1.18/agent/providers/openai/loop.py +440 -0
- cua_agent-0.1.18/agent/providers/openai/response_handler.py +205 -0
- cua_agent-0.1.18/agent/providers/openai/tools/__init__.py +15 -0
- cua_agent-0.1.18/agent/providers/openai/tools/base.py +79 -0
- cua_agent-0.1.18/agent/providers/openai/tools/computer.py +319 -0
- cua_agent-0.1.18/agent/providers/openai/tools/manager.py +106 -0
- cua_agent-0.1.18/agent/providers/openai/types.py +36 -0
- cua_agent-0.1.18/agent/providers/openai/utils.py +98 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/pyproject.toml +7 -3
- cua_agent-0.1.6/PKG-INFO +0 -120
- cua_agent-0.1.6/README.md +0 -74
- cua_agent-0.1.6/agent/README.md +0 -63
- cua_agent-0.1.6/agent/core/messages.py +0 -245
- cua_agent-0.1.6/agent/providers/anthropic/messages/manager.py +0 -112
- cua_agent-0.1.6/agent/providers/anthropic/tools/bash.py +0 -163
- cua_agent-0.1.6/agent/providers/omni/__init__.py +0 -27
- cua_agent-0.1.6/agent/providers/omni/callbacks.py +0 -78
- cua_agent-0.1.6/agent/providers/omni/clients/groq.py +0 -101
- cua_agent-0.1.6/agent/providers/omni/experiment.py +0 -276
- cua_agent-0.1.6/agent/providers/omni/image_utils.py +0 -106
- cua_agent-0.1.6/agent/providers/omni/loop.py +0 -971
- cua_agent-0.1.6/agent/providers/omni/messages.py +0 -171
- cua_agent-0.1.6/agent/providers/omni/tool_manager.py +0 -91
- cua_agent-0.1.6/agent/providers/omni/tools/__init__.py +0 -12
- cua_agent-0.1.6/agent/providers/omni/tools/bash.py +0 -69
- cua_agent-0.1.6/agent/providers/omni/tools/computer.py +0 -217
- cua_agent-0.1.6/agent/providers/omni/tools/manager.py +0 -81
- cua_agent-0.1.6/agent/providers/omni/utils.py +0 -157
- cua_agent-0.1.6/agent/providers/omni/visualization.py +0 -130
- cua_agent-0.1.6/agent/types/__init__.py +0 -23
- cua_agent-0.1.6/agent/types/base.py +0 -41
- cua_agent-0.1.6/agent/types/messages.py +0 -36
- cua_agent-0.1.6/tests/test_agent.py +0 -91
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/README.md +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.6/agent/types → cua_agent-0.1.18/agent/core}/tools.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.6 → cua_agent-0.1.18}/agent/telemetry.py +0 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cua-agent
|
|
3
|
+
Version: 0.1.18
|
|
4
|
+
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
|
+
Author-Email: TryCua <gh@trycua.com>
|
|
6
|
+
Requires-Python: <3.13,>=3.10
|
|
7
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0
|
|
8
|
+
Requires-Dist: aiohttp<4.0.0,>=3.9.3
|
|
9
|
+
Requires-Dist: asyncio
|
|
10
|
+
Requires-Dist: anyio<5.0.0,>=4.4.1
|
|
11
|
+
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
12
|
+
Requires-Dist: pydantic<3.0.0,>=2.6.4
|
|
13
|
+
Requires-Dist: rich<14.0.0,>=13.7.1
|
|
14
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.1
|
|
15
|
+
Requires-Dist: cua-computer<0.2.0,>=0.1.0
|
|
16
|
+
Requires-Dist: cua-core<0.2.0,>=0.1.0
|
|
17
|
+
Requires-Dist: certifi>=2024.2.2
|
|
18
|
+
Provides-Extra: anthropic
|
|
19
|
+
Requires-Dist: anthropic>=0.49.0; extra == "anthropic"
|
|
20
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
21
|
+
Provides-Extra: openai
|
|
22
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
23
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
|
+
Provides-Extra: som
|
|
25
|
+
Requires-Dist: torch>=2.2.1; extra == "som"
|
|
26
|
+
Requires-Dist: torchvision>=0.17.1; extra == "som"
|
|
27
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "som"
|
|
28
|
+
Requires-Dist: transformers>=4.38.2; extra == "som"
|
|
29
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "som"
|
|
30
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "som"
|
|
31
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "som"
|
|
32
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
|
|
33
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
|
|
34
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
|
|
35
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: torch>=2.2.1; extra == "all"
|
|
38
|
+
Requires-Dist: torchvision>=0.17.1; extra == "all"
|
|
39
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
40
|
+
Requires-Dist: transformers>=4.38.2; extra == "all"
|
|
41
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
42
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "all"
|
|
43
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "all"
|
|
44
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
|
|
45
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
|
|
46
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
|
|
47
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
<div align="center">
|
|
51
|
+
<h1>
|
|
52
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
53
|
+
<picture>
|
|
54
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
55
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
56
|
+
<img alt="Shows my svg">
|
|
57
|
+
</picture>
|
|
58
|
+
</div>
|
|
59
|
+
|
|
60
|
+
[](#)
|
|
61
|
+
[](#)
|
|
62
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
63
|
+
[](https://pypi.org/project/cua-computer/)
|
|
64
|
+
</h1>
|
|
65
|
+
</div>
|
|
66
|
+
|
|
67
|
+
**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with Cua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen).
|
|
68
|
+
|
|
69
|
+
### Get started with Agent
|
|
70
|
+
|
|
71
|
+
<div align="center">
|
|
72
|
+
<img src="../../img/agent.png"/>
|
|
73
|
+
</div>
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install "cua-agent[all]"
|
|
79
|
+
|
|
80
|
+
# or install specific loop providers
|
|
81
|
+
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
82
|
+
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
83
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Run
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
async with Computer() as macos_computer:
|
|
90
|
+
# Create agent with loop and provider
|
|
91
|
+
agent = ComputerAgent(
|
|
92
|
+
computer=macos_computer,
|
|
93
|
+
loop=AgentLoop.OPENAI,
|
|
94
|
+
model=LLM(provider=LLMProvider.OPENAI)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
tasks = [
|
|
98
|
+
"Look for a repository named trycua/cua on GitHub.",
|
|
99
|
+
"Check the open issues, open the most recent one and read it.",
|
|
100
|
+
"Clone the repository in users/lume/projects if it doesn't exist yet.",
|
|
101
|
+
"Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
|
|
102
|
+
"From Cursor, open Composer if not already open.",
|
|
103
|
+
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
for i, task in enumerate(tasks):
|
|
107
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
108
|
+
async for result in agent.run(task):
|
|
109
|
+
print(result)
|
|
110
|
+
|
|
111
|
+
print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
115
|
+
|
|
116
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
117
|
+
|
|
118
|
+
## Agent Loops
|
|
119
|
+
|
|
120
|
+
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
121
|
+
|
|
122
|
+
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
|
123
|
+
|:-----------|:-----------------|:------------|:-------------|
|
|
124
|
+
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
125
|
+
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
126
|
+
| `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
|
|
127
|
+
|
|
128
|
+
## AgentResponse
|
|
129
|
+
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
async for result in agent.run(task):
|
|
133
|
+
print("Response ID: ", result.get("id"))
|
|
134
|
+
|
|
135
|
+
# Print detailed usage information
|
|
136
|
+
usage = result.get("usage")
|
|
137
|
+
if usage:
|
|
138
|
+
print("\nUsage Details:")
|
|
139
|
+
print(f" Input Tokens: {usage.get('input_tokens')}")
|
|
140
|
+
if "input_tokens_details" in usage:
|
|
141
|
+
print(f" Input Tokens Details: {usage.get('input_tokens_details')}")
|
|
142
|
+
print(f" Output Tokens: {usage.get('output_tokens')}")
|
|
143
|
+
if "output_tokens_details" in usage:
|
|
144
|
+
print(f" Output Tokens Details: {usage.get('output_tokens_details')}")
|
|
145
|
+
print(f" Total Tokens: {usage.get('total_tokens')}")
|
|
146
|
+
|
|
147
|
+
print("Response Text: ", result.get("text"))
|
|
148
|
+
|
|
149
|
+
# Print tools information
|
|
150
|
+
tools = result.get("tools")
|
|
151
|
+
if tools:
|
|
152
|
+
print("\nTools:")
|
|
153
|
+
print(tools)
|
|
154
|
+
|
|
155
|
+
# Print reasoning and tool call outputs
|
|
156
|
+
outputs = result.get("output", [])
|
|
157
|
+
for output in outputs:
|
|
158
|
+
output_type = output.get("type")
|
|
159
|
+
if output_type == "reasoning":
|
|
160
|
+
print("\nReasoning Output:")
|
|
161
|
+
print(output)
|
|
162
|
+
elif output_type == "computer_call":
|
|
163
|
+
print("\nTool Call Output:")
|
|
164
|
+
print(output)
|
|
165
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>
|
|
3
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
4
|
+
<picture>
|
|
5
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
6
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
7
|
+
<img alt="Shows my svg">
|
|
8
|
+
</picture>
|
|
9
|
+
</div>
|
|
10
|
+
|
|
11
|
+
[](#)
|
|
12
|
+
[](#)
|
|
13
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
14
|
+
[](https://pypi.org/project/cua-computer/)
|
|
15
|
+
</h1>
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with Cua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen).
|
|
19
|
+
|
|
20
|
+
### Get started with Agent
|
|
21
|
+
|
|
22
|
+
<div align="center">
|
|
23
|
+
<img src="../../img/agent.png"/>
|
|
24
|
+
</div>
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "cua-agent[all]"
|
|
30
|
+
|
|
31
|
+
# or install specific loop providers
|
|
32
|
+
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
|
+
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Run
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
async with Computer() as macos_computer:
|
|
41
|
+
# Create agent with loop and provider
|
|
42
|
+
agent = ComputerAgent(
|
|
43
|
+
computer=macos_computer,
|
|
44
|
+
loop=AgentLoop.OPENAI,
|
|
45
|
+
model=LLM(provider=LLMProvider.OPENAI)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
tasks = [
|
|
49
|
+
"Look for a repository named trycua/cua on GitHub.",
|
|
50
|
+
"Check the open issues, open the most recent one and read it.",
|
|
51
|
+
"Clone the repository in users/lume/projects if it doesn't exist yet.",
|
|
52
|
+
"Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
|
|
53
|
+
"From Cursor, open Composer if not already open.",
|
|
54
|
+
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
for i, task in enumerate(tasks):
|
|
58
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
59
|
+
async for result in agent.run(task):
|
|
60
|
+
print(result)
|
|
61
|
+
|
|
62
|
+
print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
66
|
+
|
|
67
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
68
|
+
|
|
69
|
+
## Agent Loops
|
|
70
|
+
|
|
71
|
+
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
72
|
+
|
|
73
|
+
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
|
74
|
+
|:-----------|:-----------------|:------------|:-------------|
|
|
75
|
+
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
76
|
+
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
77
|
+
| `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
|
|
78
|
+
|
|
79
|
+
## AgentResponse
|
|
80
|
+
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
async for result in agent.run(task):
|
|
84
|
+
print("Response ID: ", result.get("id"))
|
|
85
|
+
|
|
86
|
+
# Print detailed usage information
|
|
87
|
+
usage = result.get("usage")
|
|
88
|
+
if usage:
|
|
89
|
+
print("\nUsage Details:")
|
|
90
|
+
print(f" Input Tokens: {usage.get('input_tokens')}")
|
|
91
|
+
if "input_tokens_details" in usage:
|
|
92
|
+
print(f" Input Tokens Details: {usage.get('input_tokens_details')}")
|
|
93
|
+
print(f" Output Tokens: {usage.get('output_tokens')}")
|
|
94
|
+
if "output_tokens_details" in usage:
|
|
95
|
+
print(f" Output Tokens Details: {usage.get('output_tokens_details')}")
|
|
96
|
+
print(f" Total Tokens: {usage.get('total_tokens')}")
|
|
97
|
+
|
|
98
|
+
print("Response Text: ", result.get("text"))
|
|
99
|
+
|
|
100
|
+
# Print tools information
|
|
101
|
+
tools = result.get("tools")
|
|
102
|
+
if tools:
|
|
103
|
+
print("\nTools:")
|
|
104
|
+
print(tools)
|
|
105
|
+
|
|
106
|
+
# Print reasoning and tool call outputs
|
|
107
|
+
outputs = result.get("output", [])
|
|
108
|
+
for output in outputs:
|
|
109
|
+
output_type = output.get("type")
|
|
110
|
+
if output_type == "reasoning":
|
|
111
|
+
print("\nReasoning Output:")
|
|
112
|
+
print(output)
|
|
113
|
+
elif output_type == "computer_call":
|
|
114
|
+
print("\nTool Call Output:")
|
|
115
|
+
print(output)
|
|
116
|
+
```
|
|
@@ -49,6 +49,7 @@ except Exception as e:
|
|
|
49
49
|
logger.warning(f"Error initializing telemetry: {e}")
|
|
50
50
|
|
|
51
51
|
from .providers.omni.types import LLMProvider, LLM
|
|
52
|
-
from .
|
|
52
|
+
from .core.factory import AgentLoop
|
|
53
|
+
from .core.agent import ComputerAgent
|
|
53
54
|
|
|
54
|
-
__all__ = ["AgentLoop", "LLMProvider", "LLM"]
|
|
55
|
+
__all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"]
|
|
@@ -1,12 +1,7 @@
|
|
|
1
1
|
"""Core agent components."""
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .factory import BaseLoop
|
|
4
4
|
from .messages import (
|
|
5
|
-
create_user_message,
|
|
6
|
-
create_assistant_message,
|
|
7
|
-
create_system_message,
|
|
8
|
-
create_image_message,
|
|
9
|
-
create_screen_message,
|
|
10
5
|
BaseMessageManager,
|
|
11
6
|
ImageRetentionConfig,
|
|
12
7
|
)
|
|
@@ -3,31 +3,18 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
-
from typing import
|
|
7
|
-
from dataclasses import dataclass
|
|
6
|
+
from typing import AsyncGenerator, Optional
|
|
8
7
|
|
|
9
8
|
from computer import Computer
|
|
10
|
-
from ..providers.
|
|
11
|
-
from ..providers.omni.loop import OmniLoop
|
|
12
|
-
from ..providers.omni.parser import OmniParser
|
|
13
|
-
from ..providers.omni.types import LLMProvider, LLM
|
|
9
|
+
from ..providers.omni.types import LLM
|
|
14
10
|
from .. import AgentLoop
|
|
11
|
+
from .types import AgentResponse
|
|
12
|
+
from .factory import LoopFactory
|
|
13
|
+
from .provider_config import DEFAULT_MODELS, ENV_VARS
|
|
15
14
|
|
|
16
15
|
logging.basicConfig(level=logging.INFO)
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
19
|
-
# Default models for different providers
|
|
20
|
-
DEFAULT_MODELS = {
|
|
21
|
-
LLMProvider.OPENAI: "gpt-4o",
|
|
22
|
-
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
# Map providers to their environment variable names
|
|
26
|
-
ENV_VARS = {
|
|
27
|
-
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
28
|
-
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
29
|
-
}
|
|
30
|
-
|
|
31
18
|
|
|
32
19
|
class ComputerAgent:
|
|
33
20
|
"""A computer agent that can perform automated tasks using natural language instructions."""
|
|
@@ -44,7 +31,6 @@ class ComputerAgent:
|
|
|
44
31
|
save_trajectory: bool = True,
|
|
45
32
|
trajectory_dir: str = "trajectories",
|
|
46
33
|
only_n_most_recent_images: Optional[int] = None,
|
|
47
|
-
parser: Optional[OmniParser] = None,
|
|
48
34
|
verbosity: int = logging.INFO,
|
|
49
35
|
):
|
|
50
36
|
"""Initialize the ComputerAgent.
|
|
@@ -61,12 +47,11 @@ class ComputerAgent:
|
|
|
61
47
|
save_trajectory: Whether to save the trajectory.
|
|
62
48
|
trajectory_dir: Directory to save the trajectory.
|
|
63
49
|
only_n_most_recent_images: Maximum number of recent screenshots to include in API requests.
|
|
64
|
-
parser: Parser instance for the OmniLoop. Only used if provider is not ANTHROPIC.
|
|
65
50
|
verbosity: Logging level.
|
|
66
51
|
"""
|
|
67
52
|
# Basic agent configuration
|
|
68
53
|
self.max_retries = max_retries
|
|
69
|
-
self.computer = computer
|
|
54
|
+
self.computer = computer
|
|
70
55
|
self.queue = asyncio.Queue()
|
|
71
56
|
self.screenshot_dir = screenshot_dir
|
|
72
57
|
self.log_dir = log_dir
|
|
@@ -99,39 +84,30 @@ class ComputerAgent:
|
|
|
99
84
|
f"No model specified for provider {self.provider} and no default found"
|
|
100
85
|
)
|
|
101
86
|
|
|
102
|
-
# Ensure computer is properly cast for typing purposes
|
|
103
|
-
computer_instance = cast(Computer, self.computer)
|
|
104
|
-
|
|
105
87
|
# Get API key from environment if not provided
|
|
106
88
|
actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
|
|
107
89
|
if not actual_api_key:
|
|
108
90
|
raise ValueError(f"No API key provided for {self.provider}")
|
|
109
91
|
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
computer=computer_instance,
|
|
116
|
-
save_trajectory=save_trajectory,
|
|
117
|
-
base_dir=trajectory_dir,
|
|
118
|
-
only_n_most_recent_images=only_n_most_recent_images,
|
|
119
|
-
)
|
|
120
|
-
else:
|
|
121
|
-
# Default to OmniLoop for other loop types
|
|
122
|
-
# Initialize parser if not provided
|
|
123
|
-
actual_parser = parser or OmniParser()
|
|
124
|
-
|
|
125
|
-
self._loop = OmniLoop(
|
|
92
|
+
# Create the appropriate loop using the factory
|
|
93
|
+
try:
|
|
94
|
+
# Let the factory create the appropriate loop with needed components
|
|
95
|
+
self._loop = LoopFactory.create_loop(
|
|
96
|
+
loop_type=loop,
|
|
126
97
|
provider=self.provider,
|
|
98
|
+
computer=self.computer,
|
|
99
|
+
model_name=actual_model_name,
|
|
127
100
|
api_key=actual_api_key,
|
|
128
|
-
model=actual_model_name,
|
|
129
|
-
computer=computer_instance,
|
|
130
101
|
save_trajectory=save_trajectory,
|
|
131
|
-
|
|
102
|
+
trajectory_dir=trajectory_dir,
|
|
132
103
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
133
|
-
parser=actual_parser,
|
|
134
104
|
)
|
|
105
|
+
except ValueError as e:
|
|
106
|
+
logger.error(f"Failed to create loop: {str(e)}")
|
|
107
|
+
raise
|
|
108
|
+
|
|
109
|
+
# Initialize the message manager from the loop
|
|
110
|
+
self.message_manager = self._loop.message_manager
|
|
135
111
|
|
|
136
112
|
logger.info(
|
|
137
113
|
f"ComputerAgent initialized with provider: {self.provider}, model: {actual_model_name}"
|
|
@@ -154,21 +130,6 @@ class ComputerAgent:
|
|
|
154
130
|
else:
|
|
155
131
|
logger.info("Computer already initialized, skipping initialization")
|
|
156
132
|
|
|
157
|
-
# Take a test screenshot to verify the computer is working
|
|
158
|
-
logger.info("Testing computer with a screenshot...")
|
|
159
|
-
try:
|
|
160
|
-
test_screenshot = await self.computer.interface.screenshot()
|
|
161
|
-
# Determine the screenshot size based on its type
|
|
162
|
-
if isinstance(test_screenshot, (bytes, bytearray, memoryview)):
|
|
163
|
-
size = len(test_screenshot)
|
|
164
|
-
elif hasattr(test_screenshot, "base64_image"):
|
|
165
|
-
size = len(test_screenshot.base64_image)
|
|
166
|
-
else:
|
|
167
|
-
size = "unknown"
|
|
168
|
-
logger.info(f"Screenshot test successful, size: {size}")
|
|
169
|
-
except Exception as e:
|
|
170
|
-
logger.error(f"Screenshot test failed: {str(e)}")
|
|
171
|
-
# Even though screenshot failed, we continue since some tests might not need it
|
|
172
133
|
except Exception as e:
|
|
173
134
|
logger.error(f"Error initializing computer in __aenter__: {str(e)}")
|
|
174
135
|
raise
|
|
@@ -201,36 +162,30 @@ class ComputerAgent:
|
|
|
201
162
|
await self.computer.run()
|
|
202
163
|
self._initialized = True
|
|
203
164
|
|
|
204
|
-
async def
|
|
205
|
-
"""Initialize the computer interface if it hasn't been initialized yet."""
|
|
206
|
-
if not self.computer._initialized:
|
|
207
|
-
logger.info("Computer not initialized, initializing now...")
|
|
208
|
-
try:
|
|
209
|
-
# Call run directly
|
|
210
|
-
await self.computer.run()
|
|
211
|
-
logger.info("Computer interface initialized successfully")
|
|
212
|
-
except Exception as e:
|
|
213
|
-
logger.error(f"Error initializing computer interface: {str(e)}")
|
|
214
|
-
raise
|
|
215
|
-
|
|
216
|
-
async def run(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
165
|
+
async def run(self, task: str) -> AsyncGenerator[AgentResponse, None]:
|
|
217
166
|
"""Run a task using the computer agent.
|
|
218
167
|
|
|
219
168
|
Args:
|
|
220
169
|
task: Task description
|
|
221
170
|
|
|
222
171
|
Yields:
|
|
223
|
-
|
|
172
|
+
Agent response format
|
|
224
173
|
"""
|
|
225
174
|
try:
|
|
226
175
|
logger.info(f"Running task: {task}")
|
|
176
|
+
logger.info(
|
|
177
|
+
f"Message history before task has {len(self.message_manager.messages)} messages"
|
|
178
|
+
)
|
|
227
179
|
|
|
228
180
|
# Initialize the computer if needed
|
|
229
181
|
if not self._initialized:
|
|
230
182
|
await self.initialize()
|
|
231
183
|
|
|
232
|
-
#
|
|
233
|
-
|
|
184
|
+
# Add task as a user message using the message manager
|
|
185
|
+
self.message_manager.add_user_message([{"type": "text", "text": task}])
|
|
186
|
+
logger.info(
|
|
187
|
+
f"Added task message. Message history now has {len(self.message_manager.messages)} messages"
|
|
188
|
+
)
|
|
234
189
|
|
|
235
190
|
# Pass properly formatted messages to the loop
|
|
236
191
|
if self._loop is None:
|
|
@@ -239,7 +194,7 @@ class ComputerAgent:
|
|
|
239
194
|
return
|
|
240
195
|
|
|
241
196
|
# Execute the task and yield results
|
|
242
|
-
async for result in self._loop.run(messages):
|
|
197
|
+
async for result in self._loop.run(self.message_manager.messages):
|
|
243
198
|
yield result
|
|
244
199
|
|
|
245
200
|
except Exception as e:
|