cua-agent 0.1.17__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- cua_agent-0.1.18/PKG-INFO +165 -0
- cua_agent-0.1.18/README.md +116 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/__init__.py +2 -2
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/__init__.py +1 -1
- cua_agent-0.1.17/agent/core/computer_agent.py → cua_agent-0.1.18/agent/core/agent.py +15 -53
- cua_agent-0.1.17/agent/core/loop.py → cua_agent-0.1.18/agent/core/base.py +12 -25
- cua_agent-0.1.18/agent/core/factory.py +104 -0
- cua_agent-0.1.18/agent/core/provider_config.py +15 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/types.py +10 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/loop.py +1 -1
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/response_handler.py +1 -4
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/utils.py +1 -3
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/loop.py +1 -1
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/types.py +2 -0
- cua_agent-0.1.18/agent/providers/openai/__init__.py +6 -0
- cua_agent-0.1.18/agent/providers/openai/api_handler.py +453 -0
- cua_agent-0.1.18/agent/providers/openai/loop.py +440 -0
- cua_agent-0.1.18/agent/providers/openai/response_handler.py +205 -0
- cua_agent-0.1.18/agent/providers/openai/tools/__init__.py +15 -0
- cua_agent-0.1.18/agent/providers/openai/tools/base.py +79 -0
- cua_agent-0.1.18/agent/providers/openai/tools/computer.py +319 -0
- cua_agent-0.1.18/agent/providers/openai/tools/manager.py +106 -0
- cua_agent-0.1.18/agent/providers/openai/types.py +36 -0
- cua_agent-0.1.18/agent/providers/openai/utils.py +98 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/pyproject.toml +7 -3
- cua_agent-0.1.17/PKG-INFO +0 -90
- cua_agent-0.1.17/README.md +0 -44
- cua_agent-0.1.17/agent/README.md +0 -63
- cua_agent-0.1.17/agent/providers/anthropic/messages/manager.py +0 -112
- cua_agent-0.1.17/tests/test_agent.py +0 -91
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/README.md +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/messages.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/tools.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/core/visualization.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.17 → cua_agent-0.1.18}/agent/telemetry.py +0 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cua-agent
|
|
3
|
+
Version: 0.1.18
|
|
4
|
+
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
|
+
Author-Email: TryCua <gh@trycua.com>
|
|
6
|
+
Requires-Python: <3.13,>=3.10
|
|
7
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0
|
|
8
|
+
Requires-Dist: aiohttp<4.0.0,>=3.9.3
|
|
9
|
+
Requires-Dist: asyncio
|
|
10
|
+
Requires-Dist: anyio<5.0.0,>=4.4.1
|
|
11
|
+
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
12
|
+
Requires-Dist: pydantic<3.0.0,>=2.6.4
|
|
13
|
+
Requires-Dist: rich<14.0.0,>=13.7.1
|
|
14
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.1
|
|
15
|
+
Requires-Dist: cua-computer<0.2.0,>=0.1.0
|
|
16
|
+
Requires-Dist: cua-core<0.2.0,>=0.1.0
|
|
17
|
+
Requires-Dist: certifi>=2024.2.2
|
|
18
|
+
Provides-Extra: anthropic
|
|
19
|
+
Requires-Dist: anthropic>=0.49.0; extra == "anthropic"
|
|
20
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
21
|
+
Provides-Extra: openai
|
|
22
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
23
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
|
+
Provides-Extra: som
|
|
25
|
+
Requires-Dist: torch>=2.2.1; extra == "som"
|
|
26
|
+
Requires-Dist: torchvision>=0.17.1; extra == "som"
|
|
27
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "som"
|
|
28
|
+
Requires-Dist: transformers>=4.38.2; extra == "som"
|
|
29
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "som"
|
|
30
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "som"
|
|
31
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "som"
|
|
32
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
|
|
33
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
|
|
34
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
|
|
35
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: torch>=2.2.1; extra == "all"
|
|
38
|
+
Requires-Dist: torchvision>=0.17.1; extra == "all"
|
|
39
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
40
|
+
Requires-Dist: transformers>=4.38.2; extra == "all"
|
|
41
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
42
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "all"
|
|
43
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "all"
|
|
44
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
|
|
45
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
|
|
46
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
|
|
47
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
<div align="center">
|
|
51
|
+
<h1>
|
|
52
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
53
|
+
<picture>
|
|
54
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
55
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
56
|
+
<img alt="Shows my svg">
|
|
57
|
+
</picture>
|
|
58
|
+
</div>
|
|
59
|
+
|
|
60
|
+
[](#)
|
|
61
|
+
[](#)
|
|
62
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
63
|
+
[](https://pypi.org/project/cua-computer/)
|
|
64
|
+
</h1>
|
|
65
|
+
</div>
|
|
66
|
+
|
|
67
|
+
**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with Cua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen).
|
|
68
|
+
|
|
69
|
+
### Get started with Agent
|
|
70
|
+
|
|
71
|
+
<div align="center">
|
|
72
|
+
<img src="../../img/agent.png"/>
|
|
73
|
+
</div>
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install "cua-agent[all]"
|
|
79
|
+
|
|
80
|
+
# or install specific loop providers
|
|
81
|
+
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
82
|
+
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
83
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Run
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
async with Computer() as macos_computer:
|
|
90
|
+
# Create agent with loop and provider
|
|
91
|
+
agent = ComputerAgent(
|
|
92
|
+
computer=macos_computer,
|
|
93
|
+
loop=AgentLoop.OPENAI,
|
|
94
|
+
model=LLM(provider=LLMProvider.OPENAI)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
tasks = [
|
|
98
|
+
"Look for a repository named trycua/cua on GitHub.",
|
|
99
|
+
"Check the open issues, open the most recent one and read it.",
|
|
100
|
+
"Clone the repository in users/lume/projects if it doesn't exist yet.",
|
|
101
|
+
"Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
|
|
102
|
+
"From Cursor, open Composer if not already open.",
|
|
103
|
+
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
for i, task in enumerate(tasks):
|
|
107
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
108
|
+
async for result in agent.run(task):
|
|
109
|
+
print(result)
|
|
110
|
+
|
|
111
|
+
print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
115
|
+
|
|
116
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
117
|
+
|
|
118
|
+
## Agent Loops
|
|
119
|
+
|
|
120
|
+
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
121
|
+
|
|
122
|
+
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
|
123
|
+
|:-----------|:-----------------|:------------|:-------------|
|
|
124
|
+
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
125
|
+
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
126
|
+
| `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
|
|
127
|
+
|
|
128
|
+
## AgentResponse
|
|
129
|
+
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
async for result in agent.run(task):
|
|
133
|
+
print("Response ID: ", result.get("id"))
|
|
134
|
+
|
|
135
|
+
# Print detailed usage information
|
|
136
|
+
usage = result.get("usage")
|
|
137
|
+
if usage:
|
|
138
|
+
print("\nUsage Details:")
|
|
139
|
+
print(f" Input Tokens: {usage.get('input_tokens')}")
|
|
140
|
+
if "input_tokens_details" in usage:
|
|
141
|
+
print(f" Input Tokens Details: {usage.get('input_tokens_details')}")
|
|
142
|
+
print(f" Output Tokens: {usage.get('output_tokens')}")
|
|
143
|
+
if "output_tokens_details" in usage:
|
|
144
|
+
print(f" Output Tokens Details: {usage.get('output_tokens_details')}")
|
|
145
|
+
print(f" Total Tokens: {usage.get('total_tokens')}")
|
|
146
|
+
|
|
147
|
+
print("Response Text: ", result.get("text"))
|
|
148
|
+
|
|
149
|
+
# Print tools information
|
|
150
|
+
tools = result.get("tools")
|
|
151
|
+
if tools:
|
|
152
|
+
print("\nTools:")
|
|
153
|
+
print(tools)
|
|
154
|
+
|
|
155
|
+
# Print reasoning and tool call outputs
|
|
156
|
+
outputs = result.get("output", [])
|
|
157
|
+
for output in outputs:
|
|
158
|
+
output_type = output.get("type")
|
|
159
|
+
if output_type == "reasoning":
|
|
160
|
+
print("\nReasoning Output:")
|
|
161
|
+
print(output)
|
|
162
|
+
elif output_type == "computer_call":
|
|
163
|
+
print("\nTool Call Output:")
|
|
164
|
+
print(output)
|
|
165
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>
|
|
3
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
4
|
+
<picture>
|
|
5
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
6
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
7
|
+
<img alt="Shows my svg">
|
|
8
|
+
</picture>
|
|
9
|
+
</div>
|
|
10
|
+
|
|
11
|
+
[](#)
|
|
12
|
+
[](#)
|
|
13
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
14
|
+
[](https://pypi.org/project/cua-computer/)
|
|
15
|
+
</h1>
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with Cua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen).
|
|
19
|
+
|
|
20
|
+
### Get started with Agent
|
|
21
|
+
|
|
22
|
+
<div align="center">
|
|
23
|
+
<img src="../../img/agent.png"/>
|
|
24
|
+
</div>
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "cua-agent[all]"
|
|
30
|
+
|
|
31
|
+
# or install specific loop providers
|
|
32
|
+
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
|
+
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
+
pip install "cua-agent[omni]" # Cua Loop based on OmniParser
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Run
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
async with Computer() as macos_computer:
|
|
41
|
+
# Create agent with loop and provider
|
|
42
|
+
agent = ComputerAgent(
|
|
43
|
+
computer=macos_computer,
|
|
44
|
+
loop=AgentLoop.OPENAI,
|
|
45
|
+
model=LLM(provider=LLMProvider.OPENAI)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
tasks = [
|
|
49
|
+
"Look for a repository named trycua/cua on GitHub.",
|
|
50
|
+
"Check the open issues, open the most recent one and read it.",
|
|
51
|
+
"Clone the repository in users/lume/projects if it doesn't exist yet.",
|
|
52
|
+
"Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
|
|
53
|
+
"From Cursor, open Composer if not already open.",
|
|
54
|
+
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
for i, task in enumerate(tasks):
|
|
58
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
59
|
+
async for result in agent.run(task):
|
|
60
|
+
print(result)
|
|
61
|
+
|
|
62
|
+
print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
66
|
+
|
|
67
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
68
|
+
|
|
69
|
+
## Agent Loops
|
|
70
|
+
|
|
71
|
+
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
72
|
+
|
|
73
|
+
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
|
74
|
+
|:-----------|:-----------------|:------------|:-------------|
|
|
75
|
+
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
76
|
+
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
77
|
+
| `AgentLoop.OMNI` <br>(preview) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `gpt-3.5-turbo` | Use OmniParser for element pixel-detection (SoM) and any VLMs | OmniParser |
|
|
78
|
+
|
|
79
|
+
## AgentResponse
|
|
80
|
+
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
async for result in agent.run(task):
|
|
84
|
+
print("Response ID: ", result.get("id"))
|
|
85
|
+
|
|
86
|
+
# Print detailed usage information
|
|
87
|
+
usage = result.get("usage")
|
|
88
|
+
if usage:
|
|
89
|
+
print("\nUsage Details:")
|
|
90
|
+
print(f" Input Tokens: {usage.get('input_tokens')}")
|
|
91
|
+
if "input_tokens_details" in usage:
|
|
92
|
+
print(f" Input Tokens Details: {usage.get('input_tokens_details')}")
|
|
93
|
+
print(f" Output Tokens: {usage.get('output_tokens')}")
|
|
94
|
+
if "output_tokens_details" in usage:
|
|
95
|
+
print(f" Output Tokens Details: {usage.get('output_tokens_details')}")
|
|
96
|
+
print(f" Total Tokens: {usage.get('total_tokens')}")
|
|
97
|
+
|
|
98
|
+
print("Response Text: ", result.get("text"))
|
|
99
|
+
|
|
100
|
+
# Print tools information
|
|
101
|
+
tools = result.get("tools")
|
|
102
|
+
if tools:
|
|
103
|
+
print("\nTools:")
|
|
104
|
+
print(tools)
|
|
105
|
+
|
|
106
|
+
# Print reasoning and tool call outputs
|
|
107
|
+
outputs = result.get("output", [])
|
|
108
|
+
for output in outputs:
|
|
109
|
+
output_type = output.get("type")
|
|
110
|
+
if output_type == "reasoning":
|
|
111
|
+
print("\nReasoning Output:")
|
|
112
|
+
print(output)
|
|
113
|
+
elif output_type == "computer_call":
|
|
114
|
+
print("\nTool Call Output:")
|
|
115
|
+
print(output)
|
|
116
|
+
```
|
|
@@ -49,7 +49,7 @@ except Exception as e:
|
|
|
49
49
|
logger.warning(f"Error initializing telemetry: {e}")
|
|
50
50
|
|
|
51
51
|
from .providers.omni.types import LLMProvider, LLM
|
|
52
|
-
from .core.
|
|
53
|
-
from .core.
|
|
52
|
+
from .core.factory import AgentLoop
|
|
53
|
+
from .core.agent import ComputerAgent
|
|
54
54
|
|
|
55
55
|
__all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"]
|
|
@@ -3,32 +3,18 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import AsyncGenerator, Optional
|
|
7
7
|
|
|
8
8
|
from computer import Computer
|
|
9
|
-
from ..providers.
|
|
10
|
-
from ..providers.omni.loop import OmniLoop
|
|
11
|
-
from ..providers.omni.parser import OmniParser
|
|
12
|
-
from ..providers.omni.types import LLMProvider, LLM
|
|
9
|
+
from ..providers.omni.types import LLM
|
|
13
10
|
from .. import AgentLoop
|
|
14
|
-
from .messages import StandardMessageManager, ImageRetentionConfig
|
|
15
11
|
from .types import AgentResponse
|
|
12
|
+
from .factory import LoopFactory
|
|
13
|
+
from .provider_config import DEFAULT_MODELS, ENV_VARS
|
|
16
14
|
|
|
17
15
|
logging.basicConfig(level=logging.INFO)
|
|
18
16
|
logger = logging.getLogger(__name__)
|
|
19
17
|
|
|
20
|
-
# Default models for different providers
|
|
21
|
-
DEFAULT_MODELS = {
|
|
22
|
-
LLMProvider.OPENAI: "gpt-4o",
|
|
23
|
-
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
# Map providers to their environment variable names
|
|
27
|
-
ENV_VARS = {
|
|
28
|
-
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
29
|
-
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
30
|
-
}
|
|
31
|
-
|
|
32
18
|
|
|
33
19
|
class ComputerAgent:
|
|
34
20
|
"""A computer agent that can perform automated tasks using natural language instructions."""
|
|
@@ -98,35 +84,27 @@ class ComputerAgent:
|
|
|
98
84
|
f"No model specified for provider {self.provider} and no default found"
|
|
99
85
|
)
|
|
100
86
|
|
|
101
|
-
# Ensure computer is properly cast for typing purposes
|
|
102
|
-
computer_instance = self.computer
|
|
103
|
-
|
|
104
87
|
# Get API key from environment if not provided
|
|
105
88
|
actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
|
|
106
89
|
if not actual_api_key:
|
|
107
90
|
raise ValueError(f"No API key provided for {self.provider}")
|
|
108
91
|
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
computer=computer_instance,
|
|
115
|
-
save_trajectory=save_trajectory,
|
|
116
|
-
base_dir=trajectory_dir,
|
|
117
|
-
only_n_most_recent_images=only_n_most_recent_images,
|
|
118
|
-
)
|
|
119
|
-
else:
|
|
120
|
-
self._loop = OmniLoop(
|
|
92
|
+
# Create the appropriate loop using the factory
|
|
93
|
+
try:
|
|
94
|
+
# Let the factory create the appropriate loop with needed components
|
|
95
|
+
self._loop = LoopFactory.create_loop(
|
|
96
|
+
loop_type=loop,
|
|
121
97
|
provider=self.provider,
|
|
98
|
+
computer=self.computer,
|
|
99
|
+
model_name=actual_model_name,
|
|
122
100
|
api_key=actual_api_key,
|
|
123
|
-
model=actual_model_name,
|
|
124
|
-
computer=computer_instance,
|
|
125
101
|
save_trajectory=save_trajectory,
|
|
126
|
-
|
|
102
|
+
trajectory_dir=trajectory_dir,
|
|
127
103
|
only_n_most_recent_images=only_n_most_recent_images,
|
|
128
|
-
parser=OmniParser(),
|
|
129
104
|
)
|
|
105
|
+
except ValueError as e:
|
|
106
|
+
logger.error(f"Failed to create loop: {str(e)}")
|
|
107
|
+
raise
|
|
130
108
|
|
|
131
109
|
# Initialize the message manager from the loop
|
|
132
110
|
self.message_manager = self._loop.message_manager
|
|
@@ -152,21 +130,6 @@ class ComputerAgent:
|
|
|
152
130
|
else:
|
|
153
131
|
logger.info("Computer already initialized, skipping initialization")
|
|
154
132
|
|
|
155
|
-
# Take a test screenshot to verify the computer is working
|
|
156
|
-
logger.info("Testing computer with a screenshot...")
|
|
157
|
-
try:
|
|
158
|
-
test_screenshot = await self.computer.interface.screenshot()
|
|
159
|
-
# Determine the screenshot size based on its type
|
|
160
|
-
if isinstance(test_screenshot, (bytes, bytearray, memoryview)):
|
|
161
|
-
size = len(test_screenshot)
|
|
162
|
-
elif hasattr(test_screenshot, "base64_image"):
|
|
163
|
-
size = len(test_screenshot.base64_image)
|
|
164
|
-
else:
|
|
165
|
-
size = "unknown"
|
|
166
|
-
logger.info(f"Screenshot test successful, size: {size}")
|
|
167
|
-
except Exception as e:
|
|
168
|
-
logger.error(f"Screenshot test failed: {str(e)}")
|
|
169
|
-
# Even though screenshot failed, we continue since some tests might not need it
|
|
170
133
|
except Exception as e:
|
|
171
134
|
logger.error(f"Error initializing computer in __aenter__: {str(e)}")
|
|
172
135
|
raise
|
|
@@ -232,7 +195,6 @@ class ComputerAgent:
|
|
|
232
195
|
|
|
233
196
|
# Execute the task and yield results
|
|
234
197
|
async for result in self._loop.run(self.message_manager.messages):
|
|
235
|
-
# Yield the result to the caller
|
|
236
198
|
yield result
|
|
237
199
|
|
|
238
200
|
except Exception as e:
|
|
@@ -1,35 +1,21 @@
|
|
|
1
|
-
"""Base
|
|
1
|
+
"""Base loop definitions."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import asyncio
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from
|
|
7
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
|
8
|
-
from datetime import datetime
|
|
6
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional
|
|
9
7
|
|
|
10
8
|
from computer import Computer
|
|
11
|
-
from .experiment import ExperimentManager
|
|
12
9
|
from .messages import StandardMessageManager, ImageRetentionConfig
|
|
13
10
|
from .types import AgentResponse
|
|
11
|
+
from .experiment import ExperimentManager
|
|
14
12
|
|
|
15
13
|
logger = logging.getLogger(__name__)
|
|
16
14
|
|
|
17
15
|
|
|
18
|
-
class AgentLoop(Enum):
|
|
19
|
-
"""Enumeration of available loop types."""
|
|
20
|
-
|
|
21
|
-
ANTHROPIC = auto() # Anthropic implementation
|
|
22
|
-
OMNI = auto() # OmniLoop implementation
|
|
23
|
-
# Add more loop types as needed
|
|
24
|
-
|
|
25
|
-
|
|
26
16
|
class BaseLoop(ABC):
|
|
27
17
|
"""Base class for agent loops that handle message processing and tool execution."""
|
|
28
18
|
|
|
29
|
-
###########################################
|
|
30
|
-
# INITIALIZATION AND CONFIGURATION
|
|
31
|
-
###########################################
|
|
32
|
-
|
|
33
19
|
def __init__(
|
|
34
20
|
self,
|
|
35
21
|
computer: Computer,
|
|
@@ -68,6 +54,11 @@ class BaseLoop(ABC):
|
|
|
68
54
|
self.only_n_most_recent_images = only_n_most_recent_images
|
|
69
55
|
self._kwargs = kwargs
|
|
70
56
|
|
|
57
|
+
# Initialize message manager
|
|
58
|
+
self.message_manager = StandardMessageManager(
|
|
59
|
+
config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
|
|
60
|
+
)
|
|
61
|
+
|
|
71
62
|
# Initialize experiment manager
|
|
72
63
|
if self.save_trajectory and self.base_dir:
|
|
73
64
|
self.experiment_manager = ExperimentManager(
|
|
@@ -110,8 +101,7 @@ class BaseLoop(ABC):
|
|
|
110
101
|
)
|
|
111
102
|
raise RuntimeError(f"Failed to initialize: {str(e)}")
|
|
112
103
|
|
|
113
|
-
|
|
114
|
-
|
|
104
|
+
###########################################
|
|
115
105
|
# ABSTRACT METHODS TO BE IMPLEMENTED BY SUBCLASSES
|
|
116
106
|
###########################################
|
|
117
107
|
|
|
@@ -125,17 +115,14 @@ class BaseLoop(ABC):
|
|
|
125
115
|
raise NotImplementedError
|
|
126
116
|
|
|
127
117
|
@abstractmethod
|
|
128
|
-
|
|
118
|
+
def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
129
119
|
"""Run the agent loop with provided messages.
|
|
130
120
|
|
|
131
|
-
This method handles the main agent loop including message processing,
|
|
132
|
-
API calls, response handling, and action execution.
|
|
133
|
-
|
|
134
121
|
Args:
|
|
135
122
|
messages: List of message objects
|
|
136
123
|
|
|
137
|
-
|
|
138
|
-
|
|
124
|
+
Returns:
|
|
125
|
+
An async generator that yields agent responses
|
|
139
126
|
"""
|
|
140
127
|
raise NotImplementedError
|
|
141
128
|
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Base agent loop implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import importlib.util
|
|
5
|
+
from typing import Dict, Optional, Type, TYPE_CHECKING, Any, cast, Callable, Awaitable
|
|
6
|
+
|
|
7
|
+
from computer import Computer
|
|
8
|
+
from .types import AgentLoop
|
|
9
|
+
from .base import BaseLoop
|
|
10
|
+
|
|
11
|
+
# For type checking only
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ..providers.omni.types import LLMProvider
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LoopFactory:
|
|
19
|
+
"""Factory class for creating agent loops."""
|
|
20
|
+
|
|
21
|
+
# Registry to store loop implementations
|
|
22
|
+
_loop_registry: Dict[AgentLoop, Type[BaseLoop]] = {}
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def create_loop(
|
|
26
|
+
cls,
|
|
27
|
+
loop_type: AgentLoop,
|
|
28
|
+
api_key: str,
|
|
29
|
+
model_name: str,
|
|
30
|
+
computer: Computer,
|
|
31
|
+
provider: Any = None,
|
|
32
|
+
save_trajectory: bool = True,
|
|
33
|
+
trajectory_dir: str = "trajectories",
|
|
34
|
+
only_n_most_recent_images: Optional[int] = None,
|
|
35
|
+
acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
|
|
36
|
+
) -> BaseLoop:
|
|
37
|
+
"""Create and return an appropriate loop instance based on type."""
|
|
38
|
+
if loop_type == AgentLoop.ANTHROPIC:
|
|
39
|
+
# Lazy import AnthropicLoop only when needed
|
|
40
|
+
try:
|
|
41
|
+
from ..providers.anthropic.loop import AnthropicLoop
|
|
42
|
+
except ImportError:
|
|
43
|
+
raise ImportError(
|
|
44
|
+
"The 'anthropic' provider is not installed. "
|
|
45
|
+
"Install it with 'pip install cua-agent[anthropic]'"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return AnthropicLoop(
|
|
49
|
+
api_key=api_key,
|
|
50
|
+
model=model_name,
|
|
51
|
+
computer=computer,
|
|
52
|
+
save_trajectory=save_trajectory,
|
|
53
|
+
base_dir=trajectory_dir,
|
|
54
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
55
|
+
)
|
|
56
|
+
elif loop_type == AgentLoop.OPENAI:
|
|
57
|
+
# Lazy import OpenAILoop only when needed
|
|
58
|
+
try:
|
|
59
|
+
from ..providers.openai.loop import OpenAILoop
|
|
60
|
+
except ImportError:
|
|
61
|
+
raise ImportError(
|
|
62
|
+
"The 'openai' provider is not installed. "
|
|
63
|
+
"Install it with 'pip install cua-agent[openai]'"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return OpenAILoop(
|
|
67
|
+
api_key=api_key,
|
|
68
|
+
model=model_name,
|
|
69
|
+
computer=computer,
|
|
70
|
+
save_trajectory=save_trajectory,
|
|
71
|
+
base_dir=trajectory_dir,
|
|
72
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
73
|
+
acknowledge_safety_check_callback=acknowledge_safety_check_callback,
|
|
74
|
+
)
|
|
75
|
+
elif loop_type == AgentLoop.OMNI:
|
|
76
|
+
# Lazy import OmniLoop and related classes only when needed
|
|
77
|
+
try:
|
|
78
|
+
from ..providers.omni.loop import OmniLoop
|
|
79
|
+
from ..providers.omni.parser import OmniParser
|
|
80
|
+
from ..providers.omni.types import LLMProvider
|
|
81
|
+
except ImportError:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"The 'omni' provider is not installed. "
|
|
84
|
+
"Install it with 'pip install cua-agent[all]'"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if provider is None:
|
|
88
|
+
raise ValueError("Provider is required for OMNI loop type")
|
|
89
|
+
|
|
90
|
+
# We know provider is the correct type at this point, so cast it
|
|
91
|
+
provider_instance = cast(LLMProvider, provider)
|
|
92
|
+
|
|
93
|
+
return OmniLoop(
|
|
94
|
+
provider=provider_instance,
|
|
95
|
+
api_key=api_key,
|
|
96
|
+
model=model_name,
|
|
97
|
+
computer=computer,
|
|
98
|
+
save_trajectory=save_trajectory,
|
|
99
|
+
base_dir=trajectory_dir,
|
|
100
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
101
|
+
parser=OmniParser(),
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError(f"Unsupported loop type: {loop_type}")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Provider-specific configurations and constants."""
|
|
2
|
+
|
|
3
|
+
from ..providers.omni.types import LLMProvider
|
|
4
|
+
|
|
5
|
+
# Default models for different providers
|
|
6
|
+
DEFAULT_MODELS = {
|
|
7
|
+
LLMProvider.OPENAI: "gpt-4o",
|
|
8
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
# Map providers to their environment variable names
|
|
12
|
+
ENV_VARS = {
|
|
13
|
+
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
14
|
+
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
15
|
+
}
|
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
"""Core type definitions."""
|
|
2
2
|
|
|
3
3
|
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AgentLoop(Enum):
|
|
8
|
+
"""Enumeration of available loop types."""
|
|
9
|
+
|
|
10
|
+
ANTHROPIC = auto() # Anthropic implementation
|
|
11
|
+
OMNI = auto() # OmniLoop implementation
|
|
12
|
+
OPENAI = auto() # OpenAI implementation
|
|
13
|
+
# Add more loop types as needed
|
|
4
14
|
|
|
5
15
|
|
|
6
16
|
class AgentResponse(TypedDict, total=False):
|
|
@@ -16,7 +16,7 @@ from datetime import datetime
|
|
|
16
16
|
from computer import Computer
|
|
17
17
|
|
|
18
18
|
# Base imports
|
|
19
|
-
from ...core.
|
|
19
|
+
from ...core.base import BaseLoop
|
|
20
20
|
from ...core.messages import StandardMessageManager, ImageRetentionConfig
|
|
21
21
|
from ...core.types import AgentResponse
|
|
22
22
|
|
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
"""Response and tool handling for Anthropic provider."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Dict, List,
|
|
4
|
+
from typing import Any, Dict, List, Tuple, cast
|
|
5
5
|
|
|
6
6
|
from anthropic.types.beta import (
|
|
7
7
|
BetaMessage,
|
|
8
|
-
BetaMessageParam,
|
|
9
8
|
BetaTextBlock,
|
|
10
|
-
BetaTextBlockParam,
|
|
11
|
-
BetaToolUseBlockParam,
|
|
12
9
|
BetaContentBlockParam,
|
|
13
10
|
)
|
|
14
11
|
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
"""Utility functions for Anthropic message handling."""
|
|
2
2
|
|
|
3
|
-
import time
|
|
4
3
|
import logging
|
|
5
4
|
import re
|
|
6
5
|
from typing import Any, Dict, List, Optional, Tuple, cast
|
|
7
|
-
from anthropic.types.beta import BetaMessage
|
|
6
|
+
from anthropic.types.beta import BetaMessage
|
|
8
7
|
from ..omni.parser import ParseResult
|
|
9
8
|
from ...core.types import AgentResponse
|
|
10
9
|
from datetime import datetime
|
|
11
|
-
import json
|
|
12
10
|
|
|
13
11
|
# Configure module logger
|
|
14
12
|
logger = logging.getLogger(__name__)
|