cua-agent 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- cua_agent-0.1.4/PKG-INFO +120 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/README.md +4 -56
- cua_agent-0.1.4/agent/__init__.py +56 -0
- cua_agent-0.1.4/agent/core/agent.py +252 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/base_agent.py +1 -1
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/experiment.py +11 -1
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/loop.py +1 -1
- cua_agent-0.1.4/agent/core/telemetry.py +130 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/loop.py +2 -2
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/parser.py +1 -1
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/types.py +0 -6
- cua_agent-0.1.4/agent/telemetry.py +21 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/pyproject.toml +5 -3
- cua_agent-0.1.2/PKG-INFO +0 -44
- cua_agent-0.1.2/agent/__init__.py +0 -10
- cua_agent-0.1.2/agent/core/agent.py +0 -327
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/README.md +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/README.md +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/computer_agent.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/factory.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/messages.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/loop.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/messages/manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/computer.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/callbacks.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/groq.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/experiment.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/messages.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tool_manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/visualization.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/__init__.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/base.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/messages.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/tools.py +0 -0
- {cua_agent-0.1.2 → cua_agent-0.1.4}/tests/test_agent.py +0 -0
cua_agent-0.1.4/PKG-INFO
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cua-agent
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
|
+
Author-Email: TryCua <gh@trycua.com>
|
|
6
|
+
Requires-Python: <3.13,>=3.10
|
|
7
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0
|
|
8
|
+
Requires-Dist: aiohttp<4.0.0,>=3.9.3
|
|
9
|
+
Requires-Dist: asyncio
|
|
10
|
+
Requires-Dist: anyio<5.0.0,>=4.4.1
|
|
11
|
+
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
12
|
+
Requires-Dist: pydantic<3.0.0,>=2.6.4
|
|
13
|
+
Requires-Dist: rich<14.0.0,>=13.7.1
|
|
14
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.1
|
|
15
|
+
Requires-Dist: cua-computer<0.2.0,>=0.1.0
|
|
16
|
+
Requires-Dist: cua-core<0.2.0,>=0.1.0
|
|
17
|
+
Requires-Dist: certifi>=2024.2.2
|
|
18
|
+
Provides-Extra: anthropic
|
|
19
|
+
Requires-Dist: anthropic>=0.49.0; extra == "anthropic"
|
|
20
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
21
|
+
Provides-Extra: som
|
|
22
|
+
Requires-Dist: torch>=2.2.1; extra == "som"
|
|
23
|
+
Requires-Dist: torchvision>=0.17.1; extra == "som"
|
|
24
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "som"
|
|
25
|
+
Requires-Dist: transformers>=4.38.2; extra == "som"
|
|
26
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "som"
|
|
27
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "som"
|
|
28
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "som"
|
|
29
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
|
|
30
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
|
|
31
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
|
|
32
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Requires-Dist: torch>=2.2.1; extra == "all"
|
|
35
|
+
Requires-Dist: torchvision>=0.17.1; extra == "all"
|
|
36
|
+
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
37
|
+
Requires-Dist: transformers>=4.38.2; extra == "all"
|
|
38
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
39
|
+
Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "all"
|
|
40
|
+
Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "all"
|
|
41
|
+
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
|
|
42
|
+
Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
|
|
43
|
+
Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
|
|
44
|
+
Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
<div align="center">
|
|
48
|
+
<h1>
|
|
49
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
50
|
+
<picture>
|
|
51
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
|
|
52
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
|
|
53
|
+
<img alt="Shows my svg">
|
|
54
|
+
</picture>
|
|
55
|
+
</div>
|
|
56
|
+
|
|
57
|
+
[](#)
|
|
58
|
+
[](#)
|
|
59
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
60
|
+
[](https://pypi.org/project/cua-computer/)
|
|
61
|
+
</h1>
|
|
62
|
+
</div>
|
|
63
|
+
|
|
64
|
+
**Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
|
|
65
|
+
|
|
66
|
+
### Get started with Agent
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from agent import ComputerAgent, AgentLoop, LLMProvider
|
|
70
|
+
from computer import Computer
|
|
71
|
+
|
|
72
|
+
computer = Computer(verbosity=logging.INFO)
|
|
73
|
+
|
|
74
|
+
agent = ComputerAgent(
|
|
75
|
+
computer=computer,
|
|
76
|
+
loop=AgentLoop.ANTHROPIC,
|
|
77
|
+
# loop=AgentLoop.OMNI,
|
|
78
|
+
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
|
79
|
+
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
|
|
80
|
+
save_trajectory=True,
|
|
81
|
+
trajectory_dir=str(Path("trajectories")),
|
|
82
|
+
only_n_most_recent_images=3,
|
|
83
|
+
verbosity=logging.INFO,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
tasks = [
|
|
87
|
+
"""
|
|
88
|
+
Please help me with the following task:
|
|
89
|
+
1. Open Safari browser
|
|
90
|
+
2. Go to Wikipedia.org
|
|
91
|
+
3. Search for "Claude AI"
|
|
92
|
+
4. Summarize the main points you find about Claude AI
|
|
93
|
+
"""
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
async with agent:
|
|
97
|
+
for i, task in enumerate(tasks, 1):
|
|
98
|
+
print(f"\nExecuting task {i}/{len(tasks)}: {task}")
|
|
99
|
+
async for result in agent.run(task):
|
|
100
|
+
print(result)
|
|
101
|
+
print(f"Task {i} completed")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Install
|
|
105
|
+
|
|
106
|
+
### cua-agent
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install "cua-agent[all]"
|
|
110
|
+
|
|
111
|
+
# or install specific loop providers
|
|
112
|
+
pip install "cua-agent[anthropic]"
|
|
113
|
+
pip install "cua-agent[omni]"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Run
|
|
117
|
+
|
|
118
|
+
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
119
|
+
|
|
120
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
@@ -60,67 +60,15 @@ async with agent:
|
|
|
60
60
|
### cua-agent
|
|
61
61
|
|
|
62
62
|
```bash
|
|
63
|
-
|
|
64
|
-
pip install cua-agent[all]
|
|
63
|
+
pip install "cua-agent[all]"
|
|
65
64
|
|
|
66
65
|
# or install specific loop providers
|
|
67
|
-
pip install cua-agent[anthropic]
|
|
68
|
-
pip install cua-agent[omni]
|
|
69
|
-
|
|
70
|
-
|
|
66
|
+
pip install "cua-agent[anthropic]"
|
|
67
|
+
pip install "cua-agent[omni]"
|
|
71
68
|
```
|
|
72
69
|
|
|
73
|
-
## Features
|
|
74
|
-
|
|
75
|
-
### OmniParser Integration
|
|
76
|
-
- Enhanced UI understanding with element detection
|
|
77
|
-
- Automatic bounding box detection for UI elements
|
|
78
|
-
- Improved accuracy for complex UI interactions
|
|
79
|
-
- Support for icon and text element recognition
|
|
80
|
-
|
|
81
|
-
### Basic Computer Control
|
|
82
|
-
- Direct keyboard and mouse control
|
|
83
|
-
- Window and application management
|
|
84
|
-
- Screenshot capabilities
|
|
85
|
-
- Basic UI element detection
|
|
86
|
-
|
|
87
|
-
### Provider Support
|
|
88
|
-
- OpenAI (GPT-4V) - Recommended for OmniParser integration
|
|
89
|
-
- Anthropic (Claude) - Strong general performance
|
|
90
|
-
- Groq - Fast inference with Llama models
|
|
91
|
-
- DeepSeek - Alternative model provider
|
|
92
|
-
- Qwen - Alibaba's multimodal model
|
|
93
|
-
|
|
94
70
|
## Run
|
|
95
71
|
|
|
96
72
|
Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
|
|
97
73
|
|
|
98
|
-
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
99
|
-
|
|
100
|
-
## Components
|
|
101
|
-
|
|
102
|
-
The library consists of several components:
|
|
103
|
-
|
|
104
|
-
- **Core**
|
|
105
|
-
- `ComputerAgent`: Unified agent class supporting multiple loop types
|
|
106
|
-
- `BaseComputerAgent`: Abstract base class for computer agents
|
|
107
|
-
|
|
108
|
-
- **Providers**
|
|
109
|
-
- `Anthropic`: Implementation for Anthropic Claude models
|
|
110
|
-
- `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
|
|
111
|
-
|
|
112
|
-
- **Loops**
|
|
113
|
-
- `AnthropicLoop`: Loop implementation for Anthropic
|
|
114
|
-
- `OmniLoop`: Generic loop supporting multiple providers
|
|
115
|
-
|
|
116
|
-
## Configuration
|
|
117
|
-
|
|
118
|
-
The agent can be configured with various parameters:
|
|
119
|
-
|
|
120
|
-
- **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
|
|
121
|
-
- **provider**: AI provider to use with the loop
|
|
122
|
-
- **model**: The AI model to use
|
|
123
|
-
- **save_trajectory**: Whether to save screenshots and logs
|
|
124
|
-
- **only_n_most_recent_images**: Only keep a specific number of recent images
|
|
125
|
-
|
|
126
|
-
See the [Core README](./agent/core/README.md) for more details on the unified agent.
|
|
74
|
+
- [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""CUA (Computer Use) Agent for AI-driven computer interaction."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
# Initialize logging
|
|
9
|
+
logger = logging.getLogger("cua.agent")
|
|
10
|
+
|
|
11
|
+
# Initialize telemetry when the package is imported
|
|
12
|
+
try:
|
|
13
|
+
# Import from core telemetry for basic functions
|
|
14
|
+
from core.telemetry import (
|
|
15
|
+
is_telemetry_enabled,
|
|
16
|
+
flush,
|
|
17
|
+
record_event,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Import set_dimension from our own telemetry module
|
|
21
|
+
from .core.telemetry import set_dimension
|
|
22
|
+
|
|
23
|
+
# Check if telemetry is enabled
|
|
24
|
+
if is_telemetry_enabled():
|
|
25
|
+
logger.info("Telemetry is enabled")
|
|
26
|
+
|
|
27
|
+
# Record package initialization
|
|
28
|
+
record_event(
|
|
29
|
+
"module_init",
|
|
30
|
+
{
|
|
31
|
+
"module": "agent",
|
|
32
|
+
"version": __version__,
|
|
33
|
+
"python_version": sys.version,
|
|
34
|
+
},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Set the package version as a dimension
|
|
38
|
+
set_dimension("agent_version", __version__)
|
|
39
|
+
|
|
40
|
+
# Flush events to ensure they're sent
|
|
41
|
+
flush()
|
|
42
|
+
else:
|
|
43
|
+
logger.info("Telemetry is disabled")
|
|
44
|
+
except ImportError as e:
|
|
45
|
+
# Telemetry not available
|
|
46
|
+
logger.warning(f"Telemetry not available: {e}")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
# Other issues with telemetry
|
|
49
|
+
logger.warning(f"Error initializing telemetry: {e}")
|
|
50
|
+
|
|
51
|
+
from .core.factory import AgentFactory
|
|
52
|
+
from .core.agent import ComputerAgent
|
|
53
|
+
from .providers.omni.types import LLMProvider, LLM
|
|
54
|
+
from .types.base import Provider, AgentLoop
|
|
55
|
+
|
|
56
|
+
__all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgentLoop", "LLMProvider", "LLM"]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Unified computer agent implementation that supports multiple loops."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING, Union, cast
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
from computer import Computer
|
|
13
|
+
|
|
14
|
+
from ..types.base import Provider, AgentLoop
|
|
15
|
+
from .base_agent import BaseComputerAgent
|
|
16
|
+
from ..core.telemetry import record_agent_initialization
|
|
17
|
+
|
|
18
|
+
# Only import types for type checking to avoid circular imports
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from ..providers.anthropic.loop import AnthropicLoop
|
|
21
|
+
from ..providers.omni.loop import OmniLoop
|
|
22
|
+
from ..providers.omni.parser import OmniParser
|
|
23
|
+
|
|
24
|
+
# Import the provider types
|
|
25
|
+
from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Default models for different providers
|
|
30
|
+
DEFAULT_MODELS = {
|
|
31
|
+
LLMProvider.OPENAI: "gpt-4o",
|
|
32
|
+
LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Map providers to their environment variable names
|
|
36
|
+
ENV_VARS = {
|
|
37
|
+
LLMProvider.OPENAI: "OPENAI_API_KEY",
|
|
38
|
+
LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ComputerAgent(BaseComputerAgent):
|
|
43
|
+
"""Unified implementation of the computer agent supporting multiple loop types.
|
|
44
|
+
|
|
45
|
+
This class consolidates the previous AnthropicComputerAgent and OmniComputerAgent
|
|
46
|
+
into a single implementation with configurable loop type.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
computer: Computer,
|
|
52
|
+
loop: AgentLoop = AgentLoop.OMNI,
|
|
53
|
+
model: Optional[Union[LLM, Dict[str, str], str]] = None,
|
|
54
|
+
api_key: Optional[str] = None,
|
|
55
|
+
save_trajectory: bool = True,
|
|
56
|
+
trajectory_dir: Optional[str] = "trajectories",
|
|
57
|
+
only_n_most_recent_images: Optional[int] = None,
|
|
58
|
+
max_retries: int = 3,
|
|
59
|
+
verbosity: int = logging.INFO,
|
|
60
|
+
telemetry_enabled: bool = True,
|
|
61
|
+
**kwargs,
|
|
62
|
+
):
|
|
63
|
+
"""Initialize a ComputerAgent instance.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
computer: The Computer instance to control
|
|
67
|
+
loop: The agent loop to use: ANTHROPIC or OMNI
|
|
68
|
+
model: The model to use. Can be a string, dict or LLM object.
|
|
69
|
+
Defaults to LLM for the loop type.
|
|
70
|
+
api_key: The API key to use. If None, will use environment variables.
|
|
71
|
+
save_trajectory: Whether to save the trajectory.
|
|
72
|
+
trajectory_dir: The directory to save trajectories to.
|
|
73
|
+
only_n_most_recent_images: Only keep this many most recent images.
|
|
74
|
+
max_retries: Maximum number of retries for failed requests.
|
|
75
|
+
verbosity: Logging level (standard Python logging levels).
|
|
76
|
+
telemetry_enabled: Whether to enable telemetry tracking. Defaults to True.
|
|
77
|
+
**kwargs: Additional keyword arguments to pass to the loop.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(computer)
|
|
80
|
+
self._configure_logging(verbosity)
|
|
81
|
+
logger.info(f"Initializing ComputerAgent with {loop} loop")
|
|
82
|
+
|
|
83
|
+
# Store telemetry preference
|
|
84
|
+
self.telemetry_enabled = telemetry_enabled
|
|
85
|
+
|
|
86
|
+
# Process the model configuration
|
|
87
|
+
self.model = self._process_model_config(model, loop)
|
|
88
|
+
self.loop_type = loop
|
|
89
|
+
self.api_key = api_key
|
|
90
|
+
|
|
91
|
+
# Store computer
|
|
92
|
+
self.computer = computer
|
|
93
|
+
|
|
94
|
+
# Save trajectory settings
|
|
95
|
+
self.save_trajectory = save_trajectory
|
|
96
|
+
self.trajectory_dir = trajectory_dir
|
|
97
|
+
self.only_n_most_recent_images = only_n_most_recent_images
|
|
98
|
+
|
|
99
|
+
# Store the max retries setting
|
|
100
|
+
self.max_retries = max_retries
|
|
101
|
+
|
|
102
|
+
# Initialize message history
|
|
103
|
+
self.messages = []
|
|
104
|
+
|
|
105
|
+
# Extra kwargs for the loop
|
|
106
|
+
self.loop_kwargs = kwargs
|
|
107
|
+
|
|
108
|
+
# Initialize the actual loop implementation
|
|
109
|
+
self.loop = self._init_loop()
|
|
110
|
+
|
|
111
|
+
# Record initialization in telemetry if enabled
|
|
112
|
+
if telemetry_enabled:
|
|
113
|
+
record_agent_initialization()
|
|
114
|
+
|
|
115
|
+
def _process_model_config(
|
|
116
|
+
self, model_input: Optional[Union[LLM, Dict[str, str], str]], loop: AgentLoop
|
|
117
|
+
) -> LLM:
|
|
118
|
+
"""Process and normalize model configuration.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
model_input: Input model configuration (LLM, dict, string, or None)
|
|
122
|
+
loop: The loop type being used
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Normalized LLM instance
|
|
126
|
+
"""
|
|
127
|
+
# Handle case where model_input is None
|
|
128
|
+
if model_input is None:
|
|
129
|
+
# Use Anthropic for Anthropic loop, OpenAI for Omni loop
|
|
130
|
+
default_provider = (
|
|
131
|
+
LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
132
|
+
)
|
|
133
|
+
return LLM(provider=default_provider)
|
|
134
|
+
|
|
135
|
+
# Handle case where model_input is already a LLM or one of its aliases
|
|
136
|
+
if isinstance(model_input, (LLM, Model, LLMModel)):
|
|
137
|
+
return model_input
|
|
138
|
+
|
|
139
|
+
# Handle case where model_input is a dict
|
|
140
|
+
if isinstance(model_input, dict):
|
|
141
|
+
provider = model_input.get("provider", LLMProvider.OPENAI)
|
|
142
|
+
if isinstance(provider, str):
|
|
143
|
+
provider = LLMProvider(provider)
|
|
144
|
+
return LLM(provider=provider, name=model_input.get("name"))
|
|
145
|
+
|
|
146
|
+
# Handle case where model_input is a string (model name)
|
|
147
|
+
if isinstance(model_input, str):
|
|
148
|
+
default_provider = (
|
|
149
|
+
LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
|
|
150
|
+
)
|
|
151
|
+
return LLM(provider=default_provider, name=model_input)
|
|
152
|
+
|
|
153
|
+
raise ValueError(f"Unsupported model configuration: {model_input}")
|
|
154
|
+
|
|
155
|
+
def _configure_logging(self, verbosity: int):
|
|
156
|
+
"""Configure logging based on verbosity level."""
|
|
157
|
+
# Use the logging level directly without mapping
|
|
158
|
+
logger.setLevel(verbosity)
|
|
159
|
+
logging.getLogger("agent").setLevel(verbosity)
|
|
160
|
+
|
|
161
|
+
# Log the verbosity level that was set
|
|
162
|
+
if verbosity <= logging.DEBUG:
|
|
163
|
+
logger.info("Agent logging set to DEBUG level (full debug information)")
|
|
164
|
+
elif verbosity <= logging.INFO:
|
|
165
|
+
logger.info("Agent logging set to INFO level (standard output)")
|
|
166
|
+
elif verbosity <= logging.WARNING:
|
|
167
|
+
logger.warning("Agent logging set to WARNING level (warnings and errors only)")
|
|
168
|
+
elif verbosity <= logging.ERROR:
|
|
169
|
+
logger.warning("Agent logging set to ERROR level (errors only)")
|
|
170
|
+
elif verbosity <= logging.CRITICAL:
|
|
171
|
+
logger.warning("Agent logging set to CRITICAL level (critical errors only)")
|
|
172
|
+
|
|
173
|
+
def _init_loop(self) -> Any:
|
|
174
|
+
"""Initialize the loop based on the loop_type.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Initialized loop instance
|
|
178
|
+
"""
|
|
179
|
+
# Lazy import OmniLoop and OmniParser to avoid circular imports
|
|
180
|
+
from ..providers.omni.loop import OmniLoop
|
|
181
|
+
from ..providers.omni.parser import OmniParser
|
|
182
|
+
|
|
183
|
+
if self.loop_type == AgentLoop.ANTHROPIC:
|
|
184
|
+
from ..providers.anthropic.loop import AnthropicLoop
|
|
185
|
+
|
|
186
|
+
# Ensure we always have a valid model name
|
|
187
|
+
model_name = self.model.name or DEFAULT_MODELS[LLMProvider.ANTHROPIC]
|
|
188
|
+
|
|
189
|
+
return AnthropicLoop(
|
|
190
|
+
api_key=self.api_key,
|
|
191
|
+
model=model_name,
|
|
192
|
+
computer=self.computer,
|
|
193
|
+
save_trajectory=self.save_trajectory,
|
|
194
|
+
base_dir=self.trajectory_dir,
|
|
195
|
+
only_n_most_recent_images=self.only_n_most_recent_images,
|
|
196
|
+
**self.loop_kwargs,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Initialize parser for OmniLoop with appropriate device
|
|
200
|
+
if "parser" not in self.loop_kwargs:
|
|
201
|
+
self.loop_kwargs["parser"] = OmniParser()
|
|
202
|
+
|
|
203
|
+
# Ensure we always have a valid model name
|
|
204
|
+
model_name = self.model.name or DEFAULT_MODELS[self.model.provider]
|
|
205
|
+
|
|
206
|
+
return OmniLoop(
|
|
207
|
+
provider=self.model.provider,
|
|
208
|
+
api_key=self.api_key,
|
|
209
|
+
model=model_name,
|
|
210
|
+
computer=self.computer,
|
|
211
|
+
save_trajectory=self.save_trajectory,
|
|
212
|
+
base_dir=self.trajectory_dir,
|
|
213
|
+
only_n_most_recent_images=self.only_n_most_recent_images,
|
|
214
|
+
**self.loop_kwargs,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
|
|
218
|
+
"""Execute a task using the appropriate agent loop.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
task: The task to execute
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
AsyncGenerator yielding task outputs
|
|
225
|
+
"""
|
|
226
|
+
logger.info(f"Executing task: {task}")
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
# Create a message from the task
|
|
230
|
+
task_message = {"role": "user", "content": task}
|
|
231
|
+
messages_with_task = self.messages + [task_message]
|
|
232
|
+
|
|
233
|
+
# Use the run method of the loop
|
|
234
|
+
async for output in self.loop.run(messages_with_task):
|
|
235
|
+
yield output
|
|
236
|
+
except Exception as e:
|
|
237
|
+
logger.error(f"Error executing task: {e}")
|
|
238
|
+
raise
|
|
239
|
+
finally:
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
async def _execute_action(self, action_type: str, **action_params) -> Any:
|
|
243
|
+
"""Execute an action with telemetry tracking."""
|
|
244
|
+
try:
|
|
245
|
+
# Execute the action
|
|
246
|
+
result = await super()._execute_action(action_type, **action_params)
|
|
247
|
+
return result
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.exception(f"Error executing action {action_type}: {e}")
|
|
250
|
+
raise
|
|
251
|
+
finally:
|
|
252
|
+
pass
|
|
@@ -113,7 +113,7 @@ class BaseComputerAgent(ABC):
|
|
|
113
113
|
# Take a test screenshot to verify the computer is working
|
|
114
114
|
logger.info("Testing computer with a screenshot...")
|
|
115
115
|
try:
|
|
116
|
-
test_screenshot = await self.computer.screenshot()
|
|
116
|
+
test_screenshot = await self.computer.interface.screenshot()
|
|
117
117
|
# Determine the screenshot size based on its type
|
|
118
118
|
if isinstance(test_screenshot, bytes):
|
|
119
119
|
size = len(test_screenshot)
|
|
@@ -8,6 +8,7 @@ from datetime import datetime
|
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
from PIL import Image
|
|
10
10
|
import json
|
|
11
|
+
import re
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
@@ -106,9 +107,18 @@ class ExperimentManager:
|
|
|
106
107
|
# Increment screenshot counter
|
|
107
108
|
self.screenshot_count += 1
|
|
108
109
|
|
|
110
|
+
# Sanitize action_type to ensure valid filename
|
|
111
|
+
# Replace characters that are not safe for filenames
|
|
112
|
+
sanitized_action = ""
|
|
113
|
+
if action_type:
|
|
114
|
+
# Replace invalid filename characters with underscores
|
|
115
|
+
sanitized_action = re.sub(r'[\\/*?:"<>|]', "_", action_type)
|
|
116
|
+
# Limit the length to avoid excessively long filenames
|
|
117
|
+
sanitized_action = sanitized_action[:50]
|
|
118
|
+
|
|
109
119
|
# Create a descriptive filename
|
|
110
120
|
timestamp = int(datetime.now().timestamp() * 1000)
|
|
111
|
-
action_suffix = f"_{
|
|
121
|
+
action_suffix = f"_{sanitized_action}" if sanitized_action else ""
|
|
112
122
|
filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png"
|
|
113
123
|
|
|
114
124
|
# Save directly to the turn directory
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Agent telemetry for tracking anonymous usage and feature usage."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import platform
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Dict, Any
|
|
8
|
+
|
|
9
|
+
# Import the core telemetry module
|
|
10
|
+
TELEMETRY_AVAILABLE = False
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from core.telemetry import (
|
|
14
|
+
record_event,
|
|
15
|
+
increment,
|
|
16
|
+
get_telemetry_client,
|
|
17
|
+
flush,
|
|
18
|
+
is_telemetry_enabled,
|
|
19
|
+
is_telemetry_globally_disabled,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def increment_counter(counter_name: str, value: int = 1) -> None:
|
|
23
|
+
"""Wrapper for increment to maintain backward compatibility."""
|
|
24
|
+
if is_telemetry_enabled():
|
|
25
|
+
increment(counter_name, value)
|
|
26
|
+
|
|
27
|
+
def set_dimension(name: str, value: Any) -> None:
|
|
28
|
+
"""Set a dimension that will be attached to all events."""
|
|
29
|
+
logger = logging.getLogger("cua.agent.telemetry")
|
|
30
|
+
logger.debug(f"Setting dimension {name}={value}")
|
|
31
|
+
|
|
32
|
+
TELEMETRY_AVAILABLE = True
|
|
33
|
+
logger = logging.getLogger("cua.agent.telemetry")
|
|
34
|
+
logger.info("Successfully imported telemetry")
|
|
35
|
+
except ImportError as e:
|
|
36
|
+
logger = logging.getLogger("cua.agent.telemetry")
|
|
37
|
+
logger.warning(f"Could not import telemetry: {e}")
|
|
38
|
+
TELEMETRY_AVAILABLE = False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Local fallbacks in case core telemetry isn't available
|
|
42
|
+
def _noop(*args: Any, **kwargs: Any) -> None:
|
|
43
|
+
"""No-op function for when telemetry is not available."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger("cua.agent.telemetry")
|
|
48
|
+
|
|
49
|
+
# If telemetry isn't available, use no-op functions
|
|
50
|
+
if not TELEMETRY_AVAILABLE:
|
|
51
|
+
logger.debug("Telemetry not available, using no-op functions")
|
|
52
|
+
record_event = _noop # type: ignore
|
|
53
|
+
increment_counter = _noop # type: ignore
|
|
54
|
+
set_dimension = _noop # type: ignore
|
|
55
|
+
get_telemetry_client = lambda: None # type: ignore
|
|
56
|
+
flush = _noop # type: ignore
|
|
57
|
+
is_telemetry_enabled = lambda: False # type: ignore
|
|
58
|
+
is_telemetry_globally_disabled = lambda: True # type: ignore
|
|
59
|
+
|
|
60
|
+
# Get system info once to use in telemetry
|
|
61
|
+
SYSTEM_INFO = {
|
|
62
|
+
"os": platform.system().lower(),
|
|
63
|
+
"os_version": platform.release(),
|
|
64
|
+
"python_version": platform.python_version(),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def enable_telemetry() -> bool:
|
|
69
|
+
"""Enable telemetry if available.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
bool: True if telemetry was successfully enabled, False otherwise
|
|
73
|
+
"""
|
|
74
|
+
global TELEMETRY_AVAILABLE
|
|
75
|
+
|
|
76
|
+
# Check if globally disabled using core function
|
|
77
|
+
if TELEMETRY_AVAILABLE and is_telemetry_globally_disabled():
|
|
78
|
+
logger.info("Telemetry is globally disabled via environment variable - cannot enable")
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
# Already enabled
|
|
82
|
+
if TELEMETRY_AVAILABLE:
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
# Try to import and enable
|
|
86
|
+
try:
|
|
87
|
+
from core.telemetry import (
|
|
88
|
+
record_event,
|
|
89
|
+
increment,
|
|
90
|
+
get_telemetry_client,
|
|
91
|
+
flush,
|
|
92
|
+
is_telemetry_globally_disabled,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Check again after import
|
|
96
|
+
if is_telemetry_globally_disabled():
|
|
97
|
+
logger.info("Telemetry is globally disabled via environment variable - cannot enable")
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
TELEMETRY_AVAILABLE = True
|
|
101
|
+
logger.info("Telemetry successfully enabled")
|
|
102
|
+
return True
|
|
103
|
+
except ImportError as e:
|
|
104
|
+
logger.warning(f"Could not enable telemetry: {e}")
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def is_telemetry_enabled() -> bool:
|
|
109
|
+
"""Check if telemetry is enabled.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
bool: True if telemetry is enabled, False otherwise
|
|
113
|
+
"""
|
|
114
|
+
# Use the core function if available, otherwise use our local flag
|
|
115
|
+
if TELEMETRY_AVAILABLE:
|
|
116
|
+
from core.telemetry import is_telemetry_enabled as core_is_enabled
|
|
117
|
+
|
|
118
|
+
return core_is_enabled()
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def record_agent_initialization() -> None:
|
|
123
|
+
"""Record when an agent instance is initialized."""
|
|
124
|
+
if TELEMETRY_AVAILABLE and is_telemetry_enabled():
|
|
125
|
+
record_event("agent_initialized", SYSTEM_INFO)
|
|
126
|
+
|
|
127
|
+
# Set dimensions that will be attached to all events
|
|
128
|
+
set_dimension("os", SYSTEM_INFO["os"])
|
|
129
|
+
set_dimension("os_version", SYSTEM_INFO["os_version"])
|
|
130
|
+
set_dimension("python_version", SYSTEM_INFO["python_version"])
|