cua-agent 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (70) hide show
  1. cua_agent-0.1.4/PKG-INFO +120 -0
  2. {cua_agent-0.1.2 → cua_agent-0.1.4}/README.md +4 -56
  3. cua_agent-0.1.4/agent/__init__.py +56 -0
  4. cua_agent-0.1.4/agent/core/agent.py +252 -0
  5. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/base_agent.py +1 -1
  6. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/experiment.py +11 -1
  7. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/loop.py +1 -1
  8. cua_agent-0.1.4/agent/core/telemetry.py +130 -0
  9. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/loop.py +2 -2
  10. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/parser.py +1 -1
  11. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/types.py +0 -6
  12. cua_agent-0.1.4/agent/telemetry.py +21 -0
  13. {cua_agent-0.1.2 → cua_agent-0.1.4}/pyproject.toml +5 -3
  14. cua_agent-0.1.2/PKG-INFO +0 -44
  15. cua_agent-0.1.2/agent/__init__.py +0 -10
  16. cua_agent-0.1.2/agent/core/agent.py +0 -327
  17. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/README.md +0 -0
  18. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/README.md +0 -0
  19. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/__init__.py +0 -0
  20. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/callbacks.py +0 -0
  21. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/computer_agent.py +0 -0
  22. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/factory.py +0 -0
  23. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/messages.py +0 -0
  24. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/__init__.py +0 -0
  25. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/base.py +0 -0
  26. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/bash.py +0 -0
  27. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/collection.py +0 -0
  28. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/computer.py +0 -0
  29. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/edit.py +0 -0
  30. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/core/tools/manager.py +0 -0
  31. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/__init__.py +0 -0
  32. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/__init__.py +0 -0
  33. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/api/client.py +0 -0
  34. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/api/logging.py +0 -0
  35. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/callbacks/manager.py +0 -0
  36. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/loop.py +0 -0
  37. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/messages/manager.py +0 -0
  38. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/prompts.py +0 -0
  39. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/__init__.py +0 -0
  40. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/base.py +0 -0
  41. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/bash.py +0 -0
  42. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/collection.py +0 -0
  43. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/computer.py +0 -0
  44. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/edit.py +0 -0
  45. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/manager.py +0 -0
  46. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/tools/run.py +0 -0
  47. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/anthropic/types.py +0 -0
  48. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/__init__.py +0 -0
  49. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/callbacks.py +0 -0
  50. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/anthropic.py +0 -0
  51. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/base.py +0 -0
  52. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/groq.py +0 -0
  53. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/openai.py +0 -0
  54. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/clients/utils.py +0 -0
  55. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/experiment.py +0 -0
  56. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/image_utils.py +0 -0
  57. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/messages.py +0 -0
  58. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/prompts.py +0 -0
  59. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tool_manager.py +0 -0
  60. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/__init__.py +0 -0
  61. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/bash.py +0 -0
  62. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/computer.py +0 -0
  63. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/tools/manager.py +0 -0
  64. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/utils.py +0 -0
  65. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/providers/omni/visualization.py +0 -0
  66. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/__init__.py +0 -0
  67. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/base.py +0 -0
  68. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/messages.py +0 -0
  69. {cua_agent-0.1.2 → cua_agent-0.1.4}/agent/types/tools.py +0 -0
  70. {cua_agent-0.1.2 → cua_agent-0.1.4}/tests/test_agent.py +0 -0
@@ -0,0 +1,120 @@
1
+ Metadata-Version: 2.1
2
+ Name: cua-agent
3
+ Version: 0.1.4
4
+ Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
+ Author-Email: TryCua <gh@trycua.com>
6
+ Requires-Python: <3.13,>=3.10
7
+ Requires-Dist: httpx<0.29.0,>=0.27.0
8
+ Requires-Dist: aiohttp<4.0.0,>=3.9.3
9
+ Requires-Dist: asyncio
10
+ Requires-Dist: anyio<5.0.0,>=4.4.1
11
+ Requires-Dist: typing-extensions<5.0.0,>=4.12.2
12
+ Requires-Dist: pydantic<3.0.0,>=2.6.4
13
+ Requires-Dist: rich<14.0.0,>=13.7.1
14
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.1
15
+ Requires-Dist: cua-computer<0.2.0,>=0.1.0
16
+ Requires-Dist: cua-core<0.2.0,>=0.1.0
17
+ Requires-Dist: certifi>=2024.2.2
18
+ Provides-Extra: anthropic
19
+ Requires-Dist: anthropic>=0.49.0; extra == "anthropic"
20
+ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
21
+ Provides-Extra: som
22
+ Requires-Dist: torch>=2.2.1; extra == "som"
23
+ Requires-Dist: torchvision>=0.17.1; extra == "som"
24
+ Requires-Dist: ultralytics>=8.0.0; extra == "som"
25
+ Requires-Dist: transformers>=4.38.2; extra == "som"
26
+ Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "som"
27
+ Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "som"
28
+ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "som"
29
+ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "som"
30
+ Requires-Dist: groq<0.5.0,>=0.4.0; extra == "som"
31
+ Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "som"
32
+ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "som"
33
+ Provides-Extra: all
34
+ Requires-Dist: torch>=2.2.1; extra == "all"
35
+ Requires-Dist: torchvision>=0.17.1; extra == "all"
36
+ Requires-Dist: ultralytics>=8.0.0; extra == "all"
37
+ Requires-Dist: transformers>=4.38.2; extra == "all"
38
+ Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
39
+ Requires-Dist: anthropic<0.47.0,>=0.46.0; extra == "all"
40
+ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "all"
41
+ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "all"
42
+ Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
43
+ Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
44
+ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
45
+ Description-Content-Type: text/markdown
46
+
47
+ <div align="center">
48
+ <h1>
49
+ <div class="image-wrapper" style="display: inline-block;">
50
+ <picture>
51
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
52
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
53
+ <img alt="Shows my svg">
54
+ </picture>
55
+ </div>
56
+
57
+ [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
58
+ [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
59
+ [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
60
+ [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
61
+ </h1>
62
+ </div>
63
+
64
+ **Agent** is a Computer Use (CUA) framework for running multi-app agentic workflows targeting macOS and Linux sandbox, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). The framework integrates with Microsoft's OmniParser for enhanced UI understanding and interaction.
65
+
66
+ ### Get started with Agent
67
+
68
+ ```python
69
+ from agent import ComputerAgent, AgentLoop, LLMProvider
70
+ from computer import Computer
71
+
72
+ computer = Computer(verbosity=logging.INFO)
73
+
74
+ agent = ComputerAgent(
75
+ computer=computer,
76
+ loop=AgentLoop.ANTHROPIC,
77
+ # loop=AgentLoop.OMNI,
78
+ model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
79
+ # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
80
+ save_trajectory=True,
81
+ trajectory_dir=str(Path("trajectories")),
82
+ only_n_most_recent_images=3,
83
+ verbosity=logging.INFO,
84
+ )
85
+
86
+ tasks = [
87
+ """
88
+ Please help me with the following task:
89
+ 1. Open Safari browser
90
+ 2. Go to Wikipedia.org
91
+ 3. Search for "Claude AI"
92
+ 4. Summarize the main points you find about Claude AI
93
+ """
94
+ ]
95
+
96
+ async with agent:
97
+ for i, task in enumerate(tasks, 1):
98
+ print(f"\nExecuting task {i}/{len(tasks)}: {task}")
99
+ async for result in agent.run(task):
100
+ print(result)
101
+ print(f"Task {i} completed")
102
+ ```
103
+
104
+ ## Install
105
+
106
+ ### cua-agent
107
+
108
+ ```bash
109
+ pip install "cua-agent[all]"
110
+
111
+ # or install specific loop providers
112
+ pip install "cua-agent[anthropic]"
113
+ pip install "cua-agent[omni]"
114
+ ```
115
+
116
+ ## Run
117
+
118
+ Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
119
+
120
+ - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
@@ -60,67 +60,15 @@ async with agent:
60
60
  ### cua-agent
61
61
 
62
62
  ```bash
63
-
64
- pip install cua-agent[all]
63
+ pip install "cua-agent[all]"
65
64
 
66
65
  # or install specific loop providers
67
- pip install cua-agent[anthropic]
68
- pip install cua-agent[omni]
69
-
70
-
66
+ pip install "cua-agent[anthropic]"
67
+ pip install "cua-agent[omni]"
71
68
  ```
72
69
 
73
- ## Features
74
-
75
- ### OmniParser Integration
76
- - Enhanced UI understanding with element detection
77
- - Automatic bounding box detection for UI elements
78
- - Improved accuracy for complex UI interactions
79
- - Support for icon and text element recognition
80
-
81
- ### Basic Computer Control
82
- - Direct keyboard and mouse control
83
- - Window and application management
84
- - Screenshot capabilities
85
- - Basic UI element detection
86
-
87
- ### Provider Support
88
- - OpenAI (GPT-4V) - Recommended for OmniParser integration
89
- - Anthropic (Claude) - Strong general performance
90
- - Groq - Fast inference with Llama models
91
- - DeepSeek - Alternative model provider
92
- - Qwen - Alibaba's multimodal model
93
-
94
70
  ## Run
95
71
 
96
72
  Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA):
97
73
 
98
- - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
99
-
100
- ## Components
101
-
102
- The library consists of several components:
103
-
104
- - **Core**
105
- - `ComputerAgent`: Unified agent class supporting multiple loop types
106
- - `BaseComputerAgent`: Abstract base class for computer agents
107
-
108
- - **Providers**
109
- - `Anthropic`: Implementation for Anthropic Claude models
110
- - `Omni`: Implementation for multiple providers (OpenAI, Groq, etc.)
111
-
112
- - **Loops**
113
- - `AnthropicLoop`: Loop implementation for Anthropic
114
- - `OmniLoop`: Generic loop supporting multiple providers
115
-
116
- ## Configuration
117
-
118
- The agent can be configured with various parameters:
119
-
120
- - **loop_type**: The type of loop to use (ANTHROPIC or OMNI)
121
- - **provider**: AI provider to use with the loop
122
- - **model**: The AI model to use
123
- - **save_trajectory**: Whether to save screenshots and logs
124
- - **only_n_most_recent_images**: Only keep a specific number of recent images
125
-
126
- See the [Core README](./agent/core/README.md) for more details on the unified agent.
74
+ - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
@@ -0,0 +1,56 @@
1
+ """CUA (Computer Use) Agent for AI-driven computer interaction."""
2
+
3
+ import sys
4
+ import logging
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ # Initialize logging
9
+ logger = logging.getLogger("cua.agent")
10
+
11
+ # Initialize telemetry when the package is imported
12
+ try:
13
+ # Import from core telemetry for basic functions
14
+ from core.telemetry import (
15
+ is_telemetry_enabled,
16
+ flush,
17
+ record_event,
18
+ )
19
+
20
+ # Import set_dimension from our own telemetry module
21
+ from .core.telemetry import set_dimension
22
+
23
+ # Check if telemetry is enabled
24
+ if is_telemetry_enabled():
25
+ logger.info("Telemetry is enabled")
26
+
27
+ # Record package initialization
28
+ record_event(
29
+ "module_init",
30
+ {
31
+ "module": "agent",
32
+ "version": __version__,
33
+ "python_version": sys.version,
34
+ },
35
+ )
36
+
37
+ # Set the package version as a dimension
38
+ set_dimension("agent_version", __version__)
39
+
40
+ # Flush events to ensure they're sent
41
+ flush()
42
+ else:
43
+ logger.info("Telemetry is disabled")
44
+ except ImportError as e:
45
+ # Telemetry not available
46
+ logger.warning(f"Telemetry not available: {e}")
47
+ except Exception as e:
48
+ # Other issues with telemetry
49
+ logger.warning(f"Error initializing telemetry: {e}")
50
+
51
+ from .core.factory import AgentFactory
52
+ from .core.agent import ComputerAgent
53
+ from .providers.omni.types import LLMProvider, LLM
54
+ from .types.base import Provider, AgentLoop
55
+
56
+ __all__ = ["AgentFactory", "Provider", "ComputerAgent", "AgentLoop", "LLMProvider", "LLM"]
@@ -0,0 +1,252 @@
1
+ """Unified computer agent implementation that supports multiple loops."""
2
+
3
+ import os
4
+ import logging
5
+ import asyncio
6
+ import time
7
+ import uuid
8
+ from typing import Any, AsyncGenerator, Dict, List, Optional, TYPE_CHECKING, Union, cast
9
+ from datetime import datetime
10
+ from enum import Enum
11
+
12
+ from computer import Computer
13
+
14
+ from ..types.base import Provider, AgentLoop
15
+ from .base_agent import BaseComputerAgent
16
+ from ..core.telemetry import record_agent_initialization
17
+
18
+ # Only import types for type checking to avoid circular imports
19
+ if TYPE_CHECKING:
20
+ from ..providers.anthropic.loop import AnthropicLoop
21
+ from ..providers.omni.loop import OmniLoop
22
+ from ..providers.omni.parser import OmniParser
23
+
24
+ # Import the provider types
25
+ from ..providers.omni.types import LLMProvider, LLM, Model, LLMModel
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Default models for different providers
30
+ DEFAULT_MODELS = {
31
+ LLMProvider.OPENAI: "gpt-4o",
32
+ LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
33
+ }
34
+
35
+ # Map providers to their environment variable names
36
+ ENV_VARS = {
37
+ LLMProvider.OPENAI: "OPENAI_API_KEY",
38
+ LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
39
+ }
40
+
41
+
42
+ class ComputerAgent(BaseComputerAgent):
43
+ """Unified implementation of the computer agent supporting multiple loop types.
44
+
45
+ This class consolidates the previous AnthropicComputerAgent and OmniComputerAgent
46
+ into a single implementation with configurable loop type.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ computer: Computer,
52
+ loop: AgentLoop = AgentLoop.OMNI,
53
+ model: Optional[Union[LLM, Dict[str, str], str]] = None,
54
+ api_key: Optional[str] = None,
55
+ save_trajectory: bool = True,
56
+ trajectory_dir: Optional[str] = "trajectories",
57
+ only_n_most_recent_images: Optional[int] = None,
58
+ max_retries: int = 3,
59
+ verbosity: int = logging.INFO,
60
+ telemetry_enabled: bool = True,
61
+ **kwargs,
62
+ ):
63
+ """Initialize a ComputerAgent instance.
64
+
65
+ Args:
66
+ computer: The Computer instance to control
67
+ loop: The agent loop to use: ANTHROPIC or OMNI
68
+ model: The model to use. Can be a string, dict or LLM object.
69
+ Defaults to LLM for the loop type.
70
+ api_key: The API key to use. If None, will use environment variables.
71
+ save_trajectory: Whether to save the trajectory.
72
+ trajectory_dir: The directory to save trajectories to.
73
+ only_n_most_recent_images: Only keep this many most recent images.
74
+ max_retries: Maximum number of retries for failed requests.
75
+ verbosity: Logging level (standard Python logging levels).
76
+ telemetry_enabled: Whether to enable telemetry tracking. Defaults to True.
77
+ **kwargs: Additional keyword arguments to pass to the loop.
78
+ """
79
+ super().__init__(computer)
80
+ self._configure_logging(verbosity)
81
+ logger.info(f"Initializing ComputerAgent with {loop} loop")
82
+
83
+ # Store telemetry preference
84
+ self.telemetry_enabled = telemetry_enabled
85
+
86
+ # Process the model configuration
87
+ self.model = self._process_model_config(model, loop)
88
+ self.loop_type = loop
89
+ self.api_key = api_key
90
+
91
+ # Store computer
92
+ self.computer = computer
93
+
94
+ # Save trajectory settings
95
+ self.save_trajectory = save_trajectory
96
+ self.trajectory_dir = trajectory_dir
97
+ self.only_n_most_recent_images = only_n_most_recent_images
98
+
99
+ # Store the max retries setting
100
+ self.max_retries = max_retries
101
+
102
+ # Initialize message history
103
+ self.messages = []
104
+
105
+ # Extra kwargs for the loop
106
+ self.loop_kwargs = kwargs
107
+
108
+ # Initialize the actual loop implementation
109
+ self.loop = self._init_loop()
110
+
111
+ # Record initialization in telemetry if enabled
112
+ if telemetry_enabled:
113
+ record_agent_initialization()
114
+
115
+ def _process_model_config(
116
+ self, model_input: Optional[Union[LLM, Dict[str, str], str]], loop: AgentLoop
117
+ ) -> LLM:
118
+ """Process and normalize model configuration.
119
+
120
+ Args:
121
+ model_input: Input model configuration (LLM, dict, string, or None)
122
+ loop: The loop type being used
123
+
124
+ Returns:
125
+ Normalized LLM instance
126
+ """
127
+ # Handle case where model_input is None
128
+ if model_input is None:
129
+ # Use Anthropic for Anthropic loop, OpenAI for Omni loop
130
+ default_provider = (
131
+ LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
132
+ )
133
+ return LLM(provider=default_provider)
134
+
135
+ # Handle case where model_input is already a LLM or one of its aliases
136
+ if isinstance(model_input, (LLM, Model, LLMModel)):
137
+ return model_input
138
+
139
+ # Handle case where model_input is a dict
140
+ if isinstance(model_input, dict):
141
+ provider = model_input.get("provider", LLMProvider.OPENAI)
142
+ if isinstance(provider, str):
143
+ provider = LLMProvider(provider)
144
+ return LLM(provider=provider, name=model_input.get("name"))
145
+
146
+ # Handle case where model_input is a string (model name)
147
+ if isinstance(model_input, str):
148
+ default_provider = (
149
+ LLMProvider.ANTHROPIC if loop == AgentLoop.ANTHROPIC else LLMProvider.OPENAI
150
+ )
151
+ return LLM(provider=default_provider, name=model_input)
152
+
153
+ raise ValueError(f"Unsupported model configuration: {model_input}")
154
+
155
+ def _configure_logging(self, verbosity: int):
156
+ """Configure logging based on verbosity level."""
157
+ # Use the logging level directly without mapping
158
+ logger.setLevel(verbosity)
159
+ logging.getLogger("agent").setLevel(verbosity)
160
+
161
+ # Log the verbosity level that was set
162
+ if verbosity <= logging.DEBUG:
163
+ logger.info("Agent logging set to DEBUG level (full debug information)")
164
+ elif verbosity <= logging.INFO:
165
+ logger.info("Agent logging set to INFO level (standard output)")
166
+ elif verbosity <= logging.WARNING:
167
+ logger.warning("Agent logging set to WARNING level (warnings and errors only)")
168
+ elif verbosity <= logging.ERROR:
169
+ logger.warning("Agent logging set to ERROR level (errors only)")
170
+ elif verbosity <= logging.CRITICAL:
171
+ logger.warning("Agent logging set to CRITICAL level (critical errors only)")
172
+
173
+ def _init_loop(self) -> Any:
174
+ """Initialize the loop based on the loop_type.
175
+
176
+ Returns:
177
+ Initialized loop instance
178
+ """
179
+ # Lazy import OmniLoop and OmniParser to avoid circular imports
180
+ from ..providers.omni.loop import OmniLoop
181
+ from ..providers.omni.parser import OmniParser
182
+
183
+ if self.loop_type == AgentLoop.ANTHROPIC:
184
+ from ..providers.anthropic.loop import AnthropicLoop
185
+
186
+ # Ensure we always have a valid model name
187
+ model_name = self.model.name or DEFAULT_MODELS[LLMProvider.ANTHROPIC]
188
+
189
+ return AnthropicLoop(
190
+ api_key=self.api_key,
191
+ model=model_name,
192
+ computer=self.computer,
193
+ save_trajectory=self.save_trajectory,
194
+ base_dir=self.trajectory_dir,
195
+ only_n_most_recent_images=self.only_n_most_recent_images,
196
+ **self.loop_kwargs,
197
+ )
198
+
199
+ # Initialize parser for OmniLoop with appropriate device
200
+ if "parser" not in self.loop_kwargs:
201
+ self.loop_kwargs["parser"] = OmniParser()
202
+
203
+ # Ensure we always have a valid model name
204
+ model_name = self.model.name or DEFAULT_MODELS[self.model.provider]
205
+
206
+ return OmniLoop(
207
+ provider=self.model.provider,
208
+ api_key=self.api_key,
209
+ model=model_name,
210
+ computer=self.computer,
211
+ save_trajectory=self.save_trajectory,
212
+ base_dir=self.trajectory_dir,
213
+ only_n_most_recent_images=self.only_n_most_recent_images,
214
+ **self.loop_kwargs,
215
+ )
216
+
217
+ async def _execute_task(self, task: str) -> AsyncGenerator[Dict[str, Any], None]:
218
+ """Execute a task using the appropriate agent loop.
219
+
220
+ Args:
221
+ task: The task to execute
222
+
223
+ Returns:
224
+ AsyncGenerator yielding task outputs
225
+ """
226
+ logger.info(f"Executing task: {task}")
227
+
228
+ try:
229
+ # Create a message from the task
230
+ task_message = {"role": "user", "content": task}
231
+ messages_with_task = self.messages + [task_message]
232
+
233
+ # Use the run method of the loop
234
+ async for output in self.loop.run(messages_with_task):
235
+ yield output
236
+ except Exception as e:
237
+ logger.error(f"Error executing task: {e}")
238
+ raise
239
+ finally:
240
+ pass
241
+
242
+ async def _execute_action(self, action_type: str, **action_params) -> Any:
243
+ """Execute an action with telemetry tracking."""
244
+ try:
245
+ # Execute the action
246
+ result = await super()._execute_action(action_type, **action_params)
247
+ return result
248
+ except Exception as e:
249
+ logger.exception(f"Error executing action {action_type}: {e}")
250
+ raise
251
+ finally:
252
+ pass
@@ -113,7 +113,7 @@ class BaseComputerAgent(ABC):
113
113
  # Take a test screenshot to verify the computer is working
114
114
  logger.info("Testing computer with a screenshot...")
115
115
  try:
116
- test_screenshot = await self.computer.screenshot()
116
+ test_screenshot = await self.computer.interface.screenshot()
117
117
  # Determine the screenshot size based on its type
118
118
  if isinstance(test_screenshot, bytes):
119
119
  size = len(test_screenshot)
@@ -8,6 +8,7 @@ from datetime import datetime
8
8
  from typing import Any, Dict, List, Optional
9
9
  from PIL import Image
10
10
  import json
11
+ import re
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -106,9 +107,18 @@ class ExperimentManager:
106
107
  # Increment screenshot counter
107
108
  self.screenshot_count += 1
108
109
 
110
+ # Sanitize action_type to ensure valid filename
111
+ # Replace characters that are not safe for filenames
112
+ sanitized_action = ""
113
+ if action_type:
114
+ # Replace invalid filename characters with underscores
115
+ sanitized_action = re.sub(r'[\\/*?:"<>|]', "_", action_type)
116
+ # Limit the length to avoid excessively long filenames
117
+ sanitized_action = sanitized_action[:50]
118
+
109
119
  # Create a descriptive filename
110
120
  timestamp = int(datetime.now().timestamp() * 1000)
111
- action_suffix = f"_{action_type}" if action_type else ""
121
+ action_suffix = f"_{sanitized_action}" if sanitized_action else ""
112
122
  filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png"
113
123
 
114
124
  # Save directly to the turn directory
@@ -166,7 +166,7 @@ class BaseLoop(ABC):
166
166
  """
167
167
  try:
168
168
  # Take screenshot
169
- screenshot = await self.computer.screenshot()
169
+ screenshot = await self.computer.interface.screenshot()
170
170
 
171
171
  # Initialize with default values
172
172
  width, height = 1024, 768
@@ -0,0 +1,130 @@
1
+ """Agent telemetry for tracking anonymous usage and feature usage."""
2
+
3
+ import logging
4
+ import os
5
+ import platform
6
+ import sys
7
+ from typing import Dict, Any
8
+
9
+ # Import the core telemetry module
10
+ TELEMETRY_AVAILABLE = False
11
+
12
+ try:
13
+ from core.telemetry import (
14
+ record_event,
15
+ increment,
16
+ get_telemetry_client,
17
+ flush,
18
+ is_telemetry_enabled,
19
+ is_telemetry_globally_disabled,
20
+ )
21
+
22
+ def increment_counter(counter_name: str, value: int = 1) -> None:
23
+ """Wrapper for increment to maintain backward compatibility."""
24
+ if is_telemetry_enabled():
25
+ increment(counter_name, value)
26
+
27
+ def set_dimension(name: str, value: Any) -> None:
28
+ """Set a dimension that will be attached to all events."""
29
+ logger = logging.getLogger("cua.agent.telemetry")
30
+ logger.debug(f"Setting dimension {name}={value}")
31
+
32
+ TELEMETRY_AVAILABLE = True
33
+ logger = logging.getLogger("cua.agent.telemetry")
34
+ logger.info("Successfully imported telemetry")
35
+ except ImportError as e:
36
+ logger = logging.getLogger("cua.agent.telemetry")
37
+ logger.warning(f"Could not import telemetry: {e}")
38
+ TELEMETRY_AVAILABLE = False
39
+
40
+
41
+ # Local fallbacks in case core telemetry isn't available
42
+ def _noop(*args: Any, **kwargs: Any) -> None:
43
+ """No-op function for when telemetry is not available."""
44
+ pass
45
+
46
+
47
+ logger = logging.getLogger("cua.agent.telemetry")
48
+
49
+ # If telemetry isn't available, use no-op functions
50
+ if not TELEMETRY_AVAILABLE:
51
+ logger.debug("Telemetry not available, using no-op functions")
52
+ record_event = _noop # type: ignore
53
+ increment_counter = _noop # type: ignore
54
+ set_dimension = _noop # type: ignore
55
+ get_telemetry_client = lambda: None # type: ignore
56
+ flush = _noop # type: ignore
57
+ is_telemetry_enabled = lambda: False # type: ignore
58
+ is_telemetry_globally_disabled = lambda: True # type: ignore
59
+
60
+ # Get system info once to use in telemetry
61
+ SYSTEM_INFO = {
62
+ "os": platform.system().lower(),
63
+ "os_version": platform.release(),
64
+ "python_version": platform.python_version(),
65
+ }
66
+
67
+
68
+ def enable_telemetry() -> bool:
69
+ """Enable telemetry if available.
70
+
71
+ Returns:
72
+ bool: True if telemetry was successfully enabled, False otherwise
73
+ """
74
+ global TELEMETRY_AVAILABLE
75
+
76
+ # Check if globally disabled using core function
77
+ if TELEMETRY_AVAILABLE and is_telemetry_globally_disabled():
78
+ logger.info("Telemetry is globally disabled via environment variable - cannot enable")
79
+ return False
80
+
81
+ # Already enabled
82
+ if TELEMETRY_AVAILABLE:
83
+ return True
84
+
85
+ # Try to import and enable
86
+ try:
87
+ from core.telemetry import (
88
+ record_event,
89
+ increment,
90
+ get_telemetry_client,
91
+ flush,
92
+ is_telemetry_globally_disabled,
93
+ )
94
+
95
+ # Check again after import
96
+ if is_telemetry_globally_disabled():
97
+ logger.info("Telemetry is globally disabled via environment variable - cannot enable")
98
+ return False
99
+
100
+ TELEMETRY_AVAILABLE = True
101
+ logger.info("Telemetry successfully enabled")
102
+ return True
103
+ except ImportError as e:
104
+ logger.warning(f"Could not enable telemetry: {e}")
105
+ return False
106
+
107
+
108
+ def is_telemetry_enabled() -> bool:
109
+ """Check if telemetry is enabled.
110
+
111
+ Returns:
112
+ bool: True if telemetry is enabled, False otherwise
113
+ """
114
+ # Use the core function if available, otherwise use our local flag
115
+ if TELEMETRY_AVAILABLE:
116
+ from core.telemetry import is_telemetry_enabled as core_is_enabled
117
+
118
+ return core_is_enabled()
119
+ return False
120
+
121
+
122
+ def record_agent_initialization() -> None:
123
+ """Record when an agent instance is initialized."""
124
+ if TELEMETRY_AVAILABLE and is_telemetry_enabled():
125
+ record_event("agent_initialized", SYSTEM_INFO)
126
+
127
+ # Set dimensions that will be attached to all events
128
+ set_dimension("os", SYSTEM_INFO["os"])
129
+ set_dimension("os_version", SYSTEM_INFO["os_version"])
130
+ set_dimension("python_version", SYSTEM_INFO["python_version"])