cua-agent 0.1.25__tar.gz → 0.1.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (76) hide show
  1. {cua_agent-0.1.25 → cua_agent-0.1.27}/PKG-INFO +30 -37
  2. {cua_agent-0.1.25 → cua_agent-0.1.27}/README.md +28 -35
  3. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/base.py +20 -0
  4. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/callbacks.py +57 -2
  5. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/callbacks/manager.py +20 -10
  6. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/oaicompat.py +11 -3
  7. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/loop.py +8 -2
  8. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/loop.py +13 -4
  9. cua_agent-0.1.27/agent/ui/gradio/app.py +972 -0
  10. {cua_agent-0.1.25 → cua_agent-0.1.27}/pyproject.toml +4 -4
  11. cua_agent-0.1.25/agent/ui/gradio/app.py +0 -877
  12. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/__init__.py +0 -0
  13. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/__init__.py +0 -0
  14. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/agent.py +0 -0
  15. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/experiment.py +0 -0
  16. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/factory.py +0 -0
  17. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/messages.py +0 -0
  18. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/provider_config.py +0 -0
  19. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/telemetry.py +0 -0
  20. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/__init__.py +0 -0
  21. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/base.py +0 -0
  22. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/bash.py +0 -0
  23. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/collection.py +0 -0
  24. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/computer.py +0 -0
  25. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/edit.py +0 -0
  26. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools/manager.py +0 -0
  27. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/tools.py +0 -0
  28. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/types.py +0 -0
  29. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/core/visualization.py +0 -0
  30. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/__init__.py +0 -0
  31. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/__init__.py +0 -0
  32. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/api/client.py +0 -0
  33. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/api/logging.py +0 -0
  34. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/api_handler.py +0 -0
  35. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  36. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/loop.py +0 -0
  37. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/prompts.py +0 -0
  38. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/response_handler.py +0 -0
  39. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/__init__.py +0 -0
  40. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/base.py +0 -0
  41. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/bash.py +0 -0
  42. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/collection.py +0 -0
  43. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/computer.py +0 -0
  44. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/edit.py +0 -0
  45. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/manager.py +0 -0
  46. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/tools/run.py +0 -0
  47. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/types.py +0 -0
  48. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/anthropic/utils.py +0 -0
  49. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/__init__.py +0 -0
  50. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/api_handler.py +0 -0
  51. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/anthropic.py +0 -0
  52. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/base.py +0 -0
  53. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/ollama.py +0 -0
  54. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/openai.py +0 -0
  55. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/clients/utils.py +0 -0
  56. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/image_utils.py +0 -0
  57. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/parser.py +0 -0
  58. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/prompts.py +0 -0
  59. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/tools/__init__.py +0 -0
  60. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/tools/base.py +0 -0
  61. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/tools/bash.py +0 -0
  62. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/tools/computer.py +0 -0
  63. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/tools/manager.py +0 -0
  64. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/omni/utils.py +0 -0
  65. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/__init__.py +0 -0
  66. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/api_handler.py +0 -0
  67. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/response_handler.py +0 -0
  68. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/tools/__init__.py +0 -0
  69. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/tools/base.py +0 -0
  70. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/tools/computer.py +0 -0
  71. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/tools/manager.py +0 -0
  72. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/types.py +0 -0
  73. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/providers/openai/utils.py +0 -0
  74. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/telemetry.py +0 -0
  75. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/ui/__init__.py +0 -0
  76. {cua_agent-0.1.25 → cua_agent-0.1.27}/agent/ui/gradio/__init__.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.25
3
+ Version: 0.1.27
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
- Requires-Python: <3.13,>=3.10
6
+ Requires-Python: >=3.10
7
7
  Requires-Dist: httpx<0.29.0,>=0.27.0
8
8
  Requires-Dist: aiohttp<4.0.0,>=3.9.3
9
9
  Requires-Dist: asyncio
@@ -148,8 +148,10 @@ The agent includes a Gradio-based user interface for easy interaction. To use it
148
148
  ```bash
149
149
  # Install with Gradio support
150
150
  pip install "cua-agent[ui]"
151
+ ```
152
+
153
+ ### Create a simple launcher script
151
154
 
152
- # Create a simple launcher script
153
155
  ```python
154
156
  # launch_ui.py
155
157
  from agent.ui.gradio.app import create_gradio_ui
@@ -158,10 +160,6 @@ app = create_gradio_ui()
158
160
  app.launch(share=False)
159
161
  ```
160
162
 
161
- # Run the launcher
162
- python launch_ui.py
163
- ```
164
-
165
163
  ### Setting up API Keys
166
164
 
167
165
  For the Gradio UI to show available models, you need to set API keys as environment variables:
@@ -177,6 +175,8 @@ export ANTHROPIC_API_KEY=your_anthropic_key_here
177
175
  OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
178
176
  ```
179
177
 
178
+ Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
179
+
180
180
  ### Using Local Models
181
181
 
182
182
  You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
@@ -186,42 +186,12 @@ If you're using a different local model server:
186
186
  - LocalAI: `http://localhost:8080/v1`
187
187
  - Ollama with OpenAI compat API: `http://localhost:11434/v1`
188
188
 
189
- To change the URL, modify the `provider_base_url` in your launcher script:
190
-
191
- ```python
192
- # In your launcher script
193
- from agent.ui.gradio.app import create_gradio_ui
194
- from agent import LLM, LLMProvider
195
-
196
- # Create a custom model with a specific URL
197
- custom_model = LLM(
198
- provider=LLMProvider.OAICOMPAT,
199
- name="your-model-name",
200
- provider_base_url="http://localhost:8000/v1" # Change to your server URL
201
- )
202
-
203
- app = create_gradio_ui(custom_model=custom_model)
204
- app.launch()
205
- ```
206
-
207
- Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
208
-
209
189
  The Gradio UI provides:
210
190
  - Selection of different agent loops (OpenAI, Anthropic, OMNI)
211
191
  - Model selection for each provider
212
192
  - Configuration of agent parameters
213
193
  - Chat interface for interacting with the agent
214
194
 
215
- You can also embed the Gradio UI in your own application:
216
-
217
- ```python
218
- # Import directly in your application
219
- from agent.ui.gradio.app import create_gradio_ui
220
-
221
- app = create_gradio_ui()
222
- app.launch()
223
- ```
224
-
225
195
  ## Agent Loops
226
196
 
227
197
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -270,3 +240,26 @@ async for result in agent.run(task):
270
240
  print("\nTool Call Output:")
271
241
  print(output)
272
242
  ```
243
+
244
+ ### Gradio UI
245
+
246
+ You can also interact with the agent using a Gradio interface.
247
+
248
+ ```python
249
+ # Ensure environment variables (e.g., API keys) are loaded
250
+ # You might need a helper function like load_dotenv_files() if using .env
251
+ # from utils import load_dotenv_files
252
+ # load_dotenv_files()
253
+
254
+ from agent.ui.gradio.app import create_gradio_ui
255
+
256
+ app = create_gradio_ui()
257
+ app.launch(share=False)
258
+ ```
259
+
260
+ **Note on Settings Persistence:**
261
+
262
+ * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
263
+ * This allows your preferences to persist between sessions.
264
+ * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
265
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -80,8 +80,10 @@ The agent includes a Gradio-based user interface for easy interaction. To use it
80
80
  ```bash
81
81
  # Install with Gradio support
82
82
  pip install "cua-agent[ui]"
83
+ ```
84
+
85
+ ### Create a simple launcher script
83
86
 
84
- # Create a simple launcher script
85
87
  ```python
86
88
  # launch_ui.py
87
89
  from agent.ui.gradio.app import create_gradio_ui
@@ -90,10 +92,6 @@ app = create_gradio_ui()
90
92
  app.launch(share=False)
91
93
  ```
92
94
 
93
- # Run the launcher
94
- python launch_ui.py
95
- ```
96
-
97
95
  ### Setting up API Keys
98
96
 
99
97
  For the Gradio UI to show available models, you need to set API keys as environment variables:
@@ -109,6 +107,8 @@ export ANTHROPIC_API_KEY=your_anthropic_key_here
109
107
  OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
110
108
  ```
111
109
 
110
+ Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
111
+
112
112
  ### Using Local Models
113
113
 
114
114
  You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
@@ -118,42 +118,12 @@ If you're using a different local model server:
118
118
  - LocalAI: `http://localhost:8080/v1`
119
119
  - Ollama with OpenAI compat API: `http://localhost:11434/v1`
120
120
 
121
- To change the URL, modify the `provider_base_url` in your launcher script:
122
-
123
- ```python
124
- # In your launcher script
125
- from agent.ui.gradio.app import create_gradio_ui
126
- from agent import LLM, LLMProvider
127
-
128
- # Create a custom model with a specific URL
129
- custom_model = LLM(
130
- provider=LLMProvider.OAICOMPAT,
131
- name="your-model-name",
132
- provider_base_url="http://localhost:8000/v1" # Change to your server URL
133
- )
134
-
135
- app = create_gradio_ui(custom_model=custom_model)
136
- app.launch()
137
- ```
138
-
139
- Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
140
-
141
121
  The Gradio UI provides:
142
122
  - Selection of different agent loops (OpenAI, Anthropic, OMNI)
143
123
  - Model selection for each provider
144
124
  - Configuration of agent parameters
145
125
  - Chat interface for interacting with the agent
146
126
 
147
- You can also embed the Gradio UI in your own application:
148
-
149
- ```python
150
- # Import directly in your application
151
- from agent.ui.gradio.app import create_gradio_ui
152
-
153
- app = create_gradio_ui()
154
- app.launch()
155
- ```
156
-
157
127
  ## Agent Loops
158
128
 
159
129
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -202,3 +172,26 @@ async for result in agent.run(task):
202
172
  print("\nTool Call Output:")
203
173
  print(output)
204
174
  ```
175
+
176
+ ### Gradio UI
177
+
178
+ You can also interact with the agent using a Gradio interface.
179
+
180
+ ```python
181
+ # Ensure environment variables (e.g., API keys) are loaded
182
+ # You might need a helper function like load_dotenv_files() if using .env
183
+ # from utils import load_dotenv_files
184
+ # load_dotenv_files()
185
+
186
+ from agent.ui.gradio.app import create_gradio_ui
187
+
188
+ app = create_gradio_ui()
189
+ app.launch(share=False)
190
+ ```
191
+
192
+ **Note on Settings Persistence:**
193
+
194
+ * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
195
+ * This allows your preferences to persist between sessions.
196
+ * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
197
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -5,10 +5,12 @@ import asyncio
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import Any, AsyncGenerator, Dict, List, Optional
7
7
 
8
+ from agent.providers.omni.parser import ParseResult
8
9
  from computer import Computer
9
10
  from .messages import StandardMessageManager, ImageRetentionConfig
10
11
  from .types import AgentResponse
11
12
  from .experiment import ExperimentManager
13
+ from .callbacks import CallbackManager, CallbackHandler
12
14
 
13
15
  logger = logging.getLogger(__name__)
14
16
 
@@ -27,6 +29,7 @@ class BaseLoop(ABC):
27
29
  base_dir: Optional[str] = "trajectories",
28
30
  save_trajectory: bool = True,
29
31
  only_n_most_recent_images: Optional[int] = 2,
32
+ callback_handlers: Optional[List[CallbackHandler]] = None,
30
33
  **kwargs,
31
34
  ):
32
35
  """Initialize base agent loop.
@@ -75,6 +78,9 @@ class BaseLoop(ABC):
75
78
 
76
79
  # Initialize basic tracking
77
80
  self.turn_count = 0
81
+
82
+ # Initialize callback manager
83
+ self.callback_manager = CallbackManager(handlers=callback_handlers or [])
78
84
 
79
85
  async def initialize(self) -> None:
80
86
  """Initialize both the API client and computer interface with retries."""
@@ -187,3 +193,17 @@ class BaseLoop(ABC):
187
193
  """
188
194
  if self.experiment_manager:
189
195
  self.experiment_manager.save_screenshot(img_base64, action_type)
196
+
197
+ ###########################################
198
+ # EVENT HOOKS / CALLBACKS
199
+ ###########################################
200
+
201
+ async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
202
+ """Process a screenshot through callback managers
203
+
204
+ Args:
205
+ screenshot_base64: Base64 encoded screenshot
206
+ action_type: Type of action that triggered the screenshot
207
+ """
208
+ if hasattr(self, 'callback_manager'):
209
+ await self.callback_manager.on_screenshot(screenshot_base64, action_type, parsed_screen)
@@ -6,6 +6,8 @@ from abc import ABC, abstractmethod
6
6
  from datetime import datetime
7
7
  from typing import Any, Dict, List, Optional, Protocol
8
8
 
9
+ from agent.providers.omni.parser import ParseResult
10
+
9
11
  logger = logging.getLogger(__name__)
10
12
 
11
13
  class ContentCallback(Protocol):
@@ -20,6 +22,10 @@ class APICallback(Protocol):
20
22
  """Protocol for API callbacks."""
21
23
  def __call__(self, request: Any, response: Any, error: Optional[Exception] = None) -> None: ...
22
24
 
25
+ class ScreenshotCallback(Protocol):
26
+ """Protocol for screenshot callbacks."""
27
+ def __call__(self, screenshot_base64: str, action_type: str = "") -> Optional[str]: ...
28
+
23
29
  class BaseCallbackManager(ABC):
24
30
  """Base class for callback managers."""
25
31
 
@@ -110,7 +116,20 @@ class CallbackManager:
110
116
  """
111
117
  for handler in self.handlers:
112
118
  await handler.on_error(error, **kwargs)
113
-
119
+
120
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
121
+ """Called when a screenshot is taken.
122
+
123
+ Args:
124
+ screenshot_base64: Base64 encoded screenshot
125
+ action_type: Type of action that triggered the screenshot
126
+ parsed_screen: Optional output from parsing the screenshot
127
+
128
+ Returns:
129
+ Modified screenshot or original if no modifications
130
+ """
131
+ for handler in self.handlers:
132
+ await handler.on_screenshot(screenshot_base64, action_type, parsed_screen)
114
133
 
115
134
  class CallbackHandler(ABC):
116
135
  """Base class for callback handlers."""
@@ -144,4 +163,40 @@ class CallbackHandler(ABC):
144
163
  error: Exception that occurred
145
164
  **kwargs: Additional data
146
165
  """
147
- pass
166
+ pass
167
+
168
+ @abstractmethod
169
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
170
+ """Called when a screenshot is taken.
171
+
172
+ Args:
173
+ screenshot_base64: Base64 encoded screenshot
174
+ action_type: Type of action that triggered the screenshot
175
+
176
+ Returns:
177
+ Optional modified screenshot
178
+ """
179
+ pass
180
+
181
+ class DefaultCallbackHandler(CallbackHandler):
182
+ """Default implementation of CallbackHandler with no-op methods.
183
+
184
+ This class implements all abstract methods from CallbackHandler,
185
+ allowing subclasses to override only the methods they need.
186
+ """
187
+
188
+ async def on_action_start(self, action: str, **kwargs) -> None:
189
+ """Default no-op implementation."""
190
+ pass
191
+
192
+ async def on_action_end(self, action: str, success: bool, **kwargs) -> None:
193
+ """Default no-op implementation."""
194
+ pass
195
+
196
+ async def on_error(self, error: Exception, **kwargs) -> None:
197
+ """Default no-op implementation."""
198
+ pass
199
+
200
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
201
+ """Default no-op implementation."""
202
+ pass
@@ -3,23 +3,33 @@ import httpx
3
3
  from anthropic.types.beta import BetaContentBlockParam
4
4
  from ..tools import ToolResult
5
5
 
6
+
6
7
  class APICallback(Protocol):
7
8
  """Protocol for API callbacks."""
8
- def __call__(self, request: httpx.Request | None,
9
- response: httpx.Response | object | None,
10
- error: Exception | None) -> None: ...
9
+
10
+ def __call__(
11
+ self,
12
+ request: httpx.Request | None,
13
+ response: httpx.Response | object | None,
14
+ error: Exception | None,
15
+ ) -> None: ...
16
+
11
17
 
12
18
  class ContentCallback(Protocol):
13
19
  """Protocol for content callbacks."""
20
+
14
21
  def __call__(self, content: BetaContentBlockParam) -> None: ...
15
22
 
23
+
16
24
  class ToolCallback(Protocol):
17
25
  """Protocol for tool callbacks."""
26
+
18
27
  def __call__(self, result: ToolResult, tool_id: str) -> None: ...
19
28
 
29
+
20
30
  class CallbackManager:
21
31
  """Manages various callbacks for the agent system."""
22
-
32
+
23
33
  def __init__(
24
34
  self,
25
35
  content_callback: ContentCallback,
@@ -27,7 +37,7 @@ class CallbackManager:
27
37
  api_callback: APICallback,
28
38
  ):
29
39
  """Initialize the callback manager.
30
-
40
+
31
41
  Args:
32
42
  content_callback: Callback for content updates
33
43
  tool_callback: Callback for tool execution results
@@ -36,20 +46,20 @@ class CallbackManager:
36
46
  self.content_callback = content_callback
37
47
  self.tool_callback = tool_callback
38
48
  self.api_callback = api_callback
39
-
49
+
40
50
  def on_content(self, content: BetaContentBlockParam) -> None:
41
51
  """Handle content updates."""
42
52
  self.content_callback(content)
43
-
53
+
44
54
  def on_tool_result(self, result: ToolResult, tool_id: str) -> None:
45
55
  """Handle tool execution results."""
46
56
  self.tool_callback(result, tool_id)
47
-
57
+
48
58
  def on_api_interaction(
49
59
  self,
50
60
  request: httpx.Request | None,
51
61
  response: httpx.Response | object | None,
52
- error: Exception | None
62
+ error: Exception | None,
53
63
  ) -> None:
54
64
  """Handle API interactions."""
55
- self.api_callback(request, response, error)
65
+ self.api_callback(request, response, error)
@@ -45,8 +45,8 @@ class OAICompatClient(BaseOmniClient):
45
45
  max_tokens: Maximum tokens to generate
46
46
  temperature: Generation temperature
47
47
  """
48
- super().__init__(api_key="EMPTY", model=model)
49
- self.api_key = "EMPTY" # Local endpoints typically don't require an API key
48
+ super().__init__(api_key=api_key or "EMPTY", model=model)
49
+ self.api_key = api_key or "EMPTY" # Local endpoints typically don't require an API key
50
50
  self.model = model
51
51
  self.provider_base_url = (
52
52
  provider_base_url or "http://localhost:8000/v1"
@@ -146,10 +146,18 @@ class OAICompatClient(BaseOmniClient):
146
146
  base_url = self.provider_base_url or "http://localhost:8000/v1"
147
147
 
148
148
  # Check if the base URL already includes the chat/completions endpoint
149
+
149
150
  endpoint_url = base_url
150
151
  if not endpoint_url.endswith("/chat/completions"):
152
+ # If URL is RunPod format, make it OpenAI compatible
153
+ if endpoint_url.startswith("https://api.runpod.ai/v2/"):
154
+ # Extract RunPod endpoint ID
155
+ parts = endpoint_url.split("/")
156
+ if len(parts) >= 5:
157
+ runpod_id = parts[4]
158
+ endpoint_url = f"https://api.runpod.ai/v2/{runpod_id}/openai/v1/chat/completions"
151
159
  # If the URL ends with /v1, append /chat/completions
152
- if endpoint_url.endswith("/v1"):
160
+ elif endpoint_url.endswith("/v1"):
153
161
  endpoint_url = f"{endpoint_url}/chat/completions"
154
162
  # If the URL doesn't end with /v1, make sure it has a proper structure
155
163
  elif not endpoint_url.endswith("/"):
@@ -147,7 +147,7 @@ class OmniLoop(BaseLoop):
147
147
  )
148
148
  elif self.provider == LLMProvider.OAICOMPAT:
149
149
  self.client = OAICompatClient(
150
- api_key="EMPTY", # Local endpoints typically don't require an API key
150
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
151
151
  model=self.model,
152
152
  provider_base_url=self.provider_base_url,
153
153
  )
@@ -183,7 +183,7 @@ class OmniLoop(BaseLoop):
183
183
  )
184
184
  elif self.provider == LLMProvider.OAICOMPAT:
185
185
  self.client = OAICompatClient(
186
- api_key="EMPTY", # Local endpoints typically don't require an API key
186
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
187
187
  model=self.model,
188
188
  provider_base_url=self.provider_base_url,
189
189
  )
@@ -548,6 +548,10 @@ class OmniLoop(BaseLoop):
548
548
  img_data = parsed_screen.annotated_image_base64
549
549
  if "," in img_data:
550
550
  img_data = img_data.split(",")[1]
551
+
552
+ # Process screenshot through hooks and save if needed
553
+ await self.handle_screenshot(img_data, action_type="state", parsed_screen=parsed_screen)
554
+
551
555
  # Save with a generic "state" action type to indicate this is the current screen state
552
556
  self._save_screenshot(img_data, action_type="state")
553
557
  except Exception as e:
@@ -663,6 +667,8 @@ class OmniLoop(BaseLoop):
663
667
  response=response,
664
668
  messages=self.message_manager.messages,
665
669
  model=self.model,
670
+ parsed_screen=parsed_screen,
671
+ parser=self.parser
666
672
  )
667
673
 
668
674
  # Yield the response to the caller
@@ -194,8 +194,13 @@ class OpenAILoop(BaseLoop):
194
194
  # Convert to base64 if needed
195
195
  if isinstance(screenshot, bytes):
196
196
  screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
197
+ elif isinstance(screenshot, (bytearray, memoryview)):
198
+ screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
197
199
  else:
198
- screenshot_base64 = screenshot
200
+ screenshot_base64 = str(screenshot)
201
+
202
+ # Emit screenshot callbacks
203
+ await self.handle_screenshot(screenshot_base64, action_type="initial_state")
199
204
 
200
205
  # Save screenshot if requested
201
206
  if self.save_trajectory:
@@ -204,8 +209,6 @@ class OpenAILoop(BaseLoop):
204
209
  logger.warning(
205
210
  "Converting non-string screenshot_base64 to string for _save_screenshot"
206
211
  )
207
- if isinstance(screenshot_base64, (bytearray, memoryview)):
208
- screenshot_base64 = base64.b64encode(screenshot_base64).decode("utf-8")
209
212
  self._save_screenshot(screenshot_base64, action_type="state")
210
213
  logger.info("Screenshot saved to trajectory")
211
214
 
@@ -336,8 +339,14 @@ class OpenAILoop(BaseLoop):
336
339
  screenshot = await self.computer.interface.screenshot()
337
340
  if isinstance(screenshot, bytes):
338
341
  screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
342
+ elif isinstance(screenshot, (bytearray, memoryview)):
343
+ screenshot_base64 = base64.b64encode(bytes(screenshot)).decode("utf-8")
339
344
  else:
340
- screenshot_base64 = screenshot
345
+ screenshot_base64 = str(screenshot)
346
+
347
+ # Process screenshot through hooks
348
+ action_type = f"after_{action.get('type', 'action')}"
349
+ await self.handle_screenshot(screenshot_base64, action_type=action_type)
341
350
 
342
351
  # Create computer_call_output
343
352
  computer_call_output = {