cua-agent 0.1.24__tar.gz → 0.1.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (76) hide show
  1. {cua_agent-0.1.24 → cua_agent-0.1.26}/PKG-INFO +37 -23
  2. {cua_agent-0.1.24 → cua_agent-0.1.26}/README.md +36 -22
  3. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/base.py +20 -0
  4. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/callbacks.py +57 -2
  5. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/callbacks/manager.py +20 -10
  6. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/oaicompat.py +11 -3
  7. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/loop.py +24 -4
  8. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/loop.py +13 -4
  9. cua_agent-0.1.26/agent/ui/gradio/app.py +972 -0
  10. {cua_agent-0.1.24 → cua_agent-0.1.26}/pyproject.toml +3 -3
  11. cua_agent-0.1.24/agent/ui/gradio/app.py +0 -872
  12. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/__init__.py +0 -0
  13. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/__init__.py +0 -0
  14. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/agent.py +0 -0
  15. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/experiment.py +0 -0
  16. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/factory.py +0 -0
  17. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/messages.py +0 -0
  18. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/provider_config.py +0 -0
  19. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/telemetry.py +0 -0
  20. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/__init__.py +0 -0
  21. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/base.py +0 -0
  22. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/bash.py +0 -0
  23. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/collection.py +0 -0
  24. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/computer.py +0 -0
  25. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/edit.py +0 -0
  26. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools/manager.py +0 -0
  27. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/tools.py +0 -0
  28. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/types.py +0 -0
  29. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/core/visualization.py +0 -0
  30. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/__init__.py +0 -0
  31. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/__init__.py +0 -0
  32. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/api/client.py +0 -0
  33. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/api/logging.py +0 -0
  34. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/api_handler.py +0 -0
  35. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  36. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/loop.py +0 -0
  37. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/prompts.py +0 -0
  38. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/response_handler.py +0 -0
  39. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/__init__.py +0 -0
  40. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/base.py +0 -0
  41. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/bash.py +0 -0
  42. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/collection.py +0 -0
  43. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/computer.py +0 -0
  44. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/edit.py +0 -0
  45. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/manager.py +0 -0
  46. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/tools/run.py +0 -0
  47. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/types.py +0 -0
  48. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/anthropic/utils.py +0 -0
  49. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/__init__.py +0 -0
  50. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/api_handler.py +0 -0
  51. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/anthropic.py +0 -0
  52. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/base.py +0 -0
  53. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/ollama.py +0 -0
  54. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/openai.py +0 -0
  55. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/clients/utils.py +0 -0
  56. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/image_utils.py +0 -0
  57. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/parser.py +0 -0
  58. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/prompts.py +0 -0
  59. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/tools/__init__.py +0 -0
  60. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/tools/base.py +0 -0
  61. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/tools/bash.py +0 -0
  62. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/tools/computer.py +0 -0
  63. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/tools/manager.py +0 -0
  64. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/omni/utils.py +0 -0
  65. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/__init__.py +0 -0
  66. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/api_handler.py +0 -0
  67. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/response_handler.py +0 -0
  68. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/tools/__init__.py +0 -0
  69. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/tools/base.py +0 -0
  70. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/tools/computer.py +0 -0
  71. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/tools/manager.py +0 -0
  72. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/types.py +0 -0
  73. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/providers/openai/utils.py +0 -0
  74. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/telemetry.py +0 -0
  75. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/ui/__init__.py +0 -0
  76. {cua_agent-0.1.24 → cua_agent-0.1.26}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: <3.13,>=3.10
@@ -148,8 +148,10 @@ The agent includes a Gradio-based user interface for easy interaction. To use it
148
148
  ```bash
149
149
  # Install with Gradio support
150
150
  pip install "cua-agent[ui]"
151
+ ```
152
+
153
+ ### Create a simple launcher script
151
154
 
152
- # Create a simple launcher script
153
155
  ```python
154
156
  # launch_ui.py
155
157
  from agent.ui.gradio.app import create_gradio_ui
@@ -158,10 +160,6 @@ app = create_gradio_ui()
158
160
  app.launch(share=False)
159
161
  ```
160
162
 
161
- # Run the launcher
162
- python launch_ui.py
163
- ```
164
-
165
163
  ### Setting up API Keys
166
164
 
167
165
  For the Gradio UI to show available models, you need to set API keys as environment variables:
@@ -179,28 +177,21 @@ OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
179
177
 
180
178
  Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
181
179
 
180
+ ### Using Local Models
181
+
182
+ You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
183
+
184
+ If you're using a different local model server:
185
+ - vLLM: `http://localhost:8000/v1`
186
+ - LocalAI: `http://localhost:8080/v1`
187
+ - Ollama with OpenAI compat API: `http://localhost:11434/v1`
188
+
182
189
  The Gradio UI provides:
183
190
  - Selection of different agent loops (OpenAI, Anthropic, OMNI)
184
191
  - Model selection for each provider
185
192
  - Configuration of agent parameters
186
193
  - Chat interface for interacting with the agent
187
194
 
188
- You can also embed the Gradio UI in your own application:
189
-
190
- ```python
191
- # Import directly in your application
192
- from agent.ui.gradio.app import create_gradio_ui
193
-
194
- # Create the UI with advanced features
195
- demo = create_gradio_ui()
196
- demo.launch()
197
-
198
- # Or for a simpler interface
199
- from agent.ui.gradio import registry
200
- demo = registry(name='cua:gpt-4o')
201
- demo.launch()
202
- ```
203
-
204
195
  ## Agent Loops
205
196
 
206
197
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -209,7 +200,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
209
200
  |:-----------|:-----------------|:------------|:-------------|
210
201
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
211
202
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
212
- | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
203
+ | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
213
204
 
214
205
  ## AgentResponse
215
206
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -249,3 +240,26 @@ async for result in agent.run(task):
249
240
  print("\nTool Call Output:")
250
241
  print(output)
251
242
  ```
243
+
244
+ ### Gradio UI
245
+
246
+ You can also interact with the agent using a Gradio interface.
247
+
248
+ ```python
249
+ # Ensure environment variables (e.g., API keys) are loaded
250
+ # You might need a helper function like load_dotenv_files() if using .env
251
+ # from utils import load_dotenv_files
252
+ # load_dotenv_files()
253
+
254
+ from agent.ui.gradio.app import create_gradio_ui
255
+
256
+ app = create_gradio_ui()
257
+ app.launch(share=False)
258
+ ```
259
+
260
+ **Note on Settings Persistence:**
261
+
262
+ * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
263
+ * This allows your preferences to persist between sessions.
264
+ * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
265
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -80,8 +80,10 @@ The agent includes a Gradio-based user interface for easy interaction. To use it
80
80
  ```bash
81
81
  # Install with Gradio support
82
82
  pip install "cua-agent[ui]"
83
+ ```
84
+
85
+ ### Create a simple launcher script
83
86
 
84
- # Create a simple launcher script
85
87
  ```python
86
88
  # launch_ui.py
87
89
  from agent.ui.gradio.app import create_gradio_ui
@@ -90,10 +92,6 @@ app = create_gradio_ui()
90
92
  app.launch(share=False)
91
93
  ```
92
94
 
93
- # Run the launcher
94
- python launch_ui.py
95
- ```
96
-
97
95
  ### Setting up API Keys
98
96
 
99
97
  For the Gradio UI to show available models, you need to set API keys as environment variables:
@@ -111,28 +109,21 @@ OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
111
109
 
112
110
  Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
113
111
 
112
+ ### Using Local Models
113
+
114
+ You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
115
+
116
+ If you're using a different local model server:
117
+ - vLLM: `http://localhost:8000/v1`
118
+ - LocalAI: `http://localhost:8080/v1`
119
+ - Ollama with OpenAI compat API: `http://localhost:11434/v1`
120
+
114
121
  The Gradio UI provides:
115
122
  - Selection of different agent loops (OpenAI, Anthropic, OMNI)
116
123
  - Model selection for each provider
117
124
  - Configuration of agent parameters
118
125
  - Chat interface for interacting with the agent
119
126
 
120
- You can also embed the Gradio UI in your own application:
121
-
122
- ```python
123
- # Import directly in your application
124
- from agent.ui.gradio.app import create_gradio_ui
125
-
126
- # Create the UI with advanced features
127
- demo = create_gradio_ui()
128
- demo.launch()
129
-
130
- # Or for a simpler interface
131
- from agent.ui.gradio import registry
132
- demo = registry(name='cua:gpt-4o')
133
- demo.launch()
134
- ```
135
-
136
127
  ## Agent Loops
137
128
 
138
129
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -141,7 +132,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
141
132
  |:-----------|:-----------------|:------------|:-------------|
142
133
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
143
134
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
144
- | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
135
+ | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
145
136
 
146
137
  ## AgentResponse
147
138
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -181,3 +172,26 @@ async for result in agent.run(task):
181
172
  print("\nTool Call Output:")
182
173
  print(output)
183
174
  ```
175
+
176
+ ### Gradio UI
177
+
178
+ You can also interact with the agent using a Gradio interface.
179
+
180
+ ```python
181
+ # Ensure environment variables (e.g., API keys) are loaded
182
+ # You might need a helper function like load_dotenv_files() if using .env
183
+ # from utils import load_dotenv_files
184
+ # load_dotenv_files()
185
+
186
+ from agent.ui.gradio.app import create_gradio_ui
187
+
188
+ app = create_gradio_ui()
189
+ app.launch(share=False)
190
+ ```
191
+
192
+ **Note on Settings Persistence:**
193
+
194
+ * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
195
+ * This allows your preferences to persist between sessions.
196
+ * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
197
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -5,10 +5,12 @@ import asyncio
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import Any, AsyncGenerator, Dict, List, Optional
7
7
 
8
+ from agent.providers.omni.parser import ParseResult
8
9
  from computer import Computer
9
10
  from .messages import StandardMessageManager, ImageRetentionConfig
10
11
  from .types import AgentResponse
11
12
  from .experiment import ExperimentManager
13
+ from .callbacks import CallbackManager, CallbackHandler
12
14
 
13
15
  logger = logging.getLogger(__name__)
14
16
 
@@ -27,6 +29,7 @@ class BaseLoop(ABC):
27
29
  base_dir: Optional[str] = "trajectories",
28
30
  save_trajectory: bool = True,
29
31
  only_n_most_recent_images: Optional[int] = 2,
32
+ callback_handlers: Optional[List[CallbackHandler]] = None,
30
33
  **kwargs,
31
34
  ):
32
35
  """Initialize base agent loop.
@@ -75,6 +78,9 @@ class BaseLoop(ABC):
75
78
 
76
79
  # Initialize basic tracking
77
80
  self.turn_count = 0
81
+
82
+ # Initialize callback manager
83
+ self.callback_manager = CallbackManager(handlers=callback_handlers or [])
78
84
 
79
85
  async def initialize(self) -> None:
80
86
  """Initialize both the API client and computer interface with retries."""
@@ -187,3 +193,17 @@ class BaseLoop(ABC):
187
193
  """
188
194
  if self.experiment_manager:
189
195
  self.experiment_manager.save_screenshot(img_base64, action_type)
196
+
197
+ ###########################################
198
+ # EVENT HOOKS / CALLBACKS
199
+ ###########################################
200
+
201
+ async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
202
+ """Process a screenshot through callback managers
203
+
204
+ Args:
205
+ screenshot_base64: Base64 encoded screenshot
206
+ action_type: Type of action that triggered the screenshot
207
+ """
208
+ if hasattr(self, 'callback_manager'):
209
+ await self.callback_manager.on_screenshot(screenshot_base64, action_type, parsed_screen)
@@ -6,6 +6,8 @@ from abc import ABC, abstractmethod
6
6
  from datetime import datetime
7
7
  from typing import Any, Dict, List, Optional, Protocol
8
8
 
9
+ from agent.providers.omni.parser import ParseResult
10
+
9
11
  logger = logging.getLogger(__name__)
10
12
 
11
13
  class ContentCallback(Protocol):
@@ -20,6 +22,10 @@ class APICallback(Protocol):
20
22
  """Protocol for API callbacks."""
21
23
  def __call__(self, request: Any, response: Any, error: Optional[Exception] = None) -> None: ...
22
24
 
25
+ class ScreenshotCallback(Protocol):
26
+ """Protocol for screenshot callbacks."""
27
+ def __call__(self, screenshot_base64: str, action_type: str = "") -> Optional[str]: ...
28
+
23
29
  class BaseCallbackManager(ABC):
24
30
  """Base class for callback managers."""
25
31
 
@@ -110,7 +116,20 @@ class CallbackManager:
110
116
  """
111
117
  for handler in self.handlers:
112
118
  await handler.on_error(error, **kwargs)
113
-
119
+
120
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
121
+ """Called when a screenshot is taken.
122
+
123
+ Args:
124
+ screenshot_base64: Base64 encoded screenshot
125
+ action_type: Type of action that triggered the screenshot
126
+ parsed_screen: Optional output from parsing the screenshot
127
+
128
+ Returns:
129
+ Modified screenshot or original if no modifications
130
+ """
131
+ for handler in self.handlers:
132
+ await handler.on_screenshot(screenshot_base64, action_type, parsed_screen)
114
133
 
115
134
  class CallbackHandler(ABC):
116
135
  """Base class for callback handlers."""
@@ -144,4 +163,40 @@ class CallbackHandler(ABC):
144
163
  error: Exception that occurred
145
164
  **kwargs: Additional data
146
165
  """
147
- pass
166
+ pass
167
+
168
+ @abstractmethod
169
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None:
170
+ """Called when a screenshot is taken.
171
+
172
+ Args:
173
+ screenshot_base64: Base64 encoded screenshot
174
+ action_type: Type of action that triggered the screenshot
175
+
176
+ Returns:
177
+ Optional modified screenshot
178
+ """
179
+ pass
180
+
181
+ class DefaultCallbackHandler(CallbackHandler):
182
+ """Default implementation of CallbackHandler with no-op methods.
183
+
184
+ This class implements all abstract methods from CallbackHandler,
185
+ allowing subclasses to override only the methods they need.
186
+ """
187
+
188
+ async def on_action_start(self, action: str, **kwargs) -> None:
189
+ """Default no-op implementation."""
190
+ pass
191
+
192
+ async def on_action_end(self, action: str, success: bool, **kwargs) -> None:
193
+ """Default no-op implementation."""
194
+ pass
195
+
196
+ async def on_error(self, error: Exception, **kwargs) -> None:
197
+ """Default no-op implementation."""
198
+ pass
199
+
200
+ async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
201
+ """Default no-op implementation."""
202
+ pass
@@ -3,23 +3,33 @@ import httpx
3
3
  from anthropic.types.beta import BetaContentBlockParam
4
4
  from ..tools import ToolResult
5
5
 
6
+
6
7
  class APICallback(Protocol):
7
8
  """Protocol for API callbacks."""
8
- def __call__(self, request: httpx.Request | None,
9
- response: httpx.Response | object | None,
10
- error: Exception | None) -> None: ...
9
+
10
+ def __call__(
11
+ self,
12
+ request: httpx.Request | None,
13
+ response: httpx.Response | object | None,
14
+ error: Exception | None,
15
+ ) -> None: ...
16
+
11
17
 
12
18
  class ContentCallback(Protocol):
13
19
  """Protocol for content callbacks."""
20
+
14
21
  def __call__(self, content: BetaContentBlockParam) -> None: ...
15
22
 
23
+
16
24
  class ToolCallback(Protocol):
17
25
  """Protocol for tool callbacks."""
26
+
18
27
  def __call__(self, result: ToolResult, tool_id: str) -> None: ...
19
28
 
29
+
20
30
  class CallbackManager:
21
31
  """Manages various callbacks for the agent system."""
22
-
32
+
23
33
  def __init__(
24
34
  self,
25
35
  content_callback: ContentCallback,
@@ -27,7 +37,7 @@ class CallbackManager:
27
37
  api_callback: APICallback,
28
38
  ):
29
39
  """Initialize the callback manager.
30
-
40
+
31
41
  Args:
32
42
  content_callback: Callback for content updates
33
43
  tool_callback: Callback for tool execution results
@@ -36,20 +46,20 @@ class CallbackManager:
36
46
  self.content_callback = content_callback
37
47
  self.tool_callback = tool_callback
38
48
  self.api_callback = api_callback
39
-
49
+
40
50
  def on_content(self, content: BetaContentBlockParam) -> None:
41
51
  """Handle content updates."""
42
52
  self.content_callback(content)
43
-
53
+
44
54
  def on_tool_result(self, result: ToolResult, tool_id: str) -> None:
45
55
  """Handle tool execution results."""
46
56
  self.tool_callback(result, tool_id)
47
-
57
+
48
58
  def on_api_interaction(
49
59
  self,
50
60
  request: httpx.Request | None,
51
61
  response: httpx.Response | object | None,
52
- error: Exception | None
62
+ error: Exception | None,
53
63
  ) -> None:
54
64
  """Handle API interactions."""
55
- self.api_callback(request, response, error)
65
+ self.api_callback(request, response, error)
@@ -45,8 +45,8 @@ class OAICompatClient(BaseOmniClient):
45
45
  max_tokens: Maximum tokens to generate
46
46
  temperature: Generation temperature
47
47
  """
48
- super().__init__(api_key="EMPTY", model=model)
49
- self.api_key = "EMPTY" # Local endpoints typically don't require an API key
48
+ super().__init__(api_key=api_key or "EMPTY", model=model)
49
+ self.api_key = api_key or "EMPTY" # Local endpoints typically don't require an API key
50
50
  self.model = model
51
51
  self.provider_base_url = (
52
52
  provider_base_url or "http://localhost:8000/v1"
@@ -146,10 +146,18 @@ class OAICompatClient(BaseOmniClient):
146
146
  base_url = self.provider_base_url or "http://localhost:8000/v1"
147
147
 
148
148
  # Check if the base URL already includes the chat/completions endpoint
149
+
149
150
  endpoint_url = base_url
150
151
  if not endpoint_url.endswith("/chat/completions"):
152
+ # If URL is RunPod format, make it OpenAI compatible
153
+ if endpoint_url.startswith("https://api.runpod.ai/v2/"):
154
+ # Extract RunPod endpoint ID
155
+ parts = endpoint_url.split("/")
156
+ if len(parts) >= 5:
157
+ runpod_id = parts[4]
158
+ endpoint_url = f"https://api.runpod.ai/v2/{runpod_id}/openai/v1/chat/completions"
151
159
  # If the URL ends with /v1, append /chat/completions
152
- if endpoint_url.endswith("/v1"):
160
+ elif endpoint_url.endswith("/v1"):
153
161
  endpoint_url = f"{endpoint_url}/chat/completions"
154
162
  # If the URL doesn't end with /v1, make sure it has a proper structure
155
163
  elif not endpoint_url.endswith("/"):
@@ -147,7 +147,7 @@ class OmniLoop(BaseLoop):
147
147
  )
148
148
  elif self.provider == LLMProvider.OAICOMPAT:
149
149
  self.client = OAICompatClient(
150
- api_key="EMPTY", # Local endpoints typically don't require an API key
150
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
151
151
  model=self.model,
152
152
  provider_base_url=self.provider_base_url,
153
153
  )
@@ -183,7 +183,7 @@ class OmniLoop(BaseLoop):
183
183
  )
184
184
  elif self.provider == LLMProvider.OAICOMPAT:
185
185
  self.client = OAICompatClient(
186
- api_key="EMPTY", # Local endpoints typically don't require an API key
186
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
187
187
  model=self.model,
188
188
  provider_base_url=self.provider_base_url,
189
189
  )
@@ -443,6 +443,8 @@ class OmniLoop(BaseLoop):
443
443
  except (json.JSONDecodeError, IndexError):
444
444
  try:
445
445
  # Look for JSON object pattern
446
+ import re # Local import to ensure availability
447
+
446
448
  json_pattern = r"\{[^}]+\}"
447
449
  json_match = re.search(json_pattern, raw_text)
448
450
  if json_match:
@@ -453,8 +455,20 @@ class OmniLoop(BaseLoop):
453
455
  logger.error(f"No JSON found in content")
454
456
  return True, action_screenshot_saved
455
457
  except json.JSONDecodeError as e:
456
- logger.error(f"Failed to parse JSON from text: {str(e)}")
457
- return True, action_screenshot_saved
458
+ # Try to sanitize the JSON string and retry
459
+ try:
460
+ # Remove or replace invalid control characters
461
+ import re # Local import to ensure availability
462
+
463
+ sanitized_text = re.sub(r"[\x00-\x1F\x7F]", "", raw_text)
464
+ # Try parsing again with sanitized text
465
+ parsed_content = json.loads(sanitized_text)
466
+ logger.info(
467
+ "Successfully parsed JSON after sanitizing control characters"
468
+ )
469
+ except json.JSONDecodeError:
470
+ logger.error(f"Failed to parse JSON from text: {str(e)}")
471
+ return True, action_screenshot_saved
458
472
 
459
473
  # Step 4: Process the parsed content if available
460
474
  if parsed_content:
@@ -534,6 +548,10 @@ class OmniLoop(BaseLoop):
534
548
  img_data = parsed_screen.annotated_image_base64
535
549
  if "," in img_data:
536
550
  img_data = img_data.split(",")[1]
551
+
552
+ # Process screenshot through hooks and save if needed
553
+ await self.handle_screenshot(img_data, action_type="state", parsed_screen=parsed_screen)
554
+
537
555
  # Save with a generic "state" action type to indicate this is the current screen state
538
556
  self._save_screenshot(img_data, action_type="state")
539
557
  except Exception as e:
@@ -649,6 +667,8 @@ class OmniLoop(BaseLoop):
649
667
  response=response,
650
668
  messages=self.message_manager.messages,
651
669
  model=self.model,
670
+ parsed_screen=parsed_screen,
671
+ parser=self.parser
652
672
  )
653
673
 
654
674
  # Yield the response to the caller
@@ -194,8 +194,13 @@ class OpenAILoop(BaseLoop):
194
194
  # Convert to base64 if needed
195
195
  if isinstance(screenshot, bytes):
196
196
  screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
197
+ elif isinstance(screenshot, (bytearray, memoryview)):
198
+ screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
197
199
  else:
198
- screenshot_base64 = screenshot
200
+ screenshot_base64 = str(screenshot)
201
+
202
+ # Emit screenshot callbacks
203
+ await self.handle_screenshot(screenshot_base64, action_type="initial_state")
199
204
 
200
205
  # Save screenshot if requested
201
206
  if self.save_trajectory:
@@ -204,8 +209,6 @@ class OpenAILoop(BaseLoop):
204
209
  logger.warning(
205
210
  "Converting non-string screenshot_base64 to string for _save_screenshot"
206
211
  )
207
- if isinstance(screenshot_base64, (bytearray, memoryview)):
208
- screenshot_base64 = base64.b64encode(screenshot_base64).decode("utf-8")
209
212
  self._save_screenshot(screenshot_base64, action_type="state")
210
213
  logger.info("Screenshot saved to trajectory")
211
214
 
@@ -336,8 +339,14 @@ class OpenAILoop(BaseLoop):
336
339
  screenshot = await self.computer.interface.screenshot()
337
340
  if isinstance(screenshot, bytes):
338
341
  screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
342
+ elif isinstance(screenshot, (bytearray, memoryview)):
343
+ screenshot_base64 = base64.b64encode(bytes(screenshot)).decode("utf-8")
339
344
  else:
340
- screenshot_base64 = screenshot
345
+ screenshot_base64 = str(screenshot)
346
+
347
+ # Process screenshot through hooks
348
+ action_type = f"after_{action.get('type', 'action')}"
349
+ await self.handle_screenshot(screenshot_base64, action_type=action_type)
341
350
 
342
351
  # Create computer_call_output
343
352
  computer_call_output = {