cua-agent 0.1.22__tar.gz → 0.1.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (77) hide show
  1. {cua_agent-0.1.22 → cua_agent-0.1.24}/PKG-INFO +74 -2
  2. {cua_agent-0.1.22 → cua_agent-0.1.24}/README.md +68 -1
  3. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/__init__.py +1 -1
  4. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/agent.py +9 -4
  5. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/factory.py +3 -5
  6. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/provider_config.py +4 -2
  7. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/types.py +41 -1
  8. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/__init__.py +1 -1
  9. cua_agent-0.1.24/agent/providers/omni/clients/oaicompat.py +177 -0
  10. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/loop.py +25 -1
  11. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/tools/manager.py +1 -1
  12. cua_agent-0.1.24/agent/ui/__init__.py +1 -0
  13. cua_agent-0.1.24/agent/ui/gradio/__init__.py +21 -0
  14. cua_agent-0.1.24/agent/ui/gradio/app.py +872 -0
  15. {cua_agent-0.1.22 → cua_agent-0.1.24}/pyproject.toml +9 -3
  16. cua_agent-0.1.22/agent/core/README.md +0 -101
  17. cua_agent-0.1.22/agent/providers/omni/types.py +0 -47
  18. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/__init__.py +0 -0
  19. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/base.py +0 -0
  20. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/callbacks.py +0 -0
  21. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/experiment.py +0 -0
  22. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/messages.py +0 -0
  23. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/telemetry.py +0 -0
  24. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/__init__.py +0 -0
  25. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/base.py +0 -0
  26. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/bash.py +0 -0
  27. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/collection.py +0 -0
  28. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/computer.py +0 -0
  29. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/edit.py +0 -0
  30. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools/manager.py +0 -0
  31. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/tools.py +0 -0
  32. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/core/visualization.py +0 -0
  33. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/__init__.py +0 -0
  34. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/__init__.py +0 -0
  35. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/api/client.py +0 -0
  36. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/api/logging.py +0 -0
  37. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/api_handler.py +0 -0
  38. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  39. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/callbacks/manager.py +0 -0
  40. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/loop.py +0 -0
  41. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/prompts.py +0 -0
  42. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/response_handler.py +0 -0
  43. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/__init__.py +0 -0
  44. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/base.py +0 -0
  45. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/bash.py +0 -0
  46. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/collection.py +0 -0
  47. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/computer.py +0 -0
  48. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/edit.py +0 -0
  49. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/manager.py +0 -0
  50. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/tools/run.py +0 -0
  51. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/types.py +0 -0
  52. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/anthropic/utils.py +0 -0
  53. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/api_handler.py +0 -0
  54. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/clients/anthropic.py +0 -0
  55. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/clients/base.py +0 -0
  56. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/clients/ollama.py +0 -0
  57. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/clients/openai.py +0 -0
  58. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/clients/utils.py +0 -0
  59. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/image_utils.py +0 -0
  60. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/parser.py +0 -0
  61. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/prompts.py +0 -0
  62. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/tools/__init__.py +0 -0
  63. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/tools/base.py +0 -0
  64. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/tools/bash.py +0 -0
  65. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/tools/computer.py +0 -0
  66. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/omni/utils.py +0 -0
  67. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/__init__.py +0 -0
  68. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/api_handler.py +0 -0
  69. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/loop.py +0 -0
  70. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/response_handler.py +0 -0
  71. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/tools/__init__.py +0 -0
  72. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/tools/base.py +0 -0
  73. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/tools/computer.py +0 -0
  74. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/tools/manager.py +0 -0
  75. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/types.py +0 -0
  76. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/providers/openai/utils.py +0 -0
  77. {cua_agent-0.1.22 → cua_agent-0.1.24}/agent/telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.22
3
+ Version: 0.1.24
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: <3.13,>=3.10
@@ -21,6 +21,9 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
21
21
  Provides-Extra: openai
22
22
  Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
23
23
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
24
+ Provides-Extra: ui
25
+ Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
26
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
24
27
  Provides-Extra: som
25
28
  Requires-Dist: torch>=2.2.1; extra == "som"
26
29
  Requires-Dist: torchvision>=0.17.1; extra == "som"
@@ -59,6 +62,8 @@ Requires-Dist: groq<0.5.0,>=0.4.0; extra == "all"
59
62
  Requires-Dist: dashscope<2.0.0,>=1.13.0; extra == "all"
60
63
  Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
61
64
  Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
65
+ Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "all"
66
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "all"
62
67
  Description-Content-Type: text/markdown
63
68
 
64
69
  <div align="center">
@@ -95,6 +100,7 @@ pip install "cua-agent[all]"
95
100
  pip install "cua-agent[openai]" # OpenAI Cua Loop
96
101
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
97
102
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
103
+ pip install "cua-agent[ui]" # Gradio UI for the agent
98
104
  ```
99
105
 
100
106
  ## Run
@@ -106,6 +112,12 @@ async with Computer() as macos_computer:
106
112
  computer=macos_computer,
107
113
  loop=AgentLoop.OPENAI,
108
114
  model=LLM(provider=LLMProvider.OPENAI)
115
+ # or
116
+ # loop=AgentLoop.ANTHROPIC,
117
+ # model=LLM(provider=LLMProvider.ANTHROPIC)
118
+ # or
119
+ # loop=AgentLoop.OMNI,
120
+ # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
109
121
  )
110
122
 
111
123
  tasks = [
@@ -129,6 +141,66 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
129
141
 
130
142
  - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
131
143
 
144
+ ## Using the Gradio UI
145
+
146
+ The agent includes a Gradio-based user interface for easy interaction. To use it:
147
+
148
+ ```bash
149
+ # Install with Gradio support
150
+ pip install "cua-agent[ui]"
151
+
152
+ # Create a simple launcher script
153
+ ```python
154
+ # launch_ui.py
155
+ from agent.ui.gradio.app import create_gradio_ui
156
+
157
+ app = create_gradio_ui()
158
+ app.launch(share=False)
159
+ ```
160
+
161
+ # Run the launcher
162
+ python launch_ui.py
163
+ ```
164
+
165
+ ### Setting up API Keys
166
+
167
+ For the Gradio UI to show available models, you need to set API keys as environment variables:
168
+
169
+ ```bash
170
+ # For OpenAI models
171
+ export OPENAI_API_KEY=your_openai_key_here
172
+
173
+ # For Anthropic models
174
+ export ANTHROPIC_API_KEY=your_anthropic_key_here
175
+
176
+ # Launch with both keys set
177
+ OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
178
+ ```
179
+
180
+ Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
181
+
182
+ The Gradio UI provides:
183
+ - Selection of different agent loops (OpenAI, Anthropic, OMNI)
184
+ - Model selection for each provider
185
+ - Configuration of agent parameters
186
+ - Chat interface for interacting with the agent
187
+
188
+ You can also embed the Gradio UI in your own application:
189
+
190
+ ```python
191
+ # Import directly in your application
192
+ from agent.ui.gradio.app import create_gradio_ui
193
+
194
+ # Create the UI with advanced features
195
+ demo = create_gradio_ui()
196
+ demo.launch()
197
+
198
+ # Or for a simpler interface
199
+ from agent.ui.gradio import registry
200
+ demo = registry(name='cua:gpt-4o')
201
+ demo.launch()
202
+ ```
203
+
132
204
  ## Agent Loops
133
205
 
134
206
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -137,7 +209,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
137
209
  |:-----------|:-----------------|:------------|:-------------|
138
210
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
139
211
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
140
- | `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
212
+ | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
141
213
 
142
214
  ## AgentResponse
143
215
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -32,6 +32,7 @@ pip install "cua-agent[all]"
32
32
  pip install "cua-agent[openai]" # OpenAI Cua Loop
33
33
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
34
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
35
+ pip install "cua-agent[ui]" # Gradio UI for the agent
35
36
  ```
36
37
 
37
38
  ## Run
@@ -43,6 +44,12 @@ async with Computer() as macos_computer:
43
44
  computer=macos_computer,
44
45
  loop=AgentLoop.OPENAI,
45
46
  model=LLM(provider=LLMProvider.OPENAI)
47
+ # or
48
+ # loop=AgentLoop.ANTHROPIC,
49
+ # model=LLM(provider=LLMProvider.ANTHROPIC)
50
+ # or
51
+ # loop=AgentLoop.OMNI,
52
+ # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
46
53
  )
47
54
 
48
55
  tasks = [
@@ -66,6 +73,66 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
66
73
 
67
74
  - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows
68
75
 
76
+ ## Using the Gradio UI
77
+
78
+ The agent includes a Gradio-based user interface for easy interaction. To use it:
79
+
80
+ ```bash
81
+ # Install with Gradio support
82
+ pip install "cua-agent[ui]"
83
+
84
+ # Create a simple launcher script
85
+ ```python
86
+ # launch_ui.py
87
+ from agent.ui.gradio.app import create_gradio_ui
88
+
89
+ app = create_gradio_ui()
90
+ app.launch(share=False)
91
+ ```
92
+
93
+ # Run the launcher
94
+ python launch_ui.py
95
+ ```
96
+
97
+ ### Setting up API Keys
98
+
99
+ For the Gradio UI to show available models, you need to set API keys as environment variables:
100
+
101
+ ```bash
102
+ # For OpenAI models
103
+ export OPENAI_API_KEY=your_openai_key_here
104
+
105
+ # For Anthropic models
106
+ export ANTHROPIC_API_KEY=your_anthropic_key_here
107
+
108
+ # Launch with both keys set
109
+ OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
110
+ ```
111
+
112
+ Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
113
+
114
+ The Gradio UI provides:
115
+ - Selection of different agent loops (OpenAI, Anthropic, OMNI)
116
+ - Model selection for each provider
117
+ - Configuration of agent parameters
118
+ - Chat interface for interacting with the agent
119
+
120
+ You can also embed the Gradio UI in your own application:
121
+
122
+ ```python
123
+ # Import directly in your application
124
+ from agent.ui.gradio.app import create_gradio_ui
125
+
126
+ # Create the UI with advanced features
127
+ demo = create_gradio_ui()
128
+ demo.launch()
129
+
130
+ # Or for a simpler interface
131
+ from agent.ui.gradio import registry
132
+ demo = registry(name='cua:gpt-4o')
133
+ demo.launch()
134
+ ```
135
+
69
136
  ## Agent Loops
70
137
 
71
138
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -74,7 +141,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
74
141
  |:-----------|:-----------------|:------------|:-------------|
75
142
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
76
143
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
77
- | `AgentLoop.OMNI` <br>(experimental) | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
144
+ | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
78
145
 
79
146
  ## AgentResponse
80
147
  The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
@@ -48,7 +48,7 @@ except Exception as e:
48
48
  # Other issues with telemetry
49
49
  logger.warning(f"Error initializing telemetry: {e}")
50
50
 
51
- from .providers.omni.types import LLMProvider, LLM
51
+ from .core.types import LLMProvider, LLM
52
52
  from .core.factory import AgentLoop
53
53
  from .core.agent import ComputerAgent
54
54
 
@@ -6,8 +6,7 @@ import os
6
6
  from typing import AsyncGenerator, Optional
7
7
 
8
8
  from computer import Computer
9
- from ..providers.omni.types import LLM
10
- from .. import AgentLoop
9
+ from .types import LLM, AgentLoop
11
10
  from .types import AgentResponse
12
11
  from .factory import LoopFactory
13
12
  from .provider_config import DEFAULT_MODELS, ENV_VARS
@@ -75,6 +74,7 @@ class ComputerAgent:
75
74
  # Use the provided LLM object
76
75
  self.provider = model.provider
77
76
  actual_model_name = model.name or DEFAULT_MODELS.get(self.provider, "")
77
+ self.provider_base_url = getattr(model, "provider_base_url", None)
78
78
 
79
79
  # Ensure we have a valid model name
80
80
  if not actual_model_name:
@@ -86,8 +86,12 @@ class ComputerAgent:
86
86
 
87
87
  # Get API key from environment if not provided
88
88
  actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "")
89
- # Ollama is local and doesn't require an API key
90
- if not actual_api_key and str(self.provider) != "ollama":
89
+ # Ollama and OpenAI-compatible APIs typically don't require an API key
90
+ if (
91
+ not actual_api_key
92
+ and str(self.provider) not in ["ollama", "oaicompat"]
93
+ and ENV_VARS[self.provider] != "none"
94
+ ):
91
95
  raise ValueError(f"No API key provided for {self.provider}")
92
96
 
93
97
  # Create the appropriate loop using the factory
@@ -102,6 +106,7 @@ class ComputerAgent:
102
106
  save_trajectory=save_trajectory,
103
107
  trajectory_dir=trajectory_dir,
104
108
  only_n_most_recent_images=only_n_most_recent_images,
109
+ provider_base_url=self.provider_base_url,
105
110
  )
106
111
  except ValueError as e:
107
112
  logger.error(f"Failed to create loop: {str(e)}")
@@ -8,10 +8,6 @@ from computer import Computer
8
8
  from .types import AgentLoop
9
9
  from .base import BaseLoop
10
10
 
11
- # For type checking only
12
- if TYPE_CHECKING:
13
- from ..providers.omni.types import LLMProvider
14
-
15
11
  logger = logging.getLogger(__name__)
16
12
 
17
13
 
@@ -33,6 +29,7 @@ class LoopFactory:
33
29
  trajectory_dir: str = "trajectories",
34
30
  only_n_most_recent_images: Optional[int] = None,
35
31
  acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
32
+ provider_base_url: Optional[str] = None,
36
33
  ) -> BaseLoop:
37
34
  """Create and return an appropriate loop instance based on type."""
38
35
  if loop_type == AgentLoop.ANTHROPIC:
@@ -77,7 +74,7 @@ class LoopFactory:
77
74
  try:
78
75
  from ..providers.omni.loop import OmniLoop
79
76
  from ..providers.omni.parser import OmniParser
80
- from ..providers.omni.types import LLMProvider
77
+ from .types import LLMProvider
81
78
  except ImportError:
82
79
  raise ImportError(
83
80
  "The 'omni' provider is not installed. "
@@ -99,6 +96,7 @@ class LoopFactory:
99
96
  base_dir=trajectory_dir,
100
97
  only_n_most_recent_images=only_n_most_recent_images,
101
98
  parser=OmniParser(),
99
+ provider_base_url=provider_base_url,
102
100
  )
103
101
  else:
104
102
  raise ValueError(f"Unsupported loop type: {loop_type}")
@@ -1,17 +1,19 @@
1
1
  """Provider-specific configurations and constants."""
2
2
 
3
- from ..providers.omni.types import LLMProvider
3
+ from .types import LLMProvider
4
4
 
5
5
  # Default models for different providers
6
6
  DEFAULT_MODELS = {
7
7
  LLMProvider.OPENAI: "gpt-4o",
8
8
  LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
9
9
  LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
10
+ LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
10
11
  }
11
12
 
12
13
  # Map providers to their environment variable names
13
14
  ENV_VARS = {
14
15
  LLMProvider.OPENAI: "OPENAI_API_KEY",
15
16
  LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
16
- LLMProvider.OLLAMA: "OLLAMA_API_KEY",
17
+ LLMProvider.OLLAMA: "none",
18
+ LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key
17
19
  }
@@ -1,7 +1,8 @@
1
1
  """Core type definitions."""
2
2
 
3
3
  from typing import Any, Dict, List, Optional, TypedDict, Union
4
- from enum import Enum, auto
4
+ from enum import Enum, StrEnum, auto
5
+ from dataclasses import dataclass
5
6
 
6
7
 
7
8
  class AgentLoop(Enum):
@@ -14,6 +15,45 @@ class AgentLoop(Enum):
14
15
  # Add more loop types as needed
15
16
 
16
17
 
18
+ class LLMProvider(StrEnum):
19
+ """Supported LLM providers."""
20
+
21
+ ANTHROPIC = "anthropic"
22
+ OPENAI = "openai"
23
+ OLLAMA = "ollama"
24
+ OAICOMPAT = "oaicompat"
25
+
26
+
27
+ @dataclass
28
+ class LLM:
29
+ """Configuration for LLM model and provider."""
30
+
31
+ provider: LLMProvider
32
+ name: Optional[str] = None
33
+ provider_base_url: Optional[str] = None
34
+
35
+ def __post_init__(self):
36
+ """Set default model name if not provided."""
37
+ if self.name is None:
38
+ from .provider_config import DEFAULT_MODELS
39
+
40
+ self.name = DEFAULT_MODELS.get(self.provider)
41
+
42
+ # Set default provider URL if none provided
43
+ if self.provider_base_url is None and self.provider == LLMProvider.OAICOMPAT:
44
+ # Default for vLLM
45
+ self.provider_base_url = "http://localhost:8000/v1"
46
+ # Common alternatives:
47
+ # - LM Studio: "http://localhost:1234/v1"
48
+ # - LocalAI: "http://localhost:8080/v1"
49
+ # - Ollama with OpenAI compatible API: "http://localhost:11434/v1"
50
+
51
+
52
+ # For backward compatibility
53
+ LLMModel = LLM
54
+ Model = LLM
55
+
56
+
17
57
  class AgentResponse(TypedDict, total=False):
18
58
  """Agent response format."""
19
59
 
@@ -1,6 +1,6 @@
1
1
  """Omni provider implementation."""
2
2
 
3
- from .types import LLMProvider
3
+ from ...core.types import LLMProvider
4
4
  from .image_utils import (
5
5
  decode_base64_image,
6
6
  )
@@ -0,0 +1,177 @@
1
+ """OpenAI-compatible client implementation."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, List, Optional, Any
6
+ import aiohttp
7
+ import re
8
+ from .base import BaseOmniClient
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ # OpenAI-compatible client for the OmniLoop
14
+ class OAICompatClient(BaseOmniClient):
15
+ """OpenAI-compatible API client implementation.
16
+
17
+ This client can be used with any service that implements the OpenAI API protocol, including:
18
+ - vLLM
19
+ - LM Studio
20
+ - LocalAI
21
+ - Ollama (with OpenAI compatibility)
22
+ - Text Generation WebUI
23
+ - Any other service with OpenAI API compatibility
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ api_key: Optional[str] = None,
29
+ model: str = "Qwen2.5-VL-7B-Instruct",
30
+ provider_base_url: Optional[str] = "http://localhost:8000/v1",
31
+ max_tokens: int = 4096,
32
+ temperature: float = 0.0,
33
+ ):
34
+ """Initialize the OpenAI-compatible client.
35
+
36
+ Args:
37
+ api_key: Not used for local endpoints, usually set to "EMPTY"
38
+ model: Model name to use
39
+ provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
40
+ Examples:
41
+ - vLLM: "http://localhost:8000/v1"
42
+ - LM Studio: "http://localhost:1234/v1"
43
+ - LocalAI: "http://localhost:8080/v1"
44
+ - Ollama: "http://localhost:11434/v1"
45
+ max_tokens: Maximum tokens to generate
46
+ temperature: Generation temperature
47
+ """
48
+ super().__init__(api_key="EMPTY", model=model)
49
+ self.api_key = "EMPTY" # Local endpoints typically don't require an API key
50
+ self.model = model
51
+ self.provider_base_url = (
52
+ provider_base_url or "http://localhost:8000/v1"
53
+ ) # Use default if None
54
+ self.max_tokens = max_tokens
55
+ self.temperature = temperature
56
+
57
+ def _extract_base64_image(self, text: str) -> Optional[str]:
58
+ """Extract base64 image data from an HTML img tag."""
59
+ pattern = r'data:image/[^;]+;base64,([^"]+)'
60
+ match = re.search(pattern, text)
61
+ return match.group(1) if match else None
62
+
63
+ def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
64
+ """Create a loggable version of messages with image data truncated."""
65
+ loggable_messages = []
66
+ for msg in messages:
67
+ if isinstance(msg.get("content"), list):
68
+ new_content = []
69
+ for content in msg["content"]:
70
+ if content.get("type") == "image":
71
+ new_content.append(
72
+ {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
73
+ )
74
+ else:
75
+ new_content.append(content)
76
+ loggable_messages.append({"role": msg["role"], "content": new_content})
77
+ else:
78
+ loggable_messages.append(msg)
79
+ return loggable_messages
80
+
81
+ async def run_interleaved(
82
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
83
+ ) -> Dict[str, Any]:
84
+ """Run interleaved chat completion.
85
+
86
+ Args:
87
+ messages: List of message dicts
88
+ system: System prompt
89
+ max_tokens: Optional max tokens override
90
+
91
+ Returns:
92
+ Response dict
93
+ """
94
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
95
+
96
+ final_messages = [{"role": "system", "content": system}]
97
+
98
+ # Process messages
99
+ for item in messages:
100
+ if isinstance(item, dict):
101
+ if isinstance(item["content"], list):
102
+ # Content is already in the correct format
103
+ final_messages.append(item)
104
+ else:
105
+ # Single string content, check for image
106
+ base64_img = self._extract_base64_image(item["content"])
107
+ if base64_img:
108
+ message = {
109
+ "role": item["role"],
110
+ "content": [
111
+ {
112
+ "type": "image_url",
113
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
114
+ }
115
+ ],
116
+ }
117
+ else:
118
+ message = {
119
+ "role": item["role"],
120
+ "content": [{"type": "text", "text": item["content"]}],
121
+ }
122
+ final_messages.append(message)
123
+ else:
124
+ # String content, check for image
125
+ base64_img = self._extract_base64_image(item)
126
+ if base64_img:
127
+ message = {
128
+ "role": "user",
129
+ "content": [
130
+ {
131
+ "type": "image_url",
132
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
133
+ }
134
+ ],
135
+ }
136
+ else:
137
+ message = {"role": "user", "content": [{"type": "text", "text": item}]}
138
+ final_messages.append(message)
139
+
140
+ payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
141
+ payload["max_tokens"] = max_tokens or self.max_tokens
142
+
143
+ try:
144
+ async with aiohttp.ClientSession() as session:
145
+ # Use default base URL if none provided
146
+ base_url = self.provider_base_url or "http://localhost:8000/v1"
147
+
148
+ # Check if the base URL already includes the chat/completions endpoint
149
+ endpoint_url = base_url
150
+ if not endpoint_url.endswith("/chat/completions"):
151
+ # If the URL ends with /v1, append /chat/completions
152
+ if endpoint_url.endswith("/v1"):
153
+ endpoint_url = f"{endpoint_url}/chat/completions"
154
+ # If the URL doesn't end with /v1, make sure it has a proper structure
155
+ elif not endpoint_url.endswith("/"):
156
+ endpoint_url = f"{endpoint_url}/chat/completions"
157
+ else:
158
+ endpoint_url = f"{endpoint_url}chat/completions"
159
+
160
+ # Log the endpoint URL for debugging
161
+ logger.debug(f"Using endpoint URL: {endpoint_url}")
162
+
163
+ async with session.post(endpoint_url, headers=headers, json=payload) as response:
164
+ response_json = await response.json()
165
+
166
+ if response.status != 200:
167
+ error_msg = response_json.get("error", {}).get(
168
+ "message", str(response_json)
169
+ )
170
+ logger.error(f"Error in API call: {error_msg}")
171
+ raise Exception(f"API error: {error_msg}")
172
+
173
+ return response_json
174
+
175
+ except Exception as e:
176
+ logger.error(f"Error in API call: {str(e)}")
177
+ raise
@@ -16,10 +16,11 @@ from ...core.messages import StandardMessageManager, ImageRetentionConfig
16
16
  from .utils import to_openai_agent_response_format
17
17
  from ...core.types import AgentResponse
18
18
  from computer import Computer
19
- from .types import LLMProvider
19
+ from ...core.types import LLMProvider
20
20
  from .clients.openai import OpenAIClient
21
21
  from .clients.anthropic import AnthropicClient
22
22
  from .clients.ollama import OllamaClient
23
+ from .clients.oaicompat import OAICompatClient
23
24
  from .prompts import SYSTEM_PROMPT
24
25
  from .api_handler import OmniAPIHandler
25
26
  from .tools.manager import ToolManager
@@ -60,6 +61,7 @@ class OmniLoop(BaseLoop):
60
61
  max_retries: int = 3,
61
62
  retry_delay: float = 1.0,
62
63
  save_trajectory: bool = True,
64
+ provider_base_url: Optional[str] = None,
63
65
  **kwargs,
64
66
  ):
65
67
  """Initialize the loop.
@@ -75,10 +77,12 @@ class OmniLoop(BaseLoop):
75
77
  max_retries: Maximum number of retries for API calls
76
78
  retry_delay: Delay between retries in seconds
77
79
  save_trajectory: Whether to save trajectory data
80
+ provider_base_url: Base URL for the API provider (used for OAICOMPAT)
78
81
  """
79
82
  # Set parser and provider before initializing base class
80
83
  self.parser = parser
81
84
  self.provider = provider
85
+ self.provider_base_url = provider_base_url
82
86
 
83
87
  # Initialize message manager with image retention config
84
88
  self.message_manager = StandardMessageManager(
@@ -141,6 +145,12 @@ class OmniLoop(BaseLoop):
141
145
  api_key=self.api_key,
142
146
  model=self.model,
143
147
  )
148
+ elif self.provider == LLMProvider.OAICOMPAT:
149
+ self.client = OAICompatClient(
150
+ api_key="EMPTY", # Local endpoints typically don't require an API key
151
+ model=self.model,
152
+ provider_base_url=self.provider_base_url,
153
+ )
144
154
  else:
145
155
  raise ValueError(f"Unsupported provider: {self.provider}")
146
156
 
@@ -171,6 +181,12 @@ class OmniLoop(BaseLoop):
171
181
  api_key=self.api_key,
172
182
  model=self.model,
173
183
  )
184
+ elif self.provider == LLMProvider.OAICOMPAT:
185
+ self.client = OAICompatClient(
186
+ api_key="EMPTY", # Local endpoints typically don't require an API key
187
+ model=self.model,
188
+ provider_base_url=self.provider_base_url,
189
+ )
174
190
  else:
175
191
  raise ValueError(f"Unsupported provider: {self.provider}")
176
192
 
@@ -388,6 +404,14 @@ class OmniLoop(BaseLoop):
388
404
  except (KeyError, TypeError, IndexError) as e:
389
405
  logger.error(f"Invalid response format: {str(e)}")
390
406
  return True, action_screenshot_saved
407
+ elif self.provider == LLMProvider.OAICOMPAT:
408
+ try:
409
+ # OpenAI-compatible response format
410
+ raw_text = response["choices"][0]["message"]["content"]
411
+ standard_content = [{"type": "text", "text": raw_text}]
412
+ except (KeyError, TypeError, IndexError) as e:
413
+ logger.error(f"Invalid response format: {str(e)}")
414
+ return True, action_screenshot_saved
391
415
  else:
392
416
  # Assume OpenAI or compatible format
393
417
  try:
@@ -7,7 +7,7 @@ from ....core.tools import BaseToolManager, ToolResult
7
7
  from ....core.tools.collection import ToolCollection
8
8
  from .computer import ComputerTool
9
9
  from .bash import BashTool
10
- from ..types import LLMProvider
10
+ from ....core.types import LLMProvider
11
11
 
12
12
 
13
13
  class ToolManager(BaseToolManager):
@@ -0,0 +1 @@
1
+ """UI modules for the Computer-Use Agent."""
@@ -0,0 +1,21 @@
1
+ """Gradio UI for Computer-Use Agent."""
2
+
3
+ import gradio as gr
4
+ from typing import Optional
5
+
6
+ from .app import create_gradio_ui
7
+
8
+
9
+ def registry(name: str = "cua:gpt-4o") -> gr.Blocks:
10
+ """Create and register a Gradio UI for the Computer-Use Agent.
11
+
12
+ Args:
13
+ name: The name to use for the Gradio app, in format 'provider:model'
14
+
15
+ Returns:
16
+ A Gradio Blocks application
17
+ """
18
+ provider, model = name.split(":", 1) if ":" in name else ("openai", name)
19
+
20
+ # Create and return the Gradio UI
21
+ return create_gradio_ui(provider_name=provider, model_name=model)