cua-agent 0.1.29__tar.gz → 0.1.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (84) hide show
  1. {cua_agent-0.1.29 → cua_agent-0.1.31}/PKG-INFO +20 -19
  2. {cua_agent-0.1.29 → cua_agent-0.1.31}/README.md +17 -18
  3. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/factory.py +19 -0
  4. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/types.py +1 -0
  5. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/oaicompat.py +12 -2
  6. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/computer.py +3 -7
  7. cua_agent-0.1.31/agent/providers/uitars/__init__.py +1 -0
  8. cua_agent-0.1.31/agent/providers/uitars/clients/base.py +35 -0
  9. cua_agent-0.1.31/agent/providers/uitars/clients/oaicompat.py +216 -0
  10. cua_agent-0.1.31/agent/providers/uitars/loop.py +598 -0
  11. cua_agent-0.1.31/agent/providers/uitars/prompts.py +63 -0
  12. cua_agent-0.1.31/agent/providers/uitars/tools/__init__.py +1 -0
  13. cua_agent-0.1.31/agent/providers/uitars/tools/computer.py +283 -0
  14. cua_agent-0.1.31/agent/providers/uitars/tools/manager.py +60 -0
  15. cua_agent-0.1.31/agent/providers/uitars/utils.py +153 -0
  16. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/gradio/app.py +12 -2
  17. {cua_agent-0.1.29 → cua_agent-0.1.31}/pyproject.toml +6 -3
  18. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/__init__.py +0 -0
  19. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/__init__.py +0 -0
  20. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/agent.py +0 -0
  21. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/base.py +0 -0
  22. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/callbacks.py +0 -0
  23. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/experiment.py +0 -0
  24. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/messages.py +0 -0
  25. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/provider_config.py +0 -0
  26. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/telemetry.py +0 -0
  27. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/__init__.py +0 -0
  28. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/base.py +0 -0
  29. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/bash.py +0 -0
  30. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/collection.py +0 -0
  31. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/computer.py +0 -0
  32. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/edit.py +0 -0
  33. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools/manager.py +0 -0
  34. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/tools.py +0 -0
  35. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/core/visualization.py +0 -0
  36. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/__init__.py +0 -0
  37. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/__init__.py +0 -0
  38. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api/client.py +0 -0
  39. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api/logging.py +0 -0
  40. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/api_handler.py +0 -0
  41. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  42. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/callbacks/manager.py +0 -0
  43. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/loop.py +0 -0
  44. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/prompts.py +0 -0
  45. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/response_handler.py +0 -0
  46. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/__init__.py +0 -0
  47. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/base.py +0 -0
  48. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/bash.py +0 -0
  49. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/collection.py +0 -0
  50. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/computer.py +0 -0
  51. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/edit.py +0 -0
  52. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/manager.py +0 -0
  53. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/tools/run.py +0 -0
  54. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/types.py +0 -0
  55. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/anthropic/utils.py +0 -0
  56. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/__init__.py +0 -0
  57. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/api_handler.py +0 -0
  58. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/anthropic.py +0 -0
  59. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/base.py +0 -0
  60. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/ollama.py +0 -0
  61. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/openai.py +0 -0
  62. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/clients/utils.py +0 -0
  63. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/image_utils.py +0 -0
  64. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/loop.py +0 -0
  65. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/parser.py +0 -0
  66. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/prompts.py +0 -0
  67. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/__init__.py +0 -0
  68. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/base.py +0 -0
  69. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/bash.py +0 -0
  70. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/computer.py +0 -0
  71. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/tools/manager.py +0 -0
  72. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/omni/utils.py +0 -0
  73. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/__init__.py +0 -0
  74. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/api_handler.py +0 -0
  75. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/loop.py +0 -0
  76. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/response_handler.py +0 -0
  77. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/__init__.py +0 -0
  78. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/base.py +0 -0
  79. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/tools/manager.py +0 -0
  80. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/types.py +0 -0
  81. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/providers/openai/utils.py +0 -0
  82. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/telemetry.py +0 -0
  83. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/__init__.py +0 -0
  84. {cua_agent-0.1.29 → cua_agent-0.1.31}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.29
3
+ Version: 0.1.31
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,8 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
21
21
  Provides-Extra: openai
22
22
  Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
23
23
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
24
+ Provides-Extra: uitars
25
+ Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
24
26
  Provides-Extra: ui
25
27
  Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
26
28
  Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
@@ -99,6 +101,7 @@ pip install "cua-agent[all]"
99
101
  # or install specific loop providers
100
102
  pip install "cua-agent[openai]" # OpenAI Cua Loop
101
103
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
104
+ pip install "cua-agent[uitars]" # UI-Tars support
102
105
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
103
106
  pip install "cua-agent[ui]" # Gradio UI for the agent
104
107
  ```
@@ -118,6 +121,9 @@ async with Computer() as macos_computer:
118
121
  # or
119
122
  # loop=AgentLoop.OMNI,
120
123
  # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
124
+ # or
125
+ # loop=AgentLoop.UITARS,
126
+ # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
121
127
  )
122
128
 
123
129
  tasks = [
@@ -143,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
143
149
 
144
150
  ## Using the Gradio UI
145
151
 
146
- The agent includes a Gradio-based user interface for easy interaction. To use it:
152
+ The agent includes a Gradio-based user interface for easier interaction.
153
+
154
+ <div align="center">
155
+ <img src="../../img/agent_gradio_ui.png"/>
156
+ </div>
157
+
158
+ To use it:
147
159
 
148
160
  ```bash
149
161
  # Install with Gradio support
@@ -192,6 +204,10 @@ The Gradio UI provides:
192
204
  - Configuration of agent parameters
193
205
  - Chat interface for interacting with the agent
194
206
 
207
+ ### Using UI-TARS
208
+
209
+ You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
210
+
195
211
  ## Agent Loops
196
212
 
197
213
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -200,6 +216,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
200
216
  |:-----------|:-----------------|:------------|:-------------|
201
217
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
202
218
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
219
+ | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
203
220
  | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
204
221
 
205
222
  ## AgentResponse
@@ -241,25 +258,9 @@ async for result in agent.run(task):
241
258
  print(output)
242
259
  ```
243
260
 
244
- ### Gradio UI
245
-
246
- You can also interact with the agent using a Gradio interface.
247
-
248
- ```python
249
- # Ensure environment variables (e.g., API keys) are loaded
250
- # You might need a helper function like load_dotenv_files() if using .env
251
- # from utils import load_dotenv_files
252
- # load_dotenv_files()
253
-
254
- from agent.ui.gradio.app import create_gradio_ui
255
-
256
- app = create_gradio_ui()
257
- app.launch(share=False)
258
- ```
259
-
260
261
  **Note on Settings Persistence:**
261
262
 
262
263
  * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
263
264
  * This allows your preferences to persist between sessions.
264
265
  * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
265
- * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
266
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
31
31
  # or install specific loop providers
32
32
  pip install "cua-agent[openai]" # OpenAI Cua Loop
33
33
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
+ pip install "cua-agent[uitars]" # UI-Tars support
34
35
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
35
36
  pip install "cua-agent[ui]" # Gradio UI for the agent
36
37
  ```
@@ -50,6 +51,9 @@ async with Computer() as macos_computer:
50
51
  # or
51
52
  # loop=AgentLoop.OMNI,
52
53
  # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
54
+ # or
55
+ # loop=AgentLoop.UITARS,
56
+ # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
53
57
  )
54
58
 
55
59
  tasks = [
@@ -75,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
75
79
 
76
80
  ## Using the Gradio UI
77
81
 
78
- The agent includes a Gradio-based user interface for easy interaction. To use it:
82
+ The agent includes a Gradio-based user interface for easier interaction.
83
+
84
+ <div align="center">
85
+ <img src="../../img/agent_gradio_ui.png"/>
86
+ </div>
87
+
88
+ To use it:
79
89
 
80
90
  ```bash
81
91
  # Install with Gradio support
@@ -124,6 +134,10 @@ The Gradio UI provides:
124
134
  - Configuration of agent parameters
125
135
  - Chat interface for interacting with the agent
126
136
 
137
+ ### Using UI-TARS
138
+
139
+ You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
140
+
127
141
  ## Agent Loops
128
142
 
129
143
  The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
@@ -132,6 +146,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
132
146
  |:-----------|:-----------------|:------------|:-------------|
133
147
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
134
148
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
149
+ | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
135
150
  | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
136
151
 
137
152
  ## AgentResponse
@@ -173,25 +188,9 @@ async for result in agent.run(task):
173
188
  print(output)
174
189
  ```
175
190
 
176
- ### Gradio UI
177
-
178
- You can also interact with the agent using a Gradio interface.
179
-
180
- ```python
181
- # Ensure environment variables (e.g., API keys) are loaded
182
- # You might need a helper function like load_dotenv_files() if using .env
183
- # from utils import load_dotenv_files
184
- # load_dotenv_files()
185
-
186
- from agent.ui.gradio.app import create_gradio_ui
187
-
188
- app = create_gradio_ui()
189
- app.launch(share=False)
190
- ```
191
-
192
191
  **Note on Settings Persistence:**
193
192
 
194
193
  * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
195
194
  * This allows your preferences to persist between sessions.
196
195
  * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
197
- * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
196
+ * It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
@@ -98,5 +98,24 @@ class LoopFactory:
98
98
  parser=OmniParser(),
99
99
  provider_base_url=provider_base_url,
100
100
  )
101
+ elif loop_type == AgentLoop.UITARS:
102
+ # Lazy import UITARSLoop only when needed
103
+ try:
104
+ from ..providers.uitars.loop import UITARSLoop
105
+ except ImportError:
106
+ raise ImportError(
107
+ "The 'uitars' provider is not installed. "
108
+ "Install it with 'pip install cua-agent[all]'"
109
+ )
110
+
111
+ return UITARSLoop(
112
+ api_key=api_key,
113
+ model=model_name,
114
+ computer=computer,
115
+ save_trajectory=save_trajectory,
116
+ base_dir=trajectory_dir,
117
+ only_n_most_recent_images=only_n_most_recent_images,
118
+ provider_base_url=provider_base_url,
119
+ )
101
120
  else:
102
121
  raise ValueError(f"Unsupported loop type: {loop_type}")
@@ -12,6 +12,7 @@ class AgentLoop(Enum):
12
12
  OMNI = auto() # OmniLoop implementation
13
13
  OPENAI = auto() # OpenAI implementation
14
14
  OLLAMA = auto() # OLLAMA implementation
15
+ UITARS = auto() # UI-TARS implementation
15
16
  # Add more loop types as needed
16
17
 
17
18
 
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
93
93
  """
94
94
  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
95
95
 
96
- final_messages = [{"role": "system", "content": system}]
96
+ final_messages = [
97
+ {
98
+ "role": "system",
99
+ "content": [
100
+ { "type": "text", "text": system }
101
+ ]
102
+ }
103
+ ]
97
104
 
98
105
  # Process messages
99
106
  for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
117
124
  else:
118
125
  message = {
119
126
  "role": item["role"],
120
- "content": [{"type": "text", "text": item["content"]}],
127
+ "content": [{
128
+ "type": "text",
129
+ "text": item["content"]
130
+ }],
121
131
  }
122
132
  final_messages.append(message)
123
133
  else:
@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
162
162
  y = kwargs.get("y")
163
163
  if x is None or y is None:
164
164
  raise ToolError("x and y coordinates are required for scroll action")
165
- scroll_x = kwargs.get("scroll_x", 0) // 20
166
- scroll_y = kwargs.get("scroll_y", 0) // 20
165
+ scroll_x = kwargs.get("scroll_x", 0) // 50
166
+ scroll_y = kwargs.get("scroll_y", 0) // 50
167
167
  return await self.handle_scroll(x, y, scroll_x, scroll_y)
168
168
  elif type == "screenshot":
169
169
  return await self.screenshot()
@@ -240,11 +240,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
240
240
 
241
241
  if len(mapped_keys) > 1:
242
242
  # For key combinations (like Ctrl+C)
243
- for k in mapped_keys:
244
- await self.computer.interface.press_key(k)
245
- await asyncio.sleep(0.1)
246
- for k in reversed(mapped_keys):
247
- await self.computer.interface.press_key(k)
243
+ await self.computer.interface.hotkey(*mapped_keys)
248
244
  else:
249
245
  # Single key press
250
246
  await self.computer.interface.press_key(mapped_keys[0])
@@ -0,0 +1 @@
1
+ """UI-TARS Agent provider package."""
@@ -0,0 +1,35 @@
1
+ """Base client implementation for Omni providers."""
2
+
3
+ import logging
4
+ from typing import Dict, List, Optional, Any, Tuple
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class BaseUITarsClient:
10
+ """Base class for provider-specific clients."""
11
+
12
+ def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
13
+ """Initialize base client.
14
+
15
+ Args:
16
+ api_key: Optional API key
17
+ model: Optional model name
18
+ """
19
+ self.api_key = api_key
20
+ self.model = model
21
+
22
+ async def run_interleaved(
23
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
24
+ ) -> Dict[str, Any]:
25
+ """Run interleaved chat completion.
26
+
27
+ Args:
28
+ messages: List of message dicts
29
+ system: System prompt
30
+ max_tokens: Optional max tokens override
31
+
32
+ Returns:
33
+ Response dict
34
+ """
35
+ raise NotImplementedError
@@ -0,0 +1,216 @@
1
+ """OpenAI-compatible client implementation."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, List, Optional, Any
6
+ import aiohttp
7
+ import re
8
+ from .base import BaseUITarsClient
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ # OpenAI-compatible client for the UI_Tars
14
+ class OAICompatClient(BaseUITarsClient):
15
+ """OpenAI-compatible API client implementation.
16
+
17
+ This client can be used with any service that implements the OpenAI API protocol, including:
18
+ - Huggingface Text Generation Interface endpoints
19
+ - vLLM
20
+ - LM Studio
21
+ - LocalAI
22
+ - Ollama (with OpenAI compatibility)
23
+ - Text Generation WebUI
24
+ - Any other service with OpenAI API compatibility
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ api_key: Optional[str] = None,
30
+ model: str = "Qwen2.5-VL-7B-Instruct",
31
+ provider_base_url: Optional[str] = "http://localhost:8000/v1",
32
+ max_tokens: int = 4096,
33
+ temperature: float = 0.0,
34
+ ):
35
+ """Initialize the OpenAI-compatible client.
36
+
37
+ Args:
38
+ api_key: Not used for local endpoints, usually set to "EMPTY"
39
+ model: Model name to use
40
+ provider_base_url: API base URL. Typically in the format "http://localhost:PORT/v1"
41
+ Examples:
42
+ - vLLM: "http://localhost:8000/v1"
43
+ - LM Studio: "http://localhost:1234/v1"
44
+ - LocalAI: "http://localhost:8080/v1"
45
+ - Ollama: "http://localhost:11434/v1"
46
+ max_tokens: Maximum tokens to generate
47
+ temperature: Generation temperature
48
+ """
49
+ super().__init__(api_key=api_key or "EMPTY", model=model)
50
+ self.api_key = api_key or "EMPTY" # Local endpoints typically don't require an API key
51
+ self.model = model
52
+ self.provider_base_url = (
53
+ provider_base_url or "http://localhost:8000/v1"
54
+ ) # Use default if None
55
+ self.max_tokens = max_tokens
56
+ self.temperature = temperature
57
+
58
+ def _extract_base64_image(self, text: str) -> Optional[str]:
59
+ """Extract base64 image data from an HTML img tag."""
60
+ pattern = r'data:image/[^;]+;base64,([^"]+)'
61
+ match = re.search(pattern, text)
62
+ return match.group(1) if match else None
63
+
64
+ def _get_loggable_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
65
+ """Create a loggable version of messages with image data truncated."""
66
+ loggable_messages = []
67
+ for msg in messages:
68
+ if isinstance(msg.get("content"), list):
69
+ new_content = []
70
+ for content in msg["content"]:
71
+ if content.get("type") == "image":
72
+ new_content.append(
73
+ {"type": "image", "image_url": {"url": "[BASE64_IMAGE_DATA]"}}
74
+ )
75
+ else:
76
+ new_content.append(content)
77
+ loggable_messages.append({"role": msg["role"], "content": new_content})
78
+ else:
79
+ loggable_messages.append(msg)
80
+ return loggable_messages
81
+
82
+ async def run_interleaved(
83
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
84
+ ) -> Dict[str, Any]:
85
+ """Run interleaved chat completion.
86
+
87
+ Args:
88
+ messages: List of message dicts
89
+ system: System prompt
90
+ max_tokens: Optional max tokens override
91
+
92
+ Returns:
93
+ Response dict
94
+ """
95
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
96
+
97
+ final_messages = [
98
+ {
99
+ "role": "system",
100
+ "content": [
101
+ { "type": "text", "text": system }
102
+ ]
103
+ }
104
+ ]
105
+
106
+ # Process messages
107
+ for item in messages:
108
+ if isinstance(item, dict):
109
+ if isinstance(item["content"], list):
110
+ # Content is already in the correct format
111
+ final_messages.append(item)
112
+ else:
113
+ # Single string content, check for image
114
+ base64_img = self._extract_base64_image(item["content"])
115
+ if base64_img:
116
+ message = {
117
+ "role": item["role"],
118
+ "content": [
119
+ {
120
+ "type": "image_url",
121
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
122
+ }
123
+ ],
124
+ }
125
+ else:
126
+ message = {
127
+ "role": item["role"],
128
+ "content": [{"type": "text", "text": item["content"]}],
129
+ }
130
+ final_messages.append(message)
131
+ else:
132
+ # String content, check for image
133
+ base64_img = self._extract_base64_image(item)
134
+ if base64_img:
135
+ message = {
136
+ "role": "user",
137
+ "content": [
138
+ {
139
+ "type": "image_url",
140
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
141
+ }
142
+ ],
143
+ }
144
+ else:
145
+ message = {"role": "user", "content": [{"type": "text", "text": item}]}
146
+ final_messages.append(message)
147
+
148
+ payload = {
149
+ "model": self.model,
150
+ "messages": final_messages,
151
+ "max_tokens": max_tokens or self.max_tokens,
152
+ "temperature": self.temperature,
153
+ "top_p": 0.7,
154
+ }
155
+
156
+ try:
157
+ async with aiohttp.ClientSession() as session:
158
+ # Use default base URL if none provided
159
+ base_url = self.provider_base_url or "http://localhost:8000/v1"
160
+
161
+ # Check if the base URL already includes the chat/completions endpoint
162
+
163
+ endpoint_url = base_url
164
+ if not endpoint_url.endswith("/chat/completions"):
165
+ # If URL is RunPod format, make it OpenAI compatible
166
+ if endpoint_url.startswith("https://api.runpod.ai/v2/"):
167
+ # Extract RunPod endpoint ID
168
+ parts = endpoint_url.split("/")
169
+ if len(parts) >= 5:
170
+ runpod_id = parts[4]
171
+ endpoint_url = f"https://api.runpod.ai/v2/{runpod_id}/openai/v1/chat/completions"
172
+ # If the URL ends with /v1, append /chat/completions
173
+ elif endpoint_url.endswith("/v1"):
174
+ endpoint_url = f"{endpoint_url}/chat/completions"
175
+ # If the URL doesn't end with /v1, make sure it has a proper structure
176
+ elif not endpoint_url.endswith("/"):
177
+ endpoint_url = f"{endpoint_url}/chat/completions"
178
+ else:
179
+ endpoint_url = f"{endpoint_url}chat/completions"
180
+
181
+ # Log the endpoint URL for debugging
182
+ logger.debug(f"Using endpoint URL: {endpoint_url}")
183
+
184
+ async with session.post(endpoint_url, headers=headers, json=payload) as response:
185
+ # Log the status and content type
186
+ logger.debug(f"Status: {response.status}")
187
+ logger.debug(f"Content-Type: {response.headers.get('Content-Type')}")
188
+
189
+ # Get the raw text of the response
190
+ response_text = await response.text()
191
+ logger.debug(f"Response content: {response_text}")
192
+
193
+ # Try to parse as JSON if the content type is appropriate
194
+ if "application/json" in response.headers.get('Content-Type', ''):
195
+ response_json = await response.json()
196
+ else:
197
+ raise Exception(f"Response is not JSON format")
198
+ # # Optionally try to parse it anyway
199
+ # try:
200
+ # import json
201
+ # response_json = json.loads(response_text)
202
+ # except json.JSONDecodeError as e:
203
+ # print(f"Failed to parse response as JSON: {e}")
204
+
205
+ if response.status != 200:
206
+ error_msg = response_json.get("error", {}).get(
207
+ "message", str(response_json)
208
+ )
209
+ logger.error(f"Error in API call: {error_msg}")
210
+ raise Exception(f"API error: {error_msg}")
211
+
212
+ return response_json
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error in API call: {str(e)}")
216
+ raise