cua-agent 0.1.35__tar.gz → 0.1.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (85) hide show
  1. {cua_agent-0.1.35 → cua_agent-0.1.37}/PKG-INFO +30 -3
  2. {cua_agent-0.1.35 → cua_agent-0.1.37}/README.md +28 -2
  3. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/factory.py +1 -0
  4. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/provider_config.py +2 -0
  5. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/types.py +1 -0
  6. cua_agent-0.1.37/agent/providers/uitars/clients/mlxvlm.py +263 -0
  7. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/loop.py +25 -12
  8. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/utils.py +1 -1
  9. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/ui/gradio/app.py +19 -5
  10. {cua_agent-0.1.35 → cua_agent-0.1.37}/pyproject.toml +4 -3
  11. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/__init__.py +0 -0
  12. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/__init__.py +0 -0
  13. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/agent.py +0 -0
  14. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/base.py +0 -0
  15. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/callbacks.py +0 -0
  16. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/experiment.py +0 -0
  17. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/messages.py +0 -0
  18. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/telemetry.py +0 -0
  19. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/__init__.py +0 -0
  20. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/base.py +0 -0
  21. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/bash.py +0 -0
  22. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/collection.py +0 -0
  23. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/computer.py +0 -0
  24. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/edit.py +0 -0
  25. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools/manager.py +0 -0
  26. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/tools.py +0 -0
  27. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/core/visualization.py +0 -0
  28. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/__init__.py +0 -0
  29. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/__init__.py +0 -0
  30. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/api/client.py +0 -0
  31. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/api/logging.py +0 -0
  32. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/api_handler.py +0 -0
  33. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/callbacks/__init__.py +0 -0
  34. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/callbacks/manager.py +0 -0
  35. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/loop.py +0 -0
  36. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/prompts.py +0 -0
  37. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/response_handler.py +0 -0
  38. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/__init__.py +0 -0
  39. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/base.py +0 -0
  40. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/bash.py +0 -0
  41. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/collection.py +0 -0
  42. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/computer.py +0 -0
  43. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/edit.py +0 -0
  44. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/manager.py +0 -0
  45. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/tools/run.py +0 -0
  46. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/types.py +0 -0
  47. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/anthropic/utils.py +0 -0
  48. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/__init__.py +0 -0
  49. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/api_handler.py +0 -0
  50. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/anthropic.py +0 -0
  51. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/base.py +0 -0
  52. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/oaicompat.py +0 -0
  53. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/ollama.py +0 -0
  54. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/openai.py +0 -0
  55. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/clients/utils.py +0 -0
  56. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/image_utils.py +0 -0
  57. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/loop.py +0 -0
  58. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/parser.py +0 -0
  59. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/prompts.py +0 -0
  60. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/tools/__init__.py +0 -0
  61. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/tools/base.py +0 -0
  62. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/tools/bash.py +0 -0
  63. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/tools/computer.py +0 -0
  64. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/tools/manager.py +0 -0
  65. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/omni/utils.py +0 -0
  66. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/__init__.py +0 -0
  67. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/api_handler.py +0 -0
  68. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/loop.py +0 -0
  69. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/response_handler.py +0 -0
  70. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/tools/__init__.py +0 -0
  71. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/tools/base.py +0 -0
  72. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/tools/computer.py +0 -0
  73. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/tools/manager.py +0 -0
  74. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/types.py +0 -0
  75. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/openai/utils.py +0 -0
  76. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/__init__.py +0 -0
  77. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/clients/base.py +0 -0
  78. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/clients/oaicompat.py +0 -0
  79. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/prompts.py +0 -0
  80. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/tools/__init__.py +0 -0
  81. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/tools/computer.py +0 -0
  82. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/providers/uitars/tools/manager.py +0 -0
  83. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/telemetry.py +0 -0
  84. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/ui/__init__.py +0 -0
  85. {cua_agent-0.1.35 → cua_agent-0.1.37}/agent/ui/gradio/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.1.35
3
+ Version: 0.1.37
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.10
@@ -23,6 +23,7 @@ Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
23
23
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
24
24
  Provides-Extra: uitars
25
25
  Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
26
+ Provides-Extra: uitars-mlx
26
27
  Provides-Extra: ui
27
28
  Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
28
29
  Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
@@ -102,6 +103,7 @@ pip install "cua-agent[all]"
102
103
  pip install "cua-agent[openai]" # OpenAI Cua Loop
103
104
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
104
105
  pip install "cua-agent[uitars]" # UI-Tars support
106
+ pip install "cua-agent[uitars-mlx]" # local UI-Tars support with MLXVLM
105
107
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
106
108
  pip install "cua-agent[ui]" # Gradio UI for the agent
107
109
  ```
@@ -206,7 +208,32 @@ The Gradio UI provides:
206
208
 
207
209
  ### Using UI-TARS
208
210
 
209
- You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
211
+ The UI-TARS models are available in two forms:
212
+
213
+ 1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider
214
+ - `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version
215
+ - `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality
216
+
217
+ ```python
218
+ agent = ComputerAgent(
219
+ computer=macos_computer,
220
+ loop=AgentLoop.UITARS,
221
+ model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit")
222
+ )
223
+ ```
224
+
225
+ 2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model
226
+ - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md)
227
+ - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI:
228
+
229
+ ```python
230
+ agent = ComputerAgent(
231
+ computer=macos_computer,
232
+ loop=AgentLoop.UITARS,
233
+ model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi",
234
+ provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
235
+ )
236
+ ```
210
237
 
211
238
  ## Agent Loops
212
239
 
@@ -216,7 +243,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
216
243
  |:-----------|:-----------------|:------------|:-------------|
217
244
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
218
245
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
219
- | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
246
+ | `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)<br>• `mlx-community/UI-TARS-1.5-7B-6bit`<br>• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required |
220
247
  | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
221
248
 
222
249
  ## AgentResponse
@@ -32,6 +32,7 @@ pip install "cua-agent[all]"
32
32
  pip install "cua-agent[openai]" # OpenAI Cua Loop
33
33
  pip install "cua-agent[anthropic]" # Anthropic Cua Loop
34
34
  pip install "cua-agent[uitars]" # UI-Tars support
35
+ pip install "cua-agent[uitars-mlx]" # local UI-Tars support with MLXVLM
35
36
  pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
36
37
  pip install "cua-agent[ui]" # Gradio UI for the agent
37
38
  ```
@@ -136,7 +137,32 @@ The Gradio UI provides:
136
137
 
137
138
  ### Using UI-TARS
138
139
 
139
- You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
140
+ The UI-TARS models are available in two forms:
141
+
142
+ 1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider
143
+ - `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version
144
+ - `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality
145
+
146
+ ```python
147
+ agent = ComputerAgent(
148
+ computer=macos_computer,
149
+ loop=AgentLoop.UITARS,
150
+ model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit")
151
+ )
152
+ ```
153
+
154
+ 2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model
155
+ - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md)
156
+ - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI:
157
+
158
+ ```python
159
+ agent = ComputerAgent(
160
+ computer=macos_computer,
161
+ loop=AgentLoop.UITARS,
162
+ model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi",
163
+ provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
164
+ )
165
+ ```
140
166
 
141
167
  ## Agent Loops
142
168
 
@@ -146,7 +172,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
146
172
  |:-----------|:-----------------|:------------|:-------------|
147
173
  | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
148
174
  | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
149
- | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
175
+ | `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)<br>• `mlx-community/UI-TARS-1.5-7B-6bit`<br>• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required |
150
176
  | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
151
177
 
152
178
  ## AgentResponse
@@ -116,6 +116,7 @@ class LoopFactory:
116
116
  base_dir=trajectory_dir,
117
117
  only_n_most_recent_images=only_n_most_recent_images,
118
118
  provider_base_url=provider_base_url,
119
+ provider=provider,
119
120
  )
120
121
  else:
121
122
  raise ValueError(f"Unsupported loop type: {loop_type}")
@@ -8,6 +8,7 @@ DEFAULT_MODELS = {
8
8
  LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
9
9
  LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
10
10
  LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
11
+ LLMProvider.MLXVLM: "mlx-community/UI-TARS-1.5-7B-4bit",
11
12
  }
12
13
 
13
14
  # Map providers to their environment variable names
@@ -16,4 +17,5 @@ ENV_VARS = {
16
17
  LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
17
18
  LLMProvider.OLLAMA: "none",
18
19
  LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key
20
+ LLMProvider.MLXVLM: "none", # MLX VLM typically doesn't require an API key
19
21
  }
@@ -23,6 +23,7 @@ class LLMProvider(StrEnum):
23
23
  OPENAI = "openai"
24
24
  OLLAMA = "ollama"
25
25
  OAICOMPAT = "oaicompat"
26
+ MLXVLM= "mlxvlm"
26
27
 
27
28
 
28
29
  @dataclass
@@ -0,0 +1,263 @@
1
+ """MLX LVM client implementation."""
2
+
3
+ import io
4
+ import logging
5
+ import base64
6
+ import tempfile
7
+ import os
8
+ import re
9
+ import math
10
+ from typing import Dict, List, Optional, Any, cast, Tuple
11
+ from PIL import Image
12
+
13
+ from .base import BaseUITarsClient
14
+ import mlx.core as mx
15
+ from mlx_vlm import load, generate
16
+ from mlx_vlm.prompt_utils import apply_chat_template
17
+ from mlx_vlm.utils import load_config
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Constants for smart_resize
23
+ IMAGE_FACTOR = 28
24
+ MIN_PIXELS = 100 * 28 * 28
25
+ MAX_PIXELS = 16384 * 28 * 28
26
+ MAX_RATIO = 200
27
+
28
+ def round_by_factor(number: float, factor: int) -> int:
29
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
30
+ return round(number / factor) * factor
31
+
32
+ def ceil_by_factor(number: float, factor: int) -> int:
33
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
34
+ return math.ceil(number / factor) * factor
35
+
36
+ def floor_by_factor(number: float, factor: int) -> int:
37
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
38
+ return math.floor(number / factor) * factor
39
+
40
+ def smart_resize(
41
+ height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
42
+ ) -> tuple[int, int]:
43
+ """
44
+ Rescales the image so that the following conditions are met:
45
+
46
+ 1. Both dimensions (height and width) are divisible by 'factor'.
47
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
48
+ 3. The aspect ratio of the image is maintained as closely as possible.
49
+ """
50
+ if max(height, width) / min(height, width) > MAX_RATIO:
51
+ raise ValueError(
52
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
53
+ )
54
+ h_bar = max(factor, round_by_factor(height, factor))
55
+ w_bar = max(factor, round_by_factor(width, factor))
56
+ if h_bar * w_bar > max_pixels:
57
+ beta = math.sqrt((height * width) / max_pixels)
58
+ h_bar = floor_by_factor(height / beta, factor)
59
+ w_bar = floor_by_factor(width / beta, factor)
60
+ elif h_bar * w_bar < min_pixels:
61
+ beta = math.sqrt(min_pixels / (height * width))
62
+ h_bar = ceil_by_factor(height * beta, factor)
63
+ w_bar = ceil_by_factor(width * beta, factor)
64
+ return h_bar, w_bar
65
+
66
+ class MLXVLMUITarsClient(BaseUITarsClient):
67
+ """MLX LVM client implementation class."""
68
+
69
+ def __init__(
70
+ self,
71
+ model: str = "mlx-community/UI-TARS-1.5-7B-4bit"
72
+ ):
73
+ """Initialize MLX LVM client.
74
+
75
+ Args:
76
+ model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
77
+ """
78
+ # Load model and processor
79
+ model_obj, processor = load(
80
+ model,
81
+ processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
82
+ )
83
+ self.config = load_config(model)
84
+ self.model = model_obj
85
+ self.processor = processor
86
+ self.model_name = model
87
+
88
+ def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
89
+ """Process coordinates in box tokens based on image resizing using smart_resize approach.
90
+
91
+ Args:
92
+ text: Text containing box tokens
93
+ original_size: Original image size (width, height)
94
+ model_size: Model processed image size (width, height)
95
+
96
+ Returns:
97
+ Text with processed coordinates
98
+ """
99
+ # Find all box tokens
100
+ box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
101
+
102
+ def process_coords(match):
103
+ model_x, model_y = int(match.group(1)), int(match.group(2))
104
+ # Scale coordinates from model space to original image space
105
+ # Both original_size and model_size are in (width, height) format
106
+ new_x = int(model_x * original_size[0] / model_size[0]) # Width
107
+ new_y = int(model_y * original_size[1] / model_size[1]) # Height
108
+ return f"<|box_start|>({new_x},{new_y})<|box_end|>"
109
+
110
+ return re.sub(box_pattern, process_coords, text)
111
+
112
+ async def run_interleaved(
113
+ self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
114
+ ) -> Dict[str, Any]:
115
+ """Run interleaved chat completion.
116
+
117
+ Args:
118
+ messages: List of message dicts
119
+ system: System prompt
120
+ max_tokens: Optional max tokens override
121
+
122
+ Returns:
123
+ Response dict
124
+ """
125
+ # Ensure the system message is included
126
+ if not any(msg.get("role") == "system" for msg in messages):
127
+ messages = [{"role": "system", "content": system}] + messages
128
+
129
+ # Create a deep copy of messages to avoid modifying the original
130
+ processed_messages = messages.copy()
131
+
132
+ # Extract images and process messages
133
+ images = []
134
+ original_sizes = {} # Track original sizes of images for coordinate mapping
135
+ model_sizes = {} # Track model processed sizes
136
+ image_index = 0
137
+
138
+ for msg_idx, msg in enumerate(messages):
139
+ content = msg.get("content", [])
140
+ if not isinstance(content, list):
141
+ continue
142
+
143
+ # Create a copy of the content list to modify
144
+ processed_content = []
145
+
146
+ for item_idx, item in enumerate(content):
147
+ if item.get("type") == "image_url":
148
+ image_url = item.get("image_url", {}).get("url", "")
149
+ pil_image = None
150
+
151
+ if image_url.startswith("data:image/"):
152
+ # Extract base64 data
153
+ base64_data = image_url.split(',')[1]
154
+ # Convert base64 to PIL Image
155
+ image_data = base64.b64decode(base64_data)
156
+ pil_image = Image.open(io.BytesIO(image_data))
157
+ else:
158
+ # Handle file path or URL
159
+ pil_image = Image.open(image_url)
160
+
161
+ # Store original image size for coordinate mapping
162
+ original_size = pil_image.size
163
+ original_sizes[image_index] = original_size
164
+
165
+ # Use smart_resize to determine model size
166
+ # Note: smart_resize expects (height, width) but PIL gives (width, height)
167
+ height, width = original_size[1], original_size[0]
168
+ new_height, new_width = smart_resize(height, width)
169
+ # Store model size in (width, height) format for consistent coordinate processing
170
+ model_sizes[image_index] = (new_width, new_height)
171
+
172
+ # Resize the image using the calculated dimensions from smart_resize
173
+ resized_image = pil_image.resize((new_width, new_height))
174
+ images.append(resized_image)
175
+ image_index += 1
176
+
177
+ # Copy items to processed content list
178
+ processed_content.append(item.copy())
179
+
180
+ # Update the processed message content
181
+ processed_messages[msg_idx] = msg.copy()
182
+ processed_messages[msg_idx]["content"] = processed_content
183
+
184
+ logger.info(f"resized {len(images)} from {original_sizes[0]} to {model_sizes[0]}")
185
+
186
+ # Process user text input with box coordinates after image processing
187
+ # Swap original_size and model_size arguments for inverse transformation
188
+ for msg_idx, msg in enumerate(processed_messages):
189
+ if msg.get("role") == "user" and isinstance(msg.get("content"), str):
190
+ if "<|box_start|>" in msg.get("content") and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
191
+ orig_size = original_sizes[0]
192
+ model_size = model_sizes[0]
193
+ # Swap arguments to perform inverse transformation for user input
194
+ processed_messages[msg_idx]["content"] = self._process_coordinates(msg["content"], model_size, orig_size)
195
+
196
+ try:
197
+ # Format prompt according to model requirements using the processor directly
198
+ prompt = self.processor.apply_chat_template(
199
+ processed_messages,
200
+ tokenize=False,
201
+ add_generation_prompt=True
202
+ )
203
+ tokenizer = cast(PreTrainedTokenizer, self.processor)
204
+
205
+ print("generating response...")
206
+
207
+ # Generate response
208
+ text_content, usage = generate(
209
+ self.model,
210
+ tokenizer,
211
+ str(prompt),
212
+ images,
213
+ verbose=False,
214
+ max_tokens=max_tokens
215
+ )
216
+
217
+ from pprint import pprint
218
+ print("DEBUG - AGENT GENERATION --------")
219
+ pprint(text_content)
220
+ print("DEBUG - AGENT GENERATION --------")
221
+ except Exception as e:
222
+ logger.error(f"Error generating response: {str(e)}")
223
+ return {
224
+ "choices": [
225
+ {
226
+ "message": {
227
+ "role": "assistant",
228
+ "content": f"Error generating response: {str(e)}"
229
+ },
230
+ "finish_reason": "error"
231
+ }
232
+ ],
233
+ "model": self.model_name,
234
+ "error": str(e)
235
+ }
236
+
237
+ # Process coordinates in the response back to original image space
238
+ if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
239
+ # Get original image size and model size (using the first image)
240
+ orig_size = original_sizes[0]
241
+ model_size = model_sizes[0]
242
+
243
+ # Check if output contains box tokens that need processing
244
+ if "<|box_start|>" in text_content:
245
+ # Process coordinates from model space back to original image space
246
+ text_content = self._process_coordinates(text_content, orig_size, model_size)
247
+
248
+ # Format response to match OpenAI format
249
+ response = {
250
+ "choices": [
251
+ {
252
+ "message": {
253
+ "role": "assistant",
254
+ "content": text_content
255
+ },
256
+ "finish_reason": "stop"
257
+ }
258
+ ],
259
+ "model": self.model_name,
260
+ "usage": usage
261
+ }
262
+
263
+ return response
@@ -23,6 +23,7 @@ from .tools.computer import ToolResult
23
23
  from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
24
24
 
25
25
  from .clients.oaicompat import OAICompatClient
26
+ from .clients.mlxvlm import MLXVLMUITarsClient
26
27
 
27
28
  logging.basicConfig(level=logging.INFO)
28
29
  logger = logging.getLogger(__name__)
@@ -44,6 +45,7 @@ class UITARSLoop(BaseLoop):
44
45
  computer: Computer,
45
46
  api_key: str,
46
47
  model: str,
48
+ provider: Optional[LLMProvider] = None,
47
49
  provider_base_url: Optional[str] = "http://localhost:8000/v1",
48
50
  only_n_most_recent_images: Optional[int] = 2,
49
51
  base_dir: Optional[str] = "trajectories",
@@ -64,9 +66,10 @@ class UITARSLoop(BaseLoop):
64
66
  max_retries: Maximum number of retries for API calls
65
67
  retry_delay: Delay between retries in seconds
66
68
  save_trajectory: Whether to save trajectory data
69
+ provider: The LLM provider to use (defaults to OAICOMPAT if not specified)
67
70
  """
68
71
  # Set provider before initializing base class
69
- self.provider = LLMProvider.OAICOMPAT
72
+ self.provider = provider or LLMProvider.OAICOMPAT
70
73
  self.provider_base_url = provider_base_url
71
74
 
72
75
  # Initialize message manager with image retention config
@@ -113,7 +116,7 @@ class UITARSLoop(BaseLoop):
113
116
  logger.error(f"Error initializing tool manager: {str(e)}")
114
117
  logger.warning("Will attempt to initialize tools on first use.")
115
118
 
116
- # Initialize client for the OAICompat provider
119
+ # Initialize client for the selected provider
117
120
  try:
118
121
  await self.initialize_client()
119
122
  except Exception as e:
@@ -128,18 +131,28 @@ class UITARSLoop(BaseLoop):
128
131
  """Initialize the appropriate client.
129
132
 
130
133
  Implements abstract method from BaseLoop to set up the specific
131
- provider client (OAICompat for UI-TARS).
134
+ provider client based on the configured provider.
132
135
  """
133
136
  try:
134
- logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
135
-
136
- self.client = OAICompatClient(
137
- api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
138
- model=self.model,
139
- provider_base_url=self.provider_base_url,
140
- )
141
-
142
- logger.info(f"Initialized OAICompat client with model {self.model}")
137
+ if self.provider == LLMProvider.MLXVLM:
138
+ logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
139
+
140
+ self.client = MLXVLMUITarsClient(
141
+ model=self.model,
142
+ )
143
+
144
+ logger.info(f"Initialized MLX VLM client with model {self.model}")
145
+ else:
146
+ # Default to OAICompat client for other providers
147
+ logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
148
+
149
+ self.client = OAICompatClient(
150
+ api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key
151
+ model=self.model,
152
+ provider_base_url=self.provider_base_url,
153
+ )
154
+
155
+ logger.info(f"Initialized OAICompat client with model {self.model}")
143
156
  except Exception as e:
144
157
  logger.error(f"Error initializing client: {str(e)}")
145
158
  self.client = None
@@ -105,7 +105,7 @@ async def to_agent_response_format(
105
105
  }
106
106
  ],
107
107
  truncation="auto",
108
- usage=response["usage"],
108
+ usage=response.get("usage", {}),
109
109
  user=None,
110
110
  metadata={},
111
111
  response=response
@@ -164,8 +164,10 @@ MODEL_MAPPINGS = {
164
164
  "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
165
165
  },
166
166
  "uitars": {
167
- # UI-TARS models default to custom endpoint
168
- "default": "ByteDance-Seed/UI-TARS-1.5-7B",
167
+ # UI-TARS models using MLXVLM provider
168
+ "default": "mlx-community/UI-TARS-1.5-7B-4bit",
169
+ "mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
170
+ "mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
169
171
  },
170
172
  "ollama": {
171
173
  # For Ollama models, we keep the original name
@@ -288,8 +290,16 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
288
290
  model_name_to_use = cleaned_model_name
289
291
  # agent_loop remains AgentLoop.OMNI
290
292
  elif agent_loop == AgentLoop.UITARS:
291
- provider = LLMProvider.OAICOMPAT
292
- model_name_to_use = MODEL_MAPPINGS["uitars"]["default"] # Default
293
+ # For UITARS, use MLXVLM provider for the MLX models, OAICOMPAT for custom
294
+ if model_name == "Custom model...":
295
+ provider = LLMProvider.OAICOMPAT
296
+ model_name_to_use = "tgi"
297
+ else:
298
+ provider = LLMProvider.MLXVLM
299
+ # Get the model name from the mappings or use as-is if not found
300
+ model_name_to_use = MODEL_MAPPINGS["uitars"].get(
301
+ model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"]
302
+ )
293
303
  else:
294
304
  # Default to OpenAI if unrecognized loop
295
305
  provider = LLMProvider.OPENAI
@@ -440,7 +450,11 @@ def create_gradio_ui(
440
450
  "OPENAI": openai_models,
441
451
  "ANTHROPIC": anthropic_models,
442
452
  "OMNI": omni_models + ["Custom model..."], # Add custom model option
443
- "UITARS": ["Custom model..."], # UI-TARS options
453
+ "UITARS": [
454
+ "mlx-community/UI-TARS-1.5-7B-4bit",
455
+ "mlx-community/UI-TARS-1.5-7B-6bit",
456
+ "Custom model..."
457
+ ], # UI-TARS options with MLX models
444
458
  }
445
459
 
446
460
  # --- Apply Saved Settings (override defaults if available) ---
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.1.35"
9
+ version = "0.1.37"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -39,6 +39,7 @@ openai = [
39
39
  uitars = [
40
40
  "httpx>=0.27.0,<0.29.0",
41
41
  ]
42
+ uitars-mlx = []
42
43
  ui = [
43
44
  "gradio>=5.23.3,<6.0.0",
44
45
  "python-dotenv>=1.0.1,<2.0.0",
@@ -108,7 +109,7 @@ target-version = [
108
109
 
109
110
  [tool.ruff]
110
111
  line-length = 100
111
- target-version = "0.1.35"
112
+ target-version = "0.1.37"
112
113
  select = [
113
114
  "E",
114
115
  "F",
@@ -122,7 +123,7 @@ docstring-code-format = true
122
123
 
123
124
  [tool.mypy]
124
125
  strict = true
125
- python_version = "0.1.35"
126
+ python_version = "0.1.37"
126
127
  ignore_missing_imports = true
127
128
  disallow_untyped_defs = true
128
129
  check_untyped_defs = true
File without changes