cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -2,212 +2,155 @@ import asyncio
2
2
  import functools
3
3
  import warnings
4
4
  from concurrent.futures import ThreadPoolExecutor
5
- from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
6
- from litellm.types.utils import GenericStreamingChunk, ModelResponse
5
+ from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
6
+
7
+ from litellm import acompletion, completion
7
8
  from litellm.llms.custom_llm import CustomLLM
8
- from litellm import completion, acompletion
9
+ from litellm.types.utils import GenericStreamingChunk, ModelResponse
9
10
 
10
11
  # Try to import HuggingFace dependencies
11
12
  try:
12
13
  import torch
13
14
  from transformers import AutoModelForImageTextToText, AutoProcessor
15
+
14
16
  HF_AVAILABLE = True
15
17
  except ImportError:
16
18
  HF_AVAILABLE = False
17
19
 
20
+ from .models import load_model as load_model_handler
21
+
18
22
 
19
23
  class HuggingFaceLocalAdapter(CustomLLM):
20
24
  """HuggingFace Local Adapter for running vision-language models locally."""
21
-
22
- def __init__(self, device: str = "auto", **kwargs):
25
+
26
+ def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
23
27
  """Initialize the adapter.
24
-
28
+
25
29
  Args:
26
30
  device: Device to load model on ("auto", "cuda", "cpu", etc.)
31
+ trust_remote_code: Whether to trust remote code
27
32
  **kwargs: Additional arguments
28
33
  """
29
34
  super().__init__()
30
35
  self.device = device
31
- self.models = {} # Cache for loaded models
32
- self.processors = {} # Cache for loaded processors
36
+ self.trust_remote_code = trust_remote_code
37
+ # Cache for model handlers keyed by model_name
38
+ self._handlers: Dict[str, Any] = {}
33
39
  self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
34
-
35
- def _load_model_and_processor(self, model_name: str):
36
- """Load model and processor if not already cached.
37
-
38
- Args:
39
- model_name: Name of the model to load
40
-
41
- Returns:
42
- Tuple of (model, processor)
43
- """
44
- if model_name not in self.models:
45
- # Load model
46
- model = AutoModelForImageTextToText.from_pretrained(
47
- model_name,
48
- torch_dtype=torch.float16,
49
- device_map=self.device,
50
- attn_implementation="sdpa"
51
- )
52
-
53
- # Load processor
54
- processor = AutoProcessor.from_pretrained(
55
- model_name,
56
- min_pixels=3136,
57
- max_pixels=4096 * 2160,
58
- device_map=self.device
40
+
41
+ def _get_handler(self, model_name: str):
42
+ """Get or create a model handler for the given model name."""
43
+ if model_name not in self._handlers:
44
+ self._handlers[model_name] = load_model_handler(
45
+ model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
59
46
  )
60
-
61
- # Cache them
62
- self.models[model_name] = model
63
- self.processors[model_name] = processor
64
-
65
- return self.models[model_name], self.processors[model_name]
66
-
47
+ return self._handlers[model_name]
48
+
67
49
  def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
68
50
  """Convert OpenAI format messages to HuggingFace format.
69
-
51
+
70
52
  Args:
71
53
  messages: Messages in OpenAI format
72
-
54
+
73
55
  Returns:
74
56
  Messages in HuggingFace format
75
57
  """
76
58
  converted_messages = []
77
-
59
+
78
60
  for message in messages:
79
- converted_message = {
80
- "role": message["role"],
81
- "content": []
82
- }
83
-
61
+ converted_message = {"role": message["role"], "content": []}
62
+
84
63
  content = message.get("content", [])
85
64
  if isinstance(content, str):
86
65
  # Simple text content
87
- converted_message["content"].append({
88
- "type": "text",
89
- "text": content
90
- })
66
+ converted_message["content"].append({"type": "text", "text": content})
91
67
  elif isinstance(content, list):
92
68
  # Multi-modal content
93
69
  for item in content:
94
70
  if item.get("type") == "text":
95
- converted_message["content"].append({
96
- "type": "text",
97
- "text": item.get("text", "")
98
- })
71
+ converted_message["content"].append(
72
+ {"type": "text", "text": item.get("text", "")}
73
+ )
99
74
  elif item.get("type") == "image_url":
100
75
  # Convert image_url format to image format
101
76
  image_url = item.get("image_url", {}).get("url", "")
102
- converted_message["content"].append({
103
- "type": "image",
104
- "image": image_url
105
- })
106
-
77
+ converted_message["content"].append({"type": "image", "image": image_url})
78
+
107
79
  converted_messages.append(converted_message)
108
-
80
+
109
81
  return converted_messages
110
-
82
+
111
83
  def _generate(self, **kwargs) -> str:
112
84
  """Generate response using the local HuggingFace model.
113
-
85
+
114
86
  Args:
115
87
  **kwargs: Keyword arguments containing messages and model info
116
-
88
+
117
89
  Returns:
118
90
  Generated text response
119
91
  """
120
92
  if not HF_AVAILABLE:
121
93
  raise ImportError(
122
94
  "HuggingFace transformers dependencies not found. "
123
- "Please install with: pip install \"cua-agent[uitars-hf]\""
95
+ 'Please install with: pip install "cua-agent[uitars-hf]"'
124
96
  )
125
-
97
+
126
98
  # Extract messages and model from kwargs
127
- messages = kwargs.get('messages', [])
128
- model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
129
- max_new_tokens = kwargs.get('max_tokens', 128)
130
-
99
+ messages = kwargs.get("messages", [])
100
+ model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
101
+ max_new_tokens = kwargs.get("max_tokens", 128)
102
+
131
103
  # Warn about ignored kwargs
132
- ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
104
+ ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
133
105
  if ignored_kwargs:
134
106
  warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
135
-
136
- # Load model and processor
137
- model, processor = self._load_model_and_processor(model_name)
138
-
107
+
139
108
  # Convert messages to HuggingFace format
140
109
  hf_messages = self._convert_messages(messages)
141
-
142
- # Apply chat template and tokenize
143
- inputs = processor.apply_chat_template(
144
- hf_messages,
145
- add_generation_prompt=True,
146
- tokenize=True,
147
- return_dict=True,
148
- return_tensors="pt"
149
- )
150
-
151
- # Move inputs to the same device as model
152
- inputs = inputs.to(model.device)
153
-
154
- # Generate response
155
- with torch.no_grad():
156
- generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
157
-
158
- # Trim input tokens from output
159
- generated_ids_trimmed = [
160
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
161
- ]
162
-
163
- # Decode output
164
- output_text = processor.batch_decode(
165
- generated_ids_trimmed,
166
- skip_special_tokens=True,
167
- clean_up_tokenization_spaces=False
168
- )
169
-
170
- return output_text[0] if output_text else ""
171
-
110
+
111
+ # Delegate to model handler
112
+ handler = self._get_handler(model_name)
113
+ generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
114
+ return generated_text
115
+
172
116
  def completion(self, *args, **kwargs) -> ModelResponse:
173
117
  """Synchronous completion method.
174
-
118
+
175
119
  Returns:
176
120
  ModelResponse with generated text
177
121
  """
178
122
  generated_text = self._generate(**kwargs)
179
-
123
+
180
124
  return completion(
181
125
  model=f"huggingface-local/{kwargs['model']}",
182
126
  mock_response=generated_text,
183
127
  )
184
-
128
+
185
129
  async def acompletion(self, *args, **kwargs) -> ModelResponse:
186
130
  """Asynchronous completion method.
187
-
131
+
188
132
  Returns:
189
133
  ModelResponse with generated text
190
134
  """
191
135
  # Run _generate in thread pool to avoid blocking
192
136
  loop = asyncio.get_event_loop()
193
137
  generated_text = await loop.run_in_executor(
194
- self._executor,
195
- functools.partial(self._generate, **kwargs)
138
+ self._executor, functools.partial(self._generate, **kwargs)
196
139
  )
197
-
140
+
198
141
  return await acompletion(
199
142
  model=f"huggingface-local/{kwargs['model']}",
200
143
  mock_response=generated_text,
201
144
  )
202
-
145
+
203
146
  def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
204
147
  """Synchronous streaming method.
205
-
148
+
206
149
  Returns:
207
150
  Iterator of GenericStreamingChunk
208
151
  """
209
152
  generated_text = self._generate(**kwargs)
210
-
153
+
211
154
  generic_streaming_chunk: GenericStreamingChunk = {
212
155
  "finish_reason": "stop",
213
156
  "index": 0,
@@ -216,22 +159,21 @@ class HuggingFaceLocalAdapter(CustomLLM):
216
159
  "tool_use": None,
217
160
  "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
218
161
  }
219
-
162
+
220
163
  yield generic_streaming_chunk
221
-
164
+
222
165
  async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
223
166
  """Asynchronous streaming method.
224
-
167
+
225
168
  Returns:
226
169
  AsyncIterator of GenericStreamingChunk
227
170
  """
228
171
  # Run _generate in thread pool to avoid blocking
229
172
  loop = asyncio.get_event_loop()
230
173
  generated_text = await loop.run_in_executor(
231
- self._executor,
232
- functools.partial(self._generate, **kwargs)
174
+ self._executor, functools.partial(self._generate, **kwargs)
233
175
  )
234
-
176
+
235
177
  generic_streaming_chunk: GenericStreamingChunk = {
236
178
  "finish_reason": "stop",
237
179
  "index": 0,
@@ -240,5 +182,5 @@ class HuggingFaceLocalAdapter(CustomLLM):
240
182
  "tool_use": None,
241
183
  "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
242
184
  }
243
-
244
- yield generic_streaming_chunk
185
+
186
+ yield generic_streaming_chunk