cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/__init__.py CHANGED
@@ -5,19 +5,13 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
5
5
  import logging
6
6
  import sys
7
7
 
8
- from .decorators import register_agent
9
- from .agent import ComputerAgent
10
- from .types import Messages, AgentResponse
11
-
12
8
  # Import loops to register them
13
9
  from . import loops
10
+ from .agent import ComputerAgent
11
+ from .decorators import register_agent
12
+ from .types import AgentResponse, Messages
14
13
 
15
- __all__ = [
16
- "register_agent",
17
- "ComputerAgent",
18
- "Messages",
19
- "AgentResponse"
20
- ]
14
+ __all__ = ["register_agent", "ComputerAgent", "Messages", "AgentResponse"]
21
15
 
22
16
  __version__ = "0.4.0"
23
17
 
agent/__main__.py CHANGED
@@ -5,8 +5,9 @@ Usage:
5
5
  python -m agent.cli <model_string>
6
6
  """
7
7
 
8
- import sys
9
8
  import asyncio
9
+ import sys
10
+
10
11
  from .cli import main
11
12
 
12
13
  if __name__ == "__main__":
@@ -2,27 +2,30 @@ import asyncio
2
2
  import functools
3
3
  import warnings
4
4
  from concurrent.futures import ThreadPoolExecutor
5
- from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
6
- from litellm.types.utils import GenericStreamingChunk, ModelResponse
5
+ from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
6
+
7
+ from litellm import acompletion, completion
7
8
  from litellm.llms.custom_llm import CustomLLM
8
- from litellm import completion, acompletion
9
+ from litellm.types.utils import GenericStreamingChunk, ModelResponse
9
10
 
10
11
  # Try to import HuggingFace dependencies
11
12
  try:
12
13
  import torch
13
14
  from transformers import AutoModelForImageTextToText, AutoProcessor
15
+
14
16
  HF_AVAILABLE = True
15
17
  except ImportError:
16
18
  HF_AVAILABLE = False
17
19
 
18
20
  from .models import load_model as load_model_handler
19
21
 
22
+
20
23
  class HuggingFaceLocalAdapter(CustomLLM):
21
24
  """HuggingFace Local Adapter for running vision-language models locally."""
22
-
25
+
23
26
  def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
24
27
  """Initialize the adapter.
25
-
28
+
26
29
  Args:
27
30
  device: Device to load model on ("auto", "cuda", "cpu", etc.)
28
31
  trust_remote_code: Whether to trust remote code
@@ -34,129 +37,120 @@ class HuggingFaceLocalAdapter(CustomLLM):
34
37
  # Cache for model handlers keyed by model_name
35
38
  self._handlers: Dict[str, Any] = {}
36
39
  self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
37
-
40
+
38
41
  def _get_handler(self, model_name: str):
39
42
  """Get or create a model handler for the given model name."""
40
43
  if model_name not in self._handlers:
41
- self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
44
+ self._handlers[model_name] = load_model_handler(
45
+ model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
46
+ )
42
47
  return self._handlers[model_name]
43
-
48
+
44
49
  def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
45
50
  """Convert OpenAI format messages to HuggingFace format.
46
-
51
+
47
52
  Args:
48
53
  messages: Messages in OpenAI format
49
-
54
+
50
55
  Returns:
51
56
  Messages in HuggingFace format
52
57
  """
53
58
  converted_messages = []
54
-
59
+
55
60
  for message in messages:
56
- converted_message = {
57
- "role": message["role"],
58
- "content": []
59
- }
60
-
61
+ converted_message = {"role": message["role"], "content": []}
62
+
61
63
  content = message.get("content", [])
62
64
  if isinstance(content, str):
63
65
  # Simple text content
64
- converted_message["content"].append({
65
- "type": "text",
66
- "text": content
67
- })
66
+ converted_message["content"].append({"type": "text", "text": content})
68
67
  elif isinstance(content, list):
69
68
  # Multi-modal content
70
69
  for item in content:
71
70
  if item.get("type") == "text":
72
- converted_message["content"].append({
73
- "type": "text",
74
- "text": item.get("text", "")
75
- })
71
+ converted_message["content"].append(
72
+ {"type": "text", "text": item.get("text", "")}
73
+ )
76
74
  elif item.get("type") == "image_url":
77
75
  # Convert image_url format to image format
78
76
  image_url = item.get("image_url", {}).get("url", "")
79
- converted_message["content"].append({
80
- "type": "image",
81
- "image": image_url
82
- })
83
-
77
+ converted_message["content"].append({"type": "image", "image": image_url})
78
+
84
79
  converted_messages.append(converted_message)
85
-
80
+
86
81
  return converted_messages
87
-
82
+
88
83
  def _generate(self, **kwargs) -> str:
89
84
  """Generate response using the local HuggingFace model.
90
-
85
+
91
86
  Args:
92
87
  **kwargs: Keyword arguments containing messages and model info
93
-
88
+
94
89
  Returns:
95
90
  Generated text response
96
91
  """
97
92
  if not HF_AVAILABLE:
98
93
  raise ImportError(
99
94
  "HuggingFace transformers dependencies not found. "
100
- "Please install with: pip install \"cua-agent[uitars-hf]\""
95
+ 'Please install with: pip install "cua-agent[uitars-hf]"'
101
96
  )
102
-
97
+
103
98
  # Extract messages and model from kwargs
104
- messages = kwargs.get('messages', [])
105
- model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
106
- max_new_tokens = kwargs.get('max_tokens', 128)
107
-
99
+ messages = kwargs.get("messages", [])
100
+ model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
101
+ max_new_tokens = kwargs.get("max_tokens", 128)
102
+
108
103
  # Warn about ignored kwargs
109
- ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
104
+ ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
110
105
  if ignored_kwargs:
111
106
  warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
112
-
107
+
113
108
  # Convert messages to HuggingFace format
114
109
  hf_messages = self._convert_messages(messages)
115
-
110
+
116
111
  # Delegate to model handler
117
112
  handler = self._get_handler(model_name)
118
113
  generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
119
114
  return generated_text
120
-
115
+
121
116
  def completion(self, *args, **kwargs) -> ModelResponse:
122
117
  """Synchronous completion method.
123
-
118
+
124
119
  Returns:
125
120
  ModelResponse with generated text
126
121
  """
127
122
  generated_text = self._generate(**kwargs)
128
-
123
+
129
124
  return completion(
130
125
  model=f"huggingface-local/{kwargs['model']}",
131
126
  mock_response=generated_text,
132
127
  )
133
-
128
+
134
129
  async def acompletion(self, *args, **kwargs) -> ModelResponse:
135
130
  """Asynchronous completion method.
136
-
131
+
137
132
  Returns:
138
133
  ModelResponse with generated text
139
134
  """
140
135
  # Run _generate in thread pool to avoid blocking
141
136
  loop = asyncio.get_event_loop()
142
137
  generated_text = await loop.run_in_executor(
143
- self._executor,
144
- functools.partial(self._generate, **kwargs)
138
+ self._executor, functools.partial(self._generate, **kwargs)
145
139
  )
146
-
140
+
147
141
  return await acompletion(
148
142
  model=f"huggingface-local/{kwargs['model']}",
149
143
  mock_response=generated_text,
150
144
  )
151
-
145
+
152
146
  def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
153
147
  """Synchronous streaming method.
154
-
148
+
155
149
  Returns:
156
150
  Iterator of GenericStreamingChunk
157
151
  """
158
152
  generated_text = self._generate(**kwargs)
159
-
153
+
160
154
  generic_streaming_chunk: GenericStreamingChunk = {
161
155
  "finish_reason": "stop",
162
156
  "index": 0,
@@ -165,22 +159,21 @@ class HuggingFaceLocalAdapter(CustomLLM):
165
159
  "tool_use": None,
166
160
  "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
167
161
  }
168
-
162
+
169
163
  yield generic_streaming_chunk
170
-
164
+
171
165
  async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
172
166
  """Asynchronous streaming method.
173
-
167
+
174
168
  Returns:
175
169
  AsyncIterator of GenericStreamingChunk
176
170
  """
177
171
  # Run _generate in thread pool to avoid blocking
178
172
  loop = asyncio.get_event_loop()
179
173
  generated_text = await loop.run_in_executor(
180
- self._executor,
181
- functools.partial(self._generate, **kwargs)
174
+ self._executor, functools.partial(self._generate, **kwargs)
182
175
  )
183
-
176
+
184
177
  generic_streaming_chunk: GenericStreamingChunk = {
185
178
  "finish_reason": "stop",
186
179
  "index": 0,
@@ -189,5 +182,5 @@ class HuggingFaceLocalAdapter(CustomLLM):
189
182
  "tool_use": None,
190
183
  "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
191
184
  }
192
-
193
- yield generic_streaming_chunk
185
+
186
+ yield generic_streaming_chunk