cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
@@ -2,38 +2,41 @@
2
2
  PII anonymization callback handler using Microsoft Presidio for text and image redaction.
3
3
  """
4
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
- from .base import AsyncCallbackHandler
7
5
  import base64
8
6
  import io
9
7
  import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from .base import AsyncCallbackHandler
10
11
 
11
12
  try:
12
13
  # TODO: Add Presidio dependencies
13
14
  from PIL import Image
15
+
14
16
  PRESIDIO_AVAILABLE = True
15
17
  except ImportError:
16
18
  PRESIDIO_AVAILABLE = False
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+
20
23
  class PIIAnonymizationCallback(AsyncCallbackHandler):
21
24
  """
22
25
  Callback handler that anonymizes PII in text and images using Microsoft Presidio.
23
-
26
+
24
27
  This handler:
25
28
  1. Anonymizes PII in messages before sending to the agent loop
26
29
  2. Deanonymizes PII in tool calls and message outputs after the agent loop
27
30
  3. Redacts PII from images in computer_call_output messages
28
31
  """
29
-
32
+
30
33
  def __init__(
31
34
  self,
32
35
  # TODO: Any extra kwargs if needed
33
36
  ):
34
37
  """
35
38
  Initialize the PII anonymization callback.
36
-
39
+
37
40
  Args:
38
41
  anonymize_text: Whether to anonymize text content
39
42
  anonymize_images: Whether to redact images
@@ -46,16 +49,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
46
49
  "Presidio is not available. Install with: "
47
50
  "pip install cua-agent[pii-anonymization]"
48
51
  )
49
-
52
+
50
53
  # TODO: Implement __init__
51
-
54
+
52
55
  async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
53
56
  """
54
57
  Anonymize PII in messages before sending to agent loop.
55
-
58
+
56
59
  Args:
57
60
  messages: List of message dictionaries
58
-
61
+
59
62
  Returns:
60
63
  List of messages with PII anonymized
61
64
  """
@@ -63,16 +66,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
63
66
  for msg in messages:
64
67
  anonymized_msg = await self._anonymize_message(msg)
65
68
  anonymized_messages.append(anonymized_msg)
66
-
69
+
67
70
  return anonymized_messages
68
-
71
+
69
72
  async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
70
73
  """
71
74
  Deanonymize PII in tool calls and message outputs after agent loop.
72
-
75
+
73
76
  Args:
74
77
  output: List of output dictionaries
75
-
78
+
76
79
  Returns:
77
80
  List of output with PII deanonymized for tool calls
78
81
  """
@@ -84,13 +87,13 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
84
87
  deanonymized_output.append(deanonymized_item)
85
88
  else:
86
89
  deanonymized_output.append(item)
87
-
90
+
88
91
  return deanonymized_output
89
-
92
+
90
93
  async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
91
94
  # TODO: Implement _anonymize_message
92
95
  return message
93
-
96
+
94
97
  async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
95
98
  # TODO: Implement _deanonymize_item
96
99
  return item
@@ -2,17 +2,17 @@
2
2
  Telemetry callback handler for Computer-Use Agent (cua-agent)
3
3
  """
4
4
 
5
+ import platform
5
6
  import time
6
7
  import uuid
7
- from typing import List, Dict, Any, Optional, Union
8
+ from typing import Any, Dict, List, Optional, Union
8
9
 
9
- from .base import AsyncCallbackHandler
10
10
  from core.telemetry import (
11
- record_event,
12
11
  is_telemetry_enabled,
12
+ record_event,
13
13
  )
14
14
 
15
- import platform
15
+ from .base import AsyncCallbackHandler
16
16
 
17
17
  SYSTEM_INFO = {
18
18
  "os": platform.system().lower(),
@@ -20,32 +20,29 @@ SYSTEM_INFO = {
20
20
  "python_version": platform.python_version(),
21
21
  }
22
22
 
23
+
23
24
  class TelemetryCallback(AsyncCallbackHandler):
24
25
  """
25
26
  Telemetry callback handler for Computer-Use Agent (cua-agent)
26
-
27
+
27
28
  Tracks agent usage, performance metrics, and optionally trajectory data.
28
29
  """
29
-
30
- def __init__(
31
- self,
32
- agent,
33
- log_trajectory: bool = False
34
- ):
30
+
31
+ def __init__(self, agent, log_trajectory: bool = False):
35
32
  """
36
33
  Initialize telemetry callback.
37
-
34
+
38
35
  Args:
39
36
  agent: The ComputerAgent instance
40
37
  log_trajectory: Whether to log full trajectory items (opt-in)
41
38
  """
42
39
  self.agent = agent
43
40
  self.log_trajectory = log_trajectory
44
-
41
+
45
42
  # Generate session/run IDs
46
43
  self.session_id = str(uuid.uuid4())
47
44
  self.run_id = None
48
-
45
+
49
46
  # Track timing and metrics
50
47
  self.run_start_time = None
51
48
  self.step_count = 0
@@ -54,126 +51,133 @@ class TelemetryCallback(AsyncCallbackHandler):
54
51
  "prompt_tokens": 0,
55
52
  "completion_tokens": 0,
56
53
  "total_tokens": 0,
57
- "response_cost": 0.0
54
+ "response_cost": 0.0,
58
55
  }
59
-
56
+
60
57
  # Record agent initialization
61
58
  if is_telemetry_enabled():
62
59
  self._record_agent_initialization()
63
-
60
+
64
61
  def _record_agent_initialization(self) -> None:
65
62
  """Record agent type/model and session initialization."""
66
63
  agent_info = {
67
64
  "session_id": self.session_id,
68
- "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
69
- "model": getattr(self.agent, 'model', 'unknown'),
70
- **SYSTEM_INFO
65
+ "agent_type": (
66
+ self.agent.agent_loop.__name__ if hasattr(self.agent, "agent_loop") else "unknown"
67
+ ),
68
+ "model": getattr(self.agent, "model", "unknown"),
69
+ **SYSTEM_INFO,
71
70
  }
72
-
71
+
73
72
  record_event("agent_session_start", agent_info)
74
-
73
+
75
74
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
76
75
  """Called at the start of an agent run loop."""
77
76
  if not is_telemetry_enabled():
78
77
  return
79
-
78
+
80
79
  self.run_id = str(uuid.uuid4())
81
80
  self.run_start_time = time.time()
82
81
  self.step_count = 0
83
-
82
+
84
83
  # Calculate input context size
85
84
  input_context_size = self._calculate_context_size(old_items)
86
-
85
+
87
86
  run_data = {
88
87
  "session_id": self.session_id,
89
88
  "run_id": self.run_id,
90
89
  "start_time": self.run_start_time,
91
90
  "input_context_size": input_context_size,
92
- "num_existing_messages": len(old_items)
91
+ "num_existing_messages": len(old_items),
93
92
  }
94
-
93
+
95
94
  # Log trajectory if opted in
96
95
  if self.log_trajectory:
97
96
  trajectory = self._extract_trajectory(old_items)
98
97
  if trajectory:
99
98
  run_data["uploaded_trajectory"] = trajectory
100
-
99
+
101
100
  record_event("agent_run_start", run_data)
102
-
103
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
101
+
102
+ async def on_run_end(
103
+ self,
104
+ kwargs: Dict[str, Any],
105
+ old_items: List[Dict[str, Any]],
106
+ new_items: List[Dict[str, Any]],
107
+ ) -> None:
104
108
  """Called at the end of an agent run loop."""
105
109
  if not is_telemetry_enabled() or not self.run_start_time:
106
110
  return
107
-
111
+
108
112
  run_duration = time.time() - self.run_start_time
109
-
113
+
110
114
  run_data = {
111
115
  "session_id": self.session_id,
112
116
  "run_id": self.run_id,
113
117
  "end_time": time.time(),
114
118
  "duration_seconds": run_duration,
115
119
  "num_steps": self.step_count,
116
- "total_usage": self.total_usage.copy()
120
+ "total_usage": self.total_usage.copy(),
117
121
  }
118
-
122
+
119
123
  # Log trajectory if opted in
120
124
  if self.log_trajectory:
121
125
  trajectory = self._extract_trajectory(new_items)
122
126
  if trajectory:
123
127
  run_data["uploaded_trajectory"] = trajectory
124
-
128
+
125
129
  record_event("agent_run_end", run_data)
126
-
130
+
127
131
  async def on_usage(self, usage: Dict[str, Any]) -> None:
128
132
  """Called when usage information is received."""
129
133
  if not is_telemetry_enabled():
130
134
  return
131
-
135
+
132
136
  # Accumulate usage stats
133
137
  self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
134
- self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
138
+ self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
135
139
  self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
136
140
  self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
137
-
141
+
138
142
  # Record individual usage event
139
143
  usage_data = {
140
144
  "session_id": self.session_id,
141
145
  "run_id": self.run_id,
142
146
  "step": self.step_count,
143
- **usage
147
+ **usage,
144
148
  }
145
-
149
+
146
150
  record_event("agent_usage", usage_data)
147
-
151
+
148
152
  async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
149
153
  """Called when responses are received."""
150
154
  if not is_telemetry_enabled():
151
155
  return
152
-
156
+
153
157
  self.step_count += 1
154
158
  step_duration = None
155
-
159
+
156
160
  if self.step_start_time:
157
161
  step_duration = time.time() - self.step_start_time
158
-
162
+
159
163
  self.step_start_time = time.time()
160
-
164
+
161
165
  step_data = {
162
166
  "session_id": self.session_id,
163
167
  "run_id": self.run_id,
164
168
  "step": self.step_count,
165
- "timestamp": self.step_start_time
169
+ "timestamp": self.step_start_time,
166
170
  }
167
-
171
+
168
172
  if step_duration is not None:
169
173
  step_data["duration_seconds"] = step_duration
170
-
174
+
171
175
  record_event("agent_step", step_data)
172
-
176
+
173
177
  def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
174
178
  """Calculate approximate context size in tokens/characters."""
175
179
  total_size = 0
176
-
180
+
177
181
  for item in items:
178
182
  if item.get("type") == "message" and "content" in item:
179
183
  content = item["content"]
@@ -185,25 +189,27 @@ class TelemetryCallback(AsyncCallbackHandler):
185
189
  total_size += len(part["text"])
186
190
  elif "content" in item and isinstance(item["content"], str):
187
191
  total_size += len(item["content"])
188
-
192
+
189
193
  return total_size
190
-
194
+
191
195
  def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
192
196
  """Extract trajectory items that should be logged."""
193
197
  trajectory = []
194
-
198
+
195
199
  for item in items:
196
200
  # Include user messages, assistant messages, reasoning, computer calls, and computer outputs
197
201
  if (
198
- item.get("role") == "user" or # User inputs
199
- (item.get("type") == "message" and item.get("role") == "assistant") or # Model outputs
200
- item.get("type") == "reasoning" or # Reasoning traces
201
- item.get("type") == "computer_call" or # Computer actions
202
- item.get("type") == "computer_call_output" # Computer outputs
202
+ item.get("role") == "user" # User inputs
203
+ or (
204
+ item.get("type") == "message" and item.get("role") == "assistant"
205
+ ) # Model outputs
206
+ or item.get("type") == "reasoning" # Reasoning traces
207
+ or item.get("type") == "computer_call" # Computer actions
208
+ or item.get("type") == "computer_call_output" # Computer outputs
203
209
  ):
204
210
  # Create a copy of the item with timestamp
205
211
  trajectory_item = item.copy()
206
212
  trajectory_item["logged_at"] = time.time()
207
213
  trajectory.append(trajectory_item)
208
-
209
- return trajectory
214
+
215
+ return trajectory