cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -2,38 +2,41 @@
2
2
  PII anonymization callback handler using Microsoft Presidio for text and image redaction.
3
3
  """
4
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
- from .base import AsyncCallbackHandler
7
5
  import base64
8
6
  import io
9
7
  import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from .base import AsyncCallbackHandler
10
11
 
11
12
  try:
12
13
  # TODO: Add Presidio dependencies
13
14
  from PIL import Image
15
+
14
16
  PRESIDIO_AVAILABLE = True
15
17
  except ImportError:
16
18
  PRESIDIO_AVAILABLE = False
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+
20
23
  class PIIAnonymizationCallback(AsyncCallbackHandler):
21
24
  """
22
25
  Callback handler that anonymizes PII in text and images using Microsoft Presidio.
23
-
26
+
24
27
  This handler:
25
28
  1. Anonymizes PII in messages before sending to the agent loop
26
29
  2. Deanonymizes PII in tool calls and message outputs after the agent loop
27
30
  3. Redacts PII from images in computer_call_output messages
28
31
  """
29
-
32
+
30
33
  def __init__(
31
34
  self,
32
35
  # TODO: Any extra kwargs if needed
33
36
  ):
34
37
  """
35
38
  Initialize the PII anonymization callback.
36
-
39
+
37
40
  Args:
38
41
  anonymize_text: Whether to anonymize text content
39
42
  anonymize_images: Whether to redact images
@@ -46,16 +49,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
46
49
  "Presidio is not available. Install with: "
47
50
  "pip install cua-agent[pii-anonymization]"
48
51
  )
49
-
52
+
50
53
  # TODO: Implement __init__
51
-
54
+
52
55
  async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
53
56
  """
54
57
  Anonymize PII in messages before sending to agent loop.
55
-
58
+
56
59
  Args:
57
60
  messages: List of message dictionaries
58
-
61
+
59
62
  Returns:
60
63
  List of messages with PII anonymized
61
64
  """
@@ -63,16 +66,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
63
66
  for msg in messages:
64
67
  anonymized_msg = await self._anonymize_message(msg)
65
68
  anonymized_messages.append(anonymized_msg)
66
-
69
+
67
70
  return anonymized_messages
68
-
71
+
69
72
  async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
70
73
  """
71
74
  Deanonymize PII in tool calls and message outputs after agent loop.
72
-
75
+
73
76
  Args:
74
77
  output: List of output dictionaries
75
-
78
+
76
79
  Returns:
77
80
  List of output with PII deanonymized for tool calls
78
81
  """
@@ -84,13 +87,13 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
84
87
  deanonymized_output.append(deanonymized_item)
85
88
  else:
86
89
  deanonymized_output.append(item)
87
-
90
+
88
91
  return deanonymized_output
89
-
92
+
90
93
  async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
91
94
  # TODO: Implement _anonymize_message
92
95
  return message
93
-
96
+
94
97
  async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
95
98
  # TODO: Implement _deanonymize_item
96
99
  return item
@@ -0,0 +1,47 @@
1
+ """
2
+ Prompt instructions callback.
3
+
4
+ This callback allows simple prompt engineering by pre-pending a user
5
+ instructions message to the start of the conversation before each LLM call.
6
+
7
+ Usage:
8
+
9
+ from agent.callbacks import PromptInstructionsCallback
10
+ agent = ComputerAgent(
11
+ model="openai/computer-use-preview",
12
+ callbacks=[PromptInstructionsCallback("Follow these rules...")]
13
+ )
14
+
15
+ """
16
+
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from .base import AsyncCallbackHandler
20
+
21
+
22
+ class PromptInstructionsCallback(AsyncCallbackHandler):
23
+ """
24
+ Prepend a user instructions message to the message list.
25
+
26
+ This is a minimal, non-invasive way to guide the agent's behavior without
27
+ modifying agent loops or tools. It works with any provider/loop since it
28
+ only alters the messages array before sending to the model.
29
+ """
30
+
31
+ def __init__(self, instructions: Optional[str]) -> None:
32
+ self.instructions = instructions
33
+
34
+ async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
35
+ # Pre-pend instructions message
36
+ if not self.instructions:
37
+ return messages
38
+
39
+ # Ensure we don't duplicate if already present at the front
40
+ if messages and isinstance(messages[0], dict):
41
+ first = messages[0]
42
+ if first.get("role") == "user" and first.get("content") == self.instructions:
43
+ return messages
44
+
45
+ return [
46
+ {"role": "user", "content": self.instructions},
47
+ ] + messages
@@ -2,17 +2,17 @@
2
2
  Telemetry callback handler for Computer-Use Agent (cua-agent)
3
3
  """
4
4
 
5
+ import platform
5
6
  import time
6
7
  import uuid
7
- from typing import List, Dict, Any, Optional, Union
8
+ from typing import Any, Dict, List, Optional, Union
8
9
 
9
- from .base import AsyncCallbackHandler
10
10
  from core.telemetry import (
11
- record_event,
12
11
  is_telemetry_enabled,
12
+ record_event,
13
13
  )
14
14
 
15
- import platform
15
+ from .base import AsyncCallbackHandler
16
16
 
17
17
  SYSTEM_INFO = {
18
18
  "os": platform.system().lower(),
@@ -20,32 +20,29 @@ SYSTEM_INFO = {
20
20
  "python_version": platform.python_version(),
21
21
  }
22
22
 
23
+
23
24
  class TelemetryCallback(AsyncCallbackHandler):
24
25
  """
25
26
  Telemetry callback handler for Computer-Use Agent (cua-agent)
26
-
27
+
27
28
  Tracks agent usage, performance metrics, and optionally trajectory data.
28
29
  """
29
-
30
- def __init__(
31
- self,
32
- agent,
33
- log_trajectory: bool = False
34
- ):
30
+
31
+ def __init__(self, agent, log_trajectory: bool = False):
35
32
  """
36
33
  Initialize telemetry callback.
37
-
34
+
38
35
  Args:
39
36
  agent: The ComputerAgent instance
40
37
  log_trajectory: Whether to log full trajectory items (opt-in)
41
38
  """
42
39
  self.agent = agent
43
40
  self.log_trajectory = log_trajectory
44
-
41
+
45
42
  # Generate session/run IDs
46
43
  self.session_id = str(uuid.uuid4())
47
44
  self.run_id = None
48
-
45
+
49
46
  # Track timing and metrics
50
47
  self.run_start_time = None
51
48
  self.step_count = 0
@@ -54,126 +51,165 @@ class TelemetryCallback(AsyncCallbackHandler):
54
51
  "prompt_tokens": 0,
55
52
  "completion_tokens": 0,
56
53
  "total_tokens": 0,
57
- "response_cost": 0.0
54
+ "response_cost": 0.0,
58
55
  }
59
-
56
+
60
57
  # Record agent initialization
61
58
  if is_telemetry_enabled():
62
59
  self._record_agent_initialization()
63
-
60
+
64
61
  def _record_agent_initialization(self) -> None:
65
62
  """Record agent type/model and session initialization."""
63
+ # Get the agent loop type (class name)
64
+ agent_type = "unknown"
65
+ if hasattr(self.agent, "agent_loop") and self.agent.agent_loop is not None:
66
+ agent_type = type(self.agent.agent_loop).__name__
67
+
66
68
  agent_info = {
67
69
  "session_id": self.session_id,
68
- "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
69
- "model": getattr(self.agent, 'model', 'unknown'),
70
- **SYSTEM_INFO
70
+ "agent_type": agent_type,
71
+ "model": getattr(self.agent, "model", "unknown"),
72
+ **SYSTEM_INFO,
71
73
  }
72
-
74
+
75
+ # Include VM name if available
76
+ vm_name = self._get_vm_name()
77
+ if vm_name:
78
+ agent_info["vm_name"] = vm_name
79
+
73
80
  record_event("agent_session_start", agent_info)
74
-
81
+
75
82
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
76
83
  """Called at the start of an agent run loop."""
77
84
  if not is_telemetry_enabled():
78
85
  return
79
-
86
+
80
87
  self.run_id = str(uuid.uuid4())
81
88
  self.run_start_time = time.time()
82
89
  self.step_count = 0
83
-
90
+
84
91
  # Calculate input context size
85
92
  input_context_size = self._calculate_context_size(old_items)
86
-
93
+
87
94
  run_data = {
88
95
  "session_id": self.session_id,
89
96
  "run_id": self.run_id,
90
97
  "start_time": self.run_start_time,
91
98
  "input_context_size": input_context_size,
92
- "num_existing_messages": len(old_items)
99
+ "num_existing_messages": len(old_items),
93
100
  }
94
-
101
+
102
+ # Include VM name if available
103
+ vm_name = self._get_vm_name()
104
+ if vm_name:
105
+ run_data["vm_name"] = vm_name
106
+
95
107
  # Log trajectory if opted in
96
108
  if self.log_trajectory:
97
109
  trajectory = self._extract_trajectory(old_items)
98
110
  if trajectory:
99
111
  run_data["uploaded_trajectory"] = trajectory
100
-
112
+
101
113
  record_event("agent_run_start", run_data)
102
-
103
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
114
+
115
+ async def on_run_end(
116
+ self,
117
+ kwargs: Dict[str, Any],
118
+ old_items: List[Dict[str, Any]],
119
+ new_items: List[Dict[str, Any]],
120
+ ) -> None:
104
121
  """Called at the end of an agent run loop."""
105
122
  if not is_telemetry_enabled() or not self.run_start_time:
106
123
  return
107
-
124
+
108
125
  run_duration = time.time() - self.run_start_time
109
-
126
+
110
127
  run_data = {
111
128
  "session_id": self.session_id,
112
129
  "run_id": self.run_id,
113
130
  "end_time": time.time(),
114
131
  "duration_seconds": run_duration,
115
132
  "num_steps": self.step_count,
116
- "total_usage": self.total_usage.copy()
133
+ "total_usage": self.total_usage.copy(),
117
134
  }
118
-
135
+
136
+ # Include VM name if available
137
+ vm_name = self._get_vm_name()
138
+ if vm_name:
139
+ run_data["vm_name"] = vm_name
140
+
119
141
  # Log trajectory if opted in
120
142
  if self.log_trajectory:
121
143
  trajectory = self._extract_trajectory(new_items)
122
144
  if trajectory:
123
145
  run_data["uploaded_trajectory"] = trajectory
124
-
146
+
125
147
  record_event("agent_run_end", run_data)
126
-
148
+
127
149
  async def on_usage(self, usage: Dict[str, Any]) -> None:
128
150
  """Called when usage information is received."""
129
151
  if not is_telemetry_enabled():
130
152
  return
131
-
153
+
132
154
  # Accumulate usage stats
133
155
  self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
134
- self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
156
+ self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
135
157
  self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
136
158
  self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
137
-
159
+
138
160
  # Record individual usage event
139
161
  usage_data = {
140
162
  "session_id": self.session_id,
141
163
  "run_id": self.run_id,
142
164
  "step": self.step_count,
143
- **usage
165
+ **usage,
144
166
  }
145
-
167
+
146
168
  record_event("agent_usage", usage_data)
147
-
169
+
148
170
  async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
149
171
  """Called when responses are received."""
150
172
  if not is_telemetry_enabled():
151
173
  return
152
-
174
+
153
175
  self.step_count += 1
154
176
  step_duration = None
155
-
177
+
156
178
  if self.step_start_time:
157
179
  step_duration = time.time() - self.step_start_time
158
-
180
+
159
181
  self.step_start_time = time.time()
160
-
182
+
161
183
  step_data = {
162
184
  "session_id": self.session_id,
163
185
  "run_id": self.run_id,
164
186
  "step": self.step_count,
165
- "timestamp": self.step_start_time
187
+ "timestamp": self.step_start_time,
166
188
  }
167
-
189
+
168
190
  if step_duration is not None:
169
191
  step_data["duration_seconds"] = step_duration
170
-
192
+
171
193
  record_event("agent_step", step_data)
172
-
194
+
195
+ def _get_vm_name(self) -> Optional[str]:
196
+ """Extract VM name from agent's computer handler if available."""
197
+ try:
198
+ if hasattr(self.agent, "computer_handler") and self.agent.computer_handler:
199
+ handler = self.agent.computer_handler
200
+ # Check if it's a cuaComputerHandler with a cua_computer
201
+ if hasattr(handler, "cua_computer"):
202
+ computer = handler.cua_computer
203
+ if hasattr(computer, "config") and hasattr(computer.config, "name"):
204
+ return computer.config.name
205
+ except Exception:
206
+ pass
207
+ return None
208
+
173
209
  def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
174
210
  """Calculate approximate context size in tokens/characters."""
175
211
  total_size = 0
176
-
212
+
177
213
  for item in items:
178
214
  if item.get("type") == "message" and "content" in item:
179
215
  content = item["content"]
@@ -185,25 +221,27 @@ class TelemetryCallback(AsyncCallbackHandler):
185
221
  total_size += len(part["text"])
186
222
  elif "content" in item and isinstance(item["content"], str):
187
223
  total_size += len(item["content"])
188
-
224
+
189
225
  return total_size
190
-
226
+
191
227
  def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
192
228
  """Extract trajectory items that should be logged."""
193
229
  trajectory = []
194
-
230
+
195
231
  for item in items:
196
232
  # Include user messages, assistant messages, reasoning, computer calls, and computer outputs
197
233
  if (
198
- item.get("role") == "user" or # User inputs
199
- (item.get("type") == "message" and item.get("role") == "assistant") or # Model outputs
200
- item.get("type") == "reasoning" or # Reasoning traces
201
- item.get("type") == "computer_call" or # Computer actions
202
- item.get("type") == "computer_call_output" # Computer outputs
234
+ item.get("role") == "user" # User inputs
235
+ or (
236
+ item.get("type") == "message" and item.get("role") == "assistant"
237
+ ) # Model outputs
238
+ or item.get("type") == "reasoning" # Reasoning traces
239
+ or item.get("type") == "computer_call" # Computer actions
240
+ or item.get("type") == "computer_call_output" # Computer outputs
203
241
  ):
204
242
  # Create a copy of the item with timestamp
205
243
  trajectory_item = item.copy()
206
244
  trajectory_item["logged_at"] = time.time()
207
245
  trajectory.append(trajectory_item)
208
-
209
- return trajectory
246
+
247
+ return trajectory