cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,291 @@
1
+ """
2
+ OpenTelemetry callback handler for Computer-Use Agent (cua-agent).
3
+
4
+ Instruments agent operations for the Four Golden Signals:
5
+ - Latency: Operation duration
6
+ - Traffic: Operation counts
7
+ - Errors: Error counts
8
+ - Saturation: Concurrent operations
9
+ """
10
+
11
+ import time
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from .base import AsyncCallbackHandler
15
+
16
+ # Import OTEL functions - these are available when cua-core[telemetry] is installed
17
+ try:
18
+ from core.telemetry import (
19
+ add_breadcrumb,
20
+ capture_exception,
21
+ create_span,
22
+ is_otel_enabled,
23
+ record_error,
24
+ record_operation,
25
+ record_tokens,
26
+ set_context,
27
+ track_concurrent,
28
+ )
29
+
30
+ OTEL_AVAILABLE = True
31
+ except ImportError:
32
+ OTEL_AVAILABLE = False
33
+
34
+ def is_otel_enabled() -> bool:
35
+ return False
36
+
37
+
38
+ class OtelCallback(AsyncCallbackHandler):
39
+ """
40
+ OpenTelemetry callback handler for instrumentation.
41
+
42
+ Tracks:
43
+ - Agent session lifecycle (start/end)
44
+ - Agent run lifecycle (start/end with duration)
45
+ - Individual steps (with duration)
46
+ - Computer actions (with duration)
47
+ - Token usage
48
+ - Errors
49
+ """
50
+
51
+ def __init__(self, agent: Any):
52
+ """
53
+ Initialize OTEL callback.
54
+
55
+ Args:
56
+ agent: The ComputerAgent instance
57
+ """
58
+ self.agent = agent
59
+ self.model = getattr(agent, "model", "unknown")
60
+
61
+ # Timing state
62
+ self.run_start_time: Optional[float] = None
63
+ self.step_start_time: Optional[float] = None
64
+ self.step_count = 0
65
+
66
+ # Span management
67
+ self._session_span: Optional[Any] = None
68
+ self._run_span: Optional[Any] = None
69
+
70
+ # Track concurrent sessions
71
+ self._concurrent_tracker: Optional[Any] = None
72
+
73
+ if OTEL_AVAILABLE and is_otel_enabled():
74
+ # Set context for all events
75
+ set_context(
76
+ "agent",
77
+ {
78
+ "model": self.model,
79
+ "agent_type": self._get_agent_type(),
80
+ },
81
+ )
82
+
83
+ def _get_agent_type(self) -> str:
84
+ """Get the agent loop type name."""
85
+ if hasattr(self.agent, "agent_loop") and self.agent.agent_loop is not None:
86
+ return type(self.agent.agent_loop).__name__
87
+ return "unknown"
88
+
89
+ async def on_run_start(
90
+ self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]
91
+ ) -> None:
92
+ """Called at the start of an agent run loop."""
93
+ if not OTEL_AVAILABLE or not is_otel_enabled():
94
+ return
95
+
96
+ self.run_start_time = time.perf_counter()
97
+ self.step_count = 0
98
+
99
+ # Add breadcrumb for debugging
100
+ add_breadcrumb(
101
+ category="agent",
102
+ message=f"Agent run started with model {self.model}",
103
+ level="info",
104
+ data={
105
+ "model": self.model,
106
+ "agent_type": self._get_agent_type(),
107
+ "input_messages": len(old_items),
108
+ },
109
+ )
110
+
111
+ async def on_run_end(
112
+ self,
113
+ kwargs: Dict[str, Any],
114
+ old_items: List[Dict[str, Any]],
115
+ new_items: List[Dict[str, Any]],
116
+ ) -> None:
117
+ """Called at the end of an agent run loop."""
118
+ if not OTEL_AVAILABLE or not is_otel_enabled():
119
+ return
120
+
121
+ if self.run_start_time is not None:
122
+ duration = time.perf_counter() - self.run_start_time
123
+
124
+ # Record run metrics
125
+ record_operation(
126
+ operation="agent.run",
127
+ duration_seconds=duration,
128
+ status="success",
129
+ model=self.model,
130
+ steps=self.step_count,
131
+ )
132
+
133
+ add_breadcrumb(
134
+ category="agent",
135
+ message=f"Agent run completed in {duration:.2f}s",
136
+ level="info",
137
+ data={
138
+ "duration_seconds": duration,
139
+ "steps": self.step_count,
140
+ "output_messages": len(new_items),
141
+ },
142
+ )
143
+
144
+ self.run_start_time = None
145
+
146
+ async def on_responses(
147
+ self, kwargs: Dict[str, Any], responses: Dict[str, Any]
148
+ ) -> None:
149
+ """Called when responses are received (each step)."""
150
+ if not OTEL_AVAILABLE or not is_otel_enabled():
151
+ return
152
+
153
+ self.step_count += 1
154
+ current_time = time.perf_counter()
155
+
156
+ # Calculate step duration if we have a start time
157
+ if self.step_start_time is not None:
158
+ step_duration = current_time - self.step_start_time
159
+ record_operation(
160
+ operation="agent.step",
161
+ duration_seconds=step_duration,
162
+ status="success",
163
+ model=self.model,
164
+ step_number=self.step_count,
165
+ )
166
+
167
+ # Start timing next step
168
+ self.step_start_time = current_time
169
+
170
+ add_breadcrumb(
171
+ category="agent",
172
+ message=f"Agent step {self.step_count} completed",
173
+ level="info",
174
+ data={"step": self.step_count},
175
+ )
176
+
177
+ async def on_usage(self, usage: Dict[str, Any]) -> None:
178
+ """Called when usage information is received."""
179
+ if not OTEL_AVAILABLE or not is_otel_enabled():
180
+ return
181
+
182
+ prompt_tokens = usage.get("prompt_tokens", 0)
183
+ completion_tokens = usage.get("completion_tokens", 0)
184
+
185
+ if prompt_tokens > 0 or completion_tokens > 0:
186
+ record_tokens(
187
+ prompt_tokens=prompt_tokens,
188
+ completion_tokens=completion_tokens,
189
+ model=self.model,
190
+ )
191
+
192
+ async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
193
+ """Called when a computer call is about to start."""
194
+ if not OTEL_AVAILABLE or not is_otel_enabled():
195
+ return
196
+
197
+ action = item.get("action", {})
198
+ action_type = action.get("type", "unknown")
199
+
200
+ add_breadcrumb(
201
+ category="computer",
202
+ message=f"Computer action: {action_type}",
203
+ level="info",
204
+ data={"action_type": action_type},
205
+ )
206
+
207
+ async def on_computer_call_end(
208
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
209
+ ) -> None:
210
+ """Called when a computer call has completed."""
211
+ if not OTEL_AVAILABLE or not is_otel_enabled():
212
+ return
213
+
214
+ action = item.get("action", {})
215
+ action_type = action.get("type", "unknown")
216
+
217
+ # Record computer action metric
218
+ # Note: We don't have precise timing here, so we record with 0 duration
219
+ # The actual timing should be done in the computer module
220
+ record_operation(
221
+ operation=f"computer.action.{action_type}",
222
+ duration_seconds=0, # Timing handled elsewhere
223
+ status="success",
224
+ model=self.model,
225
+ )
226
+
227
+ async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
228
+ """Called when an LLM API call is about to start."""
229
+ if not OTEL_AVAILABLE or not is_otel_enabled():
230
+ return
231
+
232
+ add_breadcrumb(
233
+ category="llm",
234
+ message="LLM API call started",
235
+ level="info",
236
+ data={"model": self.model},
237
+ )
238
+
239
+ async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
240
+ """Called when an LLM API call has completed."""
241
+ if not OTEL_AVAILABLE or not is_otel_enabled():
242
+ return
243
+
244
+ add_breadcrumb(
245
+ category="llm",
246
+ message="LLM API call completed",
247
+ level="info",
248
+ )
249
+
250
+
251
+ class OtelErrorCallback(AsyncCallbackHandler):
252
+ """
253
+ Callback that captures errors and sends them to Sentry/OTEL.
254
+
255
+ Should be added early in the callback chain to catch all errors.
256
+ """
257
+
258
+ def __init__(self, agent: Any):
259
+ """
260
+ Initialize error callback.
261
+
262
+ Args:
263
+ agent: The ComputerAgent instance
264
+ """
265
+ self.agent = agent
266
+ self.model = getattr(agent, "model", "unknown")
267
+
268
+ async def on_error(self, error: Exception, context: Dict[str, Any]) -> None:
269
+ """Called when an error occurs during agent execution."""
270
+ if not OTEL_AVAILABLE or not is_otel_enabled():
271
+ return
272
+
273
+ error_type = type(error).__name__
274
+ operation = context.get("operation", "unknown")
275
+
276
+ # Record error metric
277
+ record_error(
278
+ error_type=error_type,
279
+ operation=operation,
280
+ model=self.model,
281
+ )
282
+
283
+ # Capture exception in Sentry
284
+ capture_exception(
285
+ error,
286
+ context={
287
+ "model": self.model,
288
+ "operation": operation,
289
+ **{k: v for k, v in context.items() if k != "operation"},
290
+ },
291
+ )
@@ -2,38 +2,41 @@
2
2
  PII anonymization callback handler using Microsoft Presidio for text and image redaction.
3
3
  """
4
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
- from .base import AsyncCallbackHandler
7
5
  import base64
8
6
  import io
9
7
  import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from .base import AsyncCallbackHandler
10
11
 
11
12
  try:
12
13
  # TODO: Add Presidio dependencies
13
14
  from PIL import Image
15
+
14
16
  PRESIDIO_AVAILABLE = True
15
17
  except ImportError:
16
18
  PRESIDIO_AVAILABLE = False
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+
20
23
  class PIIAnonymizationCallback(AsyncCallbackHandler):
21
24
  """
22
25
  Callback handler that anonymizes PII in text and images using Microsoft Presidio.
23
-
26
+
24
27
  This handler:
25
28
  1. Anonymizes PII in messages before sending to the agent loop
26
29
  2. Deanonymizes PII in tool calls and message outputs after the agent loop
27
30
  3. Redacts PII from images in computer_call_output messages
28
31
  """
29
-
32
+
30
33
  def __init__(
31
34
  self,
32
35
  # TODO: Any extra kwargs if needed
33
36
  ):
34
37
  """
35
38
  Initialize the PII anonymization callback.
36
-
39
+
37
40
  Args:
38
41
  anonymize_text: Whether to anonymize text content
39
42
  anonymize_images: Whether to redact images
@@ -46,16 +49,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
46
49
  "Presidio is not available. Install with: "
47
50
  "pip install cua-agent[pii-anonymization]"
48
51
  )
49
-
52
+
50
53
  # TODO: Implement __init__
51
-
54
+
52
55
  async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
53
56
  """
54
57
  Anonymize PII in messages before sending to agent loop.
55
-
58
+
56
59
  Args:
57
60
  messages: List of message dictionaries
58
-
61
+
59
62
  Returns:
60
63
  List of messages with PII anonymized
61
64
  """
@@ -63,16 +66,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
63
66
  for msg in messages:
64
67
  anonymized_msg = await self._anonymize_message(msg)
65
68
  anonymized_messages.append(anonymized_msg)
66
-
69
+
67
70
  return anonymized_messages
68
-
71
+
69
72
  async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
70
73
  """
71
74
  Deanonymize PII in tool calls and message outputs after agent loop.
72
-
75
+
73
76
  Args:
74
77
  output: List of output dictionaries
75
-
78
+
76
79
  Returns:
77
80
  List of output with PII deanonymized for tool calls
78
81
  """
@@ -84,13 +87,13 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
84
87
  deanonymized_output.append(deanonymized_item)
85
88
  else:
86
89
  deanonymized_output.append(item)
87
-
90
+
88
91
  return deanonymized_output
89
-
92
+
90
93
  async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
91
94
  # TODO: Implement _anonymize_message
92
95
  return message
93
-
96
+
94
97
  async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
95
98
  # TODO: Implement _deanonymize_item
96
99
  return item
@@ -0,0 +1,47 @@
1
+ """
2
+ Prompt instructions callback.
3
+
4
+ This callback allows simple prompt engineering by pre-pending a user
5
+ instructions message to the start of the conversation before each LLM call.
6
+
7
+ Usage:
8
+
9
+ from agent.callbacks import PromptInstructionsCallback
10
+ agent = ComputerAgent(
11
+ model="openai/computer-use-preview",
12
+ callbacks=[PromptInstructionsCallback("Follow these rules...")]
13
+ )
14
+
15
+ """
16
+
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from .base import AsyncCallbackHandler
20
+
21
+
22
+ class PromptInstructionsCallback(AsyncCallbackHandler):
23
+ """
24
+ Prepend a user instructions message to the message list.
25
+
26
+ This is a minimal, non-invasive way to guide the agent's behavior without
27
+ modifying agent loops or tools. It works with any provider/loop since it
28
+ only alters the messages array before sending to the model.
29
+ """
30
+
31
+ def __init__(self, instructions: Optional[str]) -> None:
32
+ self.instructions = instructions
33
+
34
+ async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
35
+ # Pre-pend instructions message
36
+ if not self.instructions:
37
+ return messages
38
+
39
+ # Ensure we don't duplicate if already present at the front
40
+ if messages and isinstance(messages[0], dict):
41
+ first = messages[0]
42
+ if first.get("role") == "user" and first.get("content") == self.instructions:
43
+ return messages
44
+
45
+ return [
46
+ {"role": "user", "content": self.instructions},
47
+ ] + messages