cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -3,19 +3,24 @@ Callback system for ComputerAgent preprocessing and postprocessing hooks.
3
3
  """
4
4
 
5
5
  from .base import AsyncCallbackHandler
6
+ from .budget_manager import BudgetManagerCallback
6
7
  from .image_retention import ImageRetentionCallback
7
8
  from .logging import LoggingCallback
8
- from .trajectory_saver import TrajectorySaverCallback
9
- from .budget_manager import BudgetManagerCallback
10
- from .telemetry import TelemetryCallback
9
+ from .otel import OtelCallback, OtelErrorCallback
11
10
  from .operator_validator import OperatorNormalizerCallback
11
+ from .prompt_instructions import PromptInstructionsCallback
12
+ from .telemetry import TelemetryCallback
13
+ from .trajectory_saver import TrajectorySaverCallback
12
14
 
13
15
  __all__ = [
14
16
  "AsyncCallbackHandler",
15
- "ImageRetentionCallback",
17
+ "ImageRetentionCallback",
16
18
  "LoggingCallback",
17
19
  "TrajectorySaverCallback",
18
20
  "BudgetManagerCallback",
19
21
  "TelemetryCallback",
22
+ "OtelCallback",
23
+ "OtelErrorCallback",
20
24
  "OperatorNormalizerCallback",
25
+ "PromptInstructionsCallback",
21
26
  ]
agent/callbacks/base.py CHANGED
@@ -3,7 +3,7 @@ Base callback handler interface for ComputerAgent preprocessing and postprocessi
3
3
  """
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import List, Dict, Any, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
 
9
9
  class AsyncCallbackHandler(ABC):
@@ -16,42 +16,52 @@ class AsyncCallbackHandler(ABC):
16
16
  """Called at the start of an agent run loop."""
17
17
  pass
18
18
 
19
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
19
+ async def on_run_end(
20
+ self,
21
+ kwargs: Dict[str, Any],
22
+ old_items: List[Dict[str, Any]],
23
+ new_items: List[Dict[str, Any]],
24
+ ) -> None:
20
25
  """Called at the end of an agent run loop."""
21
26
  pass
22
-
23
- async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
27
+
28
+ async def on_run_continue(
29
+ self,
30
+ kwargs: Dict[str, Any],
31
+ old_items: List[Dict[str, Any]],
32
+ new_items: List[Dict[str, Any]],
33
+ ) -> bool:
24
34
  """Called during agent run loop to determine if execution should continue.
25
-
35
+
26
36
  Args:
27
37
  kwargs: Run arguments
28
38
  old_items: Original messages
29
39
  new_items: New messages generated during run
30
-
40
+
31
41
  Returns:
32
42
  True to continue execution, False to stop
33
43
  """
34
44
  return True
35
-
45
+
36
46
  async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
37
47
  """
38
48
  Called before messages are sent to the agent loop.
39
-
49
+
40
50
  Args:
41
51
  messages: List of message dictionaries to preprocess
42
-
52
+
43
53
  Returns:
44
54
  List of preprocessed message dictionaries
45
55
  """
46
56
  return messages
47
-
57
+
48
58
  async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
49
59
  """
50
60
  Called after the agent loop returns output.
51
-
61
+
52
62
  Args:
53
63
  output: List of output message dictionaries to postprocess
54
-
64
+
55
65
  Returns:
56
66
  List of postprocessed output dictionaries
57
67
  """
@@ -60,63 +70,67 @@ class AsyncCallbackHandler(ABC):
60
70
  async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
61
71
  """
62
72
  Called when a computer call is about to start.
63
-
73
+
64
74
  Args:
65
75
  item: The computer call item dictionary
66
76
  """
67
77
  pass
68
-
69
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
78
+
79
+ async def on_computer_call_end(
80
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
81
+ ) -> None:
70
82
  """
71
83
  Called when a computer call has completed.
72
-
84
+
73
85
  Args:
74
86
  item: The computer call item dictionary
75
87
  result: The result of the computer call
76
88
  """
77
89
  pass
78
-
90
+
79
91
  async def on_function_call_start(self, item: Dict[str, Any]) -> None:
80
92
  """
81
93
  Called when a function call is about to start.
82
-
94
+
83
95
  Args:
84
96
  item: The function call item dictionary
85
97
  """
86
98
  pass
87
-
88
- async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
99
+
100
+ async def on_function_call_end(
101
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
102
+ ) -> None:
89
103
  """
90
104
  Called when a function call has completed.
91
-
105
+
92
106
  Args:
93
107
  item: The function call item dictionary
94
108
  result: The result of the function call
95
109
  """
96
110
  pass
97
-
111
+
98
112
  async def on_text(self, item: Dict[str, Any]) -> None:
99
113
  """
100
114
  Called when a text message is encountered.
101
-
115
+
102
116
  Args:
103
117
  item: The message item dictionary
104
118
  """
105
119
  pass
106
-
120
+
107
121
  async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
108
122
  """
109
123
  Called when an API call is about to start.
110
-
124
+
111
125
  Args:
112
126
  kwargs: The kwargs being passed to the API call
113
127
  """
114
128
  pass
115
-
129
+
116
130
  async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
117
131
  """
118
132
  Called when an API call has completed.
119
-
133
+
120
134
  Args:
121
135
  kwargs: The kwargs that were passed to the API call
122
136
  result: The result of the API call
@@ -126,7 +140,7 @@ class AsyncCallbackHandler(ABC):
126
140
  async def on_usage(self, usage: Dict[str, Any]) -> None:
127
141
  """
128
142
  Called when usage information is received.
129
-
143
+
130
144
  Args:
131
145
  usage: The usage information
132
146
  """
@@ -135,7 +149,7 @@ class AsyncCallbackHandler(ABC):
135
149
  async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
136
150
  """
137
151
  Called when a screenshot is taken.
138
-
152
+
139
153
  Args:
140
154
  screenshot: The screenshot image
141
155
  name: The name of the screenshot
@@ -145,9 +159,9 @@ class AsyncCallbackHandler(ABC):
145
159
  async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
146
160
  """
147
161
  Called when responses are received.
148
-
162
+
149
163
  Args:
150
164
  kwargs: The kwargs being passed to the agent loop
151
165
  responses: The responses received
152
166
  """
153
- pass
167
+ pass
@@ -1,17 +1,23 @@
1
- from typing import Dict, List, Any
1
+ from typing import Any, Dict, List
2
+
2
3
  from .base import AsyncCallbackHandler
3
4
 
5
+
4
6
  class BudgetExceededError(Exception):
5
7
  """Exception raised when budget is exceeded."""
8
+
6
9
  pass
7
10
 
11
+
8
12
  class BudgetManagerCallback(AsyncCallbackHandler):
9
13
  """Budget manager callback that tracks usage costs and can stop execution when budget is exceeded."""
10
-
11
- def __init__(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False):
14
+
15
+ def __init__(
16
+ self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False
17
+ ):
12
18
  """
13
19
  Initialize BudgetManagerCallback.
14
-
20
+
15
21
  Args:
16
22
  max_budget: Maximum budget allowed
17
23
  reset_after_each_run: Whether to reset budget after each run
@@ -21,24 +27,30 @@ class BudgetManagerCallback(AsyncCallbackHandler):
21
27
  self.reset_after_each_run = reset_after_each_run
22
28
  self.raise_error = raise_error
23
29
  self.total_cost = 0.0
24
-
30
+
25
31
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
26
32
  """Reset budget if configured to do so."""
27
33
  if self.reset_after_each_run:
28
34
  self.total_cost = 0.0
29
-
35
+
30
36
  async def on_usage(self, usage: Dict[str, Any]) -> None:
31
37
  """Track usage costs."""
32
38
  if "response_cost" in usage:
33
39
  self.total_cost += usage["response_cost"]
34
-
35
- async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
40
+
41
+ async def on_run_continue(
42
+ self,
43
+ kwargs: Dict[str, Any],
44
+ old_items: List[Dict[str, Any]],
45
+ new_items: List[Dict[str, Any]],
46
+ ) -> bool:
36
47
  """Check if budget allows continuation."""
37
48
  if self.total_cost >= self.max_budget:
38
49
  if self.raise_error:
39
- raise BudgetExceededError(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
50
+ raise BudgetExceededError(
51
+ f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}"
52
+ )
40
53
  else:
41
54
  print(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
42
55
  return False
43
56
  return True
44
-
@@ -2,7 +2,8 @@
2
2
  Image retention callback handler that limits the number of recent images in message history.
3
3
  """
4
4
 
5
- from typing import List, Dict, Any, Optional
5
+ from typing import Any, Dict, List, Optional
6
+
6
7
  from .base import AsyncCallbackHandler
7
8
 
8
9
 
@@ -11,129 +12,84 @@ class ImageRetentionCallback(AsyncCallbackHandler):
11
12
  Callback handler that applies image retention policy to limit the number
12
13
  of recent images in message history to prevent context window overflow.
13
14
  """
14
-
15
+
15
16
  def __init__(self, only_n_most_recent_images: Optional[int] = None):
16
17
  """
17
18
  Initialize the image retention callback.
18
-
19
+
19
20
  Args:
20
21
  only_n_most_recent_images: If set, only keep the N most recent images in message history
21
22
  """
22
23
  self.only_n_most_recent_images = only_n_most_recent_images
23
-
24
+
24
25
  async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
25
26
  """
26
27
  Apply image retention policy to messages before sending to agent loop.
27
-
28
+
28
29
  Args:
29
30
  messages: List of message dictionaries
30
-
31
+
31
32
  Returns:
32
33
  List of messages with image retention policy applied
33
34
  """
34
35
  if self.only_n_most_recent_images is None:
35
36
  return messages
36
-
37
+
37
38
  return self._apply_image_retention(messages)
38
-
39
+
39
40
  def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
40
41
  """Apply image retention policy to keep only the N most recent images.
41
-
42
+
42
43
  Removes computer_call_output items with image_url and their corresponding computer_call items,
43
44
  keeping only the most recent N image pairs based on only_n_most_recent_images setting.
44
-
45
+
45
46
  Args:
46
47
  messages: List of message dictionaries
47
-
48
+
48
49
  Returns:
49
50
  Filtered list of messages with image retention applied
50
51
  """
51
52
  if self.only_n_most_recent_images is None:
52
53
  return messages
53
-
54
- # First pass: Assign call_id to reasoning items based on the next computer_call
55
- messages_with_call_ids = []
56
- for i, msg in enumerate(messages):
57
- msg_copy = msg.copy() if isinstance(msg, dict) else msg
58
-
59
- # If this is a reasoning item without a call_id, find the next computer_call
60
- if (msg_copy.get("type") == "reasoning" and
61
- not msg_copy.get("call_id")):
62
- # Look ahead for the next computer_call
63
- for j in range(i + 1, len(messages)):
64
- next_msg = messages[j]
65
- if (next_msg.get("type") == "computer_call" and
66
- next_msg.get("call_id")):
67
- msg_copy["call_id"] = next_msg.get("call_id")
68
- break
69
-
70
- messages_with_call_ids.append(msg_copy)
71
-
72
- # Find all computer_call_output items with images and their call_ids
73
- image_call_ids = []
74
- for msg in reversed(messages_with_call_ids): # Process in reverse to get most recent first
75
- if (msg.get("type") == "computer_call_output" and
76
- isinstance(msg.get("output"), dict) and
77
- "image_url" in msg.get("output", {})):
78
- call_id = msg.get("call_id")
79
- if call_id and call_id not in image_call_ids:
80
- image_call_ids.append(call_id)
81
- if len(image_call_ids) >= self.only_n_most_recent_images:
82
- break
83
-
84
- # Keep the most recent N image call_ids (reverse to get chronological order)
85
- keep_call_ids = set(image_call_ids[:self.only_n_most_recent_images])
86
-
87
- # Filter messages: remove computer_call, computer_call_output, and reasoning for old images
88
- filtered_messages = []
89
- for msg in messages_with_call_ids:
90
- msg_type = msg.get("type")
91
- call_id = msg.get("call_id")
92
-
93
- # Remove old computer_call items
94
- if msg_type == "computer_call" and call_id not in keep_call_ids:
95
- # Check if this call_id corresponds to an image call
96
- has_image_output = any(
97
- m.get("type") == "computer_call_output" and
98
- m.get("call_id") == call_id and
99
- isinstance(m.get("output"), dict) and
100
- "image_url" in m.get("output", {})
101
- for m in messages_with_call_ids
102
- )
103
- if has_image_output:
104
- continue # Skip this computer_call
105
-
106
- # Remove old computer_call_output items with images
107
- if (msg_type == "computer_call_output" and
108
- call_id not in keep_call_ids and
109
- isinstance(msg.get("output"), dict) and
110
- "image_url" in msg.get("output", {})):
111
- continue # Skip this computer_call_output
112
-
113
- # Remove old reasoning items that are paired with removed computer calls
114
- if (msg_type == "reasoning" and
115
- call_id and call_id not in keep_call_ids):
116
- # Check if this call_id corresponds to an image call that's being removed
117
- has_image_output = any(
118
- m.get("type") == "computer_call_output" and
119
- m.get("call_id") == call_id and
120
- isinstance(m.get("output"), dict) and
121
- "image_url" in m.get("output", {})
122
- for m in messages_with_call_ids
123
- )
124
- if has_image_output:
125
- continue # Skip this reasoning item
126
-
127
- filtered_messages.append(msg)
128
-
129
- # Clean up: Remove call_id from reasoning items before returning
130
- final_messages = []
131
- for msg in filtered_messages:
132
- if msg.get("type") == "reasoning" and "call_id" in msg:
133
- # Create a copy without call_id for reasoning items
134
- cleaned_msg = {k: v for k, v in msg.items() if k != "call_id"}
135
- final_messages.append(cleaned_msg)
136
- else:
137
- final_messages.append(msg)
138
-
139
- return final_messages
54
+
55
+ # Gather indices of all computer_call_output messages that contain an image_url
56
+ output_indices: List[int] = []
57
+ for idx, msg in enumerate(messages):
58
+ if msg.get("type") == "computer_call_output":
59
+ out = msg.get("output")
60
+ if isinstance(out, dict) and ("image_url" in out):
61
+ output_indices.append(idx)
62
+
63
+ # Nothing to trim
64
+ if len(output_indices) <= self.only_n_most_recent_images:
65
+ return messages
66
+
67
+ # Determine which outputs to keep (most recent N)
68
+ keep_output_indices = set(output_indices[-self.only_n_most_recent_images :])
69
+
70
+ # Build set of indices to remove in one pass
71
+ to_remove: set[int] = set()
72
+
73
+ for idx in output_indices:
74
+ if idx in keep_output_indices:
75
+ continue # keep this screenshot and its context
76
+
77
+ to_remove.add(idx) # remove the computer_call_output itself
78
+
79
+ # Remove the immediately preceding computer_call with matching call_id (if present)
80
+ call_id = messages[idx].get("call_id")
81
+ prev_idx = idx - 1
82
+ if (
83
+ prev_idx >= 0
84
+ and messages[prev_idx].get("type") == "computer_call"
85
+ and messages[prev_idx].get("call_id") == call_id
86
+ ):
87
+ to_remove.add(prev_idx)
88
+ # Check a single reasoning immediately before that computer_call
89
+ r_idx = prev_idx - 1
90
+ if r_idx >= 0 and messages[r_idx].get("type") == "reasoning":
91
+ to_remove.add(r_idx)
92
+
93
+ # Construct filtered list
94
+ filtered = [m for i, m in enumerate(messages) if i not in to_remove]
95
+ return filtered