cua-agent 0.4.18__tar.gz → 0.4.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (50) hide show
  1. {cua_agent-0.4.18 → cua_agent-0.4.20}/PKG-INFO +1 -1
  2. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/adapters/mlxvlm_adapter.py +3 -2
  3. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/agent.py +10 -2
  4. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/trajectory_saver.py +83 -5
  5. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/integrations/hud/__init__.py +3 -3
  6. {cua_agent-0.4.18 → cua_agent-0.4.20}/pyproject.toml +1 -1
  7. {cua_agent-0.4.18 → cua_agent-0.4.20}/README.md +0 -0
  8. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/__init__.py +0 -0
  9. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/__main__.py +0 -0
  10. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/adapters/__init__.py +0 -0
  11. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  12. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/adapters/human_adapter.py +0 -0
  13. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/__init__.py +0 -0
  14. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/base.py +0 -0
  15. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/budget_manager.py +0 -0
  16. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/image_retention.py +0 -0
  17. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/logging.py +0 -0
  18. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/operator_validator.py +0 -0
  19. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/pii_anonymization.py +0 -0
  20. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/callbacks/telemetry.py +0 -0
  21. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/cli.py +0 -0
  22. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/computers/__init__.py +0 -0
  23. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/computers/base.py +0 -0
  24. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/computers/cua.py +0 -0
  25. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/computers/custom.py +0 -0
  26. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/decorators.py +0 -0
  27. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/human_tool/__init__.py +0 -0
  28. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/human_tool/__main__.py +0 -0
  29. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/human_tool/server.py +0 -0
  30. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/human_tool/ui.py +0 -0
  31. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/integrations/hud/proxy.py +0 -0
  32. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/__init__.py +0 -0
  33. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/anthropic.py +0 -0
  34. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/base.py +0 -0
  35. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/composed_grounded.py +0 -0
  36. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/glm45v.py +0 -0
  37. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/gta1.py +0 -0
  38. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/model_types.csv +0 -0
  39. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/omniparser.py +0 -0
  40. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/openai.py +0 -0
  41. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/loops/uitars.py +0 -0
  42. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/proxy/examples.py +0 -0
  43. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/proxy/handlers.py +0 -0
  44. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/responses.py +0 -0
  45. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/types.py +0 -0
  46. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/ui/__init__.py +0 -0
  47. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/ui/__main__.py +0 -0
  48. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/ui/gradio/__init__.py +0 -0
  49. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/ui/gradio/app.py +0 -0
  50. {cua_agent-0.4.18 → cua_agent-0.4.20}/agent/ui/gradio/ui_components.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.18
3
+ Version: 0.4.20
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -78,8 +78,6 @@ class MLXVLMAdapter(CustomLLM):
78
78
  **kwargs: Additional arguments
79
79
  """
80
80
  super().__init__()
81
- if not MLX_AVAILABLE:
82
- raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
83
81
 
84
82
  self.models = {} # Cache for loaded models
85
83
  self.processors = {} # Cache for loaded processors
@@ -95,6 +93,9 @@ class MLXVLMAdapter(CustomLLM):
95
93
  Returns:
96
94
  Tuple of (model, processor, config)
97
95
  """
96
+ if not MLX_AVAILABLE:
97
+ raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
98
+
98
99
  if model_name not in self.models:
99
100
  # Load model and processor
100
101
  model_obj, processor = load(
@@ -3,6 +3,7 @@ ComputerAgent - Main agent class that selects and runs agent loops
3
3
  """
4
4
 
5
5
  import asyncio
6
+ from pathlib import Path
6
7
  from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
7
8
 
8
9
  from litellm.responses.utils import Usage
@@ -162,7 +163,7 @@ class ComputerAgent:
162
163
  only_n_most_recent_images: Optional[int] = None,
163
164
  callbacks: Optional[List[Any]] = None,
164
165
  verbosity: Optional[int] = None,
165
- trajectory_dir: Optional[str] = None,
166
+ trajectory_dir: Optional[str | Path | dict] = None,
166
167
  max_retries: Optional[int] = 3,
167
168
  screenshot_delay: Optional[float | int] = 0.5,
168
169
  use_prompt_caching: Optional[bool] = False,
@@ -223,7 +224,10 @@ class ComputerAgent:
223
224
 
224
225
  # Add trajectory saver callback if trajectory_dir is set
225
226
  if self.trajectory_dir:
226
- self.callbacks.append(TrajectorySaverCallback(self.trajectory_dir))
227
+ if isinstance(self.trajectory_dir, dict):
228
+ self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
229
+ elif isinstance(self.trajectory_dir, (str, Path)):
230
+ self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
227
231
 
228
232
  # Add budget manager if max_trajectory_budget is set
229
233
  if max_trajectory_budget:
@@ -249,6 +253,10 @@ class ComputerAgent:
249
253
 
250
254
  # == Initialize computer agent ==
251
255
 
256
+ # If the loop is "human/human", we need to prefix a grounding model fallback
257
+ if self.agent_loop in ["human/human", "human"]:
258
+ self.agent_loop = "openai/computer-use-preview+human/human"
259
+
252
260
  # Find the appropriate agent loop
253
261
  if custom_loop:
254
262
  self.agent_loop = custom_loop
@@ -11,6 +11,8 @@ from pathlib import Path
11
11
  from typing import List, Dict, Any, Optional, Union, override
12
12
  from PIL import Image, ImageDraw
13
13
  import io
14
+ from copy import deepcopy
15
+
14
16
  from .base import AsyncCallbackHandler
15
17
 
16
18
  def sanitize_image_urls(data: Any) -> Any:
@@ -43,6 +45,64 @@ def sanitize_image_urls(data: Any) -> Any:
43
45
  return data
44
46
 
45
47
 
48
+ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
49
+ """
50
+ Save any base64-encoded screenshots from computer_call_output entries to files and
51
+ replace their image_url with the saved file path when a call_id is present.
52
+
53
+ Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
54
+
55
+ Args:
56
+ items: List of message/result dicts potentially containing computer_call_output entries
57
+ screenshot_dir: Directory to write screenshots into
58
+
59
+ Returns:
60
+ A new list with updated image_url fields when applicable.
61
+ """
62
+ if not items:
63
+ return items
64
+ if not screenshot_dir or not screenshot_dir.exists():
65
+ return items
66
+
67
+ updated: List[Dict[str, Any]] = []
68
+ for item in items:
69
+ # work on a shallow copy; deep copy nested 'output' if we modify it
70
+ msg = dict(item)
71
+ try:
72
+ if msg.get("type") == "computer_call_output":
73
+ call_id = msg.get("call_id")
74
+ output = msg.get("output", {})
75
+ image_url = output.get("image_url")
76
+ if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
77
+ # derive extension from MIME type e.g. data:image/png;base64,
78
+ try:
79
+ ext = image_url.split(";", 1)[0].split("/")[-1]
80
+ if not ext:
81
+ ext = "png"
82
+ except Exception:
83
+ ext = "png"
84
+ out_path = screenshot_dir / f"{call_id}.{ext}"
85
+ # write file if it doesn't exist
86
+ if not out_path.exists():
87
+ try:
88
+ b64_payload = image_url.split(",", 1)[1]
89
+ img_bytes = base64.b64decode(b64_payload)
90
+ out_path.parent.mkdir(parents=True, exist_ok=True)
91
+ with open(out_path, "wb") as f:
92
+ f.write(img_bytes)
93
+ except Exception:
94
+ # if anything fails, skip modifying this message
95
+ pass
96
+ # update image_url to file path
97
+ new_output = dict(output)
98
+ new_output["image_url"] = str(out_path)
99
+ msg["output"] = new_output
100
+ except Exception:
101
+ # do not block on malformed entries; keep original
102
+ pass
103
+ updated.append(msg)
104
+ return updated
105
+
46
106
  class TrajectorySaverCallback(AsyncCallbackHandler):
47
107
  """
48
108
  Callback handler that saves agent trajectories to disk.
@@ -51,7 +111,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
51
111
  within the trajectory gets its own folder with screenshots and responses.
52
112
  """
53
113
 
54
- def __init__(self, trajectory_dir: str, reset_on_run: bool = True):
114
+ def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
55
115
  """
56
116
  Initialize trajectory saver.
57
117
 
@@ -67,10 +127,12 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
67
127
  self.model: Optional[str] = None
68
128
  self.total_usage: Dict[str, Any] = {}
69
129
  self.reset_on_run = reset_on_run
130
+ # Optional directory to store extracted screenshots from metadata/new_items
131
+ self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
70
132
 
71
133
  # Ensure trajectory directory exists
72
134
  self.trajectory_dir.mkdir(parents=True, exist_ok=True)
73
-
135
+
74
136
  def _get_turn_dir(self) -> Path:
75
137
  """Get the directory for the current turn."""
76
138
  if not self.trajectory_id:
@@ -139,12 +201,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
139
201
  trajectory_path = self.trajectory_dir / self.trajectory_id
140
202
  trajectory_path.mkdir(parents=True, exist_ok=True)
141
203
 
142
- # Save trajectory metadata
204
+ # Save trajectory metadata (optionally extract screenshots to screenshot_dir)
205
+ kwargs_to_save = kwargs.copy()
206
+ try:
207
+ if "messages" in kwargs_to_save:
208
+ kwargs_to_save["messages"] = extract_computer_call_outputs(
209
+ kwargs_to_save["messages"], self.screenshot_dir
210
+ )
211
+ except Exception:
212
+ # If extraction fails, fall back to original messages
213
+ pass
143
214
  metadata = {
144
215
  "trajectory_id": self.trajectory_id,
145
216
  "created_at": str(uuid.uuid1().time),
146
217
  "status": "running",
147
- "kwargs": kwargs,
218
+ "kwargs": kwargs_to_save,
148
219
  }
149
220
 
150
221
  with open(trajectory_path / "metadata.json", "w") as f:
@@ -171,11 +242,18 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
171
242
  metadata = {}
172
243
 
173
244
  # Update metadata with completion info
245
+ # Optionally extract screenshots from new_items before persisting
246
+ new_items_to_save = new_items
247
+ try:
248
+ new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
249
+ except Exception:
250
+ pass
251
+
174
252
  metadata.update({
175
253
  "status": "completed",
176
254
  "completed_at": str(uuid.uuid1().time),
177
255
  "total_usage": self.total_usage,
178
- "new_items": new_items,
256
+ "new_items": new_items_to_save,
179
257
  "total_turns": self.current_turn
180
258
  })
181
259
 
@@ -41,7 +41,7 @@ class ProxyOperatorAgent(OperatorAgent):
41
41
  *,
42
42
  model: str | None = None,
43
43
  allowed_tools: list[str] | None = None,
44
- trajectory_dir: str | None = None,
44
+ trajectory_dir: str | dict | None = None,
45
45
  # === ComputerAgent kwargs ===
46
46
  tools: list[Any] | None = None,
47
47
  custom_loop: Any | None = None,
@@ -109,7 +109,7 @@ async def run_single_task(
109
109
  only_n_most_recent_images: int | None = None,
110
110
  callbacks: list[Any] | None = None,
111
111
  verbosity: int | None = None,
112
- trajectory_dir: str | None = None,
112
+ trajectory_dir: str | dict | None = None,
113
113
  max_retries: int | None = 3,
114
114
  screenshot_delay: float | int = 0.5,
115
115
  use_prompt_caching: bool | None = False,
@@ -167,7 +167,7 @@ async def run_full_dataset(
167
167
  max_concurrent: int = 30,
168
168
  max_steps: int = 50,
169
169
  split: str = "train",
170
- trajectory_dir: str | None = None,
170
+ trajectory_dir: str | dict | None = None,
171
171
  # === ComputerAgent kwargs ===
172
172
  tools: list[Any] | None = None,
173
173
  custom_loop: Any | None = None,
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.18"
9
+ version = "0.4.20"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
File without changes
File without changes
File without changes
File without changes
File without changes