cua-agent 0.4.15__tar.gz → 0.4.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (48) hide show
  1. {cua_agent-0.4.15 → cua_agent-0.4.16}/PKG-INFO +3 -3
  2. {cua_agent-0.4.15 → cua_agent-0.4.16}/README.md +2 -2
  3. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/agent.py +112 -88
  4. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/responses.py +47 -0
  5. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/types.py +9 -0
  6. {cua_agent-0.4.15 → cua_agent-0.4.16}/pyproject.toml +1 -1
  7. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/__init__.py +0 -0
  8. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/__main__.py +0 -0
  9. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/adapters/__init__.py +0 -0
  10. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  11. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/adapters/human_adapter.py +0 -0
  12. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/__init__.py +0 -0
  13. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/base.py +0 -0
  14. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/budget_manager.py +0 -0
  15. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/image_retention.py +0 -0
  16. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/logging.py +0 -0
  17. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/pii_anonymization.py +0 -0
  18. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/telemetry.py +0 -0
  19. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/callbacks/trajectory_saver.py +0 -0
  20. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/cli.py +0 -0
  21. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/computers/__init__.py +0 -0
  22. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/computers/base.py +0 -0
  23. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/computers/cua.py +0 -0
  24. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/computers/custom.py +0 -0
  25. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/decorators.py +0 -0
  26. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/human_tool/__init__.py +0 -0
  27. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/human_tool/__main__.py +0 -0
  28. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/human_tool/server.py +0 -0
  29. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/human_tool/ui.py +0 -0
  30. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/integrations/hud/__init__.py +0 -0
  31. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/integrations/hud/adapter.py +0 -0
  32. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/integrations/hud/agent.py +0 -0
  33. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/integrations/hud/computer_handler.py +0 -0
  34. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/__init__.py +0 -0
  35. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/anthropic.py +0 -0
  36. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/base.py +0 -0
  37. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/composed_grounded.py +0 -0
  38. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/glm45v.py +0 -0
  39. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/gta1.py +0 -0
  40. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/model_types.csv +0 -0
  41. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/omniparser.py +0 -0
  42. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/openai.py +0 -0
  43. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/loops/uitars.py +0 -0
  44. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/ui/__init__.py +0 -0
  45. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/ui/__main__.py +0 -0
  46. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/ui/gradio/__init__.py +0 -0
  47. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/ui/gradio/app.py +0 -0
  48. {cua_agent-0.4.15 → cua_agent-0.4.16}/agent/ui/gradio/ui_components.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.15
3
+ Version: 0.4.16
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -56,8 +56,8 @@ Description-Content-Type: text/markdown
56
56
  <h1>
57
57
  <div class="image-wrapper" style="display: inline-block;">
58
58
  <picture>
59
- <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../../img/logo_white.png" style="display: block; margin: auto;">
60
- <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../../img/logo_black.png" style="display: block; margin: auto;">
59
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
60
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
61
61
  <img alt="Shows my svg">
62
62
  </picture>
63
63
  </div>
@@ -2,8 +2,8 @@
2
2
  <h1>
3
3
  <div class="image-wrapper" style="display: inline-block;">
4
4
  <picture>
5
- <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../../img/logo_white.png" style="display: block; margin: auto;">
6
- <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../../img/logo_black.png" style="display: block; margin: auto;">
5
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
6
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
7
7
  <img alt="Shows my svg">
8
8
  </picture>
9
9
  </div>
@@ -7,7 +7,13 @@ from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Calla
7
7
 
8
8
  from litellm.responses.utils import Usage
9
9
 
10
- from .types import Messages, AgentCapability
10
+ from .types import (
11
+ Messages,
12
+ AgentCapability,
13
+ ToolError,
14
+ IllegalArgumentError
15
+ )
16
+ from .responses import make_tool_error_item, replace_failed_computer_calls_with_function_calls
11
17
  from .decorators import find_agent_config
12
18
  import json
13
19
  import litellm
@@ -30,6 +36,15 @@ from .computers import (
30
36
  make_computer_handler
31
37
  )
32
38
 
39
+ def assert_callable_with(f, *args, **kwargs):
40
+ """Check if function can be called with given arguments."""
41
+ try:
42
+ inspect.signature(f).bind(*args, **kwargs)
43
+ return True
44
+ except TypeError as e:
45
+ sig = inspect.signature(f)
46
+ raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
47
+
33
48
  def get_json(obj: Any, max_depth: int = 10) -> Any:
34
49
  def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
35
50
  if seen is None:
@@ -405,7 +420,8 @@ class ComputerAgent:
405
420
 
406
421
  async def _handle_item(self, item: Any, computer: Optional[AsyncComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
407
422
  """Handle each item; may cause a computer action + screenshot."""
408
- if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
423
+ call_id = item.get("call_id")
424
+ if ignore_call_ids and call_id and call_id in ignore_call_ids:
409
425
  return []
410
426
 
411
427
  item_type = item.get("type", None)
@@ -419,96 +435,103 @@ class ComputerAgent:
419
435
  # print(content_item.get("text"))
420
436
  return []
421
437
 
422
- if item_type == "computer_call":
423
- await self._on_computer_call_start(item)
424
- if not computer:
425
- raise ValueError("Computer handler is required for computer calls")
426
-
427
- # Perform computer actions
428
- action = item.get("action")
429
- action_type = action.get("type")
430
- if action_type is None:
431
- print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
432
- return []
433
-
434
- # Extract action arguments (all fields except 'type')
435
- action_args = {k: v for k, v in action.items() if k != "type"}
436
-
437
- # print(f"{action_type}({action_args})")
438
-
439
- # Execute the computer action
440
- computer_method = getattr(computer, action_type, None)
441
- if computer_method:
442
- await computer_method(**action_args)
443
- else:
444
- print(f"Unknown computer action: {action_type}")
445
- return []
446
-
447
- # Take screenshot after action
448
- if self.screenshot_delay and self.screenshot_delay > 0:
449
- await asyncio.sleep(self.screenshot_delay)
450
- screenshot_base64 = await computer.screenshot()
451
- await self._on_screenshot(screenshot_base64, "screenshot_after")
438
+ try:
439
+ if item_type == "computer_call":
440
+ await self._on_computer_call_start(item)
441
+ if not computer:
442
+ raise ValueError("Computer handler is required for computer calls")
443
+
444
+ # Perform computer actions
445
+ action = item.get("action")
446
+ action_type = action.get("type")
447
+ if action_type is None:
448
+ print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
449
+ return []
450
+
451
+ # Extract action arguments (all fields except 'type')
452
+ action_args = {k: v for k, v in action.items() if k != "type"}
453
+
454
+ # print(f"{action_type}({action_args})")
455
+
456
+ # Execute the computer action
457
+ computer_method = getattr(computer, action_type, None)
458
+ if computer_method:
459
+ assert_callable_with(computer_method, **action_args)
460
+ await computer_method(**action_args)
461
+ else:
462
+ print(f"Unknown computer action: {action_type}")
463
+ return []
464
+
465
+ # Take screenshot after action
466
+ if self.screenshot_delay and self.screenshot_delay > 0:
467
+ await asyncio.sleep(self.screenshot_delay)
468
+ screenshot_base64 = await computer.screenshot()
469
+ await self._on_screenshot(screenshot_base64, "screenshot_after")
470
+
471
+ # Handle safety checks
472
+ pending_checks = item.get("pending_safety_checks", [])
473
+ acknowledged_checks = []
474
+ for check in pending_checks:
475
+ check_message = check.get("message", str(check))
476
+ acknowledged_checks.append(check)
477
+ # TODO: implement a callback for safety checks
478
+ # if acknowledge_safety_check_callback(check_message, allow_always=True):
479
+ # acknowledged_checks.append(check)
480
+ # else:
481
+ # raise ValueError(f"Safety check failed: {check_message}")
482
+
483
+ # Create call output
484
+ call_output = {
485
+ "type": "computer_call_output",
486
+ "call_id": item.get("call_id"),
487
+ "acknowledged_safety_checks": acknowledged_checks,
488
+ "output": {
489
+ "type": "input_image",
490
+ "image_url": f"data:image/png;base64,{screenshot_base64}",
491
+ },
492
+ }
493
+
494
+ # # Additional URL safety checks for browser environments
495
+ # if await computer.get_environment() == "browser":
496
+ # current_url = await computer.get_current_url()
497
+ # call_output["output"]["current_url"] = current_url
498
+ # # TODO: implement a callback for URL safety checks
499
+ # # check_blocklisted_url(current_url)
500
+
501
+ result = [call_output]
502
+ await self._on_computer_call_end(item, result)
503
+ return result
452
504
 
453
- # Handle safety checks
454
- pending_checks = item.get("pending_safety_checks", [])
455
- acknowledged_checks = []
456
- for check in pending_checks:
457
- check_message = check.get("message", str(check))
458
- acknowledged_checks.append(check)
459
- # TODO: implement a callback for safety checks
460
- # if acknowledge_safety_check_callback(check_message, allow_always=True):
461
- # acknowledged_checks.append(check)
462
- # else:
463
- # raise ValueError(f"Safety check failed: {check_message}")
505
+ if item_type == "function_call":
506
+ await self._on_function_call_start(item)
507
+ # Perform function call
508
+ function = self._get_tool(item.get("name"))
509
+ if not function:
510
+ raise ValueError(f"Function {item.get("name")} not found")
464
511
 
465
- # Create call output
466
- call_output = {
467
- "type": "computer_call_output",
468
- "call_id": item.get("call_id"),
469
- "acknowledged_safety_checks": acknowledged_checks,
470
- "output": {
471
- "type": "input_image",
472
- "image_url": f"data:image/png;base64,{screenshot_base64}",
473
- },
474
- }
512
+ args = json.loads(item.get("arguments"))
513
+
514
+ # Validate arguments before execution
515
+ assert_callable_with(function, **args)
516
+
517
+ # Execute function - use asyncio.to_thread for non-async functions
518
+ if inspect.iscoroutinefunction(function):
519
+ result = await function(**args)
520
+ else:
521
+ result = await asyncio.to_thread(function, **args)
475
522
 
476
- # # Additional URL safety checks for browser environments
477
- # if await computer.get_environment() == "browser":
478
- # current_url = await computer.get_current_url()
479
- # call_output["output"]["current_url"] = current_url
480
- # # TODO: implement a callback for URL safety checks
481
- # # check_blocklisted_url(current_url)
523
+ # Create function call output
524
+ call_output = {
525
+ "type": "function_call_output",
526
+ "call_id": item.get("call_id"),
527
+ "output": str(result),
528
+ }
482
529
 
483
- result = [call_output]
484
- await self._on_computer_call_end(item, result)
485
- return result
486
-
487
- if item_type == "function_call":
488
- await self._on_function_call_start(item)
489
- # Perform function call
490
- function = self._get_tool(item.get("name"))
491
- if not function:
492
- raise ValueError(f"Function {item.get("name")} not found")
493
-
494
- args = json.loads(item.get("arguments"))
495
-
496
- # Execute function - use asyncio.to_thread for non-async functions
497
- if inspect.iscoroutinefunction(function):
498
- result = await function(**args)
499
- else:
500
- result = await asyncio.to_thread(function, **args)
501
-
502
- # Create function call output
503
- call_output = {
504
- "type": "function_call_output",
505
- "call_id": item.get("call_id"),
506
- "output": str(result),
507
- }
508
-
509
- result = [call_output]
510
- await self._on_function_call_end(item, result)
511
- return result
530
+ result = [call_output]
531
+ await self._on_function_call_end(item, result)
532
+ return result
533
+ except ToolError as e:
534
+ return [make_tool_error_item(repr(e), call_id)]
512
535
 
513
536
  return []
514
537
 
@@ -569,6 +592,7 @@ class ComputerAgent:
569
592
  # - PII anonymization
570
593
  # - Image retention policy
571
594
  combined_messages = old_items + new_items
595
+ combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
572
596
  preprocessed_messages = await self._on_llm_start(combined_messages)
573
597
 
574
598
  loop_kwargs = {
@@ -252,6 +252,53 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
252
252
  }
253
253
  ]
254
254
 
255
+ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
256
+ call_id = call_id if call_id else random_id()
257
+ return {
258
+ "type": "function_call_output",
259
+ "call_id": call_id,
260
+ "output": json.dumps({"error": error_message}),
261
+ }
262
+
263
+ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
264
+ """
265
+ Replace computer_call items with function_call items if they share a call_id with a function_call_output.
266
+ This indicates the computer call failed and should be treated as a function call instead.
267
+ We do this because the computer_call_output items do not support text output.
268
+
269
+ Args:
270
+ messages: List of message items to process
271
+ """
272
+ messages = messages.copy()
273
+
274
+ # Find all call_ids that have function_call_output items
275
+ failed_call_ids = set()
276
+ for msg in messages:
277
+ if msg.get("type") == "function_call_output":
278
+ call_id = msg.get("call_id")
279
+ if call_id:
280
+ failed_call_ids.add(call_id)
281
+
282
+ # Replace computer_call items that have matching call_ids
283
+ for i, msg in enumerate(messages):
284
+ if (msg.get("type") == "computer_call" and
285
+ msg.get("call_id") in failed_call_ids):
286
+
287
+ # Extract action from computer_call
288
+ action = msg.get("action", {})
289
+ call_id = msg.get("call_id")
290
+
291
+ # Create function_call replacement
292
+ messages[i] = {
293
+ "type": "function_call",
294
+ "id": msg.get("id", random_id()),
295
+ "call_id": call_id,
296
+ "name": "computer",
297
+ "arguments": json.dumps(action),
298
+ }
299
+
300
+ return messages
301
+
255
302
  # Conversion functions between element descriptions and coordinates
256
303
  def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
257
304
  """
@@ -16,6 +16,15 @@ Tools = Optional[Iterable[ToolParam]]
16
16
  AgentResponse = ResponsesAPIResponse
17
17
  AgentCapability = Literal["step", "click"]
18
18
 
19
+ # Exception types
20
+ class ToolError(RuntimeError):
21
+ """Base exception for tool-related errors"""
22
+ pass
23
+
24
+ class IllegalArgumentError(ToolError):
25
+ """Exception raised when function arguments are invalid"""
26
+ pass
27
+
19
28
 
20
29
  # Agent config registration
21
30
  class AgentConfigInfo(BaseModel):
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.15"
9
+ version = "0.4.16"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
File without changes
File without changes
File without changes