cua-agent 0.4.7__tar.gz → 0.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (39) hide show
  1. {cua_agent-0.4.7 → cua_agent-0.4.8}/PKG-INFO +2 -2
  2. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/__init__.py +2 -2
  3. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/adapters/huggingfacelocal_adapter.py +5 -1
  4. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/agent.py +82 -15
  5. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/cli.py +9 -3
  6. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/computer_handler.py +3 -1
  7. cua_agent-0.4.8/agent/decorators.py +52 -0
  8. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/__init__.py +3 -1
  9. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/anthropic.py +200 -84
  10. cua_agent-0.4.8/agent/loops/base.py +76 -0
  11. cua_agent-0.4.8/agent/loops/composed_grounded.py +318 -0
  12. cua_agent-0.4.8/agent/loops/gta1.py +178 -0
  13. cua_agent-0.4.8/agent/loops/model_types.csv +6 -0
  14. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/omniparser.py +178 -84
  15. cua_agent-0.4.8/agent/loops/openai.py +235 -0
  16. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/uitars.py +305 -178
  17. cua_agent-0.4.8/agent/responses.py +683 -0
  18. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/types.py +7 -5
  19. {cua_agent-0.4.7 → cua_agent-0.4.8}/pyproject.toml +2 -2
  20. cua_agent-0.4.7/agent/decorators.py +0 -90
  21. cua_agent-0.4.7/agent/loops/openai.py +0 -95
  22. cua_agent-0.4.7/agent/responses.py +0 -207
  23. {cua_agent-0.4.7 → cua_agent-0.4.8}/README.md +0 -0
  24. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/__main__.py +0 -0
  25. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/adapters/__init__.py +0 -0
  26. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/__init__.py +0 -0
  27. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/base.py +0 -0
  28. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/budget_manager.py +0 -0
  29. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/image_retention.py +0 -0
  30. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/logging.py +0 -0
  31. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/pii_anonymization.py +0 -0
  32. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/telemetry.py +0 -0
  33. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/trajectory_saver.py +0 -0
  34. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/telemetry.py +0 -0
  35. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/__init__.py +0 -0
  36. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/__main__.py +0 -0
  37. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/__init__.py +0 -0
  38. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/app.py +0 -0
  39. {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/ui_components.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.7
3
+ Version: 0.4.8
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -15,7 +15,7 @@ Requires-Dist: python-dotenv>=1.0.1
15
15
  Requires-Dist: cua-computer<0.5.0,>=0.4.0
16
16
  Requires-Dist: cua-core<0.2.0,>=0.1.8
17
17
  Requires-Dist: certifi>=2024.2.2
18
- Requires-Dist: litellm>=1.74.8
18
+ Requires-Dist: litellm>=1.74.12
19
19
  Provides-Extra: openai
20
20
  Provides-Extra: anthropic
21
21
  Provides-Extra: omni
@@ -5,7 +5,7 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
5
5
  import logging
6
6
  import sys
7
7
 
8
- from .decorators import agent_loop
8
+ from .decorators import register_agent
9
9
  from .agent import ComputerAgent
10
10
  from .types import Messages, AgentResponse
11
11
 
@@ -13,7 +13,7 @@ from .types import Messages, AgentResponse
13
13
  from . import loops
14
14
 
15
15
  __all__ = [
16
- "agent_loop",
16
+ "register_agent",
17
17
  "ComputerAgent",
18
18
  "Messages",
19
19
  "AgentResponse"
@@ -48,7 +48,11 @@ class HuggingFaceLocalAdapter(CustomLLM):
48
48
  )
49
49
 
50
50
  # Load processor
51
- processor = AutoProcessor.from_pretrained(model_name)
51
+ processor = AutoProcessor.from_pretrained(
52
+ model_name,
53
+ min_pixels=3136,
54
+ max_pixels=4096 * 2160
55
+ )
52
56
 
53
57
  # Cache them
54
58
  self.models[model_name] = model
@@ -3,12 +3,12 @@ ComputerAgent - Main agent class that selects and runs agent loops
3
3
  """
4
4
 
5
5
  import asyncio
6
- from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set
6
+ from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
7
7
 
8
8
  from litellm.responses.utils import Usage
9
9
 
10
- from .types import Messages, Computer
11
- from .decorators import find_agent_loop
10
+ from .types import Messages, Computer, AgentCapability
11
+ from .decorators import find_agent_config
12
12
  from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
13
13
  import json
14
14
  import litellm
@@ -117,6 +117,13 @@ def sanitize_message(msg: Any) -> Any:
117
117
  return sanitized
118
118
  return msg
119
119
 
120
+ def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
121
+ call_ids = []
122
+ for message in messages:
123
+ if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
124
+ call_ids.append(message.get("call_id"))
125
+ return call_ids
126
+
120
127
  class ComputerAgent:
121
128
  """
122
129
  Main agent class that automatically selects the appropriate agent loop
@@ -207,19 +214,21 @@ class ComputerAgent:
207
214
  litellm.custom_provider_map = [
208
215
  {"provider": "huggingface-local", "custom_handler": hf_adapter}
209
216
  ]
217
+ litellm.suppress_debug_info = True
210
218
 
211
219
  # == Initialize computer agent ==
212
220
 
213
221
  # Find the appropriate agent loop
214
222
  if custom_loop:
215
223
  self.agent_loop = custom_loop
216
- self.agent_loop_info = None
224
+ self.agent_config_info = None
217
225
  else:
218
- loop_info = find_agent_loop(model)
219
- if not loop_info:
220
- raise ValueError(f"No agent loop found for model: {model}")
221
- self.agent_loop = loop_info.func
222
- self.agent_loop_info = loop_info
226
+ config_info = find_agent_config(model)
227
+ if not config_info:
228
+ raise ValueError(f"No agent config found for model: {model}")
229
+ # Instantiate the agent config class
230
+ self.agent_loop = config_info.agent_class()
231
+ self.agent_config_info = config_info
223
232
 
224
233
  self.tool_schemas = []
225
234
  self.computer_handler = None
@@ -389,8 +398,10 @@ class ComputerAgent:
389
398
  # AGENT OUTPUT PROCESSING
390
399
  # ============================================================================
391
400
 
392
- async def _handle_item(self, item: Any, computer: Optional[Computer] = None) -> List[Dict[str, Any]]:
401
+ async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
393
402
  """Handle each item; may cause a computer action + screenshot."""
403
+ if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
404
+ return []
394
405
 
395
406
  item_type = item.get("type", None)
396
407
 
@@ -439,7 +450,7 @@ class ComputerAgent:
439
450
  acknowledged_checks = []
440
451
  for check in pending_checks:
441
452
  check_message = check.get("message", str(check))
442
- if acknowledge_safety_check_callback(check_message):
453
+ if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
443
454
  acknowledged_checks.append(check)
444
455
  else:
445
456
  raise ValueError(f"Safety check failed: {check_message}")
@@ -514,6 +525,12 @@ class ComputerAgent:
514
525
  Returns:
515
526
  AsyncGenerator that yields response chunks
516
527
  """
528
+ if not self.agent_config_info:
529
+ raise ValueError("Agent configuration not found")
530
+
531
+ capabilities = self.get_capabilities()
532
+ if "step" not in capabilities:
533
+ raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
517
534
 
518
535
  await self._initialize_computers()
519
536
 
@@ -528,7 +545,7 @@ class ComputerAgent:
528
545
  "messages": messages,
529
546
  "stream": stream,
530
547
  "model": self.model,
531
- "agent_loop": self.agent_loop.__name__,
548
+ "agent_loop": self.agent_config_info.agent_class.__name__,
532
549
  **merged_kwargs
533
550
  }
534
551
  await self._on_run_start(run_kwargs, old_items)
@@ -558,7 +575,7 @@ class ComputerAgent:
558
575
  }
559
576
 
560
577
  # Run agent loop iteration
561
- result = await self.agent_loop(
578
+ result = await self.agent_loop.predict_step(
562
579
  **loop_kwargs,
563
580
  _on_api_start=self._on_api_start,
564
581
  _on_api_end=self._on_api_end,
@@ -579,9 +596,12 @@ class ComputerAgent:
579
596
  # Add agent response to new_items
580
597
  new_items += result.get("output")
581
598
 
599
+ # Get output call ids
600
+ output_call_ids = get_output_call_ids(result.get("output", []))
601
+
582
602
  # Handle computer actions
583
603
  for item in result.get("output"):
584
- partial_items = await self._handle_item(item, self.computer_handler)
604
+ partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
585
605
  new_items += partial_items
586
606
 
587
607
  # Yield partial response
@@ -594,4 +614,51 @@ class ComputerAgent:
594
614
  )
595
615
  }
596
616
 
597
- await self._on_run_end(loop_kwargs, old_items, new_items)
617
+ await self._on_run_end(loop_kwargs, old_items, new_items)
618
+
619
+ async def predict_click(
620
+ self,
621
+ instruction: str,
622
+ image_b64: Optional[str] = None
623
+ ) -> Optional[Tuple[int, int]]:
624
+ """
625
+ Predict click coordinates based on image and instruction.
626
+
627
+ Args:
628
+ instruction: Instruction for where to click
629
+ image_b64: Base64 encoded image (optional, will take screenshot if not provided)
630
+
631
+ Returns:
632
+ None or tuple with (x, y) coordinates
633
+ """
634
+ if not self.agent_config_info:
635
+ raise ValueError("Agent configuration not found")
636
+
637
+ capabilities = self.get_capabilities()
638
+ if "click" not in capabilities:
639
+ raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
640
+ if hasattr(self.agent_loop, 'predict_click'):
641
+ if not image_b64:
642
+ if not self.computer_handler:
643
+ raise ValueError("Computer tool or image_b64 is required for predict_click")
644
+ image_b64 = await self.computer_handler.screenshot()
645
+ return await self.agent_loop.predict_click(
646
+ model=self.model,
647
+ image_b64=image_b64,
648
+ instruction=instruction
649
+ )
650
+ return None
651
+
652
+ def get_capabilities(self) -> List[AgentCapability]:
653
+ """
654
+ Get list of capabilities supported by the current agent config.
655
+
656
+ Returns:
657
+ List of capability strings (e.g., ["step", "click"])
658
+ """
659
+ if not self.agent_config_info:
660
+ raise ValueError("Agent configuration not found")
661
+
662
+ if hasattr(self.agent_loop, 'get_capabilities'):
663
+ return self.agent_loop.get_capabilities()
664
+ return ["step"] # Default capability
@@ -120,7 +120,7 @@ async def ainput(prompt: str = ""):
120
120
 
121
121
  async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
122
122
  """Main chat loop with the agent."""
123
- print_welcome(model, agent.agent_loop.__name__, container_name)
123
+ print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
124
124
 
125
125
  history = []
126
126
 
@@ -130,7 +130,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
130
130
  total_cost = 0
131
131
 
132
132
  while True:
133
- if history[-1].get("role") != "user":
133
+ if len(history) == 0 or history[-1].get("role") != "user":
134
134
  # Get user input with prompt
135
135
  print_colored("> ", end="")
136
136
  user_input = await ainput()
@@ -260,7 +260,12 @@ Examples:
260
260
  help="Show total cost of the agent runs"
261
261
  )
262
262
 
263
-
263
+ parser.add_argument(
264
+ "-r", "--max-retries",
265
+ type=int,
266
+ default=3,
267
+ help="Maximum number of retries for the LLM API calls"
268
+ )
264
269
 
265
270
  args = parser.parse_args()
266
271
 
@@ -327,6 +332,7 @@ Examples:
327
332
  "model": args.model,
328
333
  "tools": [computer],
329
334
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
335
+ "max_retries": args.max_retries
330
336
  }
331
337
 
332
338
  if args.images > 0:
@@ -93,8 +93,10 @@ class OpenAIComputerHandler:
93
93
  return ""
94
94
 
95
95
 
96
- def acknowledge_safety_check_callback(message: str) -> bool:
96
+ def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
97
97
  """Safety check callback for user acknowledgment."""
98
+ if allow_always:
99
+ return True
98
100
  response = input(
99
101
  f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
100
102
  ).lower()
@@ -0,0 +1,52 @@
1
+ """
2
+ Decorators for agent - agent_loop decorator
3
+ """
4
+
5
+ from typing import List, Optional
6
+ from .types import AgentConfigInfo
7
+
8
+ # Global registry
9
+ _agent_configs: List[AgentConfigInfo] = []
10
+
11
+ def register_agent(models: str, priority: int = 0):
12
+ """
13
+ Decorator to register an AsyncAgentConfig class.
14
+
15
+ Args:
16
+ models: Regex pattern to match supported models
17
+ priority: Priority for agent selection (higher = more priority)
18
+ """
19
+ def decorator(agent_class: type):
20
+ # Validate that the class implements AsyncAgentConfig protocol
21
+ if not hasattr(agent_class, 'predict_step'):
22
+ raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
23
+ if not hasattr(agent_class, 'predict_click'):
24
+ raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
25
+ if not hasattr(agent_class, 'get_capabilities'):
26
+ raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
27
+
28
+ # Register the agent config
29
+ config_info = AgentConfigInfo(
30
+ agent_class=agent_class,
31
+ models_regex=models,
32
+ priority=priority
33
+ )
34
+ _agent_configs.append(config_info)
35
+
36
+ # Sort by priority (highest first)
37
+ _agent_configs.sort(key=lambda x: x.priority, reverse=True)
38
+
39
+ return agent_class
40
+
41
+ return decorator
42
+
43
+ def get_agent_configs() -> List[AgentConfigInfo]:
44
+ """Get all registered agent configs"""
45
+ return _agent_configs.copy()
46
+
47
+ def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
48
+ """Find the best matching agent config for a model"""
49
+ for config_info in _agent_configs:
50
+ if config_info.matches_model(model):
51
+ return config_info
52
+ return None
@@ -7,5 +7,7 @@ from . import anthropic
7
7
  from . import openai
8
8
  from . import uitars
9
9
  from . import omniparser
10
+ from . import gta1
11
+ from . import composed_grounded
10
12
 
11
- __all__ = ["anthropic", "openai", "uitars", "omniparser"]
13
+ __all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded"]
@@ -4,12 +4,13 @@ Anthropic hosted tools agent loop implementation using liteLLM
4
4
 
5
5
  import asyncio
6
6
  import json
7
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
8
8
  import litellm
9
9
  from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
10
10
 
11
- from ..decorators import agent_loop
12
- from ..types import Messages, AgentResponse, Tools
11
+ from ..decorators import register_agent
12
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
13
+ from ..loops.base import AsyncAgentConfig
13
14
  from ..responses import (
14
15
  make_reasoning_item,
15
16
  make_output_text_item,
@@ -64,21 +65,28 @@ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
64
65
  "beta_flag": "computer-use-2024-10-22"
65
66
  }
66
67
 
67
- def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
68
+ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
68
69
  """Map a computer tool to Anthropic's hosted tool schema."""
70
+ # Get dimensions from the computer handler
71
+ try:
72
+ width, height = await computer_tool.get_dimensions()
73
+ except Exception:
74
+ # Fallback to default dimensions if method fails
75
+ width, height = 1024, 768
76
+
69
77
  return {
70
78
  "type": tool_version,
71
79
  "function": {
72
80
  "name": "computer",
73
81
  "parameters": {
74
- "display_height_px": getattr(computer_tool, 'display_height', 768),
75
- "display_width_px": getattr(computer_tool, 'display_width', 1024),
76
- "display_number": getattr(computer_tool, 'display_number', 1),
82
+ "display_height_px": height,
83
+ "display_width_px": width,
84
+ "display_number": 1,
77
85
  },
78
86
  },
79
87
  }
80
88
 
81
- def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
89
+ async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
82
90
  """Prepare tools for Anthropic API format."""
83
91
  tool_config = _get_tool_config_for_model(model)
84
92
  anthropic_tools = []
@@ -86,7 +94,7 @@ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str)
86
94
  for schema in tool_schemas:
87
95
  if schema["type"] == "computer":
88
96
  # Map computer tool to Anthropic format
89
- anthropic_tools.append(_map_computer_tool_to_anthropic(
97
+ anthropic_tools.append(await _map_computer_tool_to_anthropic(
90
98
  schema["computer"],
91
99
  tool_config["tool_version"]
92
100
  ))
@@ -1284,84 +1292,192 @@ def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str
1284
1292
 
1285
1293
  return merged
1286
1294
 
1287
- @agent_loop(models=r".*claude-.*", priority=5)
1288
- async def anthropic_hosted_tools_loop(
1289
- messages: Messages,
1290
- model: str,
1291
- tools: Optional[List[Dict[str, Any]]] = None,
1292
- max_retries: Optional[int] = None,
1293
- stream: bool = False,
1294
- computer_handler=None,
1295
- use_prompt_caching: Optional[bool] = False,
1296
- _on_api_start=None,
1297
- _on_api_end=None,
1298
- _on_usage=None,
1299
- _on_screenshot=None,
1300
- **kwargs
1301
- ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
1302
- """
1303
- Anthropic hosted tools agent loop using liteLLM acompletion.
1304
-
1305
- Supports Anthropic's computer use models with hosted tools.
1306
- """
1307
- tools = tools or []
1308
-
1309
- # Get tool configuration for this model
1310
- tool_config = _get_tool_config_for_model(model)
1295
+ @register_agent(models=r".*claude-.*")
1296
+ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1297
+ """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
1311
1298
 
1312
- # Prepare tools for Anthropic API
1313
- anthropic_tools = _prepare_tools_for_anthropic(tools, model)
1314
-
1315
- # Convert responses_items messages to completion format
1316
- completion_messages = _convert_responses_items_to_completion_messages(messages)
1317
- if use_prompt_caching:
1318
- # First combine messages to reduce number of blocks
1319
- completion_messages = _combine_completion_messages(completion_messages)
1320
- # Then add cache control, anthropic requires explicit "cache_control" dicts
1321
- completion_messages = _add_cache_control(completion_messages)
1322
-
1323
- # Prepare API call kwargs
1324
- api_kwargs = {
1325
- "model": model,
1326
- "messages": completion_messages,
1327
- "tools": anthropic_tools if anthropic_tools else None,
1328
- "stream": stream,
1329
- "num_retries": max_retries,
1299
+ async def predict_step(
1300
+ self,
1301
+ messages: Messages,
1302
+ model: str,
1303
+ tools: Optional[List[Dict[str, Any]]] = None,
1304
+ max_retries: Optional[int] = None,
1305
+ stream: bool = False,
1306
+ computer_handler=None,
1307
+ use_prompt_caching: Optional[bool] = False,
1308
+ _on_api_start=None,
1309
+ _on_api_end=None,
1310
+ _on_usage=None,
1311
+ _on_screenshot=None,
1330
1312
  **kwargs
1331
- }
1332
-
1333
- # Add beta header for computer use
1334
- if anthropic_tools:
1335
- api_kwargs["headers"] = {
1336
- "anthropic-beta": tool_config["beta_flag"]
1313
+ ) -> Dict[str, Any]:
1314
+ """
1315
+ Anthropic hosted tools agent loop using liteLLM acompletion.
1316
+
1317
+ Supports Anthropic's computer use models with hosted tools.
1318
+ """
1319
+ tools = tools or []
1320
+
1321
+ # Get tool configuration for this model
1322
+ tool_config = _get_tool_config_for_model(model)
1323
+
1324
+ # Prepare tools for Anthropic API
1325
+ anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
1326
+
1327
+ # Convert responses_items messages to completion format
1328
+ completion_messages = _convert_responses_items_to_completion_messages(messages)
1329
+ if use_prompt_caching:
1330
+ # First combine messages to reduce number of blocks
1331
+ completion_messages = _combine_completion_messages(completion_messages)
1332
+ # Then add cache control, anthropic requires explicit "cache_control" dicts
1333
+ completion_messages = _add_cache_control(completion_messages)
1334
+
1335
+ # Prepare API call kwargs
1336
+ api_kwargs = {
1337
+ "model": model,
1338
+ "messages": completion_messages,
1339
+ "tools": anthropic_tools if anthropic_tools else None,
1340
+ "stream": stream,
1341
+ "num_retries": max_retries,
1342
+ **kwargs
1337
1343
  }
1338
-
1339
- # Call API start hook
1340
- if _on_api_start:
1341
- await _on_api_start(api_kwargs)
1342
-
1343
- # Use liteLLM acompletion
1344
- response = await litellm.acompletion(**api_kwargs)
1345
-
1346
- # Call API end hook
1347
- if _on_api_end:
1348
- await _on_api_end(api_kwargs, response)
1349
-
1350
- # Convert response to responses_items format
1351
- responses_items = _convert_completion_to_responses_items(response)
1344
+
1345
+ # Add beta header for computer use
1346
+ if anthropic_tools:
1347
+ api_kwargs["headers"] = {
1348
+ "anthropic-beta": tool_config["beta_flag"]
1349
+ }
1350
+
1351
+ # Call API start hook
1352
+ if _on_api_start:
1353
+ await _on_api_start(api_kwargs)
1354
+
1355
+ # Use liteLLM acompletion
1356
+ response = await litellm.acompletion(**api_kwargs)
1357
+
1358
+ # Call API end hook
1359
+ if _on_api_end:
1360
+ await _on_api_end(api_kwargs, response)
1361
+
1362
+ # Convert response to responses_items format
1363
+ responses_items = _convert_completion_to_responses_items(response)
1352
1364
 
1353
- # Extract usage information
1354
- responses_usage = {
1355
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1356
- "response_cost": response._hidden_params.get("response_cost", 0.0),
1357
- }
1358
- if _on_usage:
1359
- await _on_usage(responses_usage)
1365
+ # Extract usage information
1366
+ responses_usage = {
1367
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1368
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
1369
+ }
1370
+ if _on_usage:
1371
+ await _on_usage(responses_usage)
1360
1372
 
1361
- # Create agent response
1362
- agent_response = {
1363
- "output": responses_items,
1364
- "usage": responses_usage
1365
- }
1373
+ # Return in AsyncAgentConfig format
1374
+ return {
1375
+ "output": responses_items,
1376
+ "usage": responses_usage
1377
+ }
1378
+
1379
+ async def predict_click(
1380
+ self,
1381
+ model: str,
1382
+ image_b64: str,
1383
+ instruction: str,
1384
+ **kwargs
1385
+ ) -> Optional[Tuple[int, int]]:
1386
+ """
1387
+ Predict click coordinates based on image and instruction.
1388
+
1389
+ Uses Anthropic's computer use models with a custom prompt that instructs
1390
+ the agent to only output clicks.
1391
+
1392
+ Args:
1393
+ model: Model name to use
1394
+ image_b64: Base64 encoded image
1395
+ instruction: Instruction for where to click
1396
+
1397
+ Returns:
1398
+ Tuple of (x, y) coordinates or None if prediction fails
1399
+ """
1400
+ # Get image dimensions from base64 data
1401
+ try:
1402
+ import base64
1403
+ from PIL import Image
1404
+ from io import BytesIO
1405
+
1406
+ image_data = base64.b64decode(image_b64)
1407
+ image = Image.open(BytesIO(image_data))
1408
+ display_width, display_height = image.size
1409
+ except Exception:
1410
+ # Fallback to default dimensions if image parsing fails
1411
+ display_width, display_height = 1024, 768
1412
+
1413
+ # Get tool configuration for this model
1414
+ tool_config = _get_tool_config_for_model(model)
1415
+
1416
+ # Prepare computer tool for Anthropic format
1417
+ computer_tool = {
1418
+ "type": tool_config["tool_version"],
1419
+ "function": {
1420
+ "name": "computer",
1421
+ "parameters": {
1422
+ "display_height_px": display_height,
1423
+ "display_width_px": display_width,
1424
+ "display_number": 1,
1425
+ },
1426
+ },
1427
+ }
1428
+
1429
+ # Construct messages in OpenAI chat completion format for liteLLM
1430
+ messages = [
1431
+ {
1432
+ "role": "user",
1433
+ "content": [
1434
+ {
1435
+ "type": "text",
1436
+ "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
1437
+ },
1438
+ {
1439
+ "type": "image_url",
1440
+ "image_url": {
1441
+ "url": f"data:image/png;base64,{image_b64}"
1442
+ }
1443
+ }
1444
+ ]
1445
+ }
1446
+ ]
1447
+
1448
+ # Prepare API call kwargs
1449
+ api_kwargs = {
1450
+ "model": model,
1451
+ "messages": messages,
1452
+ "tools": [computer_tool],
1453
+ "stream": False,
1454
+ "max_tokens": 100, # Keep response short for click prediction
1455
+ "headers": {
1456
+ "anthropic-beta": tool_config["beta_flag"]
1457
+ }
1458
+ }
1459
+
1460
+ # Use liteLLM acompletion
1461
+ response = await litellm.acompletion(**api_kwargs)
1462
+
1463
+ # Convert response to responses_items format to extract click coordinates
1464
+ responses_items = _convert_completion_to_responses_items(response)
1465
+
1466
+ # Look for computer_call with click action
1467
+ for item in responses_items:
1468
+ if (isinstance(item, dict) and
1469
+ item.get("type") == "computer_call" and
1470
+ isinstance(item.get("action"), dict)):
1471
+
1472
+ action = item["action"]
1473
+ if action.get("type") == "click":
1474
+ x = action.get("x")
1475
+ y = action.get("y")
1476
+ if x is not None and y is not None:
1477
+ return (int(x), int(y))
1478
+
1479
+ return None
1366
1480
 
1367
- return agent_response
1481
+ def get_capabilities(self) -> List[AgentCapability]:
1482
+ """Return the capabilities supported by this agent."""
1483
+ return ["click", "step"]