cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/base.py CHANGED
@@ -2,13 +2,15 @@
2
2
  Base protocol for async agent configurations
3
3
  """
4
4
 
5
- from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
6
5
  from abc import abstractmethod
6
+ from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
7
+
7
8
  from ..types import AgentCapability
8
9
 
10
+
9
11
  class AsyncAgentConfig(Protocol):
10
12
  """Protocol defining the interface for async agent configurations."""
11
-
13
+
12
14
  @abstractmethod
13
15
  async def predict_step(
14
16
  self,
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
22
24
  _on_api_end=None,
23
25
  _on_usage=None,
24
26
  _on_screenshot=None,
25
- **kwargs
27
+ **kwargs,
26
28
  ) -> Dict[str, Any]:
27
29
  """
28
30
  Predict the next step based on input items.
29
-
31
+
30
32
  Args:
31
33
  messages: Input items following Responses format (message, function_call, computer_call)
32
34
  model: Model name to use
@@ -39,37 +41,34 @@ class AsyncAgentConfig(Protocol):
39
41
  _on_usage: Callback for usage tracking
40
42
  _on_screenshot: Callback for screenshot events
41
43
  **kwargs: Additional arguments
42
-
44
+
43
45
  Returns:
44
46
  Dictionary with "output" (output items) and "usage" array
45
47
  """
46
48
  ...
47
-
49
+
48
50
  @abstractmethod
49
51
  async def predict_click(
50
- self,
51
- model: str,
52
- image_b64: str,
53
- instruction: str
52
+ self, model: str, image_b64: str, instruction: str
54
53
  ) -> Optional[Tuple[int, int]]:
55
54
  """
56
55
  Predict click coordinates based on image and instruction.
57
-
56
+
58
57
  Args:
59
58
  model: Model name to use
60
59
  image_b64: Base64 encoded image
61
60
  instruction: Instruction for where to click
62
-
61
+
63
62
  Returns:
64
63
  None or tuple with (x, y) coordinates
65
64
  """
66
65
  ...
67
-
66
+
68
67
  @abstractmethod
69
68
  def get_capabilities(self) -> List[AgentCapability]:
70
69
  """
71
70
  Get list of capabilities supported by this agent config.
72
-
71
+
73
72
  Returns:
74
73
  List of capability strings (e.g., ["step", "click"])
75
74
  """
@@ -3,122 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
3
3
  Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
4
4
  """
5
5
 
6
- import uuid
7
6
  import asyncio
8
- import json
9
7
  import base64
10
- from typing import Dict, List, Any, Optional, Tuple
8
+ import json
9
+ import uuid
11
10
  from io import BytesIO
12
- from PIL import Image
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
13
  import litellm
14
+ from PIL import Image
14
15
 
16
+ from ..agent import find_agent_config
15
17
  from ..decorators import register_agent
16
- from ..types import Messages, AgentResponse, Tools, AgentCapability
17
18
  from ..loops.base import AsyncAgentConfig
18
19
  from ..responses import (
19
- convert_computer_calls_xy2desc,
20
- convert_responses_items_to_completion_messages,
21
20
  convert_completion_messages_to_responses_items,
22
21
  convert_computer_calls_desc2xy,
23
- get_all_element_descriptions
22
+ convert_computer_calls_xy2desc,
23
+ convert_responses_items_to_completion_messages,
24
+ get_all_element_descriptions,
24
25
  )
25
- from ..agent import find_agent_config
26
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
26
27
 
27
28
  GROUNDED_COMPUTER_TOOL_SCHEMA = {
28
- "type": "function",
29
- "function": {
30
- "name": "computer",
31
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
32
- "parameters": {
33
- "type": "object",
34
- "properties": {
35
- "action": {
36
- "type": "string",
37
- "enum": [
38
- "screenshot",
39
- "click",
40
- "double_click",
41
- "drag",
42
- "type",
43
- "keypress",
44
- "scroll",
45
- "move",
46
- "wait",
47
- "get_current_url",
48
- "get_dimensions",
49
- "get_environment"
50
- ],
51
- "description": "The action to perform (required for all actions)"
52
- },
53
- "element_description": {
54
- "type": "string",
55
- "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
56
- },
57
- "start_element_description": {
58
- "type": "string",
59
- "description": "Description of the element to start dragging from (required for drag action)"
60
- },
61
- "end_element_description": {
62
- "type": "string",
63
- "description": "Description of the element to drag to (required for drag action)"
64
- },
65
- "text": {
66
- "type": "string",
67
- "description": "The text to type (required for type action)"
68
- },
69
- "keys": {
70
- "type": "array",
71
- "items": {
72
- "type": "string"
29
+ "type": "function",
30
+ "function": {
31
+ "name": "computer",
32
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
33
+ "parameters": {
34
+ "type": "object",
35
+ "properties": {
36
+ "action": {
37
+ "type": "string",
38
+ "enum": [
39
+ "screenshot",
40
+ "click",
41
+ "double_click",
42
+ "drag",
43
+ "type",
44
+ "keypress",
45
+ "scroll",
46
+ "move",
47
+ "wait",
48
+ "get_current_url",
49
+ "get_dimensions",
50
+ "get_environment",
51
+ ],
52
+ "description": "The action to perform (required for all actions)",
53
+ },
54
+ "element_description": {
55
+ "type": "string",
56
+ "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
57
+ },
58
+ "start_element_description": {
59
+ "type": "string",
60
+ "description": "Description of the element to start dragging from (required for drag action)",
61
+ },
62
+ "end_element_description": {
63
+ "type": "string",
64
+ "description": "Description of the element to drag to (required for drag action)",
65
+ },
66
+ "text": {
67
+ "type": "string",
68
+ "description": "The text to type (required for type action)",
69
+ },
70
+ "keys": {
71
+ "type": "array",
72
+ "items": {"type": "string"},
73
+ "description": "Key(s) to press (required for keypress action)",
74
+ },
75
+ "button": {
76
+ "type": "string",
77
+ "enum": ["left", "right", "wheel", "back", "forward"],
78
+ "description": "The mouse button to use for click action (required for click and double_click action)",
79
+ },
80
+ "scroll_x": {
81
+ "type": "integer",
82
+ "description": "Horizontal scroll amount for scroll action (required for scroll action)",
83
+ },
84
+ "scroll_y": {
85
+ "type": "integer",
86
+ "description": "Vertical scroll amount for scroll action (required for scroll action)",
87
+ },
73
88
  },
74
- "description": "Key(s) to press (required for keypress action)"
75
- },
76
- "button": {
77
- "type": "string",
78
- "enum": [
79
- "left",
80
- "right",
81
- "wheel",
82
- "back",
83
- "forward"
84
- ],
85
- "description": "The mouse button to use for click action (required for click and double_click action)",
86
- },
87
- "scroll_x": {
88
- "type": "integer",
89
- "description": "Horizontal scroll amount for scroll action (required for scroll action)",
89
+ "required": ["action"],
90
90
  },
91
- "scroll_y": {
92
- "type": "integer",
93
- "description": "Vertical scroll amount for scroll action (required for scroll action)",
94
- },
95
- },
96
- "required": [
97
- "action"
98
- ]
99
- }
100
- }
91
+ },
101
92
  }
102
93
 
94
+
103
95
  def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
104
96
  """Prepare tools for grounded API format"""
105
97
  grounded_tools = []
106
-
98
+
107
99
  for schema in tool_schemas:
108
100
  if schema["type"] == "computer":
109
101
  grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
110
102
  else:
111
103
  grounded_tools.append(schema)
112
-
104
+
113
105
  return grounded_tools
114
106
 
107
+
115
108
  def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
116
109
  """Get the last computer call output image from messages."""
117
110
  for message in reversed(messages):
118
- if (isinstance(message, dict) and
119
- message.get("type") == "computer_call_output" and
120
- isinstance(message.get("output"), dict) and
121
- message["output"].get("type") == "input_image"):
111
+ if (
112
+ isinstance(message, dict)
113
+ and message.get("type") == "computer_call_output"
114
+ and isinstance(message.get("output"), dict)
115
+ and message["output"].get("type") == "input_image"
116
+ ):
122
117
  image_url = message["output"].get("image_url", "")
123
118
  if image_url.startswith("data:image/png;base64,"):
124
119
  return image_url.split(",", 1)[1]
@@ -129,14 +124,14 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
129
124
  class ComposedGroundedConfig(AsyncAgentConfig):
130
125
  """
131
126
  Composed-grounded agent configuration that uses both grounding and thinking models.
132
-
127
+
133
128
  The model parameter should be in format: "grounding_model+thinking_model"
134
129
  e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
135
130
  """
136
-
131
+
137
132
  def __init__(self):
138
133
  self.desc2xy: Dict[str, Tuple[float, float]] = {}
139
-
134
+
140
135
  async def predict_step(
141
136
  self,
142
137
  messages: List[Dict[str, Any]],
@@ -150,11 +145,11 @@ class ComposedGroundedConfig(AsyncAgentConfig):
150
145
  _on_api_end=None,
151
146
  _on_usage=None,
152
147
  _on_screenshot=None,
153
- **kwargs
148
+ **kwargs,
154
149
  ) -> Dict[str, Any]:
155
150
  """
156
151
  Composed-grounded predict step implementation.
157
-
152
+
158
153
  Process:
159
154
  0. Store last computer call image, if none then take a screenshot
160
155
  1. Convert computer calls from xy to descriptions
@@ -167,18 +162,20 @@ class ComposedGroundedConfig(AsyncAgentConfig):
167
162
  """
168
163
  # Parse the composed model
169
164
  if "+" not in model:
170
- raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
165
+ raise ValueError(
166
+ f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
167
+ )
171
168
  grounding_model, thinking_model = model.split("+", 1)
172
-
169
+
173
170
  pre_output_items = []
174
-
171
+
175
172
  # Step 0: Store last computer call image, if none then take a screenshot
176
173
  last_image_b64 = get_last_computer_call_image(messages)
177
174
  if last_image_b64 is None:
178
175
  # Take a screenshot
179
- screenshot_b64 = await computer_handler.screenshot() # type: ignore
176
+ screenshot_b64 = await computer_handler.screenshot() # type: ignore
180
177
  if screenshot_b64:
181
-
178
+
182
179
  call_id = uuid.uuid4().hex
183
180
  pre_output_items += [
184
181
  {
@@ -187,45 +184,42 @@ class ComposedGroundedConfig(AsyncAgentConfig):
187
184
  "content": [
188
185
  {
189
186
  "type": "output_text",
190
- "text": "Taking a screenshot to see the current computer screen."
187
+ "text": "Taking a screenshot to see the current computer screen.",
191
188
  }
192
- ]
189
+ ],
193
190
  },
194
191
  {
195
- "action": {
196
- "type": "screenshot"
197
- },
192
+ "action": {"type": "screenshot"},
198
193
  "call_id": call_id,
199
194
  "status": "completed",
200
- "type": "computer_call"
195
+ "type": "computer_call",
201
196
  },
202
197
  {
203
198
  "type": "computer_call_output",
204
199
  "call_id": call_id,
205
200
  "output": {
206
201
  "type": "input_image",
207
- "image_url": f"data:image/png;base64,{screenshot_b64}"
208
- }
202
+ "image_url": f"data:image/png;base64,{screenshot_b64}",
203
+ },
209
204
  },
210
205
  ]
211
206
  last_image_b64 = screenshot_b64
212
-
207
+
213
208
  # Call screenshot callback if provided
214
209
  if _on_screenshot:
215
210
  await _on_screenshot(screenshot_b64)
216
-
217
- tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
211
+
212
+ tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
218
213
 
219
214
  # Step 1: Convert computer calls from xy to descriptions
220
215
  input_messages = messages + pre_output_items
221
216
  messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
222
-
217
+
223
218
  # Step 2: Convert responses items to completion messages
224
219
  completion_messages = convert_responses_items_to_completion_messages(
225
- messages_with_descriptions,
226
- allow_images_in_tool_results=False
220
+ messages_with_descriptions, allow_images_in_tool_results=False
227
221
  )
228
-
222
+
229
223
  # Step 3: Call thinking model with litellm.acompletion
230
224
  api_kwargs = {
231
225
  "model": thinking_model,
@@ -233,98 +227,90 @@ class ComposedGroundedConfig(AsyncAgentConfig):
233
227
  "tools": tool_schemas,
234
228
  "max_retries": max_retries,
235
229
  "stream": stream,
236
- **kwargs
230
+ **kwargs,
237
231
  }
238
232
 
239
233
  if use_prompt_caching:
240
234
  api_kwargs["use_prompt_caching"] = use_prompt_caching
241
-
235
+
242
236
  # Call API start hook
243
237
  if _on_api_start:
244
238
  await _on_api_start(api_kwargs)
245
-
239
+
246
240
  # Make the completion call
247
241
  response = await litellm.acompletion(**api_kwargs)
248
-
242
+
249
243
  # Call API end hook
250
244
  if _on_api_end:
251
245
  await _on_api_end(api_kwargs, response)
252
-
246
+
253
247
  # Extract usage information
254
248
  usage = {
255
- **response.usage.model_dump(), # type: ignore
249
+ **response.usage.model_dump(), # type: ignore
256
250
  "response_cost": response._hidden_params.get("response_cost", 0.0),
257
251
  }
258
252
  if _on_usage:
259
253
  await _on_usage(usage)
260
-
254
+
261
255
  # Step 4: Convert completion messages back to responses items format
262
- response_dict = response.model_dump() # type: ignore
256
+ response_dict = response.model_dump() # type: ignore
263
257
  choice_messages = [choice["message"] for choice in response_dict["choices"]]
264
258
  thinking_output_items = []
265
-
259
+
266
260
  for choice_message in choice_messages:
267
- thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message]))
268
-
261
+ thinking_output_items.extend(
262
+ convert_completion_messages_to_responses_items([choice_message])
263
+ )
264
+
269
265
  # Step 5: Get all element descriptions and populate desc2xy mapping
270
266
  element_descriptions = get_all_element_descriptions(thinking_output_items)
271
-
267
+
272
268
  if element_descriptions and last_image_b64:
273
269
  # Use grounding model to predict coordinates for each description
274
270
  grounding_agent_conf = find_agent_config(grounding_model)
275
271
  if grounding_agent_conf:
276
272
  grounding_agent = grounding_agent_conf.agent_class()
277
-
273
+
278
274
  for desc in element_descriptions:
279
- for _ in range(3): # try 3 times
275
+ for _ in range(3): # try 3 times
280
276
  coords = await grounding_agent.predict_click(
281
- model=grounding_model,
282
- image_b64=last_image_b64,
283
- instruction=desc
277
+ model=grounding_model, image_b64=last_image_b64, instruction=desc
284
278
  )
285
279
  if coords:
286
280
  self.desc2xy[desc] = coords
287
281
  break
288
-
282
+
289
283
  # Step 6: Convert computer calls from descriptions back to xy coordinates
290
284
  final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
291
-
285
+
292
286
  # Step 7: Return output and usage
293
- return {
294
- "output": pre_output_items + final_output_items,
295
- "usage": usage
296
- }
297
-
287
+ return {"output": pre_output_items + final_output_items, "usage": usage}
288
+
298
289
  async def predict_click(
299
- self,
300
- model: str,
301
- image_b64: str,
302
- instruction: str,
303
- **kwargs
290
+ self, model: str, image_b64: str, instruction: str, **kwargs
304
291
  ) -> Optional[Tuple[int, int]]:
305
292
  """
306
293
  Predict click coordinates using the grounding model.
307
-
294
+
308
295
  For composed models, uses only the grounding model part for click prediction.
309
296
  """
310
297
  # Parse the composed model to get grounding model
311
298
  if "+" not in model:
312
- raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
299
+ raise ValueError(
300
+ f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
301
+ )
313
302
  grounding_model, thinking_model = model.split("+", 1)
314
-
303
+
315
304
  # Find and use the grounding agent
316
305
  grounding_agent_conf = find_agent_config(grounding_model)
317
306
  if grounding_agent_conf:
318
307
  grounding_agent = grounding_agent_conf.agent_class()
319
308
  return await grounding_agent.predict_click(
320
- model=grounding_model,
321
- image_b64=image_b64,
322
- instruction=instruction,
323
- **kwargs
309
+ model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
324
310
  )
325
-
311
+
326
312
  return None
327
-
313
+
328
314
  def get_capabilities(self) -> List[AgentCapability]:
329
315
  """Return the capabilities supported by this agent."""
330
316
  return ["click", "step"]
agent/loops/gemini.py CHANGED
@@ -29,6 +29,7 @@ def _lazy_import_genai():
29
29
  try:
30
30
  from google import genai # type: ignore
31
31
  from google.genai import types # type: ignore
32
+
32
33
  return genai, types
33
34
  except Exception as e: # pragma: no cover
34
35
  raise RuntimeError(
@@ -134,7 +135,13 @@ def _map_gemini_fc_to_computer_call(
134
135
  dx = magnitude
135
136
  elif direction == "left":
136
137
  dx = -magnitude
137
- action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
138
+ action = {
139
+ "type": "scroll",
140
+ "scroll_x": dx,
141
+ "scroll_y": dy,
142
+ "x": int(screen_w / 2),
143
+ "y": int(screen_h / 2),
144
+ }
138
145
  elif name == "scroll_at":
139
146
  x = _denormalize(int(args.get("x", 500)), screen_w)
140
147
  y = _denormalize(int(args.get("y", 500)), screen_h)
@@ -155,7 +162,14 @@ def _map_gemini_fc_to_computer_call(
155
162
  y = _denormalize(int(args.get("y", 0)), screen_h)
156
163
  dx = _denormalize(int(args.get("destination_x", x)), screen_w)
157
164
  dy = _denormalize(int(args.get("destination_y", y)), screen_h)
158
- action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
165
+ action = {
166
+ "type": "drag",
167
+ "start_x": x,
168
+ "start_y": y,
169
+ "end_x": dx,
170
+ "end_y": dy,
171
+ "button": "left",
172
+ }
159
173
  elif name == "wait_5_seconds":
160
174
  action = {"type": "wait"}
161
175
  else:
@@ -242,20 +256,25 @@ class GeminiComputerUseConfig(AsyncAgentConfig):
242
256
  }
243
257
 
244
258
  if _on_api_start:
245
- await _on_api_start({
246
- "model": api_kwargs["model"],
247
- # "contents": api_kwargs["contents"], # Disabled for now
248
- "config": api_kwargs["config"],
249
- })
259
+ await _on_api_start(
260
+ {
261
+ "model": api_kwargs["model"],
262
+ # "contents": api_kwargs["contents"], # Disabled for now
263
+ "config": api_kwargs["config"],
264
+ }
265
+ )
250
266
 
251
267
  response = client.models.generate_content(**api_kwargs)
252
268
 
253
269
  if _on_api_end:
254
- await _on_api_end({
255
- "model": api_kwargs["model"],
256
- # "contents": api_kwargs["contents"], # Disabled for now
257
- "config": api_kwargs["config"],
258
- }, response)
270
+ await _on_api_end(
271
+ {
272
+ "model": api_kwargs["model"],
273
+ # "contents": api_kwargs["contents"], # Disabled for now
274
+ "config": api_kwargs["config"],
275
+ },
276
+ response,
277
+ )
259
278
 
260
279
  # Usage (Gemini SDK may not always provide token usage; populate when available)
261
280
  usage: Dict[str, Any] = {}