cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
@@ -8,21 +8,22 @@ Exports:
8
8
  - run_full_dataset(dataset, ...)
9
9
  - MCPComputerAgent
10
10
  """
11
+
11
12
  import time
12
13
  from typing import Any, Optional
13
14
 
14
15
  from agent.computers import is_agent_computer
15
- from datasets import load_dataset, Dataset
16
- from hud.datasets import Task, run_dataset
16
+ from datasets import Dataset, load_dataset
17
17
  from hud import trace
18
+ from hud.datasets import Task, run_dataset
18
19
 
19
20
  from .agent import MCPComputerAgent
20
21
 
21
-
22
22
  # ---------------------------------------------------------------------------
23
23
  # Single-task runner
24
24
  # ---------------------------------------------------------------------------
25
25
 
26
+
26
27
  async def run_single_task(
27
28
  dataset: str | Dataset | list[dict[str, Any]],
28
29
  *,
@@ -47,24 +48,20 @@ async def run_single_task(
47
48
 
48
49
  # Load dataset and pick a sample
49
50
  if isinstance(dataset, str):
50
- dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
51
+ dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
51
52
  elif isinstance(dataset, list):
52
53
  dataset = dataset
53
54
  else:
54
55
  dataset = dataset["train"]
55
-
56
+
56
57
  sample_task = dataset[task_id] # type: ignore[index]
57
58
  task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
58
59
 
59
60
  # Filter any existing Computer tools
60
61
  # The eval framework will add its own Computer tool per task
61
62
  if tools:
62
- tools = [
63
- tool
64
- for tool in tools
65
- if not is_agent_computer(tool)
66
- ]
67
-
63
+ tools = [tool for tool in tools if not is_agent_computer(tool)]
64
+
68
65
  with trace(name=task_prompt):
69
66
  task = Task(**sample_task) # type: ignore[arg-type]
70
67
 
@@ -87,13 +84,14 @@ async def run_single_task(
87
84
  )
88
85
  print(f"Running: {task_prompt}")
89
86
  result = await agent.run(task, max_steps=10)
90
- print(f"✅ Reward: {getattr(result, 'reward')}")
87
+ print(f"✅ Reward: {result.reward}")
91
88
 
92
89
 
93
90
  # ---------------------------------------------------------------------------
94
91
  # Full-dataset runner
95
92
  # ---------------------------------------------------------------------------
96
93
 
94
+
97
95
  async def run_full_dataset(
98
96
  dataset: str | Dataset | list[dict[str, Any]],
99
97
  *,
@@ -121,9 +119,9 @@ async def run_full_dataset(
121
119
 
122
120
  # Run with our MCP-based agent class.
123
121
  if isinstance(dataset, str):
124
- dataset_name = dataset.split('/')[-1]
122
+ dataset_name = dataset.split("/")[-1]
125
123
  job_name = job_name or f"Evaluation {dataset_name}"
126
- dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
124
+ dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
127
125
  else:
128
126
  dataset_name = "custom"
129
127
  job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
@@ -131,12 +129,8 @@ async def run_full_dataset(
131
129
  # Filter any existing Computer tools
132
130
  # The eval framework will add its own Computer tool per task
133
131
  if tools:
134
- tools = [
135
- tool
136
- for tool in tools
137
- if not is_agent_computer(tool)
138
- ]
139
-
132
+ tools = [tool for tool in tools if not is_agent_computer(tool)]
133
+
140
134
  # Execute evaluation
141
135
  return await run_dataset(
142
136
  name=job_name,
@@ -170,4 +164,4 @@ __all__ = [
170
164
  "run_single_task",
171
165
  "run_full_dataset",
172
166
  "MCPComputerAgent",
173
- ]
167
+ ]
@@ -9,26 +9,26 @@ Key differences from the OpenAI OperatorAgent variant:
9
9
  - Planning is executed via `ComputerAgent.run(messages)`.
10
10
  - The first yielded result per step is returned as the agent response.
11
11
  """
12
+
12
13
  from __future__ import annotations
13
14
 
15
+ import base64
14
16
  import io
17
+ import uuid
18
+ from pathlib import Path
15
19
  from typing import Any, ClassVar, Optional
16
20
 
21
+ import hud
22
+ import mcp.types as types
17
23
  from agent.agent import ComputerAgent as BaseComputerAgent
18
24
  from agent.callbacks import PromptInstructionsCallback
19
25
  from agent.callbacks.trajectory_saver import TrajectorySaverCallback
26
+ from agent.computers import is_agent_computer
27
+ from agent.responses import make_failed_tool_call_items
20
28
  from hud.agents import MCPAgent
21
29
  from hud.tools.computer.settings import computer_settings
22
30
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
23
-
24
- from agent.responses import make_failed_tool_call_items
25
- from agent.computers import is_agent_computer
26
31
  from PIL import Image
27
- import mcp.types as types
28
- import hud
29
- import uuid
30
- import base64
31
- from pathlib import Path
32
32
 
33
33
 
34
34
  class MCPComputerAgent(MCPAgent):
@@ -114,8 +114,10 @@ class MCPComputerAgent(MCPAgent):
114
114
  self.last_screenshot_b64 = None
115
115
 
116
116
  buffer = io.BytesIO()
117
- Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
118
- self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
117
+ Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
118
+ buffer, format="PNG"
119
+ )
120
+ self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
119
121
 
120
122
  # Ensure a computer shim is present so width/height/environment are known
121
123
  computer_shim = {
@@ -128,12 +130,8 @@ class MCPComputerAgent(MCPAgent):
128
130
  }
129
131
  agent_tools: list[Any] = [computer_shim]
130
132
  if tools:
131
- agent_tools.extend([
132
- tool
133
- for tool in tools
134
- if not is_agent_computer(tool)
135
- ])
136
-
133
+ agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
134
+
137
135
  agent_kwargs = {
138
136
  "model": self.model,
139
137
  "trajectory_dir": trajectory_dir,
@@ -150,9 +148,7 @@ class MCPComputerAgent(MCPAgent):
150
148
  "telemetry_enabled": telemetry_enabled,
151
149
  }
152
150
 
153
- self.computer_agent = BaseComputerAgent(
154
- **agent_kwargs
155
- )
151
+ self.computer_agent = BaseComputerAgent(**agent_kwargs)
156
152
 
157
153
  async def get_system_messages(self) -> list[Any]:
158
154
  """Create initial messages.
@@ -161,9 +157,7 @@ class MCPComputerAgent(MCPAgent):
161
157
  """
162
158
  return []
163
159
 
164
- async def format_blocks(
165
- self, blocks: list[types.ContentBlock]
166
- ) -> list[dict[str, Any]]:
160
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
167
161
  """
168
162
  Format blocks for OpenAI input format.
169
163
 
@@ -200,42 +194,49 @@ class MCPComputerAgent(MCPAgent):
200
194
 
201
195
  # Call the ComputerAgent LLM API
202
196
  async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
203
- items = result['output']
197
+ items = result["output"]
204
198
  if not items or tool_calls:
205
199
  break
206
200
 
207
201
  for item in items:
208
- if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
202
+ if item["type"] in [
203
+ "reasoning",
204
+ "message",
205
+ "computer_call",
206
+ "function_call",
207
+ "function_call_output",
208
+ ]:
209
209
  agent_result.append(item)
210
-
210
+
211
211
  # Add messages to output text
212
- if item['type'] == 'reasoning':
212
+ if item["type"] == "reasoning":
213
213
  output_text.extend(
214
- f"Reasoning: {summary['text']}"
215
- for summary in item['summary']
214
+ f"Reasoning: {summary['text']}" for summary in item["summary"]
216
215
  )
217
- elif item['type'] == 'message':
218
- if isinstance(item['content'], list):
216
+ elif item["type"] == "message":
217
+ if isinstance(item["content"], list):
219
218
  output_text.extend(
220
- item['text']
221
- for item in item['content']
222
- if item['type'] == 'output_text'
219
+ item["text"]
220
+ for item in item["content"]
221
+ if item["type"] == "output_text"
223
222
  )
224
- elif isinstance(item['content'], str):
225
- output_text.append(item['content'])
226
-
223
+ elif isinstance(item["content"], str):
224
+ output_text.append(item["content"])
225
+
227
226
  # If we get a tool call, we're not done
228
- if item['type'] == 'computer_call':
227
+ if item["type"] == "computer_call":
229
228
  id = item["call_id"]
230
- tool_calls.append(MCPToolCall(
231
- name="openai_computer",
232
- arguments=item["action"],
233
- id=id,
234
- ))
229
+ tool_calls.append(
230
+ MCPToolCall(
231
+ name="openai_computer",
232
+ arguments=item["action"],
233
+ id=id,
234
+ )
235
+ )
235
236
  is_done = False
236
237
  self.tool_call_inputs[id] = agent_result
237
238
  break
238
-
239
+
239
240
  # if we have tool calls, we should exit the loop
240
241
  if tool_calls:
241
242
  break
@@ -247,7 +248,7 @@ class MCPComputerAgent(MCPAgent):
247
248
  tool_calls=tool_calls,
248
249
  done=is_done,
249
250
  )
250
-
251
+
251
252
  def _log_image(self, image_b64: str):
252
253
  callbacks = self.computer_agent.callbacks
253
254
  for callback in callbacks:
@@ -257,9 +258,7 @@ class MCPComputerAgent(MCPAgent):
257
258
  callback._save_artifact("screenshot_after", image_bytes)
258
259
 
259
260
  async def format_tool_results(
260
- self,
261
- tool_calls: list[MCPToolCall],
262
- tool_results: list[MCPToolResult]
261
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
263
262
  ) -> list[dict[str, Any]]:
264
263
  """Extract latest screenshot from tool results in dict form.
265
264
 
@@ -274,45 +273,60 @@ class MCPComputerAgent(MCPAgent):
274
273
  previous_output = self.previous_output.copy() or []
275
274
 
276
275
  # First we need to remove any pending computer_calls from the end of previous_output
277
- while previous_output and previous_output[-1]['type'] == 'computer_call':
276
+ while previous_output and previous_output[-1]["type"] == "computer_call":
278
277
  previous_output.pop()
279
278
  messages.extend(previous_output)
280
279
 
281
280
  # If the call is a 'response', don't add the result
282
- if call.name == 'response':
281
+ if call.name == "response":
283
282
  continue
284
283
  # Otherwise, if we have a result, we should add it to the messages
285
284
  content = [
286
- { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
287
- else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
288
- else { "type": "input_text", "text": "" }
285
+ (
286
+ {"type": "input_text", "text": content.text}
287
+ if isinstance(content, types.TextContent)
288
+ else (
289
+ {
290
+ "type": "input_image",
291
+ "image_url": f"data:image/png;base64,{content.data}",
292
+ }
293
+ if isinstance(content, types.ImageContent)
294
+ else {"type": "input_text", "text": ""}
295
+ )
296
+ )
289
297
  for content in result.content
290
298
  ]
291
- messages.append({
292
- "role": "user",
293
- "content": content,
294
- })
299
+ messages.append(
300
+ {
301
+ "role": "user",
302
+ "content": content,
303
+ }
304
+ )
295
305
 
296
306
  continue
297
-
307
+
298
308
  # Add the assistant's computer call
299
309
  messages.extend(self.tool_call_inputs[call.id])
300
-
310
+
301
311
  if result.isError:
302
- error_text = "".join([
303
- content.text
304
- for content in result.content
305
- if isinstance(content, types.TextContent)
306
- ])
312
+ error_text = "".join(
313
+ [
314
+ content.text
315
+ for content in result.content
316
+ if isinstance(content, types.TextContent)
317
+ ]
318
+ )
307
319
 
308
320
  # Replace computer call with failed tool call
309
321
  messages.pop()
310
- messages.extend(make_failed_tool_call_items(
311
- tool_name=call.name,
312
- tool_kwargs=call.arguments or {},
313
- error_message=error_text,
314
- call_id=call.id,
315
- ))
322
+ messages.extend(
323
+ make_failed_tool_call_items(
324
+ tool_name=call.name,
325
+ tool_kwargs=call.arguments or {},
326
+ error_message=error_text,
327
+ call_id=call.id,
328
+ )
329
+ )
316
330
  else:
317
331
  # Get the latest screenshot
318
332
  screenshots = [
@@ -325,23 +339,27 @@ class MCPComputerAgent(MCPAgent):
325
339
  if screenshots:
326
340
  self._log_image(screenshots[0])
327
341
  self.last_screenshot_b64 = screenshots[0]
328
- messages.append({
329
- "type": "computer_call_output",
330
- "call_id": call.id,
331
- "output": {
332
- "type": "input_image",
333
- "image_url": f"data:image/png;base64,{screenshots[0]}"
334
- },
335
- })
342
+ messages.append(
343
+ {
344
+ "type": "computer_call_output",
345
+ "call_id": call.id,
346
+ "output": {
347
+ "type": "input_image",
348
+ "image_url": f"data:image/png;base64,{screenshots[0]}",
349
+ },
350
+ }
351
+ )
336
352
  else:
337
353
  # Otherwise, replace computer call with failed tool call
338
354
  messages.pop()
339
- messages.extend(make_failed_tool_call_items(
340
- tool_name=call.name,
341
- tool_kwargs=call.arguments or {},
342
- error_message="No screenshots returned.",
343
- call_id=call.id,
344
- ))
355
+ messages.extend(
356
+ make_failed_tool_call_items(
357
+ tool_name=call.name,
358
+ tool_kwargs=call.arguments or {},
359
+ error_message="No screenshots returned.",
360
+ call_id=call.id,
361
+ )
362
+ )
345
363
 
346
364
  return messages
347
365
 
@@ -7,30 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
7
7
  by consuming the first yielded result from `ComputerAgent.run()`.
8
8
  """
9
9
 
10
- import traceback
11
10
  import time
11
+ import traceback
12
12
  import uuid
13
13
  from typing import Any, Dict, List, Optional
14
14
 
15
15
  from agent.agent import ComputerAgent as BaseComputerAgent
16
16
  from agent.callbacks import PromptInstructionsCallback
17
- from hud.tools.computer.settings import computer_settings
18
- from PIL import Image
19
17
  from hud.agents import OperatorAgent
18
+ from hud.tools.computer.settings import computer_settings
20
19
 
21
20
  # OpenAI Responses typed models (required)
22
21
  from openai.types.responses import (
23
22
  Response,
23
+ ResponseComputerToolCall,
24
24
  ResponseInputParam,
25
25
  ResponseOutputItem,
26
- ResponseComputerToolCall,
27
26
  ResponseOutputMessage,
28
27
  ResponseOutputText,
29
28
  ResponseReasoningItem,
30
29
  ResponseUsage,
31
30
  )
31
+ from PIL import Image
32
32
 
33
- def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
33
+
34
+ def _map_agent_output_to_openai_blocks(
35
+ output_items: List[Dict[str, Any]],
36
+ ) -> List[ResponseOutputItem]:
34
37
  """Map our agent output items to OpenAI ResponseOutputItem typed models.
35
38
 
36
39
  Only a subset is supported: computer_call, assistant message (text), and reasoning.
@@ -40,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
40
43
  for item in output_items or []:
41
44
  t = item.get("type")
42
45
  if t == "computer_call":
43
- comp = ResponseComputerToolCall.model_validate({
44
- "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
45
- "type": "computer_call",
46
- "call_id": item["call_id"],
47
- "action": item["action"],
48
- "pending_safety_checks": item.get("pending_safety_checks", []),
49
- "status": "completed",
50
- })
46
+ comp = ResponseComputerToolCall.model_validate(
47
+ {
48
+ "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
49
+ "type": "computer_call",
50
+ "call_id": item["call_id"],
51
+ "action": item["action"],
52
+ "pending_safety_checks": item.get("pending_safety_checks", []),
53
+ "status": "completed",
54
+ }
55
+ )
51
56
  blocks.append(comp)
52
57
  # we will exit early here as the responses api only supports a single step
53
58
  break
@@ -55,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
55
60
  content_blocks: List[ResponseOutputText] = []
56
61
  for c in item.get("content", []) or []:
57
62
  content_blocks.append(
58
- ResponseOutputText.model_validate({
59
- "type": "output_text",
60
- "text": c["text"],
61
- "annotations": [],
62
- })
63
+ ResponseOutputText.model_validate(
64
+ {
65
+ "type": "output_text",
66
+ "text": c["text"],
67
+ "annotations": [],
68
+ }
69
+ )
63
70
  )
64
71
  if content_blocks:
65
- msg = ResponseOutputMessage.model_validate({
66
- "id": item.get("id") or f"msg_{uuid.uuid4()}",
67
- "type": "message",
68
- "role": "assistant",
69
- "status": "completed",
70
- "content": [ct.model_dump() for ct in content_blocks],
71
- })
72
+ msg = ResponseOutputMessage.model_validate(
73
+ {
74
+ "id": item.get("id") or f"msg_{uuid.uuid4()}",
75
+ "type": "message",
76
+ "role": "assistant",
77
+ "status": "completed",
78
+ "content": [ct.model_dump() for ct in content_blocks],
79
+ }
80
+ )
72
81
  blocks.append(msg)
73
82
  elif t == "reasoning":
74
- reasoning = ResponseReasoningItem.model_validate({
75
- "id": item.get("id") or f"rsn_{uuid.uuid4()}",
76
- "type": "reasoning",
77
- "summary": item["summary"],
78
- })
83
+ reasoning = ResponseReasoningItem.model_validate(
84
+ {
85
+ "id": item.get("id") or f"rsn_{uuid.uuid4()}",
86
+ "type": "reasoning",
87
+ "summary": item["summary"],
88
+ }
89
+ )
79
90
  blocks.append(reasoning)
80
91
  # Unhandled types are ignored
81
92
  return blocks
82
93
 
94
+
83
95
  def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
84
96
  out: List[Dict[str, Any]] = []
85
97
  for it in list(items):
@@ -92,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
92
104
  out.append(dict(it)) # may raise if not mapping
93
105
  return out
94
106
 
107
+
95
108
  class FakeAsyncOpenAI:
96
109
  """Minimal fake OpenAI client with only `responses.create` implemented.
97
110
 
@@ -132,10 +145,12 @@ class FakeAsyncOpenAI:
132
145
  # Pre-pend instructions message
133
146
  effective_input = full_input
134
147
  if instructions:
135
- effective_input = [{
136
- "role": "user",
137
- "content": instructions,
138
- }] + full_input
148
+ effective_input = [
149
+ {
150
+ "role": "user",
151
+ "content": instructions,
152
+ }
153
+ ] + full_input
139
154
 
140
155
  # Run a single iteration of the ComputerAgent
141
156
  agent_result: Optional[Dict[str, Any]] = None
@@ -152,32 +167,43 @@ class FakeAsyncOpenAI:
152
167
  blocks_to_cache = full_input + output
153
168
  for b in blocks_to_cache:
154
169
  bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
155
- self.blocks_cache[bid] = b # type: ignore[assignment]
170
+ self.blocks_cache[bid] = b # type: ignore[assignment]
156
171
  block_ids.append(bid)
157
172
  response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
158
173
  self.context_cache[response_id] = block_ids
159
174
 
160
175
  try:
161
- return Response.model_validate({
162
- "id": response_id,
163
- "created_at": time.time(),
164
- "object": "response",
165
- "model": model,
166
- "output": output,
167
- "parallel_tool_calls": False,
168
- "tool_choice": "auto",
169
- "tools": [],
170
- "previous_response_id": previous_response_id,
171
- "usage": ResponseUsage.model_validate({
172
- "input_tokens": usage.get("input_tokens", 0),
173
- "output_tokens": usage.get("output_tokens", 0),
174
- "total_tokens": usage.get("total_tokens", 0),
175
- "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
176
- "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
177
- }),
178
- })
176
+ return Response.model_validate(
177
+ {
178
+ "id": response_id,
179
+ "created_at": time.time(),
180
+ "object": "response",
181
+ "model": model,
182
+ "output": output,
183
+ "parallel_tool_calls": False,
184
+ "tool_choice": "auto",
185
+ "tools": [],
186
+ "previous_response_id": previous_response_id,
187
+ "usage": ResponseUsage.model_validate(
188
+ {
189
+ "input_tokens": usage.get("input_tokens", 0),
190
+ "output_tokens": usage.get("output_tokens", 0),
191
+ "total_tokens": usage.get("total_tokens", 0),
192
+ "input_tokens_details": usage.get(
193
+ "input_tokens_details", {"cached_tokens": 0}
194
+ ),
195
+ "output_tokens_details": usage.get(
196
+ "output_tokens_details", {"reasoning_tokens": 0}
197
+ ),
198
+ }
199
+ ),
200
+ }
201
+ )
179
202
  except Exception as e:
180
- print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
203
+ print(
204
+ f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
205
+ e,
206
+ )
181
207
  if attempt == max_retries - 1:
182
208
  print(traceback.format_exc())
183
209
  raise e
@@ -221,9 +247,15 @@ class ProxyOperatorAgent(OperatorAgent):
221
247
  allowed_tools = allowed_tools or ["openai_computer"]
222
248
 
223
249
  computer_shim = {
224
- 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
225
- 'environment': 'linux',
226
- 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
250
+ "screenshot": lambda: Image.new(
251
+ "RGB",
252
+ (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
253
+ ),
254
+ "environment": "linux",
255
+ "dimensions": (
256
+ computer_settings.OPENAI_COMPUTER_WIDTH,
257
+ computer_settings.OPENAI_COMPUTER_HEIGHT,
258
+ ),
227
259
  }
228
260
  # Build tools ensuring the computer_shim is included
229
261
  agent_tools: list[Any] = [computer_shim]
@@ -258,6 +290,7 @@ class ProxyOperatorAgent(OperatorAgent):
258
290
  **kwargs,
259
291
  )
260
292
 
293
+
261
294
  __all__ = [
262
295
  "FakeAsyncOpenAI",
263
296
  "ProxyOperatorAgent",