cua-agent 0.3.2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b2.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b2.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/entry_points.txt +0 -0
@@ -1,456 +0,0 @@
1
- """API handler for the OpenAI provider."""
2
-
3
- import logging
4
- import requests
5
- import os
6
- from typing import Any, Dict, List, Optional, TYPE_CHECKING
7
- from datetime import datetime
8
-
9
- if TYPE_CHECKING:
10
- from .loop import OpenAILoop
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class OpenAIAPIHandler:
16
- """Handler for OpenAI API interactions."""
17
-
18
- def __init__(self, loop: "OpenAILoop"):
19
- """Initialize the API handler.
20
-
21
- Args:
22
- loop: OpenAI loop instance
23
- """
24
- self.loop = loop
25
- self.api_key = os.getenv("OPENAI_API_KEY")
26
- if not self.api_key:
27
- raise ValueError("OPENAI_API_KEY environment variable not set")
28
-
29
- self.api_base = "https://api.openai.com/v1"
30
- self.headers = {
31
- "Authorization": f"Bearer {self.api_key}",
32
- "Content-Type": "application/json",
33
- }
34
-
35
- # Add organization if specified
36
- org_id = os.getenv("OPENAI_ORG")
37
- if org_id:
38
- self.headers["OpenAI-Organization"] = org_id
39
-
40
- logger.info("Initialized OpenAI API handler")
41
-
42
- async def send_initial_request(
43
- self,
44
- messages: List[Dict[str, Any]],
45
- display_width: str,
46
- display_height: str,
47
- previous_response_id: Optional[str] = None,
48
- ) -> Dict[str, Any]:
49
- """Send an initial request to the OpenAI API with a screenshot.
50
-
51
- Args:
52
- messages: List of message objects in standard format
53
- display_width: Width of the display in pixels
54
- display_height: Height of the display in pixels
55
- previous_response_id: Optional ID of the previous response to link requests
56
-
57
- Returns:
58
- API response
59
- """
60
- # Convert display dimensions to integers
61
- try:
62
- width = int(display_width)
63
- height = int(display_height)
64
- except (ValueError, TypeError) as e:
65
- logger.error(f"Failed to convert display dimensions to integers: {str(e)}")
66
- raise ValueError(
67
- f"Display dimensions must be integers: width={display_width}, height={display_height}"
68
- )
69
-
70
- # Extract the latest text message and screenshot from messages
71
- latest_text = None
72
- latest_screenshot = None
73
-
74
- for msg in reversed(messages):
75
- if not isinstance(msg, dict):
76
- continue
77
-
78
- content = msg.get("content", [])
79
-
80
- if isinstance(content, str) and not latest_text:
81
- latest_text = content
82
- continue
83
-
84
- if not isinstance(content, list):
85
- continue
86
-
87
- for item in content:
88
- if not isinstance(item, dict):
89
- continue
90
-
91
- # Look for text if we don't have it yet
92
- if not latest_text and item.get("type") == "text" and "text" in item:
93
- latest_text = item.get("text", "")
94
-
95
- # Look for an image if we don't have it yet
96
- if not latest_screenshot and item.get("type") == "image":
97
- source = item.get("source", {})
98
- if source.get("type") == "base64" and "data" in source:
99
- latest_screenshot = source["data"]
100
-
101
- # Prepare the input array
102
- input_array = []
103
-
104
- # Add the text message if found
105
- if latest_text:
106
- input_array.append({"role": "user", "content": latest_text})
107
-
108
- # Add the screenshot if found and no previous_response_id is provided
109
- if latest_screenshot and not previous_response_id:
110
- input_array.append(
111
- {
112
- "type": "message",
113
- "role": "user",
114
- "content": [
115
- {
116
- "type": "input_image",
117
- "image_url": f"data:image/png;base64,{latest_screenshot}",
118
- }
119
- ],
120
- }
121
- )
122
-
123
- # Prepare the request payload - using minimal format from docs
124
- payload = {
125
- "model": "computer-use-preview",
126
- "tools": [
127
- {
128
- "type": "computer_use_preview",
129
- "display_width": width,
130
- "display_height": height,
131
- "environment": "mac", # We're on macOS
132
- }
133
- ],
134
- "input": input_array,
135
- "reasoning": {
136
- "generate_summary": "concise",
137
- },
138
- "truncation": "auto",
139
- }
140
-
141
- # Add previous_response_id if provided
142
- if previous_response_id:
143
- payload["previous_response_id"] = previous_response_id
144
-
145
- # Log the request using the BaseLoop's log_api_call method
146
- self.loop._log_api_call("request", payload)
147
-
148
- # Log for debug purposes
149
- logger.info("Sending initial request to OpenAI API")
150
- logger.debug(f"Request payload: {self._sanitize_response(payload)}")
151
-
152
- # Send the request
153
- response = requests.post(
154
- f"{self.api_base}/responses",
155
- headers=self.headers,
156
- json=payload,
157
- )
158
-
159
- if response.status_code != 200:
160
- error_message = f"OpenAI API error: {response.status_code} {response.text}"
161
- logger.error(error_message)
162
- # Log the error using the BaseLoop's log_api_call method
163
- self.loop._log_api_call("error", payload, error=Exception(error_message))
164
- raise Exception(error_message)
165
-
166
- response_data = response.json()
167
-
168
- # Log the response using the BaseLoop's log_api_call method
169
- self.loop._log_api_call("response", payload, response_data)
170
-
171
- # Log for debug purposes
172
- logger.info("Received response from OpenAI API")
173
- logger.debug(f"Response data: {self._sanitize_response(response_data)}")
174
-
175
- return response_data
176
-
177
- async def send_computer_call_request(
178
- self,
179
- messages: List[Dict[str, Any]],
180
- display_width: str,
181
- display_height: str,
182
- previous_response_id: str,
183
- ) -> Dict[str, Any]:
184
- """Send a request to the OpenAI API with computer_call_output.
185
-
186
- Args:
187
- messages: List of message objects in standard format
188
- display_width: Width of the display in pixels
189
- display_height: Height of the display in pixels
190
- system_prompt: System prompt to include
191
- previous_response_id: ID of the previous response to link requests
192
-
193
- Returns:
194
- API response
195
- """
196
- # Convert display dimensions to integers
197
- try:
198
- width = int(display_width)
199
- height = int(display_height)
200
- except (ValueError, TypeError) as e:
201
- logger.error(f"Failed to convert display dimensions to integers: {str(e)}")
202
- raise ValueError(
203
- f"Display dimensions must be integers: width={display_width}, height={display_height}"
204
- )
205
-
206
- # Find the most recent computer_call_output with call_id
207
- call_id = None
208
- screenshot_base64 = None
209
-
210
- # Look for call_id and screenshot in messages
211
- for msg in reversed(messages):
212
- if not isinstance(msg, dict):
213
- continue
214
-
215
- # Check if the message itself has a call_id
216
- if "call_id" in msg and not call_id:
217
- call_id = msg["call_id"]
218
-
219
- content = msg.get("content", [])
220
- if not isinstance(content, list):
221
- continue
222
-
223
- for item in content:
224
- if not isinstance(item, dict):
225
- continue
226
-
227
- # Look for call_id
228
- if not call_id and "call_id" in item:
229
- call_id = item["call_id"]
230
-
231
- # Look for screenshot in computer_call_output
232
- if not screenshot_base64 and item.get("type") == "computer_call_output":
233
- output = item.get("output", {})
234
- if isinstance(output, dict) and "image_url" in output:
235
- image_url = output.get("image_url", "")
236
- if image_url.startswith("data:image/png;base64,"):
237
- screenshot_base64 = image_url[len("data:image/png;base64,") :]
238
-
239
- # Look for screenshot in image type
240
- if not screenshot_base64 and item.get("type") == "image":
241
- source = item.get("source", {})
242
- if source.get("type") == "base64" and "data" in source:
243
- screenshot_base64 = source["data"]
244
-
245
- if not call_id or not screenshot_base64:
246
- logger.error("Missing call_id or screenshot for computer_call_output")
247
- logger.error(f"Last message: {messages[-1] if messages else None}")
248
- raise ValueError("Cannot create computer call request: missing call_id or screenshot")
249
-
250
- # Prepare the request payload using minimal format from docs
251
- payload = {
252
- "model": "computer-use-preview",
253
- "previous_response_id": previous_response_id,
254
- "tools": [
255
- {
256
- "type": "computer_use_preview",
257
- "display_width": width,
258
- "display_height": height,
259
- "environment": "mac", # We're on macOS
260
- }
261
- ],
262
- "input": [
263
- {
264
- "type": "computer_call_output",
265
- "call_id": call_id,
266
- "output": {
267
- "type": "input_image",
268
- "image_url": f"data:image/png;base64,{screenshot_base64}",
269
- },
270
- }
271
- ],
272
- "truncation": "auto",
273
- }
274
-
275
- # Log the request using the BaseLoop's log_api_call method
276
- self.loop._log_api_call("request", payload)
277
-
278
- # Log for debug purposes
279
- logger.info("Sending computer call request to OpenAI API")
280
- logger.debug(f"Request payload: {self._sanitize_response(payload)}")
281
-
282
- # Send the request
283
- response = requests.post(
284
- f"{self.api_base}/responses",
285
- headers=self.headers,
286
- json=payload,
287
- )
288
-
289
- if response.status_code != 200:
290
- error_message = f"OpenAI API error: {response.status_code} {response.text}"
291
- logger.error(error_message)
292
- # Log the error using the BaseLoop's log_api_call method
293
- self.loop._log_api_call("error", payload, error=Exception(error_message))
294
- raise Exception(error_message)
295
-
296
- response_data = response.json()
297
-
298
- # Log the response using the BaseLoop's log_api_call method
299
- self.loop._log_api_call("response", payload, response_data)
300
-
301
- # Log for debug purposes
302
- logger.info("Received response from OpenAI API")
303
- logger.debug(f"Response data: {self._sanitize_response(response_data)}")
304
-
305
- return response_data
306
-
307
- def _format_messages_for_agent_response(
308
- self, messages: List[Dict[str, Any]]
309
- ) -> List[Dict[str, Any]]:
310
- """Format messages for the OpenAI Agent Response API.
311
-
312
- The Agent Response API requires specific content types:
313
- - For user messages: use "input_text", "input_image", etc.
314
- - For assistant messages: use "output_text" only
315
-
316
- Additionally, when using the computer tool, only one image can be sent.
317
-
318
- Args:
319
- messages: List of standard messages
320
-
321
- Returns:
322
- Messages formatted for the Agent Response API
323
- """
324
- formatted_messages = []
325
- has_image = False # Track if we've already included an image
326
-
327
- # We need to process messages in reverse to ensure we keep the most recent image
328
- # but preserve the original order in the final output
329
- reversed_messages = list(reversed(messages))
330
- temp_formatted = []
331
-
332
- for msg in reversed_messages:
333
- if not msg:
334
- continue
335
-
336
- role = msg.get("role", "user")
337
- content = msg.get("content", "")
338
-
339
- logger.debug(f"Processing message - Role: {role}, Content type: {type(content)}")
340
- if isinstance(content, list):
341
- logger.debug(
342
- f"List content items: {[item.get('type') for item in content if isinstance(item, dict)]}"
343
- )
344
-
345
- if isinstance(content, str):
346
- # For string content, create a message with the appropriate text type
347
- if role == "user":
348
- temp_formatted.append(
349
- {"role": role, "content": [{"type": "input_text", "text": content}]}
350
- )
351
- elif role == "assistant":
352
- # For assistant, we need explicit output_text
353
- temp_formatted.append(
354
- {"role": role, "content": [{"type": "output_text", "text": content}]}
355
- )
356
- elif role == "system":
357
- # System messages need to be formatted as input_text as well
358
- temp_formatted.append(
359
- {"role": role, "content": [{"type": "input_text", "text": content}]}
360
- )
361
- elif isinstance(content, list):
362
- # For list content, convert each item to the correct type based on role
363
- formatted_content = []
364
- has_image_in_this_message = False
365
-
366
- for item in content:
367
- if not isinstance(item, dict):
368
- continue
369
-
370
- item_type = item.get("type")
371
-
372
- if role == "user":
373
- # Handle user message formatting
374
- if item_type == "text" or item_type == "input_text":
375
- # Text from user is input_text
376
- formatted_content.append(
377
- {"type": "input_text", "text": item.get("text", "")}
378
- )
379
- elif (item_type == "image" or item_type == "image_url") and not has_image:
380
- # Only include the first/most recent image we encounter
381
- if item_type == "image":
382
- # Image from user is input_image
383
- source = item.get("source", {})
384
- if source.get("type") == "base64" and "data" in source:
385
- formatted_content.append(
386
- {
387
- "type": "input_image",
388
- "image_url": f"data:image/png;base64,{source['data']}",
389
- }
390
- )
391
- has_image = True
392
- has_image_in_this_message = True
393
- elif item_type == "image_url":
394
- # Convert "image_url" to "input_image"
395
- formatted_content.append(
396
- {
397
- "type": "input_image",
398
- "image_url": item.get("image_url", {}).get("url", ""),
399
- }
400
- )
401
- has_image = True
402
- has_image_in_this_message = True
403
- elif role == "assistant":
404
- # Handle assistant message formatting - only output_text is supported
405
- if item_type == "text" or item_type == "output_text":
406
- formatted_content.append(
407
- {"type": "output_text", "text": item.get("text", "")}
408
- )
409
-
410
- if formatted_content:
411
- # If this message had an image, mark it for inclusion
412
- temp_formatted.append(
413
- {
414
- "role": role,
415
- "content": formatted_content,
416
- "_had_image": has_image_in_this_message, # Temporary marker
417
- }
418
- )
419
-
420
- # Reverse back to original order and cleanup
421
- for msg in reversed(temp_formatted):
422
- # Remove our temporary marker
423
- if "_had_image" in msg:
424
- del msg["_had_image"]
425
- formatted_messages.append(msg)
426
-
427
- # Log summary for debugging
428
- num_images = sum(
429
- 1
430
- for msg in formatted_messages
431
- for item in (msg.get("content", []) if isinstance(msg.get("content"), list) else [])
432
- if isinstance(item, dict) and item.get("type") == "input_image"
433
- )
434
- logger.info(f"Formatted {len(messages)} messages for OpenAI API with {num_images} images")
435
-
436
- return formatted_messages
437
-
438
- def _sanitize_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
439
- """Sanitize response for logging by removing large image data.
440
-
441
- Args:
442
- response: Response to sanitize
443
-
444
- Returns:
445
- Sanitized response
446
- """
447
- from .utils import sanitize_message
448
-
449
- # Deep copy to avoid modifying the original
450
- sanitized = response.copy()
451
-
452
- # Sanitize output items if present
453
- if "output" in sanitized and isinstance(sanitized["output"], list):
454
- sanitized["output"] = [sanitize_message(item) for item in sanitized["output"]]
455
-
456
- return sanitized