cua-agent 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,453 @@
1
+ """API handler for the OpenAI provider."""
2
+
3
+ import logging
4
+ import requests
5
+ import os
6
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING
7
+ from datetime import datetime
8
+
9
+ if TYPE_CHECKING:
10
+ from .loop import OpenAILoop
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class OpenAIAPIHandler:
16
+ """Handler for OpenAI API interactions."""
17
+
18
+ def __init__(self, loop: "OpenAILoop"):
19
+ """Initialize the API handler.
20
+
21
+ Args:
22
+ loop: OpenAI loop instance
23
+ """
24
+ self.loop = loop
25
+ self.api_key = os.getenv("OPENAI_API_KEY")
26
+ if not self.api_key:
27
+ raise ValueError("OPENAI_API_KEY environment variable not set")
28
+
29
+ self.api_base = "https://api.openai.com/v1"
30
+ self.headers = {
31
+ "Authorization": f"Bearer {self.api_key}",
32
+ "Content-Type": "application/json",
33
+ }
34
+
35
+ # Add organization if specified
36
+ org_id = os.getenv("OPENAI_ORG")
37
+ if org_id:
38
+ self.headers["OpenAI-Organization"] = org_id
39
+
40
+ logger.info("Initialized OpenAI API handler")
41
+
42
+ async def send_initial_request(
43
+ self,
44
+ messages: List[Dict[str, Any]],
45
+ display_width: str,
46
+ display_height: str,
47
+ previous_response_id: Optional[str] = None,
48
+ ) -> Dict[str, Any]:
49
+ """Send an initial request to the OpenAI API with a screenshot.
50
+
51
+ Args:
52
+ messages: List of message objects in standard format
53
+ display_width: Width of the display in pixels
54
+ display_height: Height of the display in pixels
55
+ previous_response_id: Optional ID of the previous response to link requests
56
+
57
+ Returns:
58
+ API response
59
+ """
60
+ # Convert display dimensions to integers
61
+ try:
62
+ width = int(display_width)
63
+ height = int(display_height)
64
+ except (ValueError, TypeError) as e:
65
+ logger.error(f"Failed to convert display dimensions to integers: {str(e)}")
66
+ raise ValueError(
67
+ f"Display dimensions must be integers: width={display_width}, height={display_height}"
68
+ )
69
+
70
+ # Extract the latest text message and screenshot from messages
71
+ latest_text = None
72
+ latest_screenshot = None
73
+
74
+ for msg in reversed(messages):
75
+ if not isinstance(msg, dict):
76
+ continue
77
+
78
+ content = msg.get("content", [])
79
+
80
+ if isinstance(content, str) and not latest_text:
81
+ latest_text = content
82
+ continue
83
+
84
+ if not isinstance(content, list):
85
+ continue
86
+
87
+ for item in content:
88
+ if not isinstance(item, dict):
89
+ continue
90
+
91
+ # Look for text if we don't have it yet
92
+ if not latest_text and item.get("type") == "text" and "text" in item:
93
+ latest_text = item.get("text", "")
94
+
95
+ # Look for an image if we don't have it yet
96
+ if not latest_screenshot and item.get("type") == "image":
97
+ source = item.get("source", {})
98
+ if source.get("type") == "base64" and "data" in source:
99
+ latest_screenshot = source["data"]
100
+
101
+ # Prepare the input array
102
+ input_array = []
103
+
104
+ # Add the text message if found
105
+ if latest_text:
106
+ input_array.append({"role": "user", "content": latest_text})
107
+
108
+ # Add the screenshot if found and no previous_response_id is provided
109
+ if latest_screenshot and not previous_response_id:
110
+ input_array.append(
111
+ {
112
+ "type": "message",
113
+ "role": "user",
114
+ "content": [
115
+ {
116
+ "type": "input_image",
117
+ "image_url": f"data:image/png;base64,{latest_screenshot}",
118
+ }
119
+ ],
120
+ }
121
+ )
122
+
123
+ # Prepare the request payload - using minimal format from docs
124
+ payload = {
125
+ "model": "computer-use-preview",
126
+ "tools": [
127
+ {
128
+ "type": "computer_use_preview",
129
+ "display_width": width,
130
+ "display_height": height,
131
+ "environment": "mac", # We're on macOS
132
+ }
133
+ ],
134
+ "input": input_array,
135
+ "truncation": "auto",
136
+ }
137
+
138
+ # Add previous_response_id if provided
139
+ if previous_response_id:
140
+ payload["previous_response_id"] = previous_response_id
141
+
142
+ # Log the request using the BaseLoop's log_api_call method
143
+ self.loop._log_api_call("request", payload)
144
+
145
+ # Log for debug purposes
146
+ logger.info("Sending initial request to OpenAI API")
147
+ logger.debug(f"Request payload: {self._sanitize_response(payload)}")
148
+
149
+ # Send the request
150
+ response = requests.post(
151
+ f"{self.api_base}/responses",
152
+ headers=self.headers,
153
+ json=payload,
154
+ )
155
+
156
+ if response.status_code != 200:
157
+ error_message = f"OpenAI API error: {response.status_code} {response.text}"
158
+ logger.error(error_message)
159
+ # Log the error using the BaseLoop's log_api_call method
160
+ self.loop._log_api_call("error", payload, error=Exception(error_message))
161
+ raise Exception(error_message)
162
+
163
+ response_data = response.json()
164
+
165
+ # Log the response using the BaseLoop's log_api_call method
166
+ self.loop._log_api_call("response", payload, response_data)
167
+
168
+ # Log for debug purposes
169
+ logger.info("Received response from OpenAI API")
170
+ logger.debug(f"Response data: {self._sanitize_response(response_data)}")
171
+
172
+ return response_data
173
+
174
+ async def send_computer_call_request(
175
+ self,
176
+ messages: List[Dict[str, Any]],
177
+ display_width: str,
178
+ display_height: str,
179
+ previous_response_id: str,
180
+ ) -> Dict[str, Any]:
181
+ """Send a request to the OpenAI API with computer_call_output.
182
+
183
+ Args:
184
+ messages: List of message objects in standard format
185
+ display_width: Width of the display in pixels
186
+ display_height: Height of the display in pixels
187
+ system_prompt: System prompt to include
188
+ previous_response_id: ID of the previous response to link requests
189
+
190
+ Returns:
191
+ API response
192
+ """
193
+ # Convert display dimensions to integers
194
+ try:
195
+ width = int(display_width)
196
+ height = int(display_height)
197
+ except (ValueError, TypeError) as e:
198
+ logger.error(f"Failed to convert display dimensions to integers: {str(e)}")
199
+ raise ValueError(
200
+ f"Display dimensions must be integers: width={display_width}, height={display_height}"
201
+ )
202
+
203
+ # Find the most recent computer_call_output with call_id
204
+ call_id = None
205
+ screenshot_base64 = None
206
+
207
+ # Look for call_id and screenshot in messages
208
+ for msg in reversed(messages):
209
+ if not isinstance(msg, dict):
210
+ continue
211
+
212
+ # Check if the message itself has a call_id
213
+ if "call_id" in msg and not call_id:
214
+ call_id = msg["call_id"]
215
+
216
+ content = msg.get("content", [])
217
+ if not isinstance(content, list):
218
+ continue
219
+
220
+ for item in content:
221
+ if not isinstance(item, dict):
222
+ continue
223
+
224
+ # Look for call_id
225
+ if not call_id and "call_id" in item:
226
+ call_id = item["call_id"]
227
+
228
+ # Look for screenshot in computer_call_output
229
+ if not screenshot_base64 and item.get("type") == "computer_call_output":
230
+ output = item.get("output", {})
231
+ if isinstance(output, dict) and "image_url" in output:
232
+ image_url = output.get("image_url", "")
233
+ if image_url.startswith("data:image/png;base64,"):
234
+ screenshot_base64 = image_url[len("data:image/png;base64,") :]
235
+
236
+ # Look for screenshot in image type
237
+ if not screenshot_base64 and item.get("type") == "image":
238
+ source = item.get("source", {})
239
+ if source.get("type") == "base64" and "data" in source:
240
+ screenshot_base64 = source["data"]
241
+
242
+ if not call_id or not screenshot_base64:
243
+ logger.error("Missing call_id or screenshot for computer_call_output")
244
+ logger.error(f"Last message: {messages[-1] if messages else None}")
245
+ raise ValueError("Cannot create computer call request: missing call_id or screenshot")
246
+
247
+ # Prepare the request payload using minimal format from docs
248
+ payload = {
249
+ "model": "computer-use-preview",
250
+ "previous_response_id": previous_response_id,
251
+ "tools": [
252
+ {
253
+ "type": "computer_use_preview",
254
+ "display_width": width,
255
+ "display_height": height,
256
+ "environment": "mac", # We're on macOS
257
+ }
258
+ ],
259
+ "input": [
260
+ {
261
+ "type": "computer_call_output",
262
+ "call_id": call_id,
263
+ "output": {
264
+ "type": "input_image",
265
+ "image_url": f"data:image/png;base64,{screenshot_base64}",
266
+ },
267
+ }
268
+ ],
269
+ "truncation": "auto",
270
+ }
271
+
272
+ # Log the request using the BaseLoop's log_api_call method
273
+ self.loop._log_api_call("request", payload)
274
+
275
+ # Log for debug purposes
276
+ logger.info("Sending computer call request to OpenAI API")
277
+ logger.debug(f"Request payload: {self._sanitize_response(payload)}")
278
+
279
+ # Send the request
280
+ response = requests.post(
281
+ f"{self.api_base}/responses",
282
+ headers=self.headers,
283
+ json=payload,
284
+ )
285
+
286
+ if response.status_code != 200:
287
+ error_message = f"OpenAI API error: {response.status_code} {response.text}"
288
+ logger.error(error_message)
289
+ # Log the error using the BaseLoop's log_api_call method
290
+ self.loop._log_api_call("error", payload, error=Exception(error_message))
291
+ raise Exception(error_message)
292
+
293
+ response_data = response.json()
294
+
295
+ # Log the response using the BaseLoop's log_api_call method
296
+ self.loop._log_api_call("response", payload, response_data)
297
+
298
+ # Log for debug purposes
299
+ logger.info("Received response from OpenAI API")
300
+ logger.debug(f"Response data: {self._sanitize_response(response_data)}")
301
+
302
+ return response_data
303
+
304
+ def _format_messages_for_agent_response(
305
+ self, messages: List[Dict[str, Any]]
306
+ ) -> List[Dict[str, Any]]:
307
+ """Format messages for the OpenAI Agent Response API.
308
+
309
+ The Agent Response API requires specific content types:
310
+ - For user messages: use "input_text", "input_image", etc.
311
+ - For assistant messages: use "output_text" only
312
+
313
+ Additionally, when using the computer tool, only one image can be sent.
314
+
315
+ Args:
316
+ messages: List of standard messages
317
+
318
+ Returns:
319
+ Messages formatted for the Agent Response API
320
+ """
321
+ formatted_messages = []
322
+ has_image = False # Track if we've already included an image
323
+
324
+ # We need to process messages in reverse to ensure we keep the most recent image
325
+ # but preserve the original order in the final output
326
+ reversed_messages = list(reversed(messages))
327
+ temp_formatted = []
328
+
329
+ for msg in reversed_messages:
330
+ if not msg:
331
+ continue
332
+
333
+ role = msg.get("role", "user")
334
+ content = msg.get("content", "")
335
+
336
+ logger.debug(f"Processing message - Role: {role}, Content type: {type(content)}")
337
+ if isinstance(content, list):
338
+ logger.debug(
339
+ f"List content items: {[item.get('type') for item in content if isinstance(item, dict)]}"
340
+ )
341
+
342
+ if isinstance(content, str):
343
+ # For string content, create a message with the appropriate text type
344
+ if role == "user":
345
+ temp_formatted.append(
346
+ {"role": role, "content": [{"type": "input_text", "text": content}]}
347
+ )
348
+ elif role == "assistant":
349
+ # For assistant, we need explicit output_text
350
+ temp_formatted.append(
351
+ {"role": role, "content": [{"type": "output_text", "text": content}]}
352
+ )
353
+ elif role == "system":
354
+ # System messages need to be formatted as input_text as well
355
+ temp_formatted.append(
356
+ {"role": role, "content": [{"type": "input_text", "text": content}]}
357
+ )
358
+ elif isinstance(content, list):
359
+ # For list content, convert each item to the correct type based on role
360
+ formatted_content = []
361
+ has_image_in_this_message = False
362
+
363
+ for item in content:
364
+ if not isinstance(item, dict):
365
+ continue
366
+
367
+ item_type = item.get("type")
368
+
369
+ if role == "user":
370
+ # Handle user message formatting
371
+ if item_type == "text" or item_type == "input_text":
372
+ # Text from user is input_text
373
+ formatted_content.append(
374
+ {"type": "input_text", "text": item.get("text", "")}
375
+ )
376
+ elif (item_type == "image" or item_type == "image_url") and not has_image:
377
+ # Only include the first/most recent image we encounter
378
+ if item_type == "image":
379
+ # Image from user is input_image
380
+ source = item.get("source", {})
381
+ if source.get("type") == "base64" and "data" in source:
382
+ formatted_content.append(
383
+ {
384
+ "type": "input_image",
385
+ "image_url": f"data:image/png;base64,{source['data']}",
386
+ }
387
+ )
388
+ has_image = True
389
+ has_image_in_this_message = True
390
+ elif item_type == "image_url":
391
+ # Convert "image_url" to "input_image"
392
+ formatted_content.append(
393
+ {
394
+ "type": "input_image",
395
+ "image_url": item.get("image_url", {}).get("url", ""),
396
+ }
397
+ )
398
+ has_image = True
399
+ has_image_in_this_message = True
400
+ elif role == "assistant":
401
+ # Handle assistant message formatting - only output_text is supported
402
+ if item_type == "text" or item_type == "output_text":
403
+ formatted_content.append(
404
+ {"type": "output_text", "text": item.get("text", "")}
405
+ )
406
+
407
+ if formatted_content:
408
+ # If this message had an image, mark it for inclusion
409
+ temp_formatted.append(
410
+ {
411
+ "role": role,
412
+ "content": formatted_content,
413
+ "_had_image": has_image_in_this_message, # Temporary marker
414
+ }
415
+ )
416
+
417
+ # Reverse back to original order and cleanup
418
+ for msg in reversed(temp_formatted):
419
+ # Remove our temporary marker
420
+ if "_had_image" in msg:
421
+ del msg["_had_image"]
422
+ formatted_messages.append(msg)
423
+
424
+ # Log summary for debugging
425
+ num_images = sum(
426
+ 1
427
+ for msg in formatted_messages
428
+ for item in (msg.get("content", []) if isinstance(msg.get("content"), list) else [])
429
+ if isinstance(item, dict) and item.get("type") == "input_image"
430
+ )
431
+ logger.info(f"Formatted {len(messages)} messages for OpenAI API with {num_images} images")
432
+
433
+ return formatted_messages
434
+
435
+ def _sanitize_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
436
+ """Sanitize response for logging by removing large image data.
437
+
438
+ Args:
439
+ response: Response to sanitize
440
+
441
+ Returns:
442
+ Sanitized response
443
+ """
444
+ from .utils import sanitize_message
445
+
446
+ # Deep copy to avoid modifying the original
447
+ sanitized = response.copy()
448
+
449
+ # Sanitize output items if present
450
+ if "output" in sanitized and isinstance(sanitized["output"], list):
451
+ sanitized["output"] = [sanitize_message(item) for item in sanitized["output"]]
452
+
453
+ return sanitized