cua-agent 0.4.36__tar.gz → 0.4.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.4.36 → cua_agent-0.4.37}/PKG-INFO +1 -1
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/omniparser.py +137 -75
- {cua_agent-0.4.36 → cua_agent-0.4.37}/pyproject.toml +1 -1
- {cua_agent-0.4.36 → cua_agent-0.4.37}/README.md +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/__main__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/huggingfacelocal_adapter.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/human_adapter.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/mlxvlm_adapter.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/generic.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/internvl.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/opencua.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/qwen2_5_vl.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/agent.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/base.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/budget_manager.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/image_retention.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/logging.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/operator_validator.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/pii_anonymization.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/prompt_instructions.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/telemetry.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/trajectory_saver.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/cli.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/base.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/cua.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/custom.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/decorators.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/__main__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/server.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/ui.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/agent.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/proxy.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/anthropic.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/base.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/composed_grounded.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/gemini.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/glm45v.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/gta1.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/holo.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/internvl.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/model_types.csv +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/moondream3.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/openai.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/opencua.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/qwen.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/uitars.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/proxy/examples.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/proxy/handlers.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/responses.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/types.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/__main__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/__init__.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/app.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/ui_components.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/tests/conftest.py +0 -0
- {cua_agent-0.4.36 → cua_agent-0.4.37}/tests/test_computer_agent.py +0 -0
|
@@ -14,67 +14,73 @@ import litellm
|
|
|
14
14
|
|
|
15
15
|
from ..decorators import register_agent
|
|
16
16
|
from ..loops.base import AsyncAgentConfig
|
|
17
|
+
from ..responses import (
|
|
18
|
+
convert_completion_messages_to_responses_items,
|
|
19
|
+
convert_responses_items_to_completion_messages,
|
|
20
|
+
)
|
|
17
21
|
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
18
22
|
|
|
19
23
|
SOM_TOOL_SCHEMA = {
|
|
20
24
|
"type": "function",
|
|
21
|
-
"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
|
|
29
|
-
"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
"
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
"
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
"
|
|
74
|
-
|
|
25
|
+
"function": {
|
|
26
|
+
"name": "computer",
|
|
27
|
+
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
|
|
28
|
+
"parameters": {
|
|
29
|
+
"type": "object",
|
|
30
|
+
"properties": {
|
|
31
|
+
"action": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"enum": [
|
|
34
|
+
"screenshot",
|
|
35
|
+
"click",
|
|
36
|
+
"double_click",
|
|
37
|
+
"drag",
|
|
38
|
+
"type",
|
|
39
|
+
"keypress",
|
|
40
|
+
"scroll",
|
|
41
|
+
"move",
|
|
42
|
+
"wait",
|
|
43
|
+
"get_current_url",
|
|
44
|
+
"get_dimensions",
|
|
45
|
+
"get_environment",
|
|
46
|
+
],
|
|
47
|
+
"description": "The action to perform",
|
|
48
|
+
},
|
|
49
|
+
"element_id": {
|
|
50
|
+
"type": "integer",
|
|
51
|
+
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
|
|
52
|
+
},
|
|
53
|
+
"start_element_id": {
|
|
54
|
+
"type": "integer",
|
|
55
|
+
"description": "The ID of the element to start dragging from (required for drag action)",
|
|
56
|
+
},
|
|
57
|
+
"end_element_id": {
|
|
58
|
+
"type": "integer",
|
|
59
|
+
"description": "The ID of the element to drag to (required for drag action)",
|
|
60
|
+
},
|
|
61
|
+
"text": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "The text to type (required for type action)",
|
|
64
|
+
},
|
|
65
|
+
"keys": {
|
|
66
|
+
"type": "string",
|
|
67
|
+
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
|
|
68
|
+
},
|
|
69
|
+
"button": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
|
72
|
+
},
|
|
73
|
+
"scroll_x": {
|
|
74
|
+
"type": "integer",
|
|
75
|
+
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
|
76
|
+
},
|
|
77
|
+
"scroll_y": {
|
|
78
|
+
"type": "integer",
|
|
79
|
+
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
|
80
|
+
},
|
|
75
81
|
},
|
|
82
|
+
"required": ["action", "element_id"],
|
|
76
83
|
},
|
|
77
|
-
"required": ["action"],
|
|
78
84
|
},
|
|
79
85
|
}
|
|
80
86
|
|
|
@@ -256,7 +262,7 @@ async def replace_computer_call_with_function(
|
|
|
256
262
|
{
|
|
257
263
|
"type": "function_call_output",
|
|
258
264
|
"call_id": item.get("call_id"),
|
|
259
|
-
"output": output,
|
|
265
|
+
"output": item.get("output"),
|
|
260
266
|
"id": item.get("id"),
|
|
261
267
|
"status": "completed",
|
|
262
268
|
}
|
|
@@ -298,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
298
304
|
|
|
299
305
|
llm_model = model.split("+")[-1]
|
|
300
306
|
|
|
307
|
+
# Get screen dimensions from computer handler
|
|
308
|
+
try:
|
|
309
|
+
width, height = await computer_handler.get_dimensions()
|
|
310
|
+
except Exception:
|
|
311
|
+
# Fallback to default dimensions if method fails
|
|
312
|
+
width, height = 1024, 768
|
|
313
|
+
|
|
301
314
|
# Prepare tools for OpenAI API
|
|
302
315
|
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
|
|
303
316
|
|
|
@@ -311,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
311
324
|
result = parser.parse(image_data)
|
|
312
325
|
if _on_screenshot:
|
|
313
326
|
await _on_screenshot(result.annotated_image_base64, "annotated_image")
|
|
327
|
+
|
|
328
|
+
# Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
|
|
314
329
|
for element in result.elements:
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
330
|
+
norm_x = (element.bbox.x1 + element.bbox.x2) / 2
|
|
331
|
+
norm_y = (element.bbox.y1 + element.bbox.y2) / 2
|
|
332
|
+
pixel_x = int(norm_x * width)
|
|
333
|
+
pixel_y = int(norm_y * height)
|
|
334
|
+
id2xy[element.id] = (pixel_x, pixel_y)
|
|
335
|
+
|
|
336
|
+
# Replace the original screenshot with the annotated image
|
|
337
|
+
annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
|
|
338
|
+
last_computer_call_output["output"]["image_url"] = annotated_image_url
|
|
339
|
+
|
|
340
|
+
xy2id = {v: k for k, v in id2xy.items()}
|
|
341
|
+
messages_with_element_ids = []
|
|
342
|
+
for i, message in enumerate(messages):
|
|
323
343
|
if not isinstance(message, dict):
|
|
324
344
|
message = message.__dict__
|
|
325
|
-
|
|
326
|
-
|
|
345
|
+
|
|
346
|
+
msg_type = message.get("type")
|
|
347
|
+
|
|
348
|
+
if msg_type == "computer_call" and "action" in message:
|
|
349
|
+
action = message.get("action", {})
|
|
350
|
+
|
|
351
|
+
converted = await replace_computer_call_with_function(message, xy2id) # type: ignore
|
|
352
|
+
messages_with_element_ids += converted
|
|
353
|
+
|
|
354
|
+
completion_messages = convert_responses_items_to_completion_messages(
|
|
355
|
+
messages_with_element_ids, allow_images_in_tool_results=False
|
|
356
|
+
)
|
|
327
357
|
|
|
328
358
|
# Prepare API call kwargs
|
|
329
359
|
api_kwargs = {
|
|
330
360
|
"model": llm_model,
|
|
331
|
-
"
|
|
361
|
+
"messages": completion_messages,
|
|
332
362
|
"tools": openai_tools if openai_tools else None,
|
|
333
363
|
"stream": stream,
|
|
334
|
-
"truncation": "auto",
|
|
335
364
|
"num_retries": max_retries,
|
|
336
365
|
**kwargs,
|
|
337
366
|
}
|
|
@@ -342,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
342
371
|
|
|
343
372
|
print(str(api_kwargs)[:1000])
|
|
344
373
|
|
|
345
|
-
# Use liteLLM
|
|
346
|
-
response = await litellm.
|
|
374
|
+
# Use liteLLM completion
|
|
375
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
347
376
|
|
|
348
377
|
# Call API end hook
|
|
349
378
|
if _on_api_end:
|
|
@@ -357,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
|
|
|
357
386
|
if _on_usage:
|
|
358
387
|
await _on_usage(usage)
|
|
359
388
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
389
|
+
response_dict = response.model_dump() # type: ignore
|
|
390
|
+
choice_messages = [choice["message"] for choice in response_dict["choices"]]
|
|
391
|
+
responses_items = []
|
|
392
|
+
for choice_message in choice_messages:
|
|
393
|
+
responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
|
|
394
|
+
|
|
395
|
+
# Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
|
|
396
|
+
final_output = []
|
|
397
|
+
for item in responses_items:
|
|
398
|
+
if item.get("type") == "computer_call" and "action" in item:
|
|
399
|
+
action = item["action"].copy()
|
|
400
|
+
|
|
401
|
+
# Handle single element_id
|
|
402
|
+
if "element_id" in action:
|
|
403
|
+
element_id = action["element_id"]
|
|
404
|
+
if element_id in id2xy:
|
|
405
|
+
x, y = id2xy[element_id]
|
|
406
|
+
action["x"] = x
|
|
407
|
+
action["y"] = y
|
|
408
|
+
del action["element_id"]
|
|
409
|
+
|
|
410
|
+
# Handle start_element_id and end_element_id for drag operations
|
|
411
|
+
elif "start_element_id" in action and "end_element_id" in action:
|
|
412
|
+
start_id = action["start_element_id"]
|
|
413
|
+
end_id = action["end_element_id"]
|
|
414
|
+
if start_id in id2xy and end_id in id2xy:
|
|
415
|
+
start_x, start_y = id2xy[start_id]
|
|
416
|
+
end_x, end_y = id2xy[end_id]
|
|
417
|
+
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
418
|
+
del action["start_element_id"]
|
|
419
|
+
del action["end_element_id"]
|
|
420
|
+
|
|
421
|
+
converted_item = item.copy()
|
|
422
|
+
converted_item["action"] = action
|
|
423
|
+
final_output.append(converted_item)
|
|
424
|
+
else:
|
|
425
|
+
final_output.append(item)
|
|
364
426
|
|
|
365
|
-
return {"output":
|
|
427
|
+
return {"output": final_output, "usage": usage}
|
|
366
428
|
|
|
367
429
|
async def predict_click(
|
|
368
430
|
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|