cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/glm45v.py
CHANGED
|
@@ -4,33 +4,36 @@ Supports vision-language models for computer control with bounding box parsing.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import asyncio
|
|
7
|
-
import json
|
|
8
7
|
import base64
|
|
8
|
+
import json
|
|
9
9
|
import re
|
|
10
|
-
from typing import Dict, List, Any, Optional, Tuple
|
|
11
10
|
from io import BytesIO
|
|
12
|
-
from
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
13
|
import litellm
|
|
14
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
15
|
+
LiteLLMCompletionResponsesConfig,
|
|
16
|
+
)
|
|
14
17
|
from litellm.types.utils import ModelResponse
|
|
15
|
-
from
|
|
18
|
+
from PIL import Image
|
|
16
19
|
|
|
17
20
|
from ..decorators import register_agent
|
|
18
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
19
21
|
from ..loops.base import AsyncAgentConfig
|
|
20
22
|
from ..responses import (
|
|
21
|
-
convert_responses_items_to_completion_messages,
|
|
22
23
|
convert_completion_messages_to_responses_items,
|
|
23
|
-
|
|
24
|
-
make_output_text_item,
|
|
24
|
+
convert_responses_items_to_completion_messages,
|
|
25
25
|
make_click_item,
|
|
26
26
|
make_double_click_item,
|
|
27
27
|
make_drag_item,
|
|
28
|
+
make_input_image_item,
|
|
28
29
|
make_keypress_item,
|
|
30
|
+
make_output_text_item,
|
|
31
|
+
make_reasoning_item,
|
|
29
32
|
make_scroll_item,
|
|
30
33
|
make_type_item,
|
|
31
34
|
make_wait_item,
|
|
32
|
-
make_input_image_item
|
|
33
35
|
)
|
|
36
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
34
37
|
|
|
35
38
|
# GLM-4.5V specific constants
|
|
36
39
|
GLM_ACTION_SPACE = """
|
|
@@ -251,16 +254,18 @@ Call rule: `FAIL()`
|
|
|
251
254
|
}
|
|
252
255
|
}"""
|
|
253
256
|
|
|
257
|
+
|
|
254
258
|
def encode_image_to_base64(image_path: str) -> str:
|
|
255
259
|
"""Encode image file to base64 string with data URI."""
|
|
256
260
|
with open(image_path, "rb") as image_file:
|
|
257
261
|
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
258
262
|
return f"data:image/png;base64,{encoded_string}"
|
|
259
263
|
|
|
264
|
+
|
|
260
265
|
def parse_glm_response(response: str) -> Dict[str, Any]:
|
|
261
266
|
"""
|
|
262
267
|
Parse GLM-4.5V response to extract action and memory.
|
|
263
|
-
|
|
268
|
+
|
|
264
269
|
The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
|
|
265
270
|
Coordinates are normalized values between 0 and 1000.
|
|
266
271
|
"""
|
|
@@ -274,26 +279,23 @@ def parse_glm_response(response: str) -> Dict[str, Any]:
|
|
|
274
279
|
action_pattern = r"[\w_]+\([^)]*\)"
|
|
275
280
|
matches = re.findall(action_pattern, response)
|
|
276
281
|
action = matches[0] if matches else None
|
|
277
|
-
|
|
282
|
+
|
|
278
283
|
# Extract memory section
|
|
279
284
|
memory_pattern = r"Memory:(.*?)$"
|
|
280
285
|
memory_match = re.search(memory_pattern, response, re.DOTALL)
|
|
281
286
|
memory = memory_match.group(1).strip() if memory_match else "[]"
|
|
282
|
-
|
|
287
|
+
|
|
283
288
|
# Extract action text (everything before Memory:)
|
|
284
|
-
action_text_pattern = r
|
|
289
|
+
action_text_pattern = r"^(.*?)Memory:"
|
|
285
290
|
action_text_match = re.search(action_text_pattern, response, re.DOTALL)
|
|
286
291
|
action_text = action_text_match.group(1).strip() if action_text_match else response
|
|
287
|
-
|
|
292
|
+
|
|
288
293
|
# Clean up action text by removing special tokens
|
|
289
294
|
if action_text:
|
|
290
295
|
action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
|
|
291
|
-
|
|
292
|
-
return {
|
|
293
|
-
|
|
294
|
-
"action_text": action_text,
|
|
295
|
-
"memory": memory
|
|
296
|
-
}
|
|
296
|
+
|
|
297
|
+
return {"action": action, "action_text": action_text, "memory": memory}
|
|
298
|
+
|
|
297
299
|
|
|
298
300
|
def get_last_image_from_messages(messages: Messages) -> Optional[str]:
|
|
299
301
|
"""Extract the last image from messages for processing."""
|
|
@@ -314,23 +316,28 @@ def get_last_image_from_messages(messages: Messages) -> Optional[str]:
|
|
|
314
316
|
image_url_obj = item.get("image_url", {})
|
|
315
317
|
if isinstance(image_url_obj, dict):
|
|
316
318
|
image_url = image_url_obj.get("url", "")
|
|
317
|
-
if isinstance(image_url, str) and image_url.startswith(
|
|
319
|
+
if isinstance(image_url, str) and image_url.startswith(
|
|
320
|
+
"data:image/"
|
|
321
|
+
):
|
|
318
322
|
return image_url.split(",", 1)[1]
|
|
319
323
|
return None
|
|
320
324
|
|
|
321
|
-
|
|
325
|
+
|
|
326
|
+
def convert_responses_items_to_glm45v_pc_prompt(
|
|
327
|
+
messages: Messages, task: str, memory: str = ""
|
|
328
|
+
) -> List[Dict[str, Any]]:
|
|
322
329
|
"""Convert responses items to GLM-4.5V PC prompt format with historical actions.
|
|
323
|
-
|
|
330
|
+
|
|
324
331
|
Args:
|
|
325
332
|
messages: List of message items from the conversation
|
|
326
333
|
task: The task description
|
|
327
334
|
memory: Current memory state
|
|
328
|
-
|
|
335
|
+
|
|
329
336
|
Returns:
|
|
330
337
|
List of content items for the prompt (text and image_url items)
|
|
331
338
|
"""
|
|
332
339
|
action_space = GLM_ACTION_SPACE
|
|
333
|
-
|
|
340
|
+
|
|
334
341
|
# Template head
|
|
335
342
|
head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
|
|
336
343
|
|
|
@@ -345,7 +352,7 @@ Ubuntu
|
|
|
345
352
|
|
|
346
353
|
# Historical Actions and Current Memory
|
|
347
354
|
History:"""
|
|
348
|
-
|
|
355
|
+
|
|
349
356
|
# Template tail
|
|
350
357
|
tail_text = f"""
|
|
351
358
|
Memory:
|
|
@@ -363,18 +370,18 @@ Memory:
|
|
|
363
370
|
|
|
364
371
|
Current Screenshot:
|
|
365
372
|
"""
|
|
366
|
-
|
|
373
|
+
|
|
367
374
|
# Build history from messages
|
|
368
375
|
history = []
|
|
369
376
|
history_images = []
|
|
370
|
-
|
|
377
|
+
|
|
371
378
|
# Group messages into steps
|
|
372
379
|
current_step = []
|
|
373
380
|
step_num = 0
|
|
374
|
-
|
|
381
|
+
|
|
375
382
|
for message in messages:
|
|
376
383
|
msg_type = message.get("type")
|
|
377
|
-
|
|
384
|
+
|
|
378
385
|
if msg_type == "reasoning":
|
|
379
386
|
current_step.append(message)
|
|
380
387
|
elif msg_type == "message" and message.get("role") == "assistant":
|
|
@@ -386,7 +393,7 @@ Current Screenshot:
|
|
|
386
393
|
# End of step - process it
|
|
387
394
|
if current_step:
|
|
388
395
|
step_num += 1
|
|
389
|
-
|
|
396
|
+
|
|
390
397
|
# Extract bot thought from message content
|
|
391
398
|
bot_thought = ""
|
|
392
399
|
for item in current_step:
|
|
@@ -397,14 +404,14 @@ Current Screenshot:
|
|
|
397
404
|
bot_thought = content_item.get("text", "")
|
|
398
405
|
break
|
|
399
406
|
break
|
|
400
|
-
|
|
407
|
+
|
|
401
408
|
# Extract action from computer_call
|
|
402
409
|
action_text = ""
|
|
403
410
|
for item in current_step:
|
|
404
411
|
if item.get("type") == "computer_call":
|
|
405
412
|
action = item.get("action", {})
|
|
406
413
|
action_type = action.get("type", "")
|
|
407
|
-
|
|
414
|
+
|
|
408
415
|
if action_type == "click":
|
|
409
416
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
410
417
|
# Convert to 0-999 range (assuming screen dimensions)
|
|
@@ -436,7 +443,7 @@ Current Screenshot:
|
|
|
436
443
|
elif action_type == "wait":
|
|
437
444
|
action_text = "WAIT()"
|
|
438
445
|
break
|
|
439
|
-
|
|
446
|
+
|
|
440
447
|
# Extract screenshot from computer_call_output
|
|
441
448
|
screenshot_url = None
|
|
442
449
|
for item in current_step:
|
|
@@ -445,34 +452,34 @@ Current Screenshot:
|
|
|
445
452
|
if output.get("type") == "input_image":
|
|
446
453
|
screenshot_url = output.get("image_url", "")
|
|
447
454
|
break
|
|
448
|
-
|
|
455
|
+
|
|
449
456
|
# Store step info
|
|
450
457
|
step_info = {
|
|
451
458
|
"step_num": step_num,
|
|
452
459
|
"bot_thought": bot_thought,
|
|
453
460
|
"action_text": action_text,
|
|
454
|
-
"screenshot_url": screenshot_url
|
|
461
|
+
"screenshot_url": screenshot_url,
|
|
455
462
|
}
|
|
456
463
|
history.append(step_info)
|
|
457
|
-
|
|
464
|
+
|
|
458
465
|
# Store screenshot for last 4 steps
|
|
459
466
|
if screenshot_url:
|
|
460
467
|
history_images.append(screenshot_url)
|
|
461
|
-
|
|
468
|
+
|
|
462
469
|
current_step = []
|
|
463
|
-
|
|
470
|
+
|
|
464
471
|
# Build content array with head, history, and tail
|
|
465
472
|
content = []
|
|
466
473
|
current_text = head_text
|
|
467
|
-
|
|
474
|
+
|
|
468
475
|
total_history_steps = len(history)
|
|
469
476
|
history_image_count = min(4, len(history_images)) # Last 4 images
|
|
470
|
-
|
|
477
|
+
|
|
471
478
|
for step_idx, step_info in enumerate(history):
|
|
472
479
|
step_num = step_info["step_num"]
|
|
473
480
|
bot_thought = step_info["bot_thought"]
|
|
474
481
|
action_text = step_info["action_text"]
|
|
475
|
-
|
|
482
|
+
|
|
476
483
|
if step_idx < total_history_steps - history_image_count:
|
|
477
484
|
# For steps beyond the last 4, use text placeholder
|
|
478
485
|
current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
|
|
@@ -480,20 +487,21 @@ Current Screenshot:
|
|
|
480
487
|
# For the last 4 steps, insert images
|
|
481
488
|
current_text += f"\nstep {step_num}: Screenshot:"
|
|
482
489
|
content.append({"type": "text", "text": current_text})
|
|
483
|
-
|
|
490
|
+
|
|
484
491
|
# Add image
|
|
485
492
|
img_idx = step_idx - (total_history_steps - history_image_count)
|
|
486
493
|
if img_idx < len(history_images):
|
|
487
494
|
content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
|
|
488
|
-
|
|
495
|
+
|
|
489
496
|
current_text = f" Thought: {bot_thought}\nAction: {action_text}"
|
|
490
|
-
|
|
497
|
+
|
|
491
498
|
# Add tail
|
|
492
499
|
current_text += tail_text
|
|
493
500
|
content.append({"type": "text", "text": current_text})
|
|
494
|
-
|
|
501
|
+
|
|
495
502
|
return content
|
|
496
503
|
|
|
504
|
+
|
|
497
505
|
def model_dump(obj) -> Dict[str, Any]:
|
|
498
506
|
if isinstance(obj, dict):
|
|
499
507
|
return {k: model_dump(v) for k, v in obj.items()}
|
|
@@ -502,58 +510,61 @@ def model_dump(obj) -> Dict[str, Any]:
|
|
|
502
510
|
else:
|
|
503
511
|
return obj
|
|
504
512
|
|
|
505
|
-
|
|
513
|
+
|
|
514
|
+
def convert_glm_completion_to_responses_items(
|
|
515
|
+
response: ModelResponse, image_width: int, image_height: int
|
|
516
|
+
) -> List[Dict[str, Any]]:
|
|
506
517
|
"""
|
|
507
518
|
Convert GLM-4.5V completion response to responses items format.
|
|
508
|
-
|
|
519
|
+
|
|
509
520
|
Args:
|
|
510
521
|
response: LiteLLM ModelResponse from GLM-4.5V
|
|
511
522
|
image_width: Original image width for coordinate scaling
|
|
512
523
|
image_height: Original image height for coordinate scaling
|
|
513
|
-
|
|
524
|
+
|
|
514
525
|
Returns:
|
|
515
526
|
List of response items in the proper format
|
|
516
527
|
"""
|
|
517
528
|
import uuid
|
|
518
|
-
|
|
529
|
+
|
|
519
530
|
response_items = []
|
|
520
|
-
|
|
531
|
+
|
|
521
532
|
if not response.choices or not response.choices[0].message:
|
|
522
533
|
return response_items
|
|
523
|
-
|
|
534
|
+
|
|
524
535
|
message = response.choices[0].message
|
|
525
536
|
content = message.content or ""
|
|
526
|
-
reasoning_content = getattr(message,
|
|
527
|
-
|
|
537
|
+
reasoning_content = getattr(message, "reasoning_content", None)
|
|
538
|
+
|
|
528
539
|
# Add reasoning item if present
|
|
529
540
|
if reasoning_content:
|
|
530
541
|
reasoning_item = model_dump(make_reasoning_item(reasoning_content))
|
|
531
542
|
response_items.append(reasoning_item)
|
|
532
|
-
|
|
543
|
+
|
|
533
544
|
# Parse the content to extract action and text
|
|
534
545
|
parsed_response = parse_glm_response(content)
|
|
535
546
|
action = parsed_response.get("action", "")
|
|
536
547
|
action_text = parsed_response.get("action_text", "")
|
|
537
|
-
|
|
548
|
+
|
|
538
549
|
# Add message item with text content (excluding action and memory)
|
|
539
550
|
if action_text:
|
|
540
551
|
# Remove action from action_text if it's there
|
|
541
552
|
clean_text = action_text
|
|
542
553
|
if action and action in clean_text:
|
|
543
554
|
clean_text = clean_text.replace(action, "").strip()
|
|
544
|
-
|
|
555
|
+
|
|
545
556
|
# Remove memory section
|
|
546
557
|
memory_pattern = r"Memory:\s*\[.*?\]\s*$"
|
|
547
558
|
clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
|
|
548
|
-
|
|
559
|
+
|
|
549
560
|
if clean_text:
|
|
550
561
|
message_item = model_dump(make_output_text_item(clean_text))
|
|
551
562
|
response_items.append(message_item)
|
|
552
|
-
|
|
563
|
+
|
|
553
564
|
# Convert action to computer call if present
|
|
554
565
|
if action:
|
|
555
566
|
call_id = f"call_{uuid.uuid4().hex[:8]}"
|
|
556
|
-
|
|
567
|
+
|
|
557
568
|
# Parse different action types and create appropriate computer calls
|
|
558
569
|
if action.startswith("left_click"):
|
|
559
570
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
@@ -566,7 +577,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
566
577
|
computer_call["call_id"] = call_id
|
|
567
578
|
computer_call["status"] = "completed"
|
|
568
579
|
response_items.append(computer_call)
|
|
569
|
-
|
|
580
|
+
|
|
570
581
|
elif action.startswith("right_click"):
|
|
571
582
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
572
583
|
if coord_match:
|
|
@@ -577,7 +588,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
577
588
|
computer_call["call_id"] = call_id
|
|
578
589
|
computer_call["status"] = "completed"
|
|
579
590
|
response_items.append(computer_call)
|
|
580
|
-
|
|
591
|
+
|
|
581
592
|
elif action.startswith("left_double_click"):
|
|
582
593
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
583
594
|
if coord_match:
|
|
@@ -588,7 +599,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
588
599
|
computer_call["call_id"] = call_id
|
|
589
600
|
computer_call["status"] = "completed"
|
|
590
601
|
response_items.append(computer_call)
|
|
591
|
-
|
|
602
|
+
|
|
592
603
|
elif action.startswith("left_drag"):
|
|
593
604
|
start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
594
605
|
end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
@@ -605,18 +616,18 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
605
616
|
computer_call["call_id"] = call_id
|
|
606
617
|
computer_call["status"] = "completed"
|
|
607
618
|
response_items.append(computer_call)
|
|
608
|
-
|
|
619
|
+
|
|
609
620
|
elif action.startswith("key"):
|
|
610
621
|
key_match = re.search(r"keys='([^']+)'", action)
|
|
611
622
|
if key_match:
|
|
612
623
|
keys = key_match.group(1)
|
|
613
624
|
# Split keys by '+' for key combinations, or use as single key
|
|
614
|
-
key_list = keys.split(
|
|
625
|
+
key_list = keys.split("+") if "+" in keys else [keys]
|
|
615
626
|
computer_call = model_dump(make_keypress_item(key_list))
|
|
616
627
|
computer_call["call_id"] = call_id
|
|
617
628
|
computer_call["status"] = "completed"
|
|
618
629
|
response_items.append(computer_call)
|
|
619
|
-
|
|
630
|
+
|
|
620
631
|
elif action.startswith("type"):
|
|
621
632
|
content_match = re.search(r"content='([^']*)'", action)
|
|
622
633
|
if content_match:
|
|
@@ -625,7 +636,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
625
636
|
computer_call["call_id"] = call_id
|
|
626
637
|
computer_call["status"] = "completed"
|
|
627
638
|
response_items.append(computer_call)
|
|
628
|
-
|
|
639
|
+
|
|
629
640
|
elif action.startswith("scroll"):
|
|
630
641
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
631
642
|
direction_match = re.search(r"direction='([^']+)'", action)
|
|
@@ -648,15 +659,16 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
648
659
|
computer_call["call_id"] = call_id
|
|
649
660
|
computer_call["status"] = "completed"
|
|
650
661
|
response_items.append(computer_call)
|
|
651
|
-
|
|
662
|
+
|
|
652
663
|
elif action == "WAIT()":
|
|
653
664
|
computer_call = model_dump(make_wait_item())
|
|
654
665
|
computer_call["call_id"] = call_id
|
|
655
666
|
computer_call["status"] = "completed"
|
|
656
667
|
response_items.append(computer_call)
|
|
657
|
-
|
|
668
|
+
|
|
658
669
|
return response_items
|
|
659
670
|
|
|
671
|
+
|
|
660
672
|
@register_agent(models=r"(?i).*GLM-4\.5V.*")
|
|
661
673
|
class Glm4vConfig(AsyncAgentConfig):
|
|
662
674
|
"""GLM-4.5V agent configuration using liteLLM."""
|
|
@@ -674,11 +686,11 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
674
686
|
_on_api_end=None,
|
|
675
687
|
_on_usage=None,
|
|
676
688
|
_on_screenshot=None,
|
|
677
|
-
**kwargs
|
|
689
|
+
**kwargs,
|
|
678
690
|
) -> Dict[str, Any]:
|
|
679
691
|
"""
|
|
680
692
|
Predict the next step using GLM-4.5V model.
|
|
681
|
-
|
|
693
|
+
|
|
682
694
|
Args:
|
|
683
695
|
messages: Input messages following Responses format
|
|
684
696
|
model: Model name to use
|
|
@@ -691,7 +703,7 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
691
703
|
_on_api_end: Callback for API end
|
|
692
704
|
_on_usage: Callback for usage tracking
|
|
693
705
|
_on_screenshot: Callback for screenshot events
|
|
694
|
-
|
|
706
|
+
|
|
695
707
|
Returns:
|
|
696
708
|
Dict with "output" and "usage" keys
|
|
697
709
|
"""
|
|
@@ -708,7 +720,7 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
708
720
|
user_instruction = item.get("text", "")
|
|
709
721
|
break
|
|
710
722
|
break
|
|
711
|
-
|
|
723
|
+
|
|
712
724
|
# Get the last image for processing
|
|
713
725
|
last_image_b64 = get_last_image_from_messages(messages)
|
|
714
726
|
if not last_image_b64 and computer_handler:
|
|
@@ -718,35 +730,28 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
718
730
|
last_image_b64 = screenshot_b64
|
|
719
731
|
if _on_screenshot:
|
|
720
732
|
await _on_screenshot(screenshot_b64)
|
|
721
|
-
|
|
733
|
+
|
|
722
734
|
if not last_image_b64:
|
|
723
735
|
raise ValueError("No image available for GLM-4.5V processing")
|
|
724
|
-
|
|
736
|
+
|
|
725
737
|
# Convert responses items to GLM-4.5V PC prompt format with historical actions
|
|
726
738
|
prompt_content = convert_responses_items_to_glm45v_pc_prompt(
|
|
727
739
|
messages=messages,
|
|
728
740
|
task=user_instruction,
|
|
729
|
-
memory="[]" # Initialize with empty memory for now
|
|
741
|
+
memory="[]", # Initialize with empty memory for now
|
|
730
742
|
)
|
|
731
|
-
|
|
743
|
+
|
|
732
744
|
# Add the current screenshot to the end
|
|
733
|
-
prompt_content.append(
|
|
734
|
-
"type": "image_url",
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
745
|
+
prompt_content.append(
|
|
746
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}}
|
|
747
|
+
)
|
|
748
|
+
|
|
738
749
|
# Prepare messages for liteLLM
|
|
739
750
|
litellm_messages = [
|
|
740
|
-
{
|
|
741
|
-
|
|
742
|
-
"content": "You are a helpful GUI agent assistant."
|
|
743
|
-
},
|
|
744
|
-
{
|
|
745
|
-
"role": "user",
|
|
746
|
-
"content": prompt_content
|
|
747
|
-
}
|
|
751
|
+
{"role": "system", "content": "You are a helpful GUI agent assistant."},
|
|
752
|
+
{"role": "user", "content": prompt_content},
|
|
748
753
|
]
|
|
749
|
-
|
|
754
|
+
|
|
750
755
|
# Prepare API call kwargs
|
|
751
756
|
api_kwargs = {
|
|
752
757
|
"model": model,
|
|
@@ -757,20 +762,21 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
757
762
|
# "skip_special_tokens": False,
|
|
758
763
|
# }
|
|
759
764
|
}
|
|
760
|
-
|
|
765
|
+
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
|
|
766
|
+
|
|
761
767
|
# Add API callbacks
|
|
762
768
|
if _on_api_start:
|
|
763
769
|
await _on_api_start(api_kwargs)
|
|
764
|
-
|
|
770
|
+
|
|
765
771
|
# Call liteLLM
|
|
766
772
|
response = await litellm.acompletion(**api_kwargs)
|
|
767
|
-
|
|
773
|
+
|
|
768
774
|
if _on_api_end:
|
|
769
775
|
await _on_api_end(api_kwargs, response)
|
|
770
|
-
|
|
776
|
+
|
|
771
777
|
# Get image dimensions for coordinate scaling
|
|
772
778
|
image_width, image_height = 1920, 1080 # Default dimensions
|
|
773
|
-
|
|
779
|
+
|
|
774
780
|
# Try to get actual dimensions from the image
|
|
775
781
|
try:
|
|
776
782
|
image_data = base64.b64decode(last_image_b64)
|
|
@@ -778,41 +784,38 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
778
784
|
image_width, image_height = image.size
|
|
779
785
|
except Exception:
|
|
780
786
|
pass # Use default dimensions
|
|
781
|
-
|
|
787
|
+
|
|
782
788
|
# Convert GLM completion response to responses items
|
|
783
|
-
response_items = convert_glm_completion_to_responses_items(
|
|
784
|
-
|
|
789
|
+
response_items = convert_glm_completion_to_responses_items(
|
|
790
|
+
response, image_width, image_height
|
|
791
|
+
)
|
|
792
|
+
|
|
785
793
|
# Extract usage information
|
|
786
794
|
response_usage = {
|
|
787
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
795
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
796
|
+
response.usage
|
|
797
|
+
).model_dump(),
|
|
788
798
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
789
799
|
}
|
|
790
800
|
if _on_usage:
|
|
791
801
|
await _on_usage(response_usage)
|
|
792
|
-
|
|
802
|
+
|
|
793
803
|
# Create agent response
|
|
794
|
-
agent_response = {
|
|
795
|
-
|
|
796
|
-
"usage": response_usage
|
|
797
|
-
}
|
|
798
|
-
|
|
804
|
+
agent_response = {"output": response_items, "usage": response_usage}
|
|
805
|
+
|
|
799
806
|
return agent_response
|
|
800
807
|
|
|
801
808
|
async def predict_click(
|
|
802
|
-
self,
|
|
803
|
-
model: str,
|
|
804
|
-
image_b64: str,
|
|
805
|
-
instruction: str,
|
|
806
|
-
**kwargs
|
|
809
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
807
810
|
) -> Optional[Tuple[int, int]]:
|
|
808
811
|
"""
|
|
809
812
|
Predict click coordinates using GLM-4.5V model.
|
|
810
|
-
|
|
813
|
+
|
|
811
814
|
Args:
|
|
812
815
|
model: Model name to use
|
|
813
816
|
image_b64: Base64 encoded image
|
|
814
817
|
instruction: Instruction for where to click
|
|
815
|
-
|
|
818
|
+
|
|
816
819
|
Returns:
|
|
817
820
|
Tuple with (x, y) coordinates or None
|
|
818
821
|
"""
|
|
@@ -824,52 +827,54 @@ Respond with a single click action in this format:
|
|
|
824
827
|
left_click(start_box='[x,y]')
|
|
825
828
|
|
|
826
829
|
Where x,y are coordinates normalized to 0-999 range."""
|
|
827
|
-
|
|
830
|
+
|
|
828
831
|
# Prepare messages for liteLLM
|
|
829
832
|
litellm_messages = [
|
|
830
|
-
{
|
|
831
|
-
"role": "system",
|
|
832
|
-
"content": "You are a helpful GUI agent assistant."
|
|
833
|
-
},
|
|
833
|
+
{"role": "system", "content": "You are a helpful GUI agent assistant."},
|
|
834
834
|
{
|
|
835
835
|
"role": "user",
|
|
836
836
|
"content": [
|
|
837
837
|
{"type": "text", "text": click_prompt},
|
|
838
|
-
{
|
|
839
|
-
|
|
840
|
-
|
|
838
|
+
{
|
|
839
|
+
"type": "image_url",
|
|
840
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
841
|
+
},
|
|
842
|
+
],
|
|
843
|
+
},
|
|
841
844
|
]
|
|
842
|
-
|
|
845
|
+
|
|
843
846
|
# Prepare API call kwargs
|
|
844
847
|
api_kwargs = {
|
|
845
848
|
"model": model,
|
|
846
849
|
"messages": litellm_messages,
|
|
847
|
-
"max_tokens":
|
|
850
|
+
"max_tokens": 2056,
|
|
848
851
|
"temperature": 0.001,
|
|
849
852
|
"extra_body": {
|
|
850
853
|
"skip_special_tokens": False,
|
|
851
|
-
}
|
|
854
|
+
},
|
|
852
855
|
}
|
|
853
|
-
|
|
856
|
+
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
|
|
857
|
+
|
|
854
858
|
# Call liteLLM
|
|
855
859
|
response = await litellm.acompletion(**api_kwargs)
|
|
856
|
-
|
|
860
|
+
|
|
857
861
|
# Extract response content
|
|
858
862
|
response_content = response.choices[0].message.content.strip()
|
|
859
|
-
|
|
863
|
+
print(response)
|
|
864
|
+
|
|
860
865
|
# Parse response for click coordinates
|
|
861
866
|
# Look for coordinates in the response, handling special tokens
|
|
862
867
|
coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
|
|
863
868
|
match = re.search(coord_pattern, response_content)
|
|
864
|
-
|
|
869
|
+
|
|
865
870
|
if not match:
|
|
866
871
|
# Fallback: look for coordinates without special tokens
|
|
867
872
|
coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
|
|
868
873
|
match = re.search(coord_pattern, response_content)
|
|
869
|
-
|
|
874
|
+
|
|
870
875
|
if match:
|
|
871
876
|
x, y = int(match.group(1)), int(match.group(2))
|
|
872
|
-
|
|
877
|
+
|
|
873
878
|
# Get actual image dimensions for scaling
|
|
874
879
|
try:
|
|
875
880
|
image_data = base64.b64decode(image_b64)
|
|
@@ -878,15 +883,15 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
878
883
|
except Exception:
|
|
879
884
|
# Use default dimensions
|
|
880
885
|
image_width, image_height = 1920, 1080
|
|
881
|
-
|
|
886
|
+
|
|
882
887
|
# Convert from 0-999 normalized coordinates to actual pixel coordinates
|
|
883
888
|
actual_x = int((x / 999.0) * image_width)
|
|
884
889
|
actual_y = int((y / 999.0) * image_height)
|
|
885
|
-
|
|
890
|
+
|
|
886
891
|
return (actual_x, actual_y)
|
|
887
|
-
|
|
892
|
+
|
|
888
893
|
return None
|
|
889
|
-
|
|
894
|
+
|
|
890
895
|
except Exception as e:
|
|
891
896
|
# Log error and return None
|
|
892
897
|
print(f"Error in predict_click: {e}")
|
|
@@ -895,7 +900,7 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
895
900
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
896
901
|
"""
|
|
897
902
|
Get list of capabilities supported by this agent config.
|
|
898
|
-
|
|
903
|
+
|
|
899
904
|
Returns:
|
|
900
905
|
List of capability strings
|
|
901
906
|
"""
|