cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/glm45v.py
CHANGED
|
@@ -4,33 +4,36 @@ Supports vision-language models for computer control with bounding box parsing.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import asyncio
|
|
7
|
-
import json
|
|
8
7
|
import base64
|
|
8
|
+
import json
|
|
9
9
|
import re
|
|
10
|
-
from typing import Dict, List, Any, Optional, Tuple
|
|
11
10
|
from io import BytesIO
|
|
12
|
-
from
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
13
|
import litellm
|
|
14
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
15
|
+
LiteLLMCompletionResponsesConfig,
|
|
16
|
+
)
|
|
14
17
|
from litellm.types.utils import ModelResponse
|
|
15
|
-
from
|
|
18
|
+
from PIL import Image
|
|
16
19
|
|
|
17
20
|
from ..decorators import register_agent
|
|
18
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
19
21
|
from ..loops.base import AsyncAgentConfig
|
|
20
22
|
from ..responses import (
|
|
21
|
-
convert_responses_items_to_completion_messages,
|
|
22
23
|
convert_completion_messages_to_responses_items,
|
|
23
|
-
|
|
24
|
-
make_output_text_item,
|
|
24
|
+
convert_responses_items_to_completion_messages,
|
|
25
25
|
make_click_item,
|
|
26
26
|
make_double_click_item,
|
|
27
27
|
make_drag_item,
|
|
28
|
+
make_input_image_item,
|
|
28
29
|
make_keypress_item,
|
|
30
|
+
make_output_text_item,
|
|
31
|
+
make_reasoning_item,
|
|
29
32
|
make_scroll_item,
|
|
30
33
|
make_type_item,
|
|
31
34
|
make_wait_item,
|
|
32
|
-
make_input_image_item
|
|
33
35
|
)
|
|
36
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
34
37
|
|
|
35
38
|
# GLM-4.5V specific constants
|
|
36
39
|
GLM_ACTION_SPACE = """
|
|
@@ -251,16 +254,18 @@ Call rule: `FAIL()`
|
|
|
251
254
|
}
|
|
252
255
|
}"""
|
|
253
256
|
|
|
257
|
+
|
|
254
258
|
def encode_image_to_base64(image_path: str) -> str:
|
|
255
259
|
"""Encode image file to base64 string with data URI."""
|
|
256
260
|
with open(image_path, "rb") as image_file:
|
|
257
261
|
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
258
262
|
return f"data:image/png;base64,{encoded_string}"
|
|
259
263
|
|
|
264
|
+
|
|
260
265
|
def parse_glm_response(response: str) -> Dict[str, Any]:
|
|
261
266
|
"""
|
|
262
267
|
Parse GLM-4.5V response to extract action and memory.
|
|
263
|
-
|
|
268
|
+
|
|
264
269
|
The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
|
|
265
270
|
Coordinates are normalized values between 0 and 1000.
|
|
266
271
|
"""
|
|
@@ -274,26 +279,23 @@ def parse_glm_response(response: str) -> Dict[str, Any]:
|
|
|
274
279
|
action_pattern = r"[\w_]+\([^)]*\)"
|
|
275
280
|
matches = re.findall(action_pattern, response)
|
|
276
281
|
action = matches[0] if matches else None
|
|
277
|
-
|
|
282
|
+
|
|
278
283
|
# Extract memory section
|
|
279
284
|
memory_pattern = r"Memory:(.*?)$"
|
|
280
285
|
memory_match = re.search(memory_pattern, response, re.DOTALL)
|
|
281
286
|
memory = memory_match.group(1).strip() if memory_match else "[]"
|
|
282
|
-
|
|
287
|
+
|
|
283
288
|
# Extract action text (everything before Memory:)
|
|
284
|
-
action_text_pattern = r
|
|
289
|
+
action_text_pattern = r"^(.*?)Memory:"
|
|
285
290
|
action_text_match = re.search(action_text_pattern, response, re.DOTALL)
|
|
286
291
|
action_text = action_text_match.group(1).strip() if action_text_match else response
|
|
287
|
-
|
|
292
|
+
|
|
288
293
|
# Clean up action text by removing special tokens
|
|
289
294
|
if action_text:
|
|
290
295
|
action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
|
|
291
|
-
|
|
292
|
-
return {
|
|
293
|
-
|
|
294
|
-
"action_text": action_text,
|
|
295
|
-
"memory": memory
|
|
296
|
-
}
|
|
296
|
+
|
|
297
|
+
return {"action": action, "action_text": action_text, "memory": memory}
|
|
298
|
+
|
|
297
299
|
|
|
298
300
|
def get_last_image_from_messages(messages: Messages) -> Optional[str]:
|
|
299
301
|
"""Extract the last image from messages for processing."""
|
|
@@ -314,23 +316,28 @@ def get_last_image_from_messages(messages: Messages) -> Optional[str]:
|
|
|
314
316
|
image_url_obj = item.get("image_url", {})
|
|
315
317
|
if isinstance(image_url_obj, dict):
|
|
316
318
|
image_url = image_url_obj.get("url", "")
|
|
317
|
-
if isinstance(image_url, str) and image_url.startswith(
|
|
319
|
+
if isinstance(image_url, str) and image_url.startswith(
|
|
320
|
+
"data:image/"
|
|
321
|
+
):
|
|
318
322
|
return image_url.split(",", 1)[1]
|
|
319
323
|
return None
|
|
320
324
|
|
|
321
|
-
|
|
325
|
+
|
|
326
|
+
def convert_responses_items_to_glm45v_pc_prompt(
|
|
327
|
+
messages: Messages, task: str, memory: str = ""
|
|
328
|
+
) -> List[Dict[str, Any]]:
|
|
322
329
|
"""Convert responses items to GLM-4.5V PC prompt format with historical actions.
|
|
323
|
-
|
|
330
|
+
|
|
324
331
|
Args:
|
|
325
332
|
messages: List of message items from the conversation
|
|
326
333
|
task: The task description
|
|
327
334
|
memory: Current memory state
|
|
328
|
-
|
|
335
|
+
|
|
329
336
|
Returns:
|
|
330
337
|
List of content items for the prompt (text and image_url items)
|
|
331
338
|
"""
|
|
332
339
|
action_space = GLM_ACTION_SPACE
|
|
333
|
-
|
|
340
|
+
|
|
334
341
|
# Template head
|
|
335
342
|
head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
|
|
336
343
|
|
|
@@ -345,7 +352,7 @@ Ubuntu
|
|
|
345
352
|
|
|
346
353
|
# Historical Actions and Current Memory
|
|
347
354
|
History:"""
|
|
348
|
-
|
|
355
|
+
|
|
349
356
|
# Template tail
|
|
350
357
|
tail_text = f"""
|
|
351
358
|
Memory:
|
|
@@ -363,18 +370,18 @@ Memory:
|
|
|
363
370
|
|
|
364
371
|
Current Screenshot:
|
|
365
372
|
"""
|
|
366
|
-
|
|
373
|
+
|
|
367
374
|
# Build history from messages
|
|
368
375
|
history = []
|
|
369
376
|
history_images = []
|
|
370
|
-
|
|
377
|
+
|
|
371
378
|
# Group messages into steps
|
|
372
379
|
current_step = []
|
|
373
380
|
step_num = 0
|
|
374
|
-
|
|
381
|
+
|
|
375
382
|
for message in messages:
|
|
376
383
|
msg_type = message.get("type")
|
|
377
|
-
|
|
384
|
+
|
|
378
385
|
if msg_type == "reasoning":
|
|
379
386
|
current_step.append(message)
|
|
380
387
|
elif msg_type == "message" and message.get("role") == "assistant":
|
|
@@ -386,7 +393,7 @@ Current Screenshot:
|
|
|
386
393
|
# End of step - process it
|
|
387
394
|
if current_step:
|
|
388
395
|
step_num += 1
|
|
389
|
-
|
|
396
|
+
|
|
390
397
|
# Extract bot thought from message content
|
|
391
398
|
bot_thought = ""
|
|
392
399
|
for item in current_step:
|
|
@@ -397,14 +404,14 @@ Current Screenshot:
|
|
|
397
404
|
bot_thought = content_item.get("text", "")
|
|
398
405
|
break
|
|
399
406
|
break
|
|
400
|
-
|
|
407
|
+
|
|
401
408
|
# Extract action from computer_call
|
|
402
409
|
action_text = ""
|
|
403
410
|
for item in current_step:
|
|
404
411
|
if item.get("type") == "computer_call":
|
|
405
412
|
action = item.get("action", {})
|
|
406
413
|
action_type = action.get("type", "")
|
|
407
|
-
|
|
414
|
+
|
|
408
415
|
if action_type == "click":
|
|
409
416
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
410
417
|
# Convert to 0-999 range (assuming screen dimensions)
|
|
@@ -436,7 +443,7 @@ Current Screenshot:
|
|
|
436
443
|
elif action_type == "wait":
|
|
437
444
|
action_text = "WAIT()"
|
|
438
445
|
break
|
|
439
|
-
|
|
446
|
+
|
|
440
447
|
# Extract screenshot from computer_call_output
|
|
441
448
|
screenshot_url = None
|
|
442
449
|
for item in current_step:
|
|
@@ -445,34 +452,34 @@ Current Screenshot:
|
|
|
445
452
|
if output.get("type") == "input_image":
|
|
446
453
|
screenshot_url = output.get("image_url", "")
|
|
447
454
|
break
|
|
448
|
-
|
|
455
|
+
|
|
449
456
|
# Store step info
|
|
450
457
|
step_info = {
|
|
451
458
|
"step_num": step_num,
|
|
452
459
|
"bot_thought": bot_thought,
|
|
453
460
|
"action_text": action_text,
|
|
454
|
-
"screenshot_url": screenshot_url
|
|
461
|
+
"screenshot_url": screenshot_url,
|
|
455
462
|
}
|
|
456
463
|
history.append(step_info)
|
|
457
|
-
|
|
464
|
+
|
|
458
465
|
# Store screenshot for last 4 steps
|
|
459
466
|
if screenshot_url:
|
|
460
467
|
history_images.append(screenshot_url)
|
|
461
|
-
|
|
468
|
+
|
|
462
469
|
current_step = []
|
|
463
|
-
|
|
470
|
+
|
|
464
471
|
# Build content array with head, history, and tail
|
|
465
472
|
content = []
|
|
466
473
|
current_text = head_text
|
|
467
|
-
|
|
474
|
+
|
|
468
475
|
total_history_steps = len(history)
|
|
469
476
|
history_image_count = min(4, len(history_images)) # Last 4 images
|
|
470
|
-
|
|
477
|
+
|
|
471
478
|
for step_idx, step_info in enumerate(history):
|
|
472
479
|
step_num = step_info["step_num"]
|
|
473
480
|
bot_thought = step_info["bot_thought"]
|
|
474
481
|
action_text = step_info["action_text"]
|
|
475
|
-
|
|
482
|
+
|
|
476
483
|
if step_idx < total_history_steps - history_image_count:
|
|
477
484
|
# For steps beyond the last 4, use text placeholder
|
|
478
485
|
current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
|
|
@@ -480,20 +487,21 @@ Current Screenshot:
|
|
|
480
487
|
# For the last 4 steps, insert images
|
|
481
488
|
current_text += f"\nstep {step_num}: Screenshot:"
|
|
482
489
|
content.append({"type": "text", "text": current_text})
|
|
483
|
-
|
|
490
|
+
|
|
484
491
|
# Add image
|
|
485
492
|
img_idx = step_idx - (total_history_steps - history_image_count)
|
|
486
493
|
if img_idx < len(history_images):
|
|
487
494
|
content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
|
|
488
|
-
|
|
495
|
+
|
|
489
496
|
current_text = f" Thought: {bot_thought}\nAction: {action_text}"
|
|
490
|
-
|
|
497
|
+
|
|
491
498
|
# Add tail
|
|
492
499
|
current_text += tail_text
|
|
493
500
|
content.append({"type": "text", "text": current_text})
|
|
494
|
-
|
|
501
|
+
|
|
495
502
|
return content
|
|
496
503
|
|
|
504
|
+
|
|
497
505
|
def model_dump(obj) -> Dict[str, Any]:
|
|
498
506
|
if isinstance(obj, dict):
|
|
499
507
|
return {k: model_dump(v) for k, v in obj.items()}
|
|
@@ -502,58 +510,61 @@ def model_dump(obj) -> Dict[str, Any]:
|
|
|
502
510
|
else:
|
|
503
511
|
return obj
|
|
504
512
|
|
|
505
|
-
|
|
513
|
+
|
|
514
|
+
def convert_glm_completion_to_responses_items(
|
|
515
|
+
response: ModelResponse, image_width: int, image_height: int
|
|
516
|
+
) -> List[Dict[str, Any]]:
|
|
506
517
|
"""
|
|
507
518
|
Convert GLM-4.5V completion response to responses items format.
|
|
508
|
-
|
|
519
|
+
|
|
509
520
|
Args:
|
|
510
521
|
response: LiteLLM ModelResponse from GLM-4.5V
|
|
511
522
|
image_width: Original image width for coordinate scaling
|
|
512
523
|
image_height: Original image height for coordinate scaling
|
|
513
|
-
|
|
524
|
+
|
|
514
525
|
Returns:
|
|
515
526
|
List of response items in the proper format
|
|
516
527
|
"""
|
|
517
528
|
import uuid
|
|
518
|
-
|
|
529
|
+
|
|
519
530
|
response_items = []
|
|
520
|
-
|
|
531
|
+
|
|
521
532
|
if not response.choices or not response.choices[0].message:
|
|
522
533
|
return response_items
|
|
523
|
-
|
|
534
|
+
|
|
524
535
|
message = response.choices[0].message
|
|
525
536
|
content = message.content or ""
|
|
526
|
-
reasoning_content = getattr(message,
|
|
527
|
-
|
|
537
|
+
reasoning_content = getattr(message, "reasoning_content", None)
|
|
538
|
+
|
|
528
539
|
# Add reasoning item if present
|
|
529
540
|
if reasoning_content:
|
|
530
541
|
reasoning_item = model_dump(make_reasoning_item(reasoning_content))
|
|
531
542
|
response_items.append(reasoning_item)
|
|
532
|
-
|
|
543
|
+
|
|
533
544
|
# Parse the content to extract action and text
|
|
534
545
|
parsed_response = parse_glm_response(content)
|
|
535
546
|
action = parsed_response.get("action", "")
|
|
536
547
|
action_text = parsed_response.get("action_text", "")
|
|
537
|
-
|
|
548
|
+
|
|
538
549
|
# Add message item with text content (excluding action and memory)
|
|
539
550
|
if action_text:
|
|
540
551
|
# Remove action from action_text if it's there
|
|
541
552
|
clean_text = action_text
|
|
542
553
|
if action and action in clean_text:
|
|
543
554
|
clean_text = clean_text.replace(action, "").strip()
|
|
544
|
-
|
|
555
|
+
|
|
545
556
|
# Remove memory section
|
|
546
557
|
memory_pattern = r"Memory:\s*\[.*?\]\s*$"
|
|
547
558
|
clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
|
|
548
|
-
|
|
559
|
+
|
|
549
560
|
if clean_text:
|
|
550
561
|
message_item = model_dump(make_output_text_item(clean_text))
|
|
551
562
|
response_items.append(message_item)
|
|
552
|
-
|
|
563
|
+
|
|
553
564
|
# Convert action to computer call if present
|
|
554
565
|
if action:
|
|
555
566
|
call_id = f"call_{uuid.uuid4().hex[:8]}"
|
|
556
|
-
|
|
567
|
+
|
|
557
568
|
# Parse different action types and create appropriate computer calls
|
|
558
569
|
if action.startswith("left_click"):
|
|
559
570
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
@@ -566,7 +577,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
566
577
|
computer_call["call_id"] = call_id
|
|
567
578
|
computer_call["status"] = "completed"
|
|
568
579
|
response_items.append(computer_call)
|
|
569
|
-
|
|
580
|
+
|
|
570
581
|
elif action.startswith("right_click"):
|
|
571
582
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
572
583
|
if coord_match:
|
|
@@ -577,7 +588,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
577
588
|
computer_call["call_id"] = call_id
|
|
578
589
|
computer_call["status"] = "completed"
|
|
579
590
|
response_items.append(computer_call)
|
|
580
|
-
|
|
591
|
+
|
|
581
592
|
elif action.startswith("left_double_click"):
|
|
582
593
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
583
594
|
if coord_match:
|
|
@@ -588,7 +599,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
588
599
|
computer_call["call_id"] = call_id
|
|
589
600
|
computer_call["status"] = "completed"
|
|
590
601
|
response_items.append(computer_call)
|
|
591
|
-
|
|
602
|
+
|
|
592
603
|
elif action.startswith("left_drag"):
|
|
593
604
|
start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
594
605
|
end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
@@ -605,18 +616,18 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
605
616
|
computer_call["call_id"] = call_id
|
|
606
617
|
computer_call["status"] = "completed"
|
|
607
618
|
response_items.append(computer_call)
|
|
608
|
-
|
|
619
|
+
|
|
609
620
|
elif action.startswith("key"):
|
|
610
621
|
key_match = re.search(r"keys='([^']+)'", action)
|
|
611
622
|
if key_match:
|
|
612
623
|
keys = key_match.group(1)
|
|
613
624
|
# Split keys by '+' for key combinations, or use as single key
|
|
614
|
-
key_list = keys.split(
|
|
625
|
+
key_list = keys.split("+") if "+" in keys else [keys]
|
|
615
626
|
computer_call = model_dump(make_keypress_item(key_list))
|
|
616
627
|
computer_call["call_id"] = call_id
|
|
617
628
|
computer_call["status"] = "completed"
|
|
618
629
|
response_items.append(computer_call)
|
|
619
|
-
|
|
630
|
+
|
|
620
631
|
elif action.startswith("type"):
|
|
621
632
|
content_match = re.search(r"content='([^']*)'", action)
|
|
622
633
|
if content_match:
|
|
@@ -625,7 +636,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
625
636
|
computer_call["call_id"] = call_id
|
|
626
637
|
computer_call["status"] = "completed"
|
|
627
638
|
response_items.append(computer_call)
|
|
628
|
-
|
|
639
|
+
|
|
629
640
|
elif action.startswith("scroll"):
|
|
630
641
|
coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
|
|
631
642
|
direction_match = re.search(r"direction='([^']+)'", action)
|
|
@@ -648,15 +659,16 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
|
|
|
648
659
|
computer_call["call_id"] = call_id
|
|
649
660
|
computer_call["status"] = "completed"
|
|
650
661
|
response_items.append(computer_call)
|
|
651
|
-
|
|
662
|
+
|
|
652
663
|
elif action == "WAIT()":
|
|
653
664
|
computer_call = model_dump(make_wait_item())
|
|
654
665
|
computer_call["call_id"] = call_id
|
|
655
666
|
computer_call["status"] = "completed"
|
|
656
667
|
response_items.append(computer_call)
|
|
657
|
-
|
|
668
|
+
|
|
658
669
|
return response_items
|
|
659
670
|
|
|
671
|
+
|
|
660
672
|
@register_agent(models=r"(?i).*GLM-4\.5V.*")
|
|
661
673
|
class Glm4vConfig(AsyncAgentConfig):
|
|
662
674
|
"""GLM-4.5V agent configuration using liteLLM."""
|
|
@@ -674,11 +686,11 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
674
686
|
_on_api_end=None,
|
|
675
687
|
_on_usage=None,
|
|
676
688
|
_on_screenshot=None,
|
|
677
|
-
**kwargs
|
|
689
|
+
**kwargs,
|
|
678
690
|
) -> Dict[str, Any]:
|
|
679
691
|
"""
|
|
680
692
|
Predict the next step using GLM-4.5V model.
|
|
681
|
-
|
|
693
|
+
|
|
682
694
|
Args:
|
|
683
695
|
messages: Input messages following Responses format
|
|
684
696
|
model: Model name to use
|
|
@@ -691,7 +703,7 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
691
703
|
_on_api_end: Callback for API end
|
|
692
704
|
_on_usage: Callback for usage tracking
|
|
693
705
|
_on_screenshot: Callback for screenshot events
|
|
694
|
-
|
|
706
|
+
|
|
695
707
|
Returns:
|
|
696
708
|
Dict with "output" and "usage" keys
|
|
697
709
|
"""
|
|
@@ -708,7 +720,7 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
708
720
|
user_instruction = item.get("text", "")
|
|
709
721
|
break
|
|
710
722
|
break
|
|
711
|
-
|
|
723
|
+
|
|
712
724
|
# Get the last image for processing
|
|
713
725
|
last_image_b64 = get_last_image_from_messages(messages)
|
|
714
726
|
if not last_image_b64 and computer_handler:
|
|
@@ -718,35 +730,28 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
718
730
|
last_image_b64 = screenshot_b64
|
|
719
731
|
if _on_screenshot:
|
|
720
732
|
await _on_screenshot(screenshot_b64)
|
|
721
|
-
|
|
733
|
+
|
|
722
734
|
if not last_image_b64:
|
|
723
735
|
raise ValueError("No image available for GLM-4.5V processing")
|
|
724
|
-
|
|
736
|
+
|
|
725
737
|
# Convert responses items to GLM-4.5V PC prompt format with historical actions
|
|
726
738
|
prompt_content = convert_responses_items_to_glm45v_pc_prompt(
|
|
727
739
|
messages=messages,
|
|
728
740
|
task=user_instruction,
|
|
729
|
-
memory="[]" # Initialize with empty memory for now
|
|
741
|
+
memory="[]", # Initialize with empty memory for now
|
|
730
742
|
)
|
|
731
|
-
|
|
743
|
+
|
|
732
744
|
# Add the current screenshot to the end
|
|
733
|
-
prompt_content.append(
|
|
734
|
-
"type": "image_url",
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
745
|
+
prompt_content.append(
|
|
746
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}}
|
|
747
|
+
)
|
|
748
|
+
|
|
738
749
|
# Prepare messages for liteLLM
|
|
739
750
|
litellm_messages = [
|
|
740
|
-
{
|
|
741
|
-
|
|
742
|
-
"content": "You are a helpful GUI agent assistant."
|
|
743
|
-
},
|
|
744
|
-
{
|
|
745
|
-
"role": "user",
|
|
746
|
-
"content": prompt_content
|
|
747
|
-
}
|
|
751
|
+
{"role": "system", "content": "You are a helpful GUI agent assistant."},
|
|
752
|
+
{"role": "user", "content": prompt_content},
|
|
748
753
|
]
|
|
749
|
-
|
|
754
|
+
|
|
750
755
|
# Prepare API call kwargs
|
|
751
756
|
api_kwargs = {
|
|
752
757
|
"model": model,
|
|
@@ -757,20 +762,20 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
757
762
|
# "skip_special_tokens": False,
|
|
758
763
|
# }
|
|
759
764
|
}
|
|
760
|
-
|
|
765
|
+
|
|
761
766
|
# Add API callbacks
|
|
762
767
|
if _on_api_start:
|
|
763
768
|
await _on_api_start(api_kwargs)
|
|
764
|
-
|
|
769
|
+
|
|
765
770
|
# Call liteLLM
|
|
766
771
|
response = await litellm.acompletion(**api_kwargs)
|
|
767
|
-
|
|
772
|
+
|
|
768
773
|
if _on_api_end:
|
|
769
774
|
await _on_api_end(api_kwargs, response)
|
|
770
|
-
|
|
775
|
+
|
|
771
776
|
# Get image dimensions for coordinate scaling
|
|
772
777
|
image_width, image_height = 1920, 1080 # Default dimensions
|
|
773
|
-
|
|
778
|
+
|
|
774
779
|
# Try to get actual dimensions from the image
|
|
775
780
|
try:
|
|
776
781
|
image_data = base64.b64decode(last_image_b64)
|
|
@@ -778,41 +783,38 @@ class Glm4vConfig(AsyncAgentConfig):
|
|
|
778
783
|
image_width, image_height = image.size
|
|
779
784
|
except Exception:
|
|
780
785
|
pass # Use default dimensions
|
|
781
|
-
|
|
786
|
+
|
|
782
787
|
# Convert GLM completion response to responses items
|
|
783
|
-
response_items = convert_glm_completion_to_responses_items(
|
|
784
|
-
|
|
788
|
+
response_items = convert_glm_completion_to_responses_items(
|
|
789
|
+
response, image_width, image_height
|
|
790
|
+
)
|
|
791
|
+
|
|
785
792
|
# Extract usage information
|
|
786
793
|
response_usage = {
|
|
787
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
794
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
795
|
+
response.usage
|
|
796
|
+
).model_dump(),
|
|
788
797
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
789
798
|
}
|
|
790
799
|
if _on_usage:
|
|
791
800
|
await _on_usage(response_usage)
|
|
792
|
-
|
|
801
|
+
|
|
793
802
|
# Create agent response
|
|
794
|
-
agent_response = {
|
|
795
|
-
|
|
796
|
-
"usage": response_usage
|
|
797
|
-
}
|
|
798
|
-
|
|
803
|
+
agent_response = {"output": response_items, "usage": response_usage}
|
|
804
|
+
|
|
799
805
|
return agent_response
|
|
800
806
|
|
|
801
807
|
async def predict_click(
|
|
802
|
-
self,
|
|
803
|
-
model: str,
|
|
804
|
-
image_b64: str,
|
|
805
|
-
instruction: str,
|
|
806
|
-
**kwargs
|
|
808
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
807
809
|
) -> Optional[Tuple[int, int]]:
|
|
808
810
|
"""
|
|
809
811
|
Predict click coordinates using GLM-4.5V model.
|
|
810
|
-
|
|
812
|
+
|
|
811
813
|
Args:
|
|
812
814
|
model: Model name to use
|
|
813
815
|
image_b64: Base64 encoded image
|
|
814
816
|
instruction: Instruction for where to click
|
|
815
|
-
|
|
817
|
+
|
|
816
818
|
Returns:
|
|
817
819
|
Tuple with (x, y) coordinates or None
|
|
818
820
|
"""
|
|
@@ -824,22 +826,22 @@ Respond with a single click action in this format:
|
|
|
824
826
|
left_click(start_box='[x,y]')
|
|
825
827
|
|
|
826
828
|
Where x,y are coordinates normalized to 0-999 range."""
|
|
827
|
-
|
|
829
|
+
|
|
828
830
|
# Prepare messages for liteLLM
|
|
829
831
|
litellm_messages = [
|
|
830
|
-
{
|
|
831
|
-
"role": "system",
|
|
832
|
-
"content": "You are a helpful GUI agent assistant."
|
|
833
|
-
},
|
|
832
|
+
{"role": "system", "content": "You are a helpful GUI agent assistant."},
|
|
834
833
|
{
|
|
835
834
|
"role": "user",
|
|
836
835
|
"content": [
|
|
837
836
|
{"type": "text", "text": click_prompt},
|
|
838
|
-
{
|
|
839
|
-
|
|
840
|
-
|
|
837
|
+
{
|
|
838
|
+
"type": "image_url",
|
|
839
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
840
|
+
},
|
|
841
|
+
],
|
|
842
|
+
},
|
|
841
843
|
]
|
|
842
|
-
|
|
844
|
+
|
|
843
845
|
# Prepare API call kwargs
|
|
844
846
|
api_kwargs = {
|
|
845
847
|
"model": model,
|
|
@@ -848,21 +850,21 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
848
850
|
"temperature": 0.001,
|
|
849
851
|
"extra_body": {
|
|
850
852
|
"skip_special_tokens": False,
|
|
851
|
-
}
|
|
853
|
+
},
|
|
852
854
|
}
|
|
853
|
-
|
|
855
|
+
|
|
854
856
|
# Call liteLLM
|
|
855
857
|
response = await litellm.acompletion(**api_kwargs)
|
|
856
|
-
|
|
858
|
+
|
|
857
859
|
# Extract response content
|
|
858
860
|
response_content = response.choices[0].message.content.strip()
|
|
859
861
|
print(response)
|
|
860
|
-
|
|
862
|
+
|
|
861
863
|
# Parse response for click coordinates
|
|
862
864
|
# Look for coordinates in the response, handling special tokens
|
|
863
865
|
coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
|
|
864
866
|
match = re.search(coord_pattern, response_content)
|
|
865
|
-
|
|
867
|
+
|
|
866
868
|
if not match:
|
|
867
869
|
# Fallback: look for coordinates without special tokens
|
|
868
870
|
coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
|
|
@@ -870,7 +872,7 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
870
872
|
|
|
871
873
|
if match:
|
|
872
874
|
x, y = int(match.group(1)), int(match.group(2))
|
|
873
|
-
|
|
875
|
+
|
|
874
876
|
# Get actual image dimensions for scaling
|
|
875
877
|
try:
|
|
876
878
|
image_data = base64.b64decode(image_b64)
|
|
@@ -879,15 +881,15 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
879
881
|
except Exception:
|
|
880
882
|
# Use default dimensions
|
|
881
883
|
image_width, image_height = 1920, 1080
|
|
882
|
-
|
|
884
|
+
|
|
883
885
|
# Convert from 0-999 normalized coordinates to actual pixel coordinates
|
|
884
886
|
actual_x = int((x / 999.0) * image_width)
|
|
885
887
|
actual_y = int((y / 999.0) * image_height)
|
|
886
|
-
|
|
888
|
+
|
|
887
889
|
return (actual_x, actual_y)
|
|
888
|
-
|
|
890
|
+
|
|
889
891
|
return None
|
|
890
|
-
|
|
892
|
+
|
|
891
893
|
except Exception as e:
|
|
892
894
|
# Log error and return None
|
|
893
895
|
print(f"Error in predict_click: {e}")
|
|
@@ -896,7 +898,7 @@ Where x,y are coordinates normalized to 0-999 range."""
|
|
|
896
898
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
897
899
|
"""
|
|
898
900
|
Get list of capabilities supported by this agent config.
|
|
899
|
-
|
|
901
|
+
|
|
900
902
|
Returns:
|
|
901
903
|
List of capability strings
|
|
902
904
|
"""
|