cua-agent 0.4.10__py3-none-any.whl → 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/glm45v.py ADDED
@@ -0,0 +1,902 @@
1
+ """
2
+ GLM-4.5V agent loop implementation using liteLLM for GLM-4.5V model.
3
+ Supports vision-language models for computer control with bounding box parsing.
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import base64
9
+ import re
10
+ from typing import Dict, List, Any, Optional, Tuple
11
+ from io import BytesIO
12
+ from PIL import Image
13
+ import litellm
14
+ from litellm.types.utils import ModelResponse
15
+ from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
16
+
17
+ from ..decorators import register_agent
18
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
19
+ from ..loops.base import AsyncAgentConfig
20
+ from ..responses import (
21
+ convert_responses_items_to_completion_messages,
22
+ convert_completion_messages_to_responses_items,
23
+ make_reasoning_item,
24
+ make_output_text_item,
25
+ make_click_item,
26
+ make_double_click_item,
27
+ make_drag_item,
28
+ make_keypress_item,
29
+ make_scroll_item,
30
+ make_type_item,
31
+ make_wait_item,
32
+ make_input_image_item
33
+ )
34
+
35
+ # GLM-4.5V specific constants
36
+ GLM_ACTION_SPACE = """
37
+ ### {left,right,middle}_click
38
+
39
+ Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')`
40
+ {
41
+ 'name': ['left_click', 'right_click', 'middle_click'],
42
+ 'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.',
43
+ 'parameters': {
44
+ 'type': 'object',
45
+ 'properties': {
46
+ 'start_box': {
47
+ 'type': 'array',
48
+ 'items': {
49
+ 'type': 'integer'
50
+ },
51
+ 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.'
52
+ },
53
+ 'element_info': {
54
+ 'type': 'string',
55
+ 'description': 'Optional text description of the UI element being clicked.'
56
+ }
57
+ },
58
+ 'required': ['start_box']
59
+ }
60
+ }
61
+
62
+ ### hover
63
+
64
+ Call rule: `hover(start_box='[x,y]', element_info='')`
65
+ {
66
+ 'name': 'hover',
67
+ 'description': 'Move the mouse pointer to the specified coordinates without performing any click action.',
68
+ 'parameters': {
69
+ 'type': 'object',
70
+ 'properties': {
71
+ 'start_box': {
72
+ 'type': 'array',
73
+ 'items': {
74
+ 'type': 'integer'
75
+ },
76
+ 'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.'
77
+ },
78
+ 'element_info': {
79
+ 'type': 'string',
80
+ 'description': 'Optional text description of the UI element being hovered over.'
81
+ }
82
+ },
83
+ 'required': ['start_box']
84
+ }
85
+ }
86
+
87
+ ### left_double_click
88
+
89
+ Call rule: `left_double_click(start_box='[x,y]', element_info='')`
90
+ {
91
+ 'name': 'left_double_click',
92
+ 'description': 'Perform a left mouse double-click at the specified coordinates on the screen.',
93
+ 'parameters': {
94
+ 'type': 'object',
95
+ 'properties': {
96
+ 'start_box': {
97
+ 'type': 'array',
98
+ 'items': {
99
+ 'type': 'integer'
100
+ },
101
+ 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.'
102
+ },
103
+ 'element_info': {
104
+ 'type': 'string',
105
+ 'description': 'Optional text description of the UI element being double-clicked.'
106
+ }
107
+ },
108
+ 'required': ['start_box']
109
+ }
110
+ }
111
+
112
+ ### left_drag
113
+
114
+ Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')`
115
+ {
116
+ 'name': 'left_drag',
117
+ 'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.',
118
+ 'parameters': {
119
+ 'type': 'object',
120
+ 'properties': {
121
+ 'start_box': {
122
+ 'type': 'array',
123
+ 'items': {
124
+ 'type': 'integer'
125
+ },
126
+ 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.'
127
+ },
128
+ 'end_box': {
129
+ 'type': 'array',
130
+ 'items': {
131
+ 'type': 'integer'
132
+ },
133
+ 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.'
134
+ },
135
+ 'element_info': {
136
+ 'type': 'string',
137
+ 'description': 'Optional text description of the UI element being dragged.'
138
+ }
139
+ },
140
+ 'required': ['start_box', 'end_box']
141
+ }
142
+ }
143
+
144
+ ### key
145
+
146
+ Call rule: `key(keys='')`
147
+ {
148
+ 'name': 'key',
149
+ 'description': 'Simulate pressing a single key or combination of keys on the keyboard.',
150
+ 'parameters': {
151
+ 'type': 'object',
152
+ 'properties': {
153
+ 'keys': {
154
+ 'type': 'string',
155
+ 'description': 'The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab').'
156
+ }
157
+ },
158
+ 'required': ['keys']
159
+ }
160
+ }
161
+
162
+ ### type
163
+
164
+ Call rule: `type(content='')`
165
+ {
166
+ 'name': 'type',
167
+ 'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.',
168
+ 'parameters': {
169
+ 'type': 'object',
170
+ 'properties': {
171
+ 'content': {
172
+ 'type': 'string',
173
+ 'description': 'The text content to be typed into the active text field.'
174
+ }
175
+ },
176
+ 'required': ['content']
177
+ }
178
+ }
179
+
180
+ ### scroll
181
+
182
+ Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')`
183
+ {
184
+ 'name': 'scroll',
185
+ 'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.',
186
+ 'parameters': {
187
+ 'type': 'object',
188
+ 'properties': {
189
+ 'start_box': {
190
+ 'type': 'array',
191
+ 'items': {
192
+ 'type': 'integer'
193
+ },
194
+ 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.'
195
+ },
196
+ 'direction': {
197
+ 'type': 'string',
198
+ 'enum': ['down', 'up'],
199
+ 'description': 'The direction to scroll: 'down' or 'up'.'
200
+ },
201
+ 'step': {
202
+ 'type': 'integer',
203
+ 'default': 5,
204
+ 'description': 'Number of wheel steps to scroll, default is 5.'
205
+ },
206
+ 'element_info': {
207
+ 'type': 'string',
208
+ 'description': 'Optional text description of the UI element being scrolled.'
209
+ }
210
+ },
211
+ 'required': ['start_box', 'direction']
212
+ }
213
+ }
214
+
215
+ ### WAIT
216
+
217
+ Call rule: `WAIT()`
218
+ {
219
+ 'name': 'WAIT',
220
+ 'description': 'Wait for 5 seconds before proceeding to the next action.',
221
+ 'parameters': {
222
+ 'type': 'object',
223
+ 'properties': {},
224
+ 'required': []
225
+ }
226
+ }
227
+
228
+ ### DONE
229
+
230
+ Call rule: `DONE()`
231
+ {
232
+ 'name': 'DONE',
233
+ 'description': 'Indicate that the current task has been completed successfully and no further actions are needed.',
234
+ 'parameters': {
235
+ 'type': 'object',
236
+ 'properties': {},
237
+ 'required': []
238
+ }
239
+ }
240
+
241
+ ### FAIL
242
+
243
+ Call rule: `FAIL()`
244
+ {
245
+ 'name': 'FAIL',
246
+ 'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.',
247
+ 'parameters': {
248
+ 'type': 'object',
249
+ 'properties': {},
250
+ 'required': []
251
+ }
252
+ }"""
253
+
254
+ def encode_image_to_base64(image_path: str) -> str:
255
+ """Encode image file to base64 string with data URI."""
256
+ with open(image_path, "rb") as image_file:
257
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
258
+ return f"data:image/png;base64,{encoded_string}"
259
+
260
+ def parse_glm_response(response: str) -> Dict[str, Any]:
261
+ """
262
+ Parse GLM-4.5V response to extract action and memory.
263
+
264
+ The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
265
+ Coordinates are normalized values between 0 and 1000.
266
+ """
267
+ # Extract action from between special tokens
268
+ pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>"
269
+ match = re.search(pattern, response)
270
+ if match:
271
+ action = match.group(1).strip()
272
+ else:
273
+ # Fallback: look for function call patterns
274
+ action_pattern = r"[\w_]+\([^)]*\)"
275
+ matches = re.findall(action_pattern, response)
276
+ action = matches[0] if matches else None
277
+
278
+ # Extract memory section
279
+ memory_pattern = r"Memory:(.*?)$"
280
+ memory_match = re.search(memory_pattern, response, re.DOTALL)
281
+ memory = memory_match.group(1).strip() if memory_match else "[]"
282
+
283
+ # Extract action text (everything before Memory:)
284
+ action_text_pattern = r'^(.*?)Memory:'
285
+ action_text_match = re.search(action_text_pattern, response, re.DOTALL)
286
+ action_text = action_text_match.group(1).strip() if action_text_match else response
287
+
288
+ # Clean up action text by removing special tokens
289
+ if action_text:
290
+ action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
291
+
292
+ return {
293
+ "action": action,
294
+ "action_text": action_text,
295
+ "memory": memory
296
+ }
297
+
298
+ def get_last_image_from_messages(messages: Messages) -> Optional[str]:
299
+ """Extract the last image from messages for processing."""
300
+ for message in reversed(messages):
301
+ if isinstance(message, dict):
302
+ if message.get("type") == "computer_call_output":
303
+ output = message.get("output", {})
304
+ if isinstance(output, dict) and output.get("type") == "input_image":
305
+ image_url = output.get("image_url", "")
306
+ if isinstance(image_url, str) and image_url.startswith("data:image/"):
307
+ # Extract base64 part
308
+ return image_url.split(",", 1)[1]
309
+ elif message.get("role") == "user":
310
+ content = message.get("content", [])
311
+ if isinstance(content, list):
312
+ for item in reversed(content):
313
+ if isinstance(item, dict) and item.get("type") == "image_url":
314
+ image_url_obj = item.get("image_url", {})
315
+ if isinstance(image_url_obj, dict):
316
+ image_url = image_url_obj.get("url", "")
317
+ if isinstance(image_url, str) and image_url.startswith("data:image/"):
318
+ return image_url.split(",", 1)[1]
319
+ return None
320
+
321
+ def convert_responses_items_to_glm45v_pc_prompt(messages: Messages, task: str, memory: str = "") -> List[Dict[str, Any]]:
322
+ """Convert responses items to GLM-4.5V PC prompt format with historical actions.
323
+
324
+ Args:
325
+ messages: List of message items from the conversation
326
+ task: The task description
327
+ memory: Current memory state
328
+
329
+ Returns:
330
+ List of content items for the prompt (text and image_url items)
331
+ """
332
+ action_space = GLM_ACTION_SPACE
333
+
334
+ # Template head
335
+ head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
336
+
337
+ # Task:
338
+ {task}
339
+
340
+ # Task Platform
341
+ Ubuntu
342
+
343
+ # Action Space
344
+ {action_space}
345
+
346
+ # Historical Actions and Current Memory
347
+ History:"""
348
+
349
+ # Template tail
350
+ tail_text = f"""
351
+ Memory:
352
+ {memory}
353
+ # Output Format
354
+ Plain text explanation with action(param='...')
355
+ Memory:
356
+ [{{"key": "value"}}, ...]
357
+
358
+ # Some Additional Notes
359
+ - I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.
360
+ - You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round. The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory. Even if you don't need to remember anything, you should also output an empty list.
361
+ - My computer's password is "password", feel free to use it when you need sudo rights.
362
+ - For the thunderbird account "anonym-x2024@outlook.com", the password is "gTCI";=@y7|QJ0nDa_kN3Sb&>".
363
+
364
+ Current Screenshot:
365
+ """
366
+
367
+ # Build history from messages
368
+ history = []
369
+ history_images = []
370
+
371
+ # Group messages into steps
372
+ current_step = []
373
+ step_num = 0
374
+
375
+ for message in messages:
376
+ msg_type = message.get("type")
377
+
378
+ if msg_type == "reasoning":
379
+ current_step.append(message)
380
+ elif msg_type == "message" and message.get("role") == "assistant":
381
+ current_step.append(message)
382
+ elif msg_type == "computer_call":
383
+ current_step.append(message)
384
+ elif msg_type == "computer_call_output":
385
+ current_step.append(message)
386
+ # End of step - process it
387
+ if current_step:
388
+ step_num += 1
389
+
390
+ # Extract bot thought from message content
391
+ bot_thought = ""
392
+ for item in current_step:
393
+ if item.get("type") == "message" and item.get("role") == "assistant":
394
+ content = item.get("content", [])
395
+ for content_item in content:
396
+ if content_item.get("type") == "output_text":
397
+ bot_thought = content_item.get("text", "")
398
+ break
399
+ break
400
+
401
+ # Extract action from computer_call
402
+ action_text = ""
403
+ for item in current_step:
404
+ if item.get("type") == "computer_call":
405
+ action = item.get("action", {})
406
+ action_type = action.get("type", "")
407
+
408
+ if action_type == "click":
409
+ x, y = action.get("x", 0), action.get("y", 0)
410
+ # Convert to 0-999 range (assuming screen dimensions)
411
+ # For now, use direct coordinates - this may need adjustment
412
+ action_text = f"left_click(start_box='[{x},{y}]')"
413
+ elif action_type == "double_click":
414
+ x, y = action.get("x", 0), action.get("y", 0)
415
+ action_text = f"left_double_click(start_box='[{x},{y}]')"
416
+ elif action_type == "right_click":
417
+ x, y = action.get("x", 0), action.get("y", 0)
418
+ action_text = f"right_click(start_box='[{x},{y}]')"
419
+ elif action_type == "drag":
420
+ # Handle drag with path
421
+ path = action.get("path", [])
422
+ if len(path) >= 2:
423
+ start = path[0]
424
+ end = path[-1]
425
+ action_text = f"left_drag(start_box='[{start.get('x', 0)},{start.get('y', 0)}]', end_box='[{end.get('x', 0)},{end.get('y', 0)}]')"
426
+ elif action_type == "keypress":
427
+ key = action.get("key", "")
428
+ action_text = f"key(keys='{key}')"
429
+ elif action_type == "type":
430
+ text = action.get("text", "")
431
+ action_text = f"type(content='{text}')"
432
+ elif action_type == "scroll":
433
+ x, y = action.get("x", 0), action.get("y", 0)
434
+ direction = action.get("direction", "down")
435
+ action_text = f"scroll(start_box='[{x},{y}]', direction='{direction}')"
436
+ elif action_type == "wait":
437
+ action_text = "WAIT()"
438
+ break
439
+
440
+ # Extract screenshot from computer_call_output
441
+ screenshot_url = None
442
+ for item in current_step:
443
+ if item.get("type") == "computer_call_output":
444
+ output = item.get("output", {})
445
+ if output.get("type") == "input_image":
446
+ screenshot_url = output.get("image_url", "")
447
+ break
448
+
449
+ # Store step info
450
+ step_info = {
451
+ "step_num": step_num,
452
+ "bot_thought": bot_thought,
453
+ "action_text": action_text,
454
+ "screenshot_url": screenshot_url
455
+ }
456
+ history.append(step_info)
457
+
458
+ # Store screenshot for last 4 steps
459
+ if screenshot_url:
460
+ history_images.append(screenshot_url)
461
+
462
+ current_step = []
463
+
464
+ # Build content array with head, history, and tail
465
+ content = []
466
+ current_text = head_text
467
+
468
+ total_history_steps = len(history)
469
+ history_image_count = min(4, len(history_images)) # Last 4 images
470
+
471
+ for step_idx, step_info in enumerate(history):
472
+ step_num = step_info["step_num"]
473
+ bot_thought = step_info["bot_thought"]
474
+ action_text = step_info["action_text"]
475
+
476
+ if step_idx < total_history_steps - history_image_count:
477
+ # For steps beyond the last 4, use text placeholder
478
+ current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
479
+ else:
480
+ # For the last 4 steps, insert images
481
+ current_text += f"\nstep {step_num}: Screenshot:"
482
+ content.append({"type": "text", "text": current_text})
483
+
484
+ # Add image
485
+ img_idx = step_idx - (total_history_steps - history_image_count)
486
+ if img_idx < len(history_images):
487
+ content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
488
+
489
+ current_text = f" Thought: {bot_thought}\nAction: {action_text}"
490
+
491
+ # Add tail
492
+ current_text += tail_text
493
+ content.append({"type": "text", "text": current_text})
494
+
495
+ return content
496
+
497
+ def model_dump(obj) -> Dict[str, Any]:
498
+ if isinstance(obj, dict):
499
+ return {k: model_dump(v) for k, v in obj.items()}
500
+ elif hasattr(obj, "model_dump"):
501
+ return obj.model_dump()
502
+ else:
503
+ return obj
504
+
505
+ def convert_glm_completion_to_responses_items(response: ModelResponse, image_width: int, image_height: int) -> List[Dict[str, Any]]:
506
+ """
507
+ Convert GLM-4.5V completion response to responses items format.
508
+
509
+ Args:
510
+ response: LiteLLM ModelResponse from GLM-4.5V
511
+ image_width: Original image width for coordinate scaling
512
+ image_height: Original image height for coordinate scaling
513
+
514
+ Returns:
515
+ List of response items in the proper format
516
+ """
517
+ import uuid
518
+
519
+ response_items = []
520
+
521
+ if not response.choices or not response.choices[0].message:
522
+ return response_items
523
+
524
+ message = response.choices[0].message
525
+ content = message.content or ""
526
+ reasoning_content = getattr(message, 'reasoning_content', None)
527
+
528
+ # Add reasoning item if present
529
+ if reasoning_content:
530
+ reasoning_item = model_dump(make_reasoning_item(reasoning_content))
531
+ response_items.append(reasoning_item)
532
+
533
+ # Parse the content to extract action and text
534
+ parsed_response = parse_glm_response(content)
535
+ action = parsed_response.get("action", "")
536
+ action_text = parsed_response.get("action_text", "")
537
+
538
+ # Add message item with text content (excluding action and memory)
539
+ if action_text:
540
+ # Remove action from action_text if it's there
541
+ clean_text = action_text
542
+ if action and action in clean_text:
543
+ clean_text = clean_text.replace(action, "").strip()
544
+
545
+ # Remove memory section
546
+ memory_pattern = r"Memory:\s*\[.*?\]\s*$"
547
+ clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
548
+
549
+ if clean_text:
550
+ message_item = model_dump(make_output_text_item(clean_text))
551
+ response_items.append(message_item)
552
+
553
+ # Convert action to computer call if present
554
+ if action:
555
+ call_id = f"call_{uuid.uuid4().hex[:8]}"
556
+
557
+ # Parse different action types and create appropriate computer calls
558
+ if action.startswith("left_click"):
559
+ coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
560
+ if coord_match:
561
+ x, y = int(coord_match.group(1)), int(coord_match.group(2))
562
+ # Convert from 0-999 to actual pixel coordinates
563
+ actual_x = int((x / 999.0) * image_width)
564
+ actual_y = int((y / 999.0) * image_height)
565
+ computer_call = model_dump(make_click_item(actual_x, actual_y))
566
+ computer_call["call_id"] = call_id
567
+ computer_call["status"] = "completed"
568
+ response_items.append(computer_call)
569
+
570
+ elif action.startswith("right_click"):
571
+ coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
572
+ if coord_match:
573
+ x, y = int(coord_match.group(1)), int(coord_match.group(2))
574
+ actual_x = int((x / 999.0) * image_width)
575
+ actual_y = int((y / 999.0) * image_height)
576
+ computer_call = model_dump(make_click_item(actual_x, actual_y, button="right"))
577
+ computer_call["call_id"] = call_id
578
+ computer_call["status"] = "completed"
579
+ response_items.append(computer_call)
580
+
581
+ elif action.startswith("left_double_click"):
582
+ coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
583
+ if coord_match:
584
+ x, y = int(coord_match.group(1)), int(coord_match.group(2))
585
+ actual_x = int((x / 999.0) * image_width)
586
+ actual_y = int((y / 999.0) * image_height)
587
+ computer_call = model_dump(make_double_click_item(actual_x, actual_y))
588
+ computer_call["call_id"] = call_id
589
+ computer_call["status"] = "completed"
590
+ response_items.append(computer_call)
591
+
592
+ elif action.startswith("left_drag"):
593
+ start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
594
+ end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
595
+ if start_match and end_match:
596
+ x1, y1 = int(start_match.group(1)), int(start_match.group(2))
597
+ x2, y2 = int(end_match.group(1)), int(end_match.group(2))
598
+ actual_x1 = int((x1 / 999.0) * image_width)
599
+ actual_y1 = int((y1 / 999.0) * image_height)
600
+ actual_x2 = int((x2 / 999.0) * image_width)
601
+ actual_y2 = int((y2 / 999.0) * image_height)
602
+ # Create path for drag operation
603
+ drag_path = [{"x": actual_x1, "y": actual_y1}, {"x": actual_x2, "y": actual_y2}]
604
+ computer_call = model_dump(make_drag_item(drag_path))
605
+ computer_call["call_id"] = call_id
606
+ computer_call["status"] = "completed"
607
+ response_items.append(computer_call)
608
+
609
+ elif action.startswith("key"):
610
+ key_match = re.search(r"keys='([^']+)'", action)
611
+ if key_match:
612
+ keys = key_match.group(1)
613
+ # Split keys by '+' for key combinations, or use as single key
614
+ key_list = keys.split('+') if '+' in keys else [keys]
615
+ computer_call = model_dump(make_keypress_item(key_list))
616
+ computer_call["call_id"] = call_id
617
+ computer_call["status"] = "completed"
618
+ response_items.append(computer_call)
619
+
620
+ elif action.startswith("type"):
621
+ content_match = re.search(r"content='([^']*)'", action)
622
+ if content_match:
623
+ content = content_match.group(1)
624
+ computer_call = model_dump(make_type_item(content))
625
+ computer_call["call_id"] = call_id
626
+ computer_call["status"] = "completed"
627
+ response_items.append(computer_call)
628
+
629
+ elif action.startswith("scroll"):
630
+ coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
631
+ direction_match = re.search(r"direction='([^']+)'", action)
632
+ if coord_match and direction_match:
633
+ x, y = int(coord_match.group(1)), int(coord_match.group(2))
634
+ direction = direction_match.group(1)
635
+ actual_x = int((x / 999.0) * image_width)
636
+ actual_y = int((y / 999.0) * image_height)
637
+ # Convert direction to scroll amounts
638
+ scroll_x, scroll_y = 0, 0
639
+ if direction == "up":
640
+ scroll_y = -5
641
+ elif direction == "down":
642
+ scroll_y = 5
643
+ elif direction == "left":
644
+ scroll_x = -5
645
+ elif direction == "right":
646
+ scroll_x = 5
647
+ computer_call = model_dump(make_scroll_item(actual_x, actual_y, scroll_x, scroll_y))
648
+ computer_call["call_id"] = call_id
649
+ computer_call["status"] = "completed"
650
+ response_items.append(computer_call)
651
+
652
+ elif action == "WAIT()":
653
+ computer_call = model_dump(make_wait_item())
654
+ computer_call["call_id"] = call_id
655
+ computer_call["status"] = "completed"
656
+ response_items.append(computer_call)
657
+
658
+ return response_items
659
+
660
+ @register_agent(models=r"(?i).*GLM-4\.5V.*")
661
+ class Glm4vConfig(AsyncAgentConfig):
662
+ """GLM-4.5V agent configuration using liteLLM."""
663
+
664
+ async def predict_step(
665
+ self,
666
+ messages: List[Dict[str, Any]],
667
+ model: str,
668
+ tools: Optional[List[Dict[str, Any]]] = None,
669
+ max_retries: Optional[int] = None,
670
+ stream: bool = False,
671
+ computer_handler=None,
672
+ use_prompt_caching: Optional[bool] = False,
673
+ _on_api_start=None,
674
+ _on_api_end=None,
675
+ _on_usage=None,
676
+ _on_screenshot=None,
677
+ **kwargs
678
+ ) -> Dict[str, Any]:
679
+ """
680
+ Predict the next step using GLM-4.5V model.
681
+
682
+ Args:
683
+ messages: Input messages following Responses format
684
+ model: Model name to use
685
+ tools: Optional list of tool schemas
686
+ max_retries: Maximum number of retries for API calls
687
+ stream: Whether to stream the response
688
+ computer_handler: Computer handler for taking screenshots
689
+ use_prompt_caching: Whether to use prompt caching
690
+ _on_api_start: Callback for API start
691
+ _on_api_end: Callback for API end
692
+ _on_usage: Callback for usage tracking
693
+ _on_screenshot: Callback for screenshot events
694
+
695
+ Returns:
696
+ Dict with "output" and "usage" keys
697
+ """
698
+ # Get the user instruction from the last user message
699
+ user_instruction = ""
700
+ for message in reversed(messages):
701
+ if isinstance(message, dict) and message.get("role") == "user":
702
+ content = message.get("content", "")
703
+ if isinstance(content, str):
704
+ user_instruction = content
705
+ elif isinstance(content, list):
706
+ for item in content:
707
+ if isinstance(item, dict) and item.get("type") == "text":
708
+ user_instruction = item.get("text", "")
709
+ break
710
+ break
711
+
712
+ # Get the last image for processing
713
+ last_image_b64 = get_last_image_from_messages(messages)
714
+ if not last_image_b64 and computer_handler:
715
+ # Take a screenshot if no image available
716
+ screenshot_b64 = await computer_handler.screenshot()
717
+ if screenshot_b64:
718
+ last_image_b64 = screenshot_b64
719
+ if _on_screenshot:
720
+ await _on_screenshot(screenshot_b64)
721
+
722
+ if not last_image_b64:
723
+ raise ValueError("No image available for GLM-4.5V processing")
724
+
725
+ # Convert responses items to GLM-4.5V PC prompt format with historical actions
726
+ prompt_content = convert_responses_items_to_glm45v_pc_prompt(
727
+ messages=messages,
728
+ task=user_instruction,
729
+ memory="[]" # Initialize with empty memory for now
730
+ )
731
+
732
+ # Add the current screenshot to the end
733
+ prompt_content.append({
734
+ "type": "image_url",
735
+ "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}
736
+ })
737
+
738
+ # Prepare messages for liteLLM
739
+ litellm_messages = [
740
+ {
741
+ "role": "system",
742
+ "content": "You are a helpful GUI agent assistant."
743
+ },
744
+ {
745
+ "role": "user",
746
+ "content": prompt_content
747
+ }
748
+ ]
749
+
750
+ # Prepare API call kwargs
751
+ api_kwargs = {
752
+ "model": model,
753
+ "messages": litellm_messages,
754
+ # "max_tokens": 2048,
755
+ # "temperature": 0.001,
756
+ # "extra_body": {
757
+ # "skip_special_tokens": False,
758
+ # }
759
+ }
760
+
761
+ # Add API callbacks
762
+ if _on_api_start:
763
+ await _on_api_start(api_kwargs)
764
+
765
+ # Call liteLLM
766
+ response = await litellm.acompletion(**api_kwargs)
767
+
768
+ if _on_api_end:
769
+ await _on_api_end(api_kwargs, response)
770
+
771
+ # Get image dimensions for coordinate scaling
772
+ image_width, image_height = 1920, 1080 # Default dimensions
773
+
774
+ # Try to get actual dimensions from the image
775
+ try:
776
+ image_data = base64.b64decode(last_image_b64)
777
+ image = Image.open(BytesIO(image_data))
778
+ image_width, image_height = image.size
779
+ except Exception:
780
+ pass # Use default dimensions
781
+
782
+ # Convert GLM completion response to responses items
783
+ response_items = convert_glm_completion_to_responses_items(response, image_width, image_height)
784
+
785
+ # Extract usage information
786
+ response_usage = {
787
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
788
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
789
+ }
790
+ if _on_usage:
791
+ await _on_usage(response_usage)
792
+
793
+ # Create agent response
794
+ agent_response = {
795
+ "output": response_items,
796
+ "usage": response_usage
797
+ }
798
+
799
+ return agent_response
800
+
801
+ async def predict_click(
802
+ self,
803
+ model: str,
804
+ image_b64: str,
805
+ instruction: str,
806
+ **kwargs
807
+ ) -> Optional[Tuple[int, int]]:
808
+ """
809
+ Predict click coordinates using GLM-4.5V model.
810
+
811
+ Args:
812
+ model: Model name to use
813
+ image_b64: Base64 encoded image
814
+ instruction: Instruction for where to click
815
+
816
+ Returns:
817
+ Tuple with (x, y) coordinates or None
818
+ """
819
+ try:
820
+ # Create a simple click instruction prompt
821
+ click_prompt = f"""You are a GUI agent. Look at the screenshot and identify where to click for: {instruction}
822
+
823
+ Respond with a single click action in this format:
824
+ left_click(start_box='[x,y]')
825
+
826
+ Where x,y are coordinates normalized to 0-999 range."""
827
+
828
+ # Prepare messages for liteLLM
829
+ litellm_messages = [
830
+ {
831
+ "role": "system",
832
+ "content": "You are a helpful GUI agent assistant."
833
+ },
834
+ {
835
+ "role": "user",
836
+ "content": [
837
+ {"type": "text", "text": click_prompt},
838
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
839
+ ]
840
+ }
841
+ ]
842
+
843
+ # Prepare API call kwargs
844
+ api_kwargs = {
845
+ "model": model,
846
+ "messages": litellm_messages,
847
+ "max_tokens": 100,
848
+ "temperature": 0.001,
849
+ "extra_body": {
850
+ "skip_special_tokens": False,
851
+ }
852
+ }
853
+
854
+ # Call liteLLM
855
+ response = await litellm.acompletion(**api_kwargs)
856
+
857
+ # Extract response content
858
+ response_content = response.choices[0].message.content.strip()
859
+
860
+ # Parse response for click coordinates
861
+ # Look for coordinates in the response, handling special tokens
862
+ coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
863
+ match = re.search(coord_pattern, response_content)
864
+
865
+ if not match:
866
+ # Fallback: look for coordinates without special tokens
867
+ coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
868
+ match = re.search(coord_pattern, response_content)
869
+
870
+ if match:
871
+ x, y = int(match.group(1)), int(match.group(2))
872
+
873
+ # Get actual image dimensions for scaling
874
+ try:
875
+ image_data = base64.b64decode(image_b64)
876
+ image = Image.open(BytesIO(image_data))
877
+ image_width, image_height = image.size
878
+ except Exception:
879
+ # Use default dimensions
880
+ image_width, image_height = 1920, 1080
881
+
882
+ # Convert from 0-999 normalized coordinates to actual pixel coordinates
883
+ actual_x = int((x / 999.0) * image_width)
884
+ actual_y = int((y / 999.0) * image_height)
885
+
886
+ return (actual_x, actual_y)
887
+
888
+ return None
889
+
890
+ except Exception as e:
891
+ # Log error and return None
892
+ print(f"Error in predict_click: {e}")
893
+ return None
894
+
895
+ def get_capabilities(self) -> List[AgentCapability]:
896
+ """
897
+ Get list of capabilities supported by this agent config.
898
+
899
+ Returns:
900
+ List of capability strings
901
+ """
902
+ return ["step", "click"]