cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/glm45v.py CHANGED
@@ -4,33 +4,36 @@ Supports vision-language models for computer control with bounding box parsing.
4
4
  """
5
5
 
6
6
  import asyncio
7
- import json
8
7
  import base64
8
+ import json
9
9
  import re
10
- from typing import Dict, List, Any, Optional, Tuple
11
10
  from io import BytesIO
12
- from PIL import Image
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
13
  import litellm
14
+ from litellm.responses.litellm_completion_transformation.transformation import (
15
+ LiteLLMCompletionResponsesConfig,
16
+ )
14
17
  from litellm.types.utils import ModelResponse
15
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
18
+ from PIL import Image
16
19
 
17
20
  from ..decorators import register_agent
18
- from ..types import Messages, AgentResponse, Tools, AgentCapability
19
21
  from ..loops.base import AsyncAgentConfig
20
22
  from ..responses import (
21
- convert_responses_items_to_completion_messages,
22
23
  convert_completion_messages_to_responses_items,
23
- make_reasoning_item,
24
- make_output_text_item,
24
+ convert_responses_items_to_completion_messages,
25
25
  make_click_item,
26
26
  make_double_click_item,
27
27
  make_drag_item,
28
+ make_input_image_item,
28
29
  make_keypress_item,
30
+ make_output_text_item,
31
+ make_reasoning_item,
29
32
  make_scroll_item,
30
33
  make_type_item,
31
34
  make_wait_item,
32
- make_input_image_item
33
35
  )
36
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
34
37
 
35
38
  # GLM-4.5V specific constants
36
39
  GLM_ACTION_SPACE = """
@@ -251,16 +254,18 @@ Call rule: `FAIL()`
251
254
  }
252
255
  }"""
253
256
 
257
+
254
258
  def encode_image_to_base64(image_path: str) -> str:
255
259
  """Encode image file to base64 string with data URI."""
256
260
  with open(image_path, "rb") as image_file:
257
261
  encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
258
262
  return f"data:image/png;base64,{encoded_string}"
259
263
 
264
+
260
265
  def parse_glm_response(response: str) -> Dict[str, Any]:
261
266
  """
262
267
  Parse GLM-4.5V response to extract action and memory.
263
-
268
+
264
269
  The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
265
270
  Coordinates are normalized values between 0 and 1000.
266
271
  """
@@ -274,26 +279,23 @@ def parse_glm_response(response: str) -> Dict[str, Any]:
274
279
  action_pattern = r"[\w_]+\([^)]*\)"
275
280
  matches = re.findall(action_pattern, response)
276
281
  action = matches[0] if matches else None
277
-
282
+
278
283
  # Extract memory section
279
284
  memory_pattern = r"Memory:(.*?)$"
280
285
  memory_match = re.search(memory_pattern, response, re.DOTALL)
281
286
  memory = memory_match.group(1).strip() if memory_match else "[]"
282
-
287
+
283
288
  # Extract action text (everything before Memory:)
284
- action_text_pattern = r'^(.*?)Memory:'
289
+ action_text_pattern = r"^(.*?)Memory:"
285
290
  action_text_match = re.search(action_text_pattern, response, re.DOTALL)
286
291
  action_text = action_text_match.group(1).strip() if action_text_match else response
287
-
292
+
288
293
  # Clean up action text by removing special tokens
289
294
  if action_text:
290
295
  action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
291
-
292
- return {
293
- "action": action,
294
- "action_text": action_text,
295
- "memory": memory
296
- }
296
+
297
+ return {"action": action, "action_text": action_text, "memory": memory}
298
+
297
299
 
298
300
  def get_last_image_from_messages(messages: Messages) -> Optional[str]:
299
301
  """Extract the last image from messages for processing."""
@@ -314,23 +316,28 @@ def get_last_image_from_messages(messages: Messages) -> Optional[str]:
314
316
  image_url_obj = item.get("image_url", {})
315
317
  if isinstance(image_url_obj, dict):
316
318
  image_url = image_url_obj.get("url", "")
317
- if isinstance(image_url, str) and image_url.startswith("data:image/"):
319
+ if isinstance(image_url, str) and image_url.startswith(
320
+ "data:image/"
321
+ ):
318
322
  return image_url.split(",", 1)[1]
319
323
  return None
320
324
 
321
- def convert_responses_items_to_glm45v_pc_prompt(messages: Messages, task: str, memory: str = "") -> List[Dict[str, Any]]:
325
+
326
+ def convert_responses_items_to_glm45v_pc_prompt(
327
+ messages: Messages, task: str, memory: str = ""
328
+ ) -> List[Dict[str, Any]]:
322
329
  """Convert responses items to GLM-4.5V PC prompt format with historical actions.
323
-
330
+
324
331
  Args:
325
332
  messages: List of message items from the conversation
326
333
  task: The task description
327
334
  memory: Current memory state
328
-
335
+
329
336
  Returns:
330
337
  List of content items for the prompt (text and image_url items)
331
338
  """
332
339
  action_space = GLM_ACTION_SPACE
333
-
340
+
334
341
  # Template head
335
342
  head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
336
343
 
@@ -345,7 +352,7 @@ Ubuntu
345
352
 
346
353
  # Historical Actions and Current Memory
347
354
  History:"""
348
-
355
+
349
356
  # Template tail
350
357
  tail_text = f"""
351
358
  Memory:
@@ -363,18 +370,18 @@ Memory:
363
370
 
364
371
  Current Screenshot:
365
372
  """
366
-
373
+
367
374
  # Build history from messages
368
375
  history = []
369
376
  history_images = []
370
-
377
+
371
378
  # Group messages into steps
372
379
  current_step = []
373
380
  step_num = 0
374
-
381
+
375
382
  for message in messages:
376
383
  msg_type = message.get("type")
377
-
384
+
378
385
  if msg_type == "reasoning":
379
386
  current_step.append(message)
380
387
  elif msg_type == "message" and message.get("role") == "assistant":
@@ -386,7 +393,7 @@ Current Screenshot:
386
393
  # End of step - process it
387
394
  if current_step:
388
395
  step_num += 1
389
-
396
+
390
397
  # Extract bot thought from message content
391
398
  bot_thought = ""
392
399
  for item in current_step:
@@ -397,14 +404,14 @@ Current Screenshot:
397
404
  bot_thought = content_item.get("text", "")
398
405
  break
399
406
  break
400
-
407
+
401
408
  # Extract action from computer_call
402
409
  action_text = ""
403
410
  for item in current_step:
404
411
  if item.get("type") == "computer_call":
405
412
  action = item.get("action", {})
406
413
  action_type = action.get("type", "")
407
-
414
+
408
415
  if action_type == "click":
409
416
  x, y = action.get("x", 0), action.get("y", 0)
410
417
  # Convert to 0-999 range (assuming screen dimensions)
@@ -436,7 +443,7 @@ Current Screenshot:
436
443
  elif action_type == "wait":
437
444
  action_text = "WAIT()"
438
445
  break
439
-
446
+
440
447
  # Extract screenshot from computer_call_output
441
448
  screenshot_url = None
442
449
  for item in current_step:
@@ -445,34 +452,34 @@ Current Screenshot:
445
452
  if output.get("type") == "input_image":
446
453
  screenshot_url = output.get("image_url", "")
447
454
  break
448
-
455
+
449
456
  # Store step info
450
457
  step_info = {
451
458
  "step_num": step_num,
452
459
  "bot_thought": bot_thought,
453
460
  "action_text": action_text,
454
- "screenshot_url": screenshot_url
461
+ "screenshot_url": screenshot_url,
455
462
  }
456
463
  history.append(step_info)
457
-
464
+
458
465
  # Store screenshot for last 4 steps
459
466
  if screenshot_url:
460
467
  history_images.append(screenshot_url)
461
-
468
+
462
469
  current_step = []
463
-
470
+
464
471
  # Build content array with head, history, and tail
465
472
  content = []
466
473
  current_text = head_text
467
-
474
+
468
475
  total_history_steps = len(history)
469
476
  history_image_count = min(4, len(history_images)) # Last 4 images
470
-
477
+
471
478
  for step_idx, step_info in enumerate(history):
472
479
  step_num = step_info["step_num"]
473
480
  bot_thought = step_info["bot_thought"]
474
481
  action_text = step_info["action_text"]
475
-
482
+
476
483
  if step_idx < total_history_steps - history_image_count:
477
484
  # For steps beyond the last 4, use text placeholder
478
485
  current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
@@ -480,20 +487,21 @@ Current Screenshot:
480
487
  # For the last 4 steps, insert images
481
488
  current_text += f"\nstep {step_num}: Screenshot:"
482
489
  content.append({"type": "text", "text": current_text})
483
-
490
+
484
491
  # Add image
485
492
  img_idx = step_idx - (total_history_steps - history_image_count)
486
493
  if img_idx < len(history_images):
487
494
  content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
488
-
495
+
489
496
  current_text = f" Thought: {bot_thought}\nAction: {action_text}"
490
-
497
+
491
498
  # Add tail
492
499
  current_text += tail_text
493
500
  content.append({"type": "text", "text": current_text})
494
-
501
+
495
502
  return content
496
503
 
504
+
497
505
  def model_dump(obj) -> Dict[str, Any]:
498
506
  if isinstance(obj, dict):
499
507
  return {k: model_dump(v) for k, v in obj.items()}
@@ -502,58 +510,61 @@ def model_dump(obj) -> Dict[str, Any]:
502
510
  else:
503
511
  return obj
504
512
 
505
- def convert_glm_completion_to_responses_items(response: ModelResponse, image_width: int, image_height: int) -> List[Dict[str, Any]]:
513
+
514
+ def convert_glm_completion_to_responses_items(
515
+ response: ModelResponse, image_width: int, image_height: int
516
+ ) -> List[Dict[str, Any]]:
506
517
  """
507
518
  Convert GLM-4.5V completion response to responses items format.
508
-
519
+
509
520
  Args:
510
521
  response: LiteLLM ModelResponse from GLM-4.5V
511
522
  image_width: Original image width for coordinate scaling
512
523
  image_height: Original image height for coordinate scaling
513
-
524
+
514
525
  Returns:
515
526
  List of response items in the proper format
516
527
  """
517
528
  import uuid
518
-
529
+
519
530
  response_items = []
520
-
531
+
521
532
  if not response.choices or not response.choices[0].message:
522
533
  return response_items
523
-
534
+
524
535
  message = response.choices[0].message
525
536
  content = message.content or ""
526
- reasoning_content = getattr(message, 'reasoning_content', None)
527
-
537
+ reasoning_content = getattr(message, "reasoning_content", None)
538
+
528
539
  # Add reasoning item if present
529
540
  if reasoning_content:
530
541
  reasoning_item = model_dump(make_reasoning_item(reasoning_content))
531
542
  response_items.append(reasoning_item)
532
-
543
+
533
544
  # Parse the content to extract action and text
534
545
  parsed_response = parse_glm_response(content)
535
546
  action = parsed_response.get("action", "")
536
547
  action_text = parsed_response.get("action_text", "")
537
-
548
+
538
549
  # Add message item with text content (excluding action and memory)
539
550
  if action_text:
540
551
  # Remove action from action_text if it's there
541
552
  clean_text = action_text
542
553
  if action and action in clean_text:
543
554
  clean_text = clean_text.replace(action, "").strip()
544
-
555
+
545
556
  # Remove memory section
546
557
  memory_pattern = r"Memory:\s*\[.*?\]\s*$"
547
558
  clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
548
-
559
+
549
560
  if clean_text:
550
561
  message_item = model_dump(make_output_text_item(clean_text))
551
562
  response_items.append(message_item)
552
-
563
+
553
564
  # Convert action to computer call if present
554
565
  if action:
555
566
  call_id = f"call_{uuid.uuid4().hex[:8]}"
556
-
567
+
557
568
  # Parse different action types and create appropriate computer calls
558
569
  if action.startswith("left_click"):
559
570
  coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
@@ -566,7 +577,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
566
577
  computer_call["call_id"] = call_id
567
578
  computer_call["status"] = "completed"
568
579
  response_items.append(computer_call)
569
-
580
+
570
581
  elif action.startswith("right_click"):
571
582
  coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
572
583
  if coord_match:
@@ -577,7 +588,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
577
588
  computer_call["call_id"] = call_id
578
589
  computer_call["status"] = "completed"
579
590
  response_items.append(computer_call)
580
-
591
+
581
592
  elif action.startswith("left_double_click"):
582
593
  coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
583
594
  if coord_match:
@@ -588,7 +599,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
588
599
  computer_call["call_id"] = call_id
589
600
  computer_call["status"] = "completed"
590
601
  response_items.append(computer_call)
591
-
602
+
592
603
  elif action.startswith("left_drag"):
593
604
  start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
594
605
  end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
@@ -605,18 +616,18 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
605
616
  computer_call["call_id"] = call_id
606
617
  computer_call["status"] = "completed"
607
618
  response_items.append(computer_call)
608
-
619
+
609
620
  elif action.startswith("key"):
610
621
  key_match = re.search(r"keys='([^']+)'", action)
611
622
  if key_match:
612
623
  keys = key_match.group(1)
613
624
  # Split keys by '+' for key combinations, or use as single key
614
- key_list = keys.split('+') if '+' in keys else [keys]
625
+ key_list = keys.split("+") if "+" in keys else [keys]
615
626
  computer_call = model_dump(make_keypress_item(key_list))
616
627
  computer_call["call_id"] = call_id
617
628
  computer_call["status"] = "completed"
618
629
  response_items.append(computer_call)
619
-
630
+
620
631
  elif action.startswith("type"):
621
632
  content_match = re.search(r"content='([^']*)'", action)
622
633
  if content_match:
@@ -625,7 +636,7 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
625
636
  computer_call["call_id"] = call_id
626
637
  computer_call["status"] = "completed"
627
638
  response_items.append(computer_call)
628
-
639
+
629
640
  elif action.startswith("scroll"):
630
641
  coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
631
642
  direction_match = re.search(r"direction='([^']+)'", action)
@@ -648,15 +659,16 @@ def convert_glm_completion_to_responses_items(response: ModelResponse, image_wid
648
659
  computer_call["call_id"] = call_id
649
660
  computer_call["status"] = "completed"
650
661
  response_items.append(computer_call)
651
-
662
+
652
663
  elif action == "WAIT()":
653
664
  computer_call = model_dump(make_wait_item())
654
665
  computer_call["call_id"] = call_id
655
666
  computer_call["status"] = "completed"
656
667
  response_items.append(computer_call)
657
-
668
+
658
669
  return response_items
659
670
 
671
+
660
672
  @register_agent(models=r"(?i).*GLM-4\.5V.*")
661
673
  class Glm4vConfig(AsyncAgentConfig):
662
674
  """GLM-4.5V agent configuration using liteLLM."""
@@ -674,11 +686,11 @@ class Glm4vConfig(AsyncAgentConfig):
674
686
  _on_api_end=None,
675
687
  _on_usage=None,
676
688
  _on_screenshot=None,
677
- **kwargs
689
+ **kwargs,
678
690
  ) -> Dict[str, Any]:
679
691
  """
680
692
  Predict the next step using GLM-4.5V model.
681
-
693
+
682
694
  Args:
683
695
  messages: Input messages following Responses format
684
696
  model: Model name to use
@@ -691,7 +703,7 @@ class Glm4vConfig(AsyncAgentConfig):
691
703
  _on_api_end: Callback for API end
692
704
  _on_usage: Callback for usage tracking
693
705
  _on_screenshot: Callback for screenshot events
694
-
706
+
695
707
  Returns:
696
708
  Dict with "output" and "usage" keys
697
709
  """
@@ -708,7 +720,7 @@ class Glm4vConfig(AsyncAgentConfig):
708
720
  user_instruction = item.get("text", "")
709
721
  break
710
722
  break
711
-
723
+
712
724
  # Get the last image for processing
713
725
  last_image_b64 = get_last_image_from_messages(messages)
714
726
  if not last_image_b64 and computer_handler:
@@ -718,35 +730,28 @@ class Glm4vConfig(AsyncAgentConfig):
718
730
  last_image_b64 = screenshot_b64
719
731
  if _on_screenshot:
720
732
  await _on_screenshot(screenshot_b64)
721
-
733
+
722
734
  if not last_image_b64:
723
735
  raise ValueError("No image available for GLM-4.5V processing")
724
-
736
+
725
737
  # Convert responses items to GLM-4.5V PC prompt format with historical actions
726
738
  prompt_content = convert_responses_items_to_glm45v_pc_prompt(
727
739
  messages=messages,
728
740
  task=user_instruction,
729
- memory="[]" # Initialize with empty memory for now
741
+ memory="[]", # Initialize with empty memory for now
730
742
  )
731
-
743
+
732
744
  # Add the current screenshot to the end
733
- prompt_content.append({
734
- "type": "image_url",
735
- "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}
736
- })
737
-
745
+ prompt_content.append(
746
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}}
747
+ )
748
+
738
749
  # Prepare messages for liteLLM
739
750
  litellm_messages = [
740
- {
741
- "role": "system",
742
- "content": "You are a helpful GUI agent assistant."
743
- },
744
- {
745
- "role": "user",
746
- "content": prompt_content
747
- }
751
+ {"role": "system", "content": "You are a helpful GUI agent assistant."},
752
+ {"role": "user", "content": prompt_content},
748
753
  ]
749
-
754
+
750
755
  # Prepare API call kwargs
751
756
  api_kwargs = {
752
757
  "model": model,
@@ -757,20 +762,20 @@ class Glm4vConfig(AsyncAgentConfig):
757
762
  # "skip_special_tokens": False,
758
763
  # }
759
764
  }
760
-
765
+
761
766
  # Add API callbacks
762
767
  if _on_api_start:
763
768
  await _on_api_start(api_kwargs)
764
-
769
+
765
770
  # Call liteLLM
766
771
  response = await litellm.acompletion(**api_kwargs)
767
-
772
+
768
773
  if _on_api_end:
769
774
  await _on_api_end(api_kwargs, response)
770
-
775
+
771
776
  # Get image dimensions for coordinate scaling
772
777
  image_width, image_height = 1920, 1080 # Default dimensions
773
-
778
+
774
779
  # Try to get actual dimensions from the image
775
780
  try:
776
781
  image_data = base64.b64decode(last_image_b64)
@@ -778,41 +783,38 @@ class Glm4vConfig(AsyncAgentConfig):
778
783
  image_width, image_height = image.size
779
784
  except Exception:
780
785
  pass # Use default dimensions
781
-
786
+
782
787
  # Convert GLM completion response to responses items
783
- response_items = convert_glm_completion_to_responses_items(response, image_width, image_height)
784
-
788
+ response_items = convert_glm_completion_to_responses_items(
789
+ response, image_width, image_height
790
+ )
791
+
785
792
  # Extract usage information
786
793
  response_usage = {
787
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
794
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
795
+ response.usage
796
+ ).model_dump(),
788
797
  "response_cost": response._hidden_params.get("response_cost", 0.0),
789
798
  }
790
799
  if _on_usage:
791
800
  await _on_usage(response_usage)
792
-
801
+
793
802
  # Create agent response
794
- agent_response = {
795
- "output": response_items,
796
- "usage": response_usage
797
- }
798
-
803
+ agent_response = {"output": response_items, "usage": response_usage}
804
+
799
805
  return agent_response
800
806
 
801
807
  async def predict_click(
802
- self,
803
- model: str,
804
- image_b64: str,
805
- instruction: str,
806
- **kwargs
808
+ self, model: str, image_b64: str, instruction: str, **kwargs
807
809
  ) -> Optional[Tuple[int, int]]:
808
810
  """
809
811
  Predict click coordinates using GLM-4.5V model.
810
-
812
+
811
813
  Args:
812
814
  model: Model name to use
813
815
  image_b64: Base64 encoded image
814
816
  instruction: Instruction for where to click
815
-
817
+
816
818
  Returns:
817
819
  Tuple with (x, y) coordinates or None
818
820
  """
@@ -824,22 +826,22 @@ Respond with a single click action in this format:
824
826
  left_click(start_box='[x,y]')
825
827
 
826
828
  Where x,y are coordinates normalized to 0-999 range."""
827
-
829
+
828
830
  # Prepare messages for liteLLM
829
831
  litellm_messages = [
830
- {
831
- "role": "system",
832
- "content": "You are a helpful GUI agent assistant."
833
- },
832
+ {"role": "system", "content": "You are a helpful GUI agent assistant."},
834
833
  {
835
834
  "role": "user",
836
835
  "content": [
837
836
  {"type": "text", "text": click_prompt},
838
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
839
- ]
840
- }
837
+ {
838
+ "type": "image_url",
839
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
840
+ },
841
+ ],
842
+ },
841
843
  ]
842
-
844
+
843
845
  # Prepare API call kwargs
844
846
  api_kwargs = {
845
847
  "model": model,
@@ -848,21 +850,21 @@ Where x,y are coordinates normalized to 0-999 range."""
848
850
  "temperature": 0.001,
849
851
  "extra_body": {
850
852
  "skip_special_tokens": False,
851
- }
853
+ },
852
854
  }
853
-
855
+
854
856
  # Call liteLLM
855
857
  response = await litellm.acompletion(**api_kwargs)
856
-
858
+
857
859
  # Extract response content
858
860
  response_content = response.choices[0].message.content.strip()
859
861
  print(response)
860
-
862
+
861
863
  # Parse response for click coordinates
862
864
  # Look for coordinates in the response, handling special tokens
863
865
  coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
864
866
  match = re.search(coord_pattern, response_content)
865
-
867
+
866
868
  if not match:
867
869
  # Fallback: look for coordinates without special tokens
868
870
  coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
@@ -870,7 +872,7 @@ Where x,y are coordinates normalized to 0-999 range."""
870
872
 
871
873
  if match:
872
874
  x, y = int(match.group(1)), int(match.group(2))
873
-
875
+
874
876
  # Get actual image dimensions for scaling
875
877
  try:
876
878
  image_data = base64.b64decode(image_b64)
@@ -879,15 +881,15 @@ Where x,y are coordinates normalized to 0-999 range."""
879
881
  except Exception:
880
882
  # Use default dimensions
881
883
  image_width, image_height = 1920, 1080
882
-
884
+
883
885
  # Convert from 0-999 normalized coordinates to actual pixel coordinates
884
886
  actual_x = int((x / 999.0) * image_width)
885
887
  actual_y = int((y / 999.0) * image_height)
886
-
888
+
887
889
  return (actual_x, actual_y)
888
-
890
+
889
891
  return None
890
-
892
+
891
893
  except Exception as e:
892
894
  # Log error and return None
893
895
  print(f"Error in predict_click: {e}")
@@ -896,7 +898,7 @@ Where x,y are coordinates normalized to 0-999 range."""
896
898
  def get_capabilities(self) -> List[AgentCapability]:
897
899
  """
898
900
  Get list of capabilities supported by this agent config.
899
-
901
+
900
902
  Returns:
901
903
  List of capability strings
902
904
  """