lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,651 @@
1
+ import ast
2
+ import re
3
+ import logging
4
+ from collections import defaultdict
5
+ from io import BytesIO
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import time
8
+ import pytesseract
9
+ from PIL import Image
10
+ from pytesseract import Output
11
+
12
+ from gui_agents.tools.tools import Tools
13
+ from gui_agents.utils.common_utils import parse_single_code_from_string
14
+ from gui_agents.store.registry import Registry
15
+ from gui_agents.agents.global_state import GlobalState
16
+
17
+ logger = logging.getLogger("desktopenv.agent")
18
+
19
+
20
+ class ACI:
21
+
22
+ def __init__(self):
23
+ self.notes: List[str] = []
24
+
25
+
26
+ def agent_action(func):
27
+ func.is_agent_action = True
28
+ return func
29
+
30
+
31
+ class Grounding(ACI):
32
+
33
+ def __init__(
34
+ self,
35
+ Tools_dict: Dict,
36
+ platform: str,
37
+ width: int = 1920,
38
+ height: int = 1080,
39
+ ):
40
+ self.platform = platform
41
+ self.Tools_dict = Tools_dict
42
+ self.width = width
43
+ self.height = height
44
+ self.coords1 = None
45
+ self.coords2 = None
46
+
47
+ self.grounding_model = Tools()
48
+ self.grounding_model.register_tool(
49
+ "grounding", self.Tools_dict["grounding"]["provider"],
50
+ self.Tools_dict["grounding"]["model"])
51
+
52
+ self.grounding_width, self.grounding_height = self.grounding_model.tools[
53
+ "grounding"].get_grounding_wh()
54
+ if self.grounding_width is None or self.grounding_height is None:
55
+ self.grounding_width = self.width
56
+ self.grounding_height = self.height
57
+
58
+ self.text_span_agent = Tools()
59
+ self.text_span_agent.register_tool(
60
+ "text_span", self.Tools_dict["text_span"]["provider"],
61
+ self.Tools_dict["text_span"]["model"])
62
+
63
+ self.global_state: GlobalState = Registry.get(
64
+ "GlobalStateStore") # type: ignore
65
+
66
+ def generate_coords(self, ref_expr: str, obs: Dict) -> List[int]:
67
+ grounding_start_time = time.time()
68
+ self.grounding_model.tools["grounding"].llm_agent.reset()
69
+ prompt = (
70
+ f"Task: Visual Grounding - Locate and return coordinates\n Query:{ref_expr}\n Instructions: 1. Carefully analyze the provided screenshot image \n 2. Locate the EXACT element/area described in the query \n 3. Return ONLY the pixel coordinates [x, y] of one representative point within the target area \n 4. Choose a point that is clearly inside the described element/region \n 5. Coordinates must be integers representing pixel positions on the image \n 6. If the described element has multiple instances, select the most prominent or central one 7. - If this appears to be for dragging (selecting text, moving items, etc.): * For START points: Position slightly to the LEFT of text/content in empty space * For END points: Position slightly to the RIGHT of text/content in empty space * Avoid placing coordinates directly ON text characters to prevent text selection issues * Keep offset minimal (3-5 pixels) - don't go too far from the target area * Still return only ONE coordinate as requested \n Output Format: Return only two integers separated by comma, like: (900, 400)\n Important Notes: - Focus on the main descriptive elements in the query (colors, positions, objects) - Ignore any additional context that doesn't help locate the target - The returned point should be clickable/actionable within the target area \n CRITICAL REQUIREMENTS: - MUST return exactly ONE coordinate pair under ALL circumstances - NO explanations, NO multiple coordinates, NO additional text \n"
71
+ )
72
+ response, total_tokens, cost_string = self.grounding_model.execute_tool(
73
+ "grounding", {
74
+ "str_input": prompt,
75
+ "img_input": obs["screenshot"]
76
+ })
77
+ logger.info(
78
+ f"Grounding model tokens: {total_tokens}, cost: {cost_string}")
79
+ grounding_end_time = time.time()
80
+ grounding_duration = grounding_end_time - grounding_start_time
81
+ logger.info(
82
+ f"Grounding model execution time: {grounding_duration:.2f} seconds")
83
+ logger.info(f"RAW GROUNDING MODEL RESPONSE: {response}")
84
+ self.global_state.log_operation(module="grounding",
85
+ operation="grounding_model_response",
86
+ data={
87
+ "tokens": total_tokens,
88
+ "cost": cost_string,
89
+ "content": response,
90
+ "duration": grounding_duration
91
+ })
92
+ numericals = re.findall(r"\d+", response)
93
+ assert len(numericals) >= 2
94
+ return [int(numericals[0]), int(numericals[1])]
95
+
96
+ def assign_coordinates(self, plan: str, obs: Dict):
97
+ self.coords1, self.coords2 = None, None
98
+ try:
99
+ action = parse_single_code_from_string(
100
+ plan.split("Grounded Action")[-1])
101
+ function_name = re.match(r"(\w+\.\w+)\(",
102
+ action).group(1) # type: ignore
103
+ args = self.parse_function_args(action)
104
+ except Exception as e:
105
+ raise RuntimeError(f"Error in parsing grounded action: {e}") from e
106
+
107
+ if (function_name in [
108
+ "agent.click", "agent.doubleclick", "agent.move", "agent.scroll"
109
+ ] and len(args) >= 1 and args[0] is not None):
110
+ self.coords1 = self.generate_coords(args[0], obs)
111
+ elif function_name == "agent.drag" and len(args) >= 2:
112
+ self.coords1 = self.generate_coords(args[0], obs)
113
+ self.coords2 = self.generate_coords(args[1], obs)
114
+
115
+ def reset_screen_size(self, width: int, height: int):
116
+ self.width = width
117
+ self.height = height
118
+
119
+ def resize_coordinates(self, coordinates: List[int]) -> List[int]:
120
+ return [
121
+ round(coordinates[0] * self.width / self.grounding_width),
122
+ round(coordinates[1] * self.height / self.grounding_height),
123
+ ]
124
+
125
+ def resize_coordinates_with_padding(self,
126
+ coordinates: List[int]) -> List[int]:
127
+ grounding_size = max(self.grounding_width, self.grounding_height)
128
+ original_size = max(self.width, self.height)
129
+ coordinates = [
130
+ round(coordinates[0] * original_size / grounding_size),
131
+ round(coordinates[1] * original_size / grounding_size),
132
+ ]
133
+ padding_left = round((original_size - self.width) / 2)
134
+ padding_top = round((original_size - self.height) / 2)
135
+ return [
136
+ coordinates[0] - padding_left,
137
+ coordinates[1] - padding_top,
138
+ ]
139
+
140
+ def parse_function_args(self, function: str) -> List[str]:
141
+ if not function or not isinstance(function, str):
142
+ return []
143
+ pattern = r'(\w+\.\w+)\((?:"([^"]*)")?(?:,\s*(\d+))?\)'
144
+ match = re.match(pattern, function)
145
+ if match:
146
+ args = []
147
+ if match.group(2) is not None:
148
+ args.append(match.group(2))
149
+ if match.group(3) is not None:
150
+ args.append(int(match.group(3)))
151
+ if args:
152
+ return args
153
+ try:
154
+ tree = ast.parse(function)
155
+ except Exception:
156
+ return []
157
+ if not tree.body or not hasattr(tree.body[0], 'value'):
158
+ return []
159
+ call_node = tree.body[0].value # type: ignore
160
+ if not isinstance(call_node, ast.Call):
161
+ return []
162
+
163
+ def safe_eval(node):
164
+ if isinstance(node, ast.Constant):
165
+ return node.value
166
+ elif hasattr(ast, 'Str') and isinstance(node, ast.Str):
167
+ return node.s
168
+ else:
169
+ try:
170
+ return ast.unparse(node)
171
+ except Exception:
172
+ return str(node)
173
+
174
+ positional_args = []
175
+ try:
176
+ positional_args = [safe_eval(arg) for arg in call_node.args]
177
+ except Exception:
178
+ positional_args = []
179
+ keyword_args = {}
180
+ try:
181
+ keyword_args = {
182
+ kw.arg: safe_eval(kw.value) for kw in call_node.keywords
183
+ }
184
+ except Exception:
185
+ keyword_args = {}
186
+ res = []
187
+ for key, val in keyword_args.items():
188
+ if key and "description" in key:
189
+ res.append(val)
190
+ for arg in positional_args:
191
+ res.append(arg)
192
+ return res
193
+
194
+ def _record_passive_memory(self, action_type: str, action_details: str):
195
+ memory_content = f"Hardware action `{action_type}` has been executed. Details: {action_details}"
196
+ self.global_state.add_agent_log({
197
+ "type": "passive",
198
+ "content": memory_content
199
+ })
200
+
201
+ @agent_action
202
+ def click(
203
+ self,
204
+ element_description: str,
205
+ button: int = 1,
206
+ holdKey: List[str] = [],
207
+ ):
208
+ x, y = self.resize_coordinates(self.coords1) # type: ignore
209
+ actionDict = {
210
+ "type": "Click",
211
+ "x": x,
212
+ "y": y,
213
+ "element_description": element_description,
214
+ "button": button,
215
+ "holdKey": holdKey
216
+ }
217
+ action_details = f"Clicked at coordinates ({x}, {y}) with button {button}, element: {element_description}"
218
+ self._record_passive_memory("Click", action_details)
219
+ return actionDict
220
+
221
+ @agent_action
222
+ def doubleclick(
223
+ self,
224
+ element_description: str,
225
+ button: int = 1,
226
+ holdKey: List[str] = [],
227
+ ):
228
+ x, y = self.resize_coordinates(self.coords1) # type: ignore
229
+ actionDict = {
230
+ "type": "DoubleClick",
231
+ "x": x,
232
+ "y": y,
233
+ "element_description": element_description,
234
+ "button": button,
235
+ "holdKey": holdKey
236
+ }
237
+ action_details = f"Double clicked at coordinates ({x}, {y}) with button {button}, element: {element_description}"
238
+ self._record_passive_memory("DoubleClick", action_details)
239
+ return actionDict
240
+
241
+ @agent_action
242
+ def move(
243
+ self,
244
+ element_description: str,
245
+ holdKey: List[str] = [],
246
+ ):
247
+ x, y = self.resize_coordinates(self.coords1) # type: ignore
248
+ actionDict = {
249
+ "type": "Move",
250
+ "x": x,
251
+ "y": y,
252
+ "element_description": element_description,
253
+ "holdKey": holdKey
254
+ }
255
+ action_details = f"Moved to coordinates ({x}, {y}), element: {element_description}"
256
+ self._record_passive_memory("Move", action_details)
257
+ return actionDict
258
+
259
+ @agent_action
260
+ def scroll(
261
+ self,
262
+ element_description: str,
263
+ clicks: int,
264
+ vertical: bool = True,
265
+ holdKey: List[str] = [],
266
+ ):
267
+ x, y = self.resize_coordinates(self.coords1) # type: ignore
268
+ if vertical:
269
+ actionDict = {
270
+ "type": "Scroll",
271
+ "x": x,
272
+ "y": y,
273
+ "element_description": element_description,
274
+ "stepVertical": clicks,
275
+ "holdKey": holdKey
276
+ }
277
+ action_details = f"Scrolled vertically at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description}"
278
+ else:
279
+ actionDict = {
280
+ "type": "Scroll",
281
+ "x": x,
282
+ "y": y,
283
+ "element_description": element_description,
284
+ "stepHorizontal": clicks,
285
+ "holdKey": holdKey
286
+ }
287
+ action_details = f"Scrolled horizontally at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description}"
288
+ self._record_passive_memory("Scroll", action_details)
289
+ return actionDict
290
+
291
+ @agent_action
292
+ def drag(
293
+ self,
294
+ starting_description: str,
295
+ ending_description: str,
296
+ holdKey: List[str] = [],
297
+ ):
298
+ x1, y1 = self.resize_coordinates(self.coords1) # type: ignore
299
+ x2, y2 = self.resize_coordinates(self.coords2) # type: ignore
300
+ actionDict = {
301
+ "type": "Drag",
302
+ "startX": x1,
303
+ "startY": y1,
304
+ "endX": x2,
305
+ "endY": y2,
306
+ "holdKey": holdKey,
307
+ "starting_description": starting_description,
308
+ "ending_description": ending_description
309
+ }
310
+ action_details = f"Dragged from ({x1}, {y1}) to ({x2}, {y2}), starting: {starting_description}, ending: {ending_description}"
311
+ self._record_passive_memory("Drag", action_details)
312
+ return actionDict
313
+
314
+ @agent_action
315
+ def type(
316
+ self,
317
+ text: str = "",
318
+ ):
319
+ actionDict = {
320
+ "type": "TypeText",
321
+ "text": text,
322
+ }
323
+ action_details = f"Typed text: {text}"
324
+ self._record_passive_memory("TypeText", action_details)
325
+ return actionDict
326
+
327
+ @agent_action
328
+ def hotkey(
329
+ self,
330
+ keys: List[str] = [],
331
+ duration: int = 0,
332
+ ):
333
+ keys = [f"{key}" for key in keys]
334
+ if 1 <= duration <= 5000:
335
+ actionDict = {
336
+ "type": "Hotkey",
337
+ "keys": keys,
338
+ "duration": duration,
339
+ }
340
+ action_details = f"Pressed hotkey combination: {', '.join(keys)} with duration {duration}ms"
341
+ else:
342
+ actionDict = {
343
+ "type": "Hotkey",
344
+ "keys": keys,
345
+ }
346
+ action_details = f"Pressed hotkey combination: {', '.join(keys)}"
347
+ self._record_passive_memory("Hotkey", action_details)
348
+ return actionDict
349
+
350
+ @agent_action
351
+ def wait(self, duration: int):
352
+ actionDict = {"type": "Wait", "duration": duration}
353
+ action_details = f"Waited for {duration} milliseconds"
354
+ self._record_passive_memory("Wait", action_details)
355
+ return actionDict
356
+
357
+ @agent_action
358
+ def done(
359
+ self,
360
+ message: str = '',
361
+ ):
362
+ self.returned_info = message
363
+ actionDict = {"type": "Done", "message": message}
364
+ return actionDict
365
+
366
+ @agent_action
367
+ def fail(
368
+ self,
369
+ message: str = '',
370
+ ):
371
+ actionDict = {"type": "Failed", "message": message}
372
+ return actionDict
373
+
374
+ @agent_action
375
+ def memorize(
376
+ self,
377
+ information: str,
378
+ memory_type: str = "active",
379
+ ):
380
+ self.global_state.add_agent_log({
381
+ "type": memory_type,
382
+ "content": information
383
+ })
384
+ actionDict = {
385
+ "type": "Memorize",
386
+ "information": information,
387
+ }
388
+ return actionDict
389
+
390
+ @agent_action
391
+ def passive_memorize(
392
+ self,
393
+ information: str,
394
+ ):
395
+ return self.memorize(information, memory_type="passive")
396
+
397
+ @agent_action
398
+ def user_takeover(
399
+ self,
400
+ message: str = '',
401
+ ):
402
+ self.global_state.set_running_state("stopped")
403
+ actionDict = {"type": "UserTakeover", "message": message}
404
+ return actionDict
405
+
406
+
407
+ class FastGrounding(ACI):
408
+
409
+ def __init__(
410
+ self,
411
+ Tools_dict: Dict,
412
+ platform: str,
413
+ width: int = 1920,
414
+ height: int = 1080,
415
+ grounding_width: int = 1920,
416
+ grounding_height: int = 1080,
417
+ ):
418
+ self.platform = platform
419
+ self.Tools_dict = Tools_dict
420
+ self.width = width
421
+ self.height = height
422
+ self.grounding_width = grounding_width
423
+ self.grounding_height = grounding_height
424
+ self.global_state: GlobalState = Registry.get(
425
+ "GlobalStateStore") # type: ignore
426
+
427
+ def reset_screen_size(self, width: int, height: int):
428
+ self.width = width
429
+ self.height = height
430
+
431
+ def resize_coordinates(self, coordinates: List[int]) -> List[int]:
432
+ return [
433
+ round(coordinates[0] * self.width / self.grounding_width),
434
+ round(coordinates[1] * self.height / self.grounding_height),
435
+ ]
436
+
437
+ def _record_passive_memory(self, action_type: str, action_details: str):
438
+ memory_content = f"Hardware action `{action_type}` has been executed. Details: {action_details}"
439
+ self.global_state.add_agent_log({
440
+ "type": "passive",
441
+ "content": memory_content
442
+ })
443
+
444
+ @agent_action
445
+ def click(
446
+ self,
447
+ x: int,
448
+ y: int,
449
+ element_description: str = "",
450
+ button: int = 1,
451
+ holdKey: List[str] = [],
452
+ ):
453
+ x, y = self.resize_coordinates([x, y])
454
+ actionDict = {
455
+ "type": "Click",
456
+ "x": x,
457
+ "y": y,
458
+ "element_description": element_description or f"Coordinates ({x}, {y})",
459
+ "button": button,
460
+ "holdKey": holdKey
461
+ }
462
+ action_details = f"Clicked at coordinates ({x}, {y}) with button {button}, element: {element_description or f'Coordinates ({x}, {y})'}"
463
+ self._record_passive_memory("Click", action_details)
464
+ return actionDict
465
+
466
+ @agent_action
467
+ def doubleclick(
468
+ self,
469
+ x: int,
470
+ y: int,
471
+ element_description: str = "",
472
+ button: int = 1,
473
+ holdKey: List[str] = [],
474
+ ):
475
+ x, y = self.resize_coordinates([x, y])
476
+ actionDict = {
477
+ "type": "DoubleClick",
478
+ "x": x,
479
+ "y": y,
480
+ "element_description": element_description or f"Coordinates ({x}, {y})",
481
+ "button": button,
482
+ "holdKey": holdKey
483
+ }
484
+ action_details = f"Double clicked at coordinates ({x}, {y}) with button {button}, element: {element_description or f'Coordinates ({x}, {y})'}"
485
+ self._record_passive_memory("DoubleClick", action_details)
486
+ return actionDict
487
+
488
+ @agent_action
489
+ def move(
490
+ self,
491
+ x: int,
492
+ y: int,
493
+ element_description: str = "",
494
+ holdKey: List[str] = [],
495
+ ):
496
+ x, y = self.resize_coordinates([x, y])
497
+ actionDict = {
498
+ "type": "Move",
499
+ "x": x,
500
+ "y": y,
501
+ "element_description": element_description or f"Coordinates ({x}, {y})",
502
+ "holdKey": holdKey
503
+ }
504
+ action_details = f"Moved to coordinates ({x}, {y}), element: {element_description or f'Coordinates ({x}, {y})'}"
505
+ self._record_passive_memory("Move", action_details)
506
+ return actionDict
507
+
508
+ @agent_action
509
+ def scroll(
510
+ self,
511
+ x: int,
512
+ y: int,
513
+ clicks: int,
514
+ element_description: str = "",
515
+ vertical: bool = True,
516
+ holdKey: List[str] = [],
517
+ ):
518
+ x, y = self.resize_coordinates([x, y])
519
+ if vertical:
520
+ actionDict = {
521
+ "type": "Scroll",
522
+ "x": x,
523
+ "y": y,
524
+ "element_description": element_description or f"Coordinates ({x}, {y})",
525
+ "stepVertical": clicks,
526
+ "holdKey": holdKey
527
+ }
528
+ action_details = f"Scrolled vertically at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description or f'Coordinates ({x}, {y})'}"
529
+ else:
530
+ actionDict = {
531
+ "type": "Scroll",
532
+ "x": x,
533
+ "y": y,
534
+ "element_description": element_description or f"Coordinates ({x}, {y})",
535
+ "stepHorizontal": clicks,
536
+ "holdKey": holdKey
537
+ }
538
+ action_details = f"Scrolled horizontally at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description or f'Coordinates ({x}, {y})'}"
539
+ self._record_passive_memory("Scroll", action_details)
540
+ return actionDict
541
+
542
+ @agent_action
543
+ def drag(
544
+ self,
545
+ startX: int,
546
+ startY: int,
547
+ endX: int,
548
+ endY: int,
549
+ starting_description: str = "",
550
+ ending_description: str = "",
551
+ holdKey: List[str] = [],
552
+ ):
553
+ startX, startY = self.resize_coordinates([startX, startY])
554
+ endX, endY = self.resize_coordinates([endX, endY])
555
+ actionDict = {
556
+ "type": "Drag",
557
+ "startX": startX,
558
+ "startY": startY,
559
+ "endX": endX,
560
+ "endY": endY,
561
+ "holdKey": holdKey,
562
+ "starting_description": starting_description or f"Coordinates ({startX}, {startY})",
563
+ "ending_description": ending_description or f"Coordinates ({endX}, {endY})"
564
+ }
565
+ action_details = f"Dragged from ({startX}, {startY}) to ({endX}, {endY}), starting: {starting_description or f'Coordinates ({startX}, {startY})'}, ending: {ending_description or f'Coordinates ({endX}, {endY})'}"
566
+ self._record_passive_memory("Drag", action_details)
567
+ return actionDict
568
+
569
+ @agent_action
570
+ def type(
571
+ self,
572
+ text: str = "",
573
+ ):
574
+ actionDict = {
575
+ "type": "TypeText",
576
+ "text": text,
577
+ }
578
+ action_details = f"Typed text: {text}"
579
+ self._record_passive_memory("TypeText", action_details)
580
+ return actionDict
581
+
582
+ @agent_action
583
+ def hotkey(
584
+ self,
585
+ keys: List[str] = [],
586
+ duration: int = 0,
587
+ ):
588
+ keys = [f"{key}" for key in keys]
589
+ if 1 <= duration <= 5000:
590
+ actionDict = {
591
+ "type": "Hotkey",
592
+ "keys": keys,
593
+ "duration": duration,
594
+ }
595
+ action_details = f"Pressed hotkey combination: {', '.join(keys)} with duration {duration}ms"
596
+ else:
597
+ actionDict = {
598
+ "type": "Hotkey",
599
+ "keys": keys,
600
+ }
601
+ action_details = f"Pressed hotkey combination: {', '.join(keys)}"
602
+ self._record_passive_memory("Hotkey", action_details)
603
+ return actionDict
604
+
605
+ @agent_action
606
+ def wait(self, duration: int):
607
+ actionDict = {"type": "Wait", "duration": duration}
608
+ action_details = f"Waited for {duration} milliseconds"
609
+ self._record_passive_memory("Wait", action_details)
610
+ return actionDict
611
+
612
+ @agent_action
613
+ def done(
614
+ self,
615
+ message: str = '',
616
+ ):
617
+ self.returned_info = message
618
+ actionDict = {"type": "Done", "message": message}
619
+ return actionDict
620
+
621
+ @agent_action
622
+ def fail(
623
+ self,
624
+ message: str = '',
625
+ ):
626
+ actionDict = {"type": "Failed", "message": message}
627
+ return actionDict
628
+
629
+ @agent_action
630
+ def memorize(
631
+ self,
632
+ information: str,
633
+ ):
634
+ self.global_state.add_agent_log({
635
+ "type": "active",
636
+ "content": information
637
+ })
638
+ actionDict = {
639
+ "type": "Memorize",
640
+ "information": information,
641
+ }
642
+ return actionDict
643
+
644
+ @agent_action
645
+ def user_takeover(
646
+ self,
647
+ message: str = '',
648
+ ):
649
+ self.global_state.set_running_state("stopped")
650
+ actionDict = {"type": "UserTakeover", "message": message}
651
+ return actionDict