lybic-guiagents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lybic-guiagents might be problematic. Click here for more details.
- desktop_env/__init__.py +1 -0
- desktop_env/actions.py +203 -0
- desktop_env/controllers/__init__.py +0 -0
- desktop_env/controllers/python.py +471 -0
- desktop_env/controllers/setup.py +882 -0
- desktop_env/desktop_env.py +509 -0
- desktop_env/evaluators/__init__.py +5 -0
- desktop_env/evaluators/getters/__init__.py +41 -0
- desktop_env/evaluators/getters/calc.py +15 -0
- desktop_env/evaluators/getters/chrome.py +1774 -0
- desktop_env/evaluators/getters/file.py +154 -0
- desktop_env/evaluators/getters/general.py +42 -0
- desktop_env/evaluators/getters/gimp.py +38 -0
- desktop_env/evaluators/getters/impress.py +126 -0
- desktop_env/evaluators/getters/info.py +24 -0
- desktop_env/evaluators/getters/misc.py +406 -0
- desktop_env/evaluators/getters/replay.py +20 -0
- desktop_env/evaluators/getters/vlc.py +86 -0
- desktop_env/evaluators/getters/vscode.py +35 -0
- desktop_env/evaluators/metrics/__init__.py +160 -0
- desktop_env/evaluators/metrics/basic_os.py +68 -0
- desktop_env/evaluators/metrics/chrome.py +493 -0
- desktop_env/evaluators/metrics/docs.py +1011 -0
- desktop_env/evaluators/metrics/general.py +665 -0
- desktop_env/evaluators/metrics/gimp.py +637 -0
- desktop_env/evaluators/metrics/libreoffice.py +28 -0
- desktop_env/evaluators/metrics/others.py +92 -0
- desktop_env/evaluators/metrics/pdf.py +31 -0
- desktop_env/evaluators/metrics/slides.py +957 -0
- desktop_env/evaluators/metrics/table.py +585 -0
- desktop_env/evaluators/metrics/thunderbird.py +176 -0
- desktop_env/evaluators/metrics/utils.py +719 -0
- desktop_env/evaluators/metrics/vlc.py +524 -0
- desktop_env/evaluators/metrics/vscode.py +283 -0
- desktop_env/providers/__init__.py +35 -0
- desktop_env/providers/aws/__init__.py +0 -0
- desktop_env/providers/aws/manager.py +278 -0
- desktop_env/providers/aws/provider.py +186 -0
- desktop_env/providers/aws/provider_with_proxy.py +315 -0
- desktop_env/providers/aws/proxy_pool.py +193 -0
- desktop_env/providers/azure/__init__.py +0 -0
- desktop_env/providers/azure/manager.py +87 -0
- desktop_env/providers/azure/provider.py +207 -0
- desktop_env/providers/base.py +97 -0
- desktop_env/providers/gcp/__init__.py +0 -0
- desktop_env/providers/gcp/manager.py +0 -0
- desktop_env/providers/gcp/provider.py +0 -0
- desktop_env/providers/virtualbox/__init__.py +0 -0
- desktop_env/providers/virtualbox/manager.py +463 -0
- desktop_env/providers/virtualbox/provider.py +124 -0
- desktop_env/providers/vmware/__init__.py +0 -0
- desktop_env/providers/vmware/manager.py +455 -0
- desktop_env/providers/vmware/provider.py +105 -0
- gui_agents/__init__.py +0 -0
- gui_agents/agents/Action.py +209 -0
- gui_agents/agents/__init__.py +0 -0
- gui_agents/agents/agent_s.py +832 -0
- gui_agents/agents/global_state.py +610 -0
- gui_agents/agents/grounding.py +651 -0
- gui_agents/agents/hardware_interface.py +129 -0
- gui_agents/agents/manager.py +568 -0
- gui_agents/agents/translator.py +132 -0
- gui_agents/agents/worker.py +355 -0
- gui_agents/cli_app.py +560 -0
- gui_agents/core/__init__.py +0 -0
- gui_agents/core/engine.py +1496 -0
- gui_agents/core/knowledge.py +449 -0
- gui_agents/core/mllm.py +555 -0
- gui_agents/tools/__init__.py +0 -0
- gui_agents/tools/tools.py +727 -0
- gui_agents/unit_test/__init__.py +0 -0
- gui_agents/unit_test/run_tests.py +65 -0
- gui_agents/unit_test/test_manager.py +330 -0
- gui_agents/unit_test/test_worker.py +269 -0
- gui_agents/utils/__init__.py +0 -0
- gui_agents/utils/analyze_display.py +301 -0
- gui_agents/utils/common_utils.py +263 -0
- gui_agents/utils/display_viewer.py +281 -0
- gui_agents/utils/embedding_manager.py +53 -0
- gui_agents/utils/image_axis_utils.py +27 -0
- lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
- lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
- lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
- lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
- lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
import time
|
|
8
|
+
import pytesseract
|
|
9
|
+
from PIL import Image
|
|
10
|
+
from pytesseract import Output
|
|
11
|
+
|
|
12
|
+
from gui_agents.tools.tools import Tools
|
|
13
|
+
from gui_agents.utils.common_utils import parse_single_code_from_string
|
|
14
|
+
from gui_agents.store.registry import Registry
|
|
15
|
+
from gui_agents.agents.global_state import GlobalState
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("desktopenv.agent")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ACI:
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.notes: List[str] = []
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def agent_action(func):
|
|
27
|
+
func.is_agent_action = True
|
|
28
|
+
return func
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Grounding(ACI):
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
Tools_dict: Dict,
|
|
36
|
+
platform: str,
|
|
37
|
+
width: int = 1920,
|
|
38
|
+
height: int = 1080,
|
|
39
|
+
):
|
|
40
|
+
self.platform = platform
|
|
41
|
+
self.Tools_dict = Tools_dict
|
|
42
|
+
self.width = width
|
|
43
|
+
self.height = height
|
|
44
|
+
self.coords1 = None
|
|
45
|
+
self.coords2 = None
|
|
46
|
+
|
|
47
|
+
self.grounding_model = Tools()
|
|
48
|
+
self.grounding_model.register_tool(
|
|
49
|
+
"grounding", self.Tools_dict["grounding"]["provider"],
|
|
50
|
+
self.Tools_dict["grounding"]["model"])
|
|
51
|
+
|
|
52
|
+
self.grounding_width, self.grounding_height = self.grounding_model.tools[
|
|
53
|
+
"grounding"].get_grounding_wh()
|
|
54
|
+
if self.grounding_width is None or self.grounding_height is None:
|
|
55
|
+
self.grounding_width = self.width
|
|
56
|
+
self.grounding_height = self.height
|
|
57
|
+
|
|
58
|
+
self.text_span_agent = Tools()
|
|
59
|
+
self.text_span_agent.register_tool(
|
|
60
|
+
"text_span", self.Tools_dict["text_span"]["provider"],
|
|
61
|
+
self.Tools_dict["text_span"]["model"])
|
|
62
|
+
|
|
63
|
+
self.global_state: GlobalState = Registry.get(
|
|
64
|
+
"GlobalStateStore") # type: ignore
|
|
65
|
+
|
|
66
|
+
def generate_coords(self, ref_expr: str, obs: Dict) -> List[int]:
|
|
67
|
+
grounding_start_time = time.time()
|
|
68
|
+
self.grounding_model.tools["grounding"].llm_agent.reset()
|
|
69
|
+
prompt = (
|
|
70
|
+
f"Task: Visual Grounding - Locate and return coordinates\n Query:{ref_expr}\n Instructions: 1. Carefully analyze the provided screenshot image \n 2. Locate the EXACT element/area described in the query \n 3. Return ONLY the pixel coordinates [x, y] of one representative point within the target area \n 4. Choose a point that is clearly inside the described element/region \n 5. Coordinates must be integers representing pixel positions on the image \n 6. If the described element has multiple instances, select the most prominent or central one 7. - If this appears to be for dragging (selecting text, moving items, etc.): * For START points: Position slightly to the LEFT of text/content in empty space * For END points: Position slightly to the RIGHT of text/content in empty space * Avoid placing coordinates directly ON text characters to prevent text selection issues * Keep offset minimal (3-5 pixels) - don't go too far from the target area * Still return only ONE coordinate as requested \n Output Format: Return only two integers separated by comma, like: (900, 400)\n Important Notes: - Focus on the main descriptive elements in the query (colors, positions, objects) - Ignore any additional context that doesn't help locate the target - The returned point should be clickable/actionable within the target area \n CRITICAL REQUIREMENTS: - MUST return exactly ONE coordinate pair under ALL circumstances - NO explanations, NO multiple coordinates, NO additional text \n"
|
|
71
|
+
)
|
|
72
|
+
response, total_tokens, cost_string = self.grounding_model.execute_tool(
|
|
73
|
+
"grounding", {
|
|
74
|
+
"str_input": prompt,
|
|
75
|
+
"img_input": obs["screenshot"]
|
|
76
|
+
})
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Grounding model tokens: {total_tokens}, cost: {cost_string}")
|
|
79
|
+
grounding_end_time = time.time()
|
|
80
|
+
grounding_duration = grounding_end_time - grounding_start_time
|
|
81
|
+
logger.info(
|
|
82
|
+
f"Grounding model execution time: {grounding_duration:.2f} seconds")
|
|
83
|
+
logger.info(f"RAW GROUNDING MODEL RESPONSE: {response}")
|
|
84
|
+
self.global_state.log_operation(module="grounding",
|
|
85
|
+
operation="grounding_model_response",
|
|
86
|
+
data={
|
|
87
|
+
"tokens": total_tokens,
|
|
88
|
+
"cost": cost_string,
|
|
89
|
+
"content": response,
|
|
90
|
+
"duration": grounding_duration
|
|
91
|
+
})
|
|
92
|
+
numericals = re.findall(r"\d+", response)
|
|
93
|
+
assert len(numericals) >= 2
|
|
94
|
+
return [int(numericals[0]), int(numericals[1])]
|
|
95
|
+
|
|
96
|
+
def assign_coordinates(self, plan: str, obs: Dict):
|
|
97
|
+
self.coords1, self.coords2 = None, None
|
|
98
|
+
try:
|
|
99
|
+
action = parse_single_code_from_string(
|
|
100
|
+
plan.split("Grounded Action")[-1])
|
|
101
|
+
function_name = re.match(r"(\w+\.\w+)\(",
|
|
102
|
+
action).group(1) # type: ignore
|
|
103
|
+
args = self.parse_function_args(action)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
raise RuntimeError(f"Error in parsing grounded action: {e}") from e
|
|
106
|
+
|
|
107
|
+
if (function_name in [
|
|
108
|
+
"agent.click", "agent.doubleclick", "agent.move", "agent.scroll"
|
|
109
|
+
] and len(args) >= 1 and args[0] is not None):
|
|
110
|
+
self.coords1 = self.generate_coords(args[0], obs)
|
|
111
|
+
elif function_name == "agent.drag" and len(args) >= 2:
|
|
112
|
+
self.coords1 = self.generate_coords(args[0], obs)
|
|
113
|
+
self.coords2 = self.generate_coords(args[1], obs)
|
|
114
|
+
|
|
115
|
+
def reset_screen_size(self, width: int, height: int):
|
|
116
|
+
self.width = width
|
|
117
|
+
self.height = height
|
|
118
|
+
|
|
119
|
+
def resize_coordinates(self, coordinates: List[int]) -> List[int]:
|
|
120
|
+
return [
|
|
121
|
+
round(coordinates[0] * self.width / self.grounding_width),
|
|
122
|
+
round(coordinates[1] * self.height / self.grounding_height),
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
def resize_coordinates_with_padding(self,
|
|
126
|
+
coordinates: List[int]) -> List[int]:
|
|
127
|
+
grounding_size = max(self.grounding_width, self.grounding_height)
|
|
128
|
+
original_size = max(self.width, self.height)
|
|
129
|
+
coordinates = [
|
|
130
|
+
round(coordinates[0] * original_size / grounding_size),
|
|
131
|
+
round(coordinates[1] * original_size / grounding_size),
|
|
132
|
+
]
|
|
133
|
+
padding_left = round((original_size - self.width) / 2)
|
|
134
|
+
padding_top = round((original_size - self.height) / 2)
|
|
135
|
+
return [
|
|
136
|
+
coordinates[0] - padding_left,
|
|
137
|
+
coordinates[1] - padding_top,
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
def parse_function_args(self, function: str) -> List[str]:
|
|
141
|
+
if not function or not isinstance(function, str):
|
|
142
|
+
return []
|
|
143
|
+
pattern = r'(\w+\.\w+)\((?:"([^"]*)")?(?:,\s*(\d+))?\)'
|
|
144
|
+
match = re.match(pattern, function)
|
|
145
|
+
if match:
|
|
146
|
+
args = []
|
|
147
|
+
if match.group(2) is not None:
|
|
148
|
+
args.append(match.group(2))
|
|
149
|
+
if match.group(3) is not None:
|
|
150
|
+
args.append(int(match.group(3)))
|
|
151
|
+
if args:
|
|
152
|
+
return args
|
|
153
|
+
try:
|
|
154
|
+
tree = ast.parse(function)
|
|
155
|
+
except Exception:
|
|
156
|
+
return []
|
|
157
|
+
if not tree.body or not hasattr(tree.body[0], 'value'):
|
|
158
|
+
return []
|
|
159
|
+
call_node = tree.body[0].value # type: ignore
|
|
160
|
+
if not isinstance(call_node, ast.Call):
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
def safe_eval(node):
|
|
164
|
+
if isinstance(node, ast.Constant):
|
|
165
|
+
return node.value
|
|
166
|
+
elif hasattr(ast, 'Str') and isinstance(node, ast.Str):
|
|
167
|
+
return node.s
|
|
168
|
+
else:
|
|
169
|
+
try:
|
|
170
|
+
return ast.unparse(node)
|
|
171
|
+
except Exception:
|
|
172
|
+
return str(node)
|
|
173
|
+
|
|
174
|
+
positional_args = []
|
|
175
|
+
try:
|
|
176
|
+
positional_args = [safe_eval(arg) for arg in call_node.args]
|
|
177
|
+
except Exception:
|
|
178
|
+
positional_args = []
|
|
179
|
+
keyword_args = {}
|
|
180
|
+
try:
|
|
181
|
+
keyword_args = {
|
|
182
|
+
kw.arg: safe_eval(kw.value) for kw in call_node.keywords
|
|
183
|
+
}
|
|
184
|
+
except Exception:
|
|
185
|
+
keyword_args = {}
|
|
186
|
+
res = []
|
|
187
|
+
for key, val in keyword_args.items():
|
|
188
|
+
if key and "description" in key:
|
|
189
|
+
res.append(val)
|
|
190
|
+
for arg in positional_args:
|
|
191
|
+
res.append(arg)
|
|
192
|
+
return res
|
|
193
|
+
|
|
194
|
+
def _record_passive_memory(self, action_type: str, action_details: str):
|
|
195
|
+
memory_content = f"Hardware action `{action_type}` has been executed. Details: {action_details}"
|
|
196
|
+
self.global_state.add_agent_log({
|
|
197
|
+
"type": "passive",
|
|
198
|
+
"content": memory_content
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
@agent_action
|
|
202
|
+
def click(
|
|
203
|
+
self,
|
|
204
|
+
element_description: str,
|
|
205
|
+
button: int = 1,
|
|
206
|
+
holdKey: List[str] = [],
|
|
207
|
+
):
|
|
208
|
+
x, y = self.resize_coordinates(self.coords1) # type: ignore
|
|
209
|
+
actionDict = {
|
|
210
|
+
"type": "Click",
|
|
211
|
+
"x": x,
|
|
212
|
+
"y": y,
|
|
213
|
+
"element_description": element_description,
|
|
214
|
+
"button": button,
|
|
215
|
+
"holdKey": holdKey
|
|
216
|
+
}
|
|
217
|
+
action_details = f"Clicked at coordinates ({x}, {y}) with button {button}, element: {element_description}"
|
|
218
|
+
self._record_passive_memory("Click", action_details)
|
|
219
|
+
return actionDict
|
|
220
|
+
|
|
221
|
+
@agent_action
|
|
222
|
+
def doubleclick(
|
|
223
|
+
self,
|
|
224
|
+
element_description: str,
|
|
225
|
+
button: int = 1,
|
|
226
|
+
holdKey: List[str] = [],
|
|
227
|
+
):
|
|
228
|
+
x, y = self.resize_coordinates(self.coords1) # type: ignore
|
|
229
|
+
actionDict = {
|
|
230
|
+
"type": "DoubleClick",
|
|
231
|
+
"x": x,
|
|
232
|
+
"y": y,
|
|
233
|
+
"element_description": element_description,
|
|
234
|
+
"button": button,
|
|
235
|
+
"holdKey": holdKey
|
|
236
|
+
}
|
|
237
|
+
action_details = f"Double clicked at coordinates ({x}, {y}) with button {button}, element: {element_description}"
|
|
238
|
+
self._record_passive_memory("DoubleClick", action_details)
|
|
239
|
+
return actionDict
|
|
240
|
+
|
|
241
|
+
@agent_action
|
|
242
|
+
def move(
|
|
243
|
+
self,
|
|
244
|
+
element_description: str,
|
|
245
|
+
holdKey: List[str] = [],
|
|
246
|
+
):
|
|
247
|
+
x, y = self.resize_coordinates(self.coords1) # type: ignore
|
|
248
|
+
actionDict = {
|
|
249
|
+
"type": "Move",
|
|
250
|
+
"x": x,
|
|
251
|
+
"y": y,
|
|
252
|
+
"element_description": element_description,
|
|
253
|
+
"holdKey": holdKey
|
|
254
|
+
}
|
|
255
|
+
action_details = f"Moved to coordinates ({x}, {y}), element: {element_description}"
|
|
256
|
+
self._record_passive_memory("Move", action_details)
|
|
257
|
+
return actionDict
|
|
258
|
+
|
|
259
|
+
@agent_action
|
|
260
|
+
def scroll(
|
|
261
|
+
self,
|
|
262
|
+
element_description: str,
|
|
263
|
+
clicks: int,
|
|
264
|
+
vertical: bool = True,
|
|
265
|
+
holdKey: List[str] = [],
|
|
266
|
+
):
|
|
267
|
+
x, y = self.resize_coordinates(self.coords1) # type: ignore
|
|
268
|
+
if vertical:
|
|
269
|
+
actionDict = {
|
|
270
|
+
"type": "Scroll",
|
|
271
|
+
"x": x,
|
|
272
|
+
"y": y,
|
|
273
|
+
"element_description": element_description,
|
|
274
|
+
"stepVertical": clicks,
|
|
275
|
+
"holdKey": holdKey
|
|
276
|
+
}
|
|
277
|
+
action_details = f"Scrolled vertically at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description}"
|
|
278
|
+
else:
|
|
279
|
+
actionDict = {
|
|
280
|
+
"type": "Scroll",
|
|
281
|
+
"x": x,
|
|
282
|
+
"y": y,
|
|
283
|
+
"element_description": element_description,
|
|
284
|
+
"stepHorizontal": clicks,
|
|
285
|
+
"holdKey": holdKey
|
|
286
|
+
}
|
|
287
|
+
action_details = f"Scrolled horizontally at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description}"
|
|
288
|
+
self._record_passive_memory("Scroll", action_details)
|
|
289
|
+
return actionDict
|
|
290
|
+
|
|
291
|
+
@agent_action
|
|
292
|
+
def drag(
|
|
293
|
+
self,
|
|
294
|
+
starting_description: str,
|
|
295
|
+
ending_description: str,
|
|
296
|
+
holdKey: List[str] = [],
|
|
297
|
+
):
|
|
298
|
+
x1, y1 = self.resize_coordinates(self.coords1) # type: ignore
|
|
299
|
+
x2, y2 = self.resize_coordinates(self.coords2) # type: ignore
|
|
300
|
+
actionDict = {
|
|
301
|
+
"type": "Drag",
|
|
302
|
+
"startX": x1,
|
|
303
|
+
"startY": y1,
|
|
304
|
+
"endX": x2,
|
|
305
|
+
"endY": y2,
|
|
306
|
+
"holdKey": holdKey,
|
|
307
|
+
"starting_description": starting_description,
|
|
308
|
+
"ending_description": ending_description
|
|
309
|
+
}
|
|
310
|
+
action_details = f"Dragged from ({x1}, {y1}) to ({x2}, {y2}), starting: {starting_description}, ending: {ending_description}"
|
|
311
|
+
self._record_passive_memory("Drag", action_details)
|
|
312
|
+
return actionDict
|
|
313
|
+
|
|
314
|
+
@agent_action
|
|
315
|
+
def type(
|
|
316
|
+
self,
|
|
317
|
+
text: str = "",
|
|
318
|
+
):
|
|
319
|
+
actionDict = {
|
|
320
|
+
"type": "TypeText",
|
|
321
|
+
"text": text,
|
|
322
|
+
}
|
|
323
|
+
action_details = f"Typed text: {text}"
|
|
324
|
+
self._record_passive_memory("TypeText", action_details)
|
|
325
|
+
return actionDict
|
|
326
|
+
|
|
327
|
+
@agent_action
|
|
328
|
+
def hotkey(
|
|
329
|
+
self,
|
|
330
|
+
keys: List[str] = [],
|
|
331
|
+
duration: int = 0,
|
|
332
|
+
):
|
|
333
|
+
keys = [f"{key}" for key in keys]
|
|
334
|
+
if 1 <= duration <= 5000:
|
|
335
|
+
actionDict = {
|
|
336
|
+
"type": "Hotkey",
|
|
337
|
+
"keys": keys,
|
|
338
|
+
"duration": duration,
|
|
339
|
+
}
|
|
340
|
+
action_details = f"Pressed hotkey combination: {', '.join(keys)} with duration {duration}ms"
|
|
341
|
+
else:
|
|
342
|
+
actionDict = {
|
|
343
|
+
"type": "Hotkey",
|
|
344
|
+
"keys": keys,
|
|
345
|
+
}
|
|
346
|
+
action_details = f"Pressed hotkey combination: {', '.join(keys)}"
|
|
347
|
+
self._record_passive_memory("Hotkey", action_details)
|
|
348
|
+
return actionDict
|
|
349
|
+
|
|
350
|
+
@agent_action
|
|
351
|
+
def wait(self, duration: int):
|
|
352
|
+
actionDict = {"type": "Wait", "duration": duration}
|
|
353
|
+
action_details = f"Waited for {duration} milliseconds"
|
|
354
|
+
self._record_passive_memory("Wait", action_details)
|
|
355
|
+
return actionDict
|
|
356
|
+
|
|
357
|
+
@agent_action
|
|
358
|
+
def done(
|
|
359
|
+
self,
|
|
360
|
+
message: str = '',
|
|
361
|
+
):
|
|
362
|
+
self.returned_info = message
|
|
363
|
+
actionDict = {"type": "Done", "message": message}
|
|
364
|
+
return actionDict
|
|
365
|
+
|
|
366
|
+
@agent_action
|
|
367
|
+
def fail(
|
|
368
|
+
self,
|
|
369
|
+
message: str = '',
|
|
370
|
+
):
|
|
371
|
+
actionDict = {"type": "Failed", "message": message}
|
|
372
|
+
return actionDict
|
|
373
|
+
|
|
374
|
+
@agent_action
|
|
375
|
+
def memorize(
|
|
376
|
+
self,
|
|
377
|
+
information: str,
|
|
378
|
+
memory_type: str = "active",
|
|
379
|
+
):
|
|
380
|
+
self.global_state.add_agent_log({
|
|
381
|
+
"type": memory_type,
|
|
382
|
+
"content": information
|
|
383
|
+
})
|
|
384
|
+
actionDict = {
|
|
385
|
+
"type": "Memorize",
|
|
386
|
+
"information": information,
|
|
387
|
+
}
|
|
388
|
+
return actionDict
|
|
389
|
+
|
|
390
|
+
@agent_action
|
|
391
|
+
def passive_memorize(
|
|
392
|
+
self,
|
|
393
|
+
information: str,
|
|
394
|
+
):
|
|
395
|
+
return self.memorize(information, memory_type="passive")
|
|
396
|
+
|
|
397
|
+
@agent_action
|
|
398
|
+
def user_takeover(
|
|
399
|
+
self,
|
|
400
|
+
message: str = '',
|
|
401
|
+
):
|
|
402
|
+
self.global_state.set_running_state("stopped")
|
|
403
|
+
actionDict = {"type": "UserTakeover", "message": message}
|
|
404
|
+
return actionDict
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class FastGrounding(ACI):
|
|
408
|
+
|
|
409
|
+
def __init__(
|
|
410
|
+
self,
|
|
411
|
+
Tools_dict: Dict,
|
|
412
|
+
platform: str,
|
|
413
|
+
width: int = 1920,
|
|
414
|
+
height: int = 1080,
|
|
415
|
+
grounding_width: int = 1920,
|
|
416
|
+
grounding_height: int = 1080,
|
|
417
|
+
):
|
|
418
|
+
self.platform = platform
|
|
419
|
+
self.Tools_dict = Tools_dict
|
|
420
|
+
self.width = width
|
|
421
|
+
self.height = height
|
|
422
|
+
self.grounding_width = grounding_width
|
|
423
|
+
self.grounding_height = grounding_height
|
|
424
|
+
self.global_state: GlobalState = Registry.get(
|
|
425
|
+
"GlobalStateStore") # type: ignore
|
|
426
|
+
|
|
427
|
+
def reset_screen_size(self, width: int, height: int):
|
|
428
|
+
self.width = width
|
|
429
|
+
self.height = height
|
|
430
|
+
|
|
431
|
+
def resize_coordinates(self, coordinates: List[int]) -> List[int]:
|
|
432
|
+
return [
|
|
433
|
+
round(coordinates[0] * self.width / self.grounding_width),
|
|
434
|
+
round(coordinates[1] * self.height / self.grounding_height),
|
|
435
|
+
]
|
|
436
|
+
|
|
437
|
+
def _record_passive_memory(self, action_type: str, action_details: str):
|
|
438
|
+
memory_content = f"Hardware action `{action_type}` has been executed. Details: {action_details}"
|
|
439
|
+
self.global_state.add_agent_log({
|
|
440
|
+
"type": "passive",
|
|
441
|
+
"content": memory_content
|
|
442
|
+
})
|
|
443
|
+
|
|
444
|
+
@agent_action
|
|
445
|
+
def click(
|
|
446
|
+
self,
|
|
447
|
+
x: int,
|
|
448
|
+
y: int,
|
|
449
|
+
element_description: str = "",
|
|
450
|
+
button: int = 1,
|
|
451
|
+
holdKey: List[str] = [],
|
|
452
|
+
):
|
|
453
|
+
x, y = self.resize_coordinates([x, y])
|
|
454
|
+
actionDict = {
|
|
455
|
+
"type": "Click",
|
|
456
|
+
"x": x,
|
|
457
|
+
"y": y,
|
|
458
|
+
"element_description": element_description or f"Coordinates ({x}, {y})",
|
|
459
|
+
"button": button,
|
|
460
|
+
"holdKey": holdKey
|
|
461
|
+
}
|
|
462
|
+
action_details = f"Clicked at coordinates ({x}, {y}) with button {button}, element: {element_description or f'Coordinates ({x}, {y})'}"
|
|
463
|
+
self._record_passive_memory("Click", action_details)
|
|
464
|
+
return actionDict
|
|
465
|
+
|
|
466
|
+
@agent_action
|
|
467
|
+
def doubleclick(
|
|
468
|
+
self,
|
|
469
|
+
x: int,
|
|
470
|
+
y: int,
|
|
471
|
+
element_description: str = "",
|
|
472
|
+
button: int = 1,
|
|
473
|
+
holdKey: List[str] = [],
|
|
474
|
+
):
|
|
475
|
+
x, y = self.resize_coordinates([x, y])
|
|
476
|
+
actionDict = {
|
|
477
|
+
"type": "DoubleClick",
|
|
478
|
+
"x": x,
|
|
479
|
+
"y": y,
|
|
480
|
+
"element_description": element_description or f"Coordinates ({x}, {y})",
|
|
481
|
+
"button": button,
|
|
482
|
+
"holdKey": holdKey
|
|
483
|
+
}
|
|
484
|
+
action_details = f"Double clicked at coordinates ({x}, {y}) with button {button}, element: {element_description or f'Coordinates ({x}, {y})'}"
|
|
485
|
+
self._record_passive_memory("DoubleClick", action_details)
|
|
486
|
+
return actionDict
|
|
487
|
+
|
|
488
|
+
@agent_action
|
|
489
|
+
def move(
|
|
490
|
+
self,
|
|
491
|
+
x: int,
|
|
492
|
+
y: int,
|
|
493
|
+
element_description: str = "",
|
|
494
|
+
holdKey: List[str] = [],
|
|
495
|
+
):
|
|
496
|
+
x, y = self.resize_coordinates([x, y])
|
|
497
|
+
actionDict = {
|
|
498
|
+
"type": "Move",
|
|
499
|
+
"x": x,
|
|
500
|
+
"y": y,
|
|
501
|
+
"element_description": element_description or f"Coordinates ({x}, {y})",
|
|
502
|
+
"holdKey": holdKey
|
|
503
|
+
}
|
|
504
|
+
action_details = f"Moved to coordinates ({x}, {y}), element: {element_description or f'Coordinates ({x}, {y})'}"
|
|
505
|
+
self._record_passive_memory("Move", action_details)
|
|
506
|
+
return actionDict
|
|
507
|
+
|
|
508
|
+
@agent_action
|
|
509
|
+
def scroll(
|
|
510
|
+
self,
|
|
511
|
+
x: int,
|
|
512
|
+
y: int,
|
|
513
|
+
clicks: int,
|
|
514
|
+
element_description: str = "",
|
|
515
|
+
vertical: bool = True,
|
|
516
|
+
holdKey: List[str] = [],
|
|
517
|
+
):
|
|
518
|
+
x, y = self.resize_coordinates([x, y])
|
|
519
|
+
if vertical:
|
|
520
|
+
actionDict = {
|
|
521
|
+
"type": "Scroll",
|
|
522
|
+
"x": x,
|
|
523
|
+
"y": y,
|
|
524
|
+
"element_description": element_description or f"Coordinates ({x}, {y})",
|
|
525
|
+
"stepVertical": clicks,
|
|
526
|
+
"holdKey": holdKey
|
|
527
|
+
}
|
|
528
|
+
action_details = f"Scrolled vertically at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description or f'Coordinates ({x}, {y})'}"
|
|
529
|
+
else:
|
|
530
|
+
actionDict = {
|
|
531
|
+
"type": "Scroll",
|
|
532
|
+
"x": x,
|
|
533
|
+
"y": y,
|
|
534
|
+
"element_description": element_description or f"Coordinates ({x}, {y})",
|
|
535
|
+
"stepHorizontal": clicks,
|
|
536
|
+
"holdKey": holdKey
|
|
537
|
+
}
|
|
538
|
+
action_details = f"Scrolled horizontally at coordinates ({x}, {y}) with {clicks} clicks, element: {element_description or f'Coordinates ({x}, {y})'}"
|
|
539
|
+
self._record_passive_memory("Scroll", action_details)
|
|
540
|
+
return actionDict
|
|
541
|
+
|
|
542
|
+
@agent_action
|
|
543
|
+
def drag(
|
|
544
|
+
self,
|
|
545
|
+
startX: int,
|
|
546
|
+
startY: int,
|
|
547
|
+
endX: int,
|
|
548
|
+
endY: int,
|
|
549
|
+
starting_description: str = "",
|
|
550
|
+
ending_description: str = "",
|
|
551
|
+
holdKey: List[str] = [],
|
|
552
|
+
):
|
|
553
|
+
startX, startY = self.resize_coordinates([startX, startY])
|
|
554
|
+
endX, endY = self.resize_coordinates([endX, endY])
|
|
555
|
+
actionDict = {
|
|
556
|
+
"type": "Drag",
|
|
557
|
+
"startX": startX,
|
|
558
|
+
"startY": startY,
|
|
559
|
+
"endX": endX,
|
|
560
|
+
"endY": endY,
|
|
561
|
+
"holdKey": holdKey,
|
|
562
|
+
"starting_description": starting_description or f"Coordinates ({startX}, {startY})",
|
|
563
|
+
"ending_description": ending_description or f"Coordinates ({endX}, {endY})"
|
|
564
|
+
}
|
|
565
|
+
action_details = f"Dragged from ({startX}, {startY}) to ({endX}, {endY}), starting: {starting_description or f'Coordinates ({startX}, {startY})'}, ending: {ending_description or f'Coordinates ({endX}, {endY})'}"
|
|
566
|
+
self._record_passive_memory("Drag", action_details)
|
|
567
|
+
return actionDict
|
|
568
|
+
|
|
569
|
+
@agent_action
|
|
570
|
+
def type(
|
|
571
|
+
self,
|
|
572
|
+
text: str = "",
|
|
573
|
+
):
|
|
574
|
+
actionDict = {
|
|
575
|
+
"type": "TypeText",
|
|
576
|
+
"text": text,
|
|
577
|
+
}
|
|
578
|
+
action_details = f"Typed text: {text}"
|
|
579
|
+
self._record_passive_memory("TypeText", action_details)
|
|
580
|
+
return actionDict
|
|
581
|
+
|
|
582
|
+
@agent_action
|
|
583
|
+
def hotkey(
|
|
584
|
+
self,
|
|
585
|
+
keys: List[str] = [],
|
|
586
|
+
duration: int = 0,
|
|
587
|
+
):
|
|
588
|
+
keys = [f"{key}" for key in keys]
|
|
589
|
+
if 1 <= duration <= 5000:
|
|
590
|
+
actionDict = {
|
|
591
|
+
"type": "Hotkey",
|
|
592
|
+
"keys": keys,
|
|
593
|
+
"duration": duration,
|
|
594
|
+
}
|
|
595
|
+
action_details = f"Pressed hotkey combination: {', '.join(keys)} with duration {duration}ms"
|
|
596
|
+
else:
|
|
597
|
+
actionDict = {
|
|
598
|
+
"type": "Hotkey",
|
|
599
|
+
"keys": keys,
|
|
600
|
+
}
|
|
601
|
+
action_details = f"Pressed hotkey combination: {', '.join(keys)}"
|
|
602
|
+
self._record_passive_memory("Hotkey", action_details)
|
|
603
|
+
return actionDict
|
|
604
|
+
|
|
605
|
+
@agent_action
|
|
606
|
+
def wait(self, duration: int):
|
|
607
|
+
actionDict = {"type": "Wait", "duration": duration}
|
|
608
|
+
action_details = f"Waited for {duration} milliseconds"
|
|
609
|
+
self._record_passive_memory("Wait", action_details)
|
|
610
|
+
return actionDict
|
|
611
|
+
|
|
612
|
+
@agent_action
|
|
613
|
+
def done(
|
|
614
|
+
self,
|
|
615
|
+
message: str = '',
|
|
616
|
+
):
|
|
617
|
+
self.returned_info = message
|
|
618
|
+
actionDict = {"type": "Done", "message": message}
|
|
619
|
+
return actionDict
|
|
620
|
+
|
|
621
|
+
@agent_action
|
|
622
|
+
def fail(
|
|
623
|
+
self,
|
|
624
|
+
message: str = '',
|
|
625
|
+
):
|
|
626
|
+
actionDict = {"type": "Failed", "message": message}
|
|
627
|
+
return actionDict
|
|
628
|
+
|
|
629
|
+
@agent_action
|
|
630
|
+
def memorize(
|
|
631
|
+
self,
|
|
632
|
+
information: str,
|
|
633
|
+
):
|
|
634
|
+
self.global_state.add_agent_log({
|
|
635
|
+
"type": "active",
|
|
636
|
+
"content": information
|
|
637
|
+
})
|
|
638
|
+
actionDict = {
|
|
639
|
+
"type": "Memorize",
|
|
640
|
+
"information": information,
|
|
641
|
+
}
|
|
642
|
+
return actionDict
|
|
643
|
+
|
|
644
|
+
@agent_action
|
|
645
|
+
def user_takeover(
|
|
646
|
+
self,
|
|
647
|
+
message: str = '',
|
|
648
|
+
):
|
|
649
|
+
self.global_state.set_running_state("stopped")
|
|
650
|
+
actionDict = {"type": "UserTakeover", "message": message}
|
|
651
|
+
return actionDict
|